diff --git a/Cargo.lock b/Cargo.lock index f5cfcd9c28..2d881fe9b6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "0.7.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" +dependencies = [ + "memchr", +] + [[package]] name = "autocfg" version = "1.5.0" @@ -182,6 +191,23 @@ dependencies = [ "bitflags", ] +[[package]] +name = "regex" +version = "1.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" + [[package]] name = "rustversion" version = "1.0.22" @@ -391,6 +417,7 @@ dependencies = [ "elain", "itertools", "rand", + "regex", "rustversion", "static_assertions", "testutil", diff --git a/Cargo.toml b/Cargo.toml index 83abf80e86..56f4fa11b4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -125,6 +125,7 @@ zerocopy-derive = { version = "=0.8.47", path = "zerocopy-derive" } elain = "0.3.0" itertools = "0.11" rand = { version = "0.8.5", default-features = false, features = ["small_rng"] } +regex = "1.0" rustversion = "1.0" static_assertions = "1.1" testutil = { path = "testutil" } diff --git a/benches/extend_vec_zeroed.rs b/benches/extend_vec_zeroed.rs new file mode 100644 index 0000000000..1fbf772d1e --- /dev/null +++ b/benches/extend_vec_zeroed.rs @@ -0,0 +1,9 @@ +use zerocopy::*; + +#[path = "formats/coco_static_size.rs"] +mod format; + +#[unsafe(no_mangle)] +fn bench_extend_vec_zeroed(v: &mut Vec, additional: usize) -> Option<()> { + FromZeros::extend_vec_zeroed(v, additional).ok() +} diff --git a/benches/extend_vec_zeroed.x86-64 b/benches/extend_vec_zeroed.x86-64 new file mode 100644 index 0000000000..831b2a075f --- /dev/null +++ b/benches/extend_vec_zeroed.x86-64 @@ -0,0 +1,60 @@ +bench_extend_vec_zeroed: + push r15 + push r14 + push r13 + push r12 + push rbx + sub rsp, 32 + mov rbx, rdi + mov rax, qword ptr [rdi] + mov r12, qword ptr [rdi + 16] + mov rcx, rax + sub rcx, r12 + cmp rsi, rcx + jbe .LBB6_3 + mov r15, r12 + add r15, rsi + jae .LBB6_6 +.LBB6_2: + xor eax, eax + jmp .LBB6_5 +.LBB6_3: + mov rax, qword ptr [rbx + 8] + lea r15, [r12 + rsi] +.LBB6_4: + lea rcx, [r12 + 2*r12] + lea rdi, [rax + 2*rcx] + add rsi, rsi + lea rdx, [rsi + 2*rsi] + xor esi, esi + call qword ptr [rip + memset@GOTPCREL] + mov qword ptr [rbx + 16], r15 + mov al, 1 +.LBB6_5: + add rsp, 32 + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.LBB6_6: + mov r13, rsi + lea rcx, [rax + rax] + cmp r15, rcx + cmova rcx, r15 + cmp rcx, 5 + mov r14d, 4 + cmovae r14, rcx + mov rdx, qword ptr [rbx + 8] + lea rdi, [rsp + 8] + mov rsi, rax + mov rcx, r14 + call ::finish_grow + cmp dword ptr [rsp + 8], 1 + je .LBB6_2 + mov rax, qword ptr [rsp + 16] + mov qword ptr [rbx + 8], rax + mov qword ptr [rbx], r14 + mov rsi, r13 + jmp .LBB6_4 diff --git a/benches/extend_vec_zeroed.x86-64.mca b/benches/extend_vec_zeroed.x86-64.mca new file mode 100644 index 0000000000..cfab1eea8f --- /dev/null +++ b/benches/extend_vec_zeroed.x86-64.mca @@ -0,0 +1,147 @@ +Iterations: 100 +Instructions: 5400 +Total Cycles: 6595 +Total uOps: 6800 + +Dispatch Width: 4 +uOps Per Cycle: 1.03 +IPC: 0.82 +Block RThroughput: 17.0 + + +Instruction Info: +[1]: #uOps +[2]: Latency +[3]: RThroughput +[4]: MayLoad +[5]: MayStore +[6]: HasSideEffects (U) + +[1] [2] [3] [4] [5] [6] Instructions: + 2 5 1.00 * push r15 + 2 5 1.00 * push r14 + 2 5 1.00 * push r13 + 2 5 1.00 * push r12 + 2 5 1.00 * push rbx + 1 1 0.33 sub rsp, 32 + 1 1 0.33 mov rbx, rdi + 1 5 0.50 * mov rax, qword ptr [rdi] + 1 5 0.50 * mov r12, qword ptr [rdi + 16] + 1 1 0.33 mov rcx, rax + 1 1 0.33 sub rcx, r12 + 1 1 0.33 cmp rsi, rcx + 1 1 1.00 jbe .LBB6_3 + 1 1 0.33 mov r15, r12 + 1 1 0.33 add r15, rsi + 1 1 1.00 jae .LBB6_6 + 1 0 0.25 xor eax, eax + 1 1 1.00 jmp .LBB6_5 + 1 5 0.50 * mov rax, qword ptr [rbx + 8] + 1 1 0.50 lea r15, [r12 + rsi] + 1 1 0.50 lea rcx, [r12 + 2*r12] + 1 1 0.50 lea rdi, [rax + 2*rcx] + 1 1 0.33 add rsi, rsi + 1 1 0.50 lea rdx, [rsi + 2*rsi] + 1 0 0.25 xor esi, esi + 4 7 1.00 * call qword ptr [rip + memset@GOTPCREL] + 1 1 1.00 * mov qword ptr [rbx + 16], r15 + 1 1 0.33 mov al, 1 + 1 1 0.33 add rsp, 32 + 1 6 0.50 * pop rbx + 1 6 0.50 * pop r12 + 1 6 0.50 * pop r13 + 1 6 0.50 * pop r14 + 1 6 0.50 * pop r15 + 1 1 1.00 U ret + 1 1 0.33 mov r13, rsi + 1 1 0.50 lea rcx, [rax + rax] + 1 1 0.33 cmp r15, rcx + 3 3 1.00 cmova rcx, r15 + 1 1 0.33 cmp rcx, 5 + 1 1 0.33 mov r14d, 4 + 2 2 0.67 cmovae r14, rcx + 1 5 0.50 * mov rdx, qword ptr [rbx + 8] + 1 1 0.50 lea rdi, [rsp + 8] + 1 1 0.33 mov rsi, rax + 1 1 0.33 mov rcx, r14 + 3 5 1.00 call ::finish_grow + 2 6 0.50 * cmp dword ptr [rsp + 8], 1 + 1 1 1.00 je .LBB6_2 + 1 5 0.50 * mov rax, qword ptr [rsp + 16] + 1 1 1.00 * mov qword ptr [rbx + 8], rax + 1 1 1.00 * mov qword ptr [rbx], r14 + 1 1 0.33 mov rsi, r13 + 1 1 1.00 jmp .LBB6_4 + + +Resources: +[0] - SBDivider +[1] - SBFPDivider +[2] - SBPort0 +[3] - SBPort1 +[4] - SBPort4 +[5] - SBPort5 +[6.0] - SBPort23 +[6.1] - SBPort23 + + +Resource pressure per iteration: +[0] [1] [2] [3] [4] [5] [6.0] [6.1] + - - 12.00 12.00 10.00 13.00 11.00 11.00 + +Resource pressure by instruction: +[0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: + - - - - 1.00 - 0.49 0.51 push r15 + - - - - 1.00 - 0.51 0.49 push r14 + - - - - 1.00 - 0.50 0.50 push r13 + - - - - 1.00 - 0.50 0.50 push r12 + - - - - 1.00 - 0.50 0.50 push rbx + - - 0.01 0.99 - - - - sub rsp, 32 + - - - - - 1.00 - - mov rbx, rdi + - - - - - - 0.50 0.50 mov rax, qword ptr [rdi] + - - - - - - 0.50 0.50 mov r12, qword ptr [rdi + 16] + - - - 1.00 - - - - mov rcx, rax + - - - 0.99 - 0.01 - - sub rcx, r12 + - - - - - 1.00 - - cmp rsi, rcx + - - - - - 1.00 - - jbe .LBB6_3 + - - 0.01 0.98 - 0.01 - - mov r15, r12 + - - 0.99 0.01 - - - - add r15, rsi + - - - - - 1.00 - - jae .LBB6_6 + - - - - - - - - xor eax, eax + - - - - - 1.00 - - jmp .LBB6_5 + - - - - - - 0.50 0.50 mov rax, qword ptr [rbx + 8] + - - 1.00 - - - - - lea r15, [r12 + rsi] + - - 0.98 0.02 - - - - lea rcx, [r12 + 2*r12] + - - 0.99 0.01 - - - - lea rdi, [rax + 2*rcx] + - - - 1.00 - - - - add rsi, rsi + - - 0.99 0.01 - - - - lea rdx, [rsi + 2*rsi] + - - - - - - - - xor esi, esi + - - - - 1.00 1.00 1.00 1.00 call qword ptr [rip + memset@GOTPCREL] + - - - - 1.00 - 0.50 0.50 mov qword ptr [rbx + 16], r15 + - - 0.01 0.99 - - - - mov al, 1 + - - 1.00 - - - - - add rsp, 32 + - - - - - - 0.50 0.50 pop rbx + - - - - - - 0.50 0.50 pop r12 + - - - - - - 0.50 0.50 pop r13 + - - - - - - 0.50 0.50 pop r14 + - - - - - - 0.50 0.50 pop r15 + - - - - - 1.00 - - ret + - - 1.00 - - - - - mov r13, rsi + - - 0.01 0.99 - - - - lea rcx, [rax + rax] + - - 0.99 0.01 - - - - cmp r15, rcx + - - 2.00 0.01 - 0.99 - - cmova rcx, r15 + - - 0.01 0.99 - - - - cmp rcx, 5 + - - 0.01 0.99 - - - - mov r14d, 4 + - - 1.00 0.01 - 0.99 - - cmovae r14, rcx + - - - - - - 0.50 0.50 mov rdx, qword ptr [rbx + 8] + - - 0.01 0.99 - - - - lea rdi, [rsp + 8] + - - - 1.00 - - - - mov rsi, rax + - - - 0.01 - 0.99 - - mov rcx, r14 + - - - - 1.00 1.00 0.50 0.50 call ::finish_grow + - - - 0.99 - 0.01 0.50 0.50 cmp dword ptr [rsp + 8], 1 + - - - - - 1.00 - - je .LBB6_2 + - - - - - - 0.50 0.50 mov rax, qword ptr [rsp + 16] + - - - - 1.00 - 0.49 0.51 mov qword ptr [rbx + 8], rax + - - - - 1.00 - 0.51 0.49 mov qword ptr [rbx], r14 + - - 0.99 0.01 - - - - mov rsi, r13 + - - - - - 1.00 - - jmp .LBB6_4 diff --git a/benches/insert_vec_zeroed.rs b/benches/insert_vec_zeroed.rs new file mode 100644 index 0000000000..a5d685c2b0 --- /dev/null +++ b/benches/insert_vec_zeroed.rs @@ -0,0 +1,13 @@ +use zerocopy::*; + +#[path = "formats/coco_static_size.rs"] +mod format; + +#[unsafe(no_mangle)] +fn bench_insert_vec_zeroed( + v: &mut Vec, + position: usize, + additional: usize, +) -> Option<()> { + FromZeros::insert_vec_zeroed(v, position, additional).ok() +} diff --git a/benches/insert_vec_zeroed.x86-64 b/benches/insert_vec_zeroed.x86-64 new file mode 100644 index 0000000000..9db87403cb --- /dev/null +++ b/benches/insert_vec_zeroed.x86-64 @@ -0,0 +1,79 @@ +bench_insert_vec_zeroed: + push rbp + push r15 + push r14 + push r13 + push r12 + push rbx + sub rsp, 24 + mov r12, qword ptr [rdi + 16] + mov r13, r12 + sub r13, rsi + jb .LBB6_10 + mov rbx, rdi + mov rax, qword ptr [rdi] + mov rcx, rax + sub rcx, r12 + cmp rdx, rcx + jbe .LBB6_4 + add r12, rdx + jae .LBB6_7 +.LBB6_3: + xor eax, eax + jmp .LBB6_6 +.LBB6_4: + mov rax, qword ptr [rbx + 8] + add r12, rdx +.LBB6_5: + lea rcx, [rsi + 2*rsi] + lea r14, [rax + 2*rcx] + add rdx, rdx + lea r15, [rdx + 2*rdx] + lea rdi, [r14 + r15] + add r13, r13 + lea rdx, [2*r13] + add rdx, r13 + mov rsi, r14 + call qword ptr [rip + memmove@GOTPCREL] + mov rdi, r14 + xor esi, esi + mov rdx, r15 + call qword ptr [rip + memset@GOTPCREL] + mov qword ptr [rbx + 16], r12 + mov al, 1 +.LBB6_6: + add rsp, 24 + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + pop rbp + ret +.LBB6_7: + mov r15, rsi + mov rbp, rdx + lea rcx, [rax + rax] + cmp r12, rcx + cmova rcx, r12 + cmp rcx, 5 + mov r14d, 4 + cmovae r14, rcx + mov rdx, qword ptr [rbx + 8] + mov rdi, rsp + mov rsi, rax + mov rcx, r14 + call ::finish_grow + cmp dword ptr [rsp], 1 + je .LBB6_3 + mov rax, qword ptr [rsp + 8] + mov qword ptr [rbx + 8], rax + mov qword ptr [rbx], r14 + mov rdx, rbp + mov rsi, r15 + jmp .LBB6_5 +.LBB6_10: + lea rdi, [rip + .Lanon.HASH.1] + lea rdx, [rip + .Lanon.HASH.3] + mov esi, 37 + call qword ptr [rip + core::panicking::panic@GOTPCREL] diff --git a/benches/insert_vec_zeroed.x86-64.mca b/benches/insert_vec_zeroed.x86-64.mca new file mode 100644 index 0000000000..6652406678 --- /dev/null +++ b/benches/insert_vec_zeroed.x86-64.mca @@ -0,0 +1,183 @@ +Iterations: 100 +Instructions: 7200 +Total Cycles: 7648 +Total uOps: 9300 + +Dispatch Width: 4 +uOps Per Cycle: 1.22 +IPC: 0.94 +Block RThroughput: 23.3 + + +Instruction Info: +[1]: #uOps +[2]: Latency +[3]: RThroughput +[4]: MayLoad +[5]: MayStore +[6]: HasSideEffects (U) + +[1] [2] [3] [4] [5] [6] Instructions: + 2 5 1.00 * push rbp + 2 5 1.00 * push r15 + 2 5 1.00 * push r14 + 2 5 1.00 * push r13 + 2 5 1.00 * push r12 + 2 5 1.00 * push rbx + 1 1 0.33 sub rsp, 24 + 1 5 0.50 * mov r12, qword ptr [rdi + 16] + 1 1 0.33 mov r13, r12 + 1 1 0.33 sub r13, rsi + 1 1 1.00 jb .LBB6_10 + 1 1 0.33 mov rbx, rdi + 1 5 0.50 * mov rax, qword ptr [rdi] + 1 1 0.33 mov rcx, rax + 1 1 0.33 sub rcx, r12 + 1 1 0.33 cmp rdx, rcx + 1 1 1.00 jbe .LBB6_4 + 1 1 0.33 add r12, rdx + 1 1 1.00 jae .LBB6_7 + 1 0 0.25 xor eax, eax + 1 1 1.00 jmp .LBB6_6 + 1 5 0.50 * mov rax, qword ptr [rbx + 8] + 1 1 0.33 add r12, rdx + 1 1 0.50 lea rcx, [rsi + 2*rsi] + 1 1 0.50 lea r14, [rax + 2*rcx] + 1 1 0.33 add rdx, rdx + 1 1 0.50 lea r15, [rdx + 2*rdx] + 1 1 0.50 lea rdi, [r14 + r15] + 1 1 0.33 add r13, r13 + 1 1 0.50 lea rdx, [2*r13] + 1 1 0.33 add rdx, r13 + 1 1 0.33 mov rsi, r14 + 4 7 1.00 * call qword ptr [rip + memmove@GOTPCREL] + 1 1 0.33 mov rdi, r14 + 1 0 0.25 xor esi, esi + 1 1 0.33 mov rdx, r15 + 4 7 1.00 * call qword ptr [rip + memset@GOTPCREL] + 1 1 1.00 * mov qword ptr [rbx + 16], r12 + 1 1 0.33 mov al, 1 + 1 1 0.33 add rsp, 24 + 1 6 0.50 * pop rbx + 1 6 0.50 * pop r12 + 1 6 0.50 * pop r13 + 1 6 0.50 * pop r14 + 1 6 0.50 * pop r15 + 1 6 0.50 * pop rbp + 1 1 1.00 U ret + 1 1 0.33 mov r15, rsi + 1 1 0.33 mov rbp, rdx + 1 1 0.50 lea rcx, [rax + rax] + 1 1 0.33 cmp r12, rcx + 3 3 1.00 cmova rcx, r12 + 1 1 0.33 cmp rcx, 5 + 1 1 0.33 mov r14d, 4 + 2 2 0.67 cmovae r14, rcx + 1 5 0.50 * mov rdx, qword ptr [rbx + 8] + 1 1 0.33 mov rdi, rsp + 1 1 0.33 mov rsi, rax + 1 1 0.33 mov rcx, r14 + 3 5 1.00 call ::finish_grow + 2 6 0.50 * cmp dword ptr [rsp], 1 + 1 1 1.00 je .LBB6_3 + 1 5 0.50 * mov rax, qword ptr [rsp + 8] + 1 1 1.00 * mov qword ptr [rbx + 8], rax + 1 1 1.00 * mov qword ptr [rbx], r14 + 1 1 0.33 mov rdx, rbp + 1 1 0.33 mov rsi, r15 + 1 1 1.00 jmp .LBB6_5 + 1 1 0.50 lea rdi, [rip + .Lanon.HASH.1] + 1 1 0.50 lea rdx, [rip + .Lanon.HASH.3] + 1 1 0.33 mov esi, 37 + 4 7 1.00 * call qword ptr [rip + core::panicking::panic@GOTPCREL] + + +Resources: +[0] - SBDivider +[1] - SBFPDivider +[2] - SBPort0 +[3] - SBPort1 +[4] - SBPort4 +[5] - SBPort5 +[6.0] - SBPort23 +[6.1] - SBPort23 + + +Resource pressure per iteration: +[0] [1] [2] [3] [4] [5] [6.0] [6.1] + - - 17.02 16.50 13.00 19.48 14.00 14.00 + +Resource pressure by instruction: +[0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: + - - - - 1.00 - 0.98 0.02 push rbp + - - - - 1.00 - 0.02 0.98 push r15 + - - - - 1.00 - 0.99 0.01 push r14 + - - - - 1.00 - 0.01 0.99 push r13 + - - - - 1.00 - 0.99 0.01 push r12 + - - - - 1.00 - 0.01 0.99 push rbx + - - 0.49 0.51 - - - - sub rsp, 24 + - - - - - - 0.04 0.96 mov r12, qword ptr [rdi + 16] + - - 0.49 0.50 - 0.01 - - mov r13, r12 + - - 0.48 0.51 - 0.01 - - sub r13, rsi + - - - - - 1.00 - - jb .LBB6_10 + - - 0.49 0.49 - 0.02 - - mov rbx, rdi + - - - - - - 0.97 0.03 mov rax, qword ptr [rdi] + - - 0.51 0.49 - - - - mov rcx, rax + - - 0.49 0.02 - 0.49 - - sub rcx, r12 + - - 0.49 0.50 - 0.01 - - cmp rdx, rcx + - - - - - 1.00 - - jbe .LBB6_4 + - - 0.02 0.49 - 0.49 - - add r12, rdx + - - - - - 1.00 - - jae .LBB6_7 + - - - - - - - - xor eax, eax + - - - - - 1.00 - - jmp .LBB6_6 + - - - - - - 0.97 0.03 mov rax, qword ptr [rbx + 8] + - - 0.51 0.49 - - - - add r12, rdx + - - 0.49 0.51 - - - - lea rcx, [rsi + 2*rsi] + - - 0.50 0.50 - - - - lea r14, [rax + 2*rcx] + - - 0.51 0.49 - - - - add rdx, rdx + - - 0.50 0.50 - - - - lea r15, [rdx + 2*rdx] + - - 0.49 0.51 - - - - lea rdi, [r14 + r15] + - - 0.50 0.49 - 0.01 - - add r13, r13 + - - 0.51 0.49 - - - - lea rdx, [2*r13] + - - 0.01 0.01 - 0.98 - - add rdx, r13 + - - 0.01 - - 0.99 - - mov rsi, r14 + - - - - 1.00 1.00 1.98 0.02 call qword ptr [rip + memmove@GOTPCREL] + - - 0.49 0.50 - 0.01 - - mov rdi, r14 + - - - - - - - - xor esi, esi + - - 0.50 0.50 - - - - mov rdx, r15 + - - - - 1.00 1.00 1.96 0.04 call qword ptr [rip + memset@GOTPCREL] + - - - - 1.00 - 0.01 0.99 mov qword ptr [rbx + 16], r12 + - - 0.50 - - 0.50 - - mov al, 1 + - - 0.51 0.49 - - - - add rsp, 24 + - - - - - - 0.02 0.98 pop rbx + - - - - - - 0.03 0.97 pop r12 + - - - - - - 0.03 0.97 pop r13 + - - - - - - 0.97 0.03 pop r14 + - - - - - - 0.03 0.97 pop r15 + - - - - - - 0.01 0.99 pop rbp + - - - - - 1.00 - - ret + - - 0.49 0.51 - - - - mov r15, rsi + - - 0.51 0.49 - - - - mov rbp, rdx + - - 0.49 0.51 - - - - lea rcx, [rax + rax] + - - 0.49 0.50 - 0.01 - - cmp r12, rcx + - - 1.04 0.50 - 1.46 - - cmova rcx, r12 + - - 0.49 0.49 - 0.02 - - cmp rcx, 5 + - - 0.50 - - 0.50 - - mov r14d, 4 + - - 0.50 0.51 - 0.99 - - cmovae r14, rcx + - - - - - - 0.97 0.03 mov rdx, qword ptr [rbx + 8] + - - - 0.51 - 0.49 - - mov rdi, rsp + - - 0.01 0.50 - 0.49 - - mov rsi, rax + - - 0.49 0.50 - 0.01 - - mov rcx, r14 + - - - - 1.00 1.00 0.99 0.01 call ::finish_grow + - - 0.51 0.49 - - 0.50 0.50 cmp dword ptr [rsp], 1 + - - - - - 1.00 - - je .LBB6_3 + - - - - - - 0.50 0.50 mov rax, qword ptr [rsp + 8] + - - - - 1.00 - 0.99 0.01 mov qword ptr [rbx + 8], rax + - - - - 1.00 - 0.01 0.99 mov qword ptr [rbx], r14 + - - 0.49 0.50 - 0.01 - - mov rdx, rbp + - - 0.50 0.01 - 0.49 - - mov rsi, r15 + - - - - - 1.00 - - jmp .LBB6_5 + - - 0.01 0.99 - - - - lea rdi, [rip + .Lanon.HASH.1] + - - 0.99 0.01 - - - - lea rdx, [rip + .Lanon.HASH.3] + - - 0.02 0.49 - 0.49 - - mov esi, 37 + - - - - 1.00 1.00 0.02 1.98 call qword ptr [rip + core::panicking::panic@GOTPCREL] diff --git a/benches/new_box_zeroed.rs b/benches/new_box_zeroed.rs new file mode 100644 index 0000000000..aa9a66cce3 --- /dev/null +++ b/benches/new_box_zeroed.rs @@ -0,0 +1,9 @@ +use zerocopy::*; + +#[path = "formats/coco_static_size.rs"] +mod format; + +#[unsafe(no_mangle)] +fn bench_new_box_zeroed() -> Option> { + FromZeros::new_box_zeroed().ok() +} diff --git a/benches/new_box_zeroed.x86-64 b/benches/new_box_zeroed.x86-64 new file mode 100644 index 0000000000..ef74ea5388 --- /dev/null +++ b/benches/new_box_zeroed.x86-64 @@ -0,0 +1,7 @@ +bench_new_box_zeroed: + push rax + call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] + mov edi, 6 + mov esi, 2 + pop rax + jmp qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] diff --git a/benches/new_box_zeroed.x86-64.mca b/benches/new_box_zeroed.x86-64.mca new file mode 100644 index 0000000000..05afa7feb0 --- /dev/null +++ b/benches/new_box_zeroed.x86-64.mca @@ -0,0 +1,51 @@ +Iterations: 100 +Instructions: 600 +Total Cycles: 1197 +Total uOps: 1100 + +Dispatch Width: 4 +uOps Per Cycle: 0.92 +IPC: 0.50 +Block RThroughput: 2.8 + + +Instruction Info: +[1]: #uOps +[2]: Latency +[3]: RThroughput +[4]: MayLoad +[5]: MayStore +[6]: HasSideEffects (U) + +[1] [2] [3] [4] [5] [6] Instructions: + 2 5 1.00 * push rax + 4 7 1.00 * call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] + 1 1 0.33 mov edi, 6 + 1 1 0.33 mov esi, 2 + 1 6 0.50 * pop rax + 2 6 1.00 * jmp qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] + + +Resources: +[0] - SBDivider +[1] - SBFPDivider +[2] - SBPort0 +[3] - SBPort1 +[4] - SBPort4 +[5] - SBPort5 +[6.0] - SBPort23 +[6.1] - SBPort23 + + +Resource pressure per iteration: +[0] [1] [2] [3] [4] [5] [6.0] [6.1] + - - 0.99 1.00 2.00 2.01 2.07 2.93 + +Resource pressure by instruction: +[0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: + - - - - 1.00 - 0.93 0.07 push rax + - - - - 1.00 1.00 0.12 1.88 call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] + - - 0.99 - - 0.01 - - mov edi, 6 + - - - 1.00 - - - - mov esi, 2 + - - - - - - 0.94 0.06 pop rax + - - - - - 1.00 0.08 0.92 jmp qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] diff --git a/benches/new_box_zeroed_with_elems_dynamic_padding.rs b/benches/new_box_zeroed_with_elems_dynamic_padding.rs new file mode 100644 index 0000000000..0afde999bf --- /dev/null +++ b/benches/new_box_zeroed_with_elems_dynamic_padding.rs @@ -0,0 +1,11 @@ +use zerocopy::*; + +#[path = "formats/coco_dynamic_padding.rs"] +mod format; + +#[unsafe(no_mangle)] +fn bench_new_box_zeroed_with_elems_dynamic_padding( + count: usize, +) -> Option> { + FromZeros::new_box_zeroed_with_elems(count).ok() +} diff --git a/benches/new_box_zeroed_with_elems_dynamic_padding.x86-64 b/benches/new_box_zeroed_with_elems_dynamic_padding.x86-64 new file mode 100644 index 0000000000..0ab9f379a2 --- /dev/null +++ b/benches/new_box_zeroed_with_elems_dynamic_padding.x86-64 @@ -0,0 +1,36 @@ +bench_new_box_zeroed_with_elems_dynamic_padding: + push r14 + push rbx + push rax + mov rbx, rdi + mov ecx, 3 + mov rax, rdi + mul rcx + jo .LBB5_6 + mov r14, rax + cmp rax, -10 + ja .LBB5_6 + lea rax, [r14 + 9] + not r14d + and r14d, 3 + add r14, rax + setb al + movabs rcx, 9223372036854775803 + cmp r14, rcx + seta cl + or cl, al + je .LBB5_4 +.LBB5_6: + xor eax, eax + jmp .LBB5_5 +.LBB5_4: + call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] + mov esi, 4 + mov rdi, r14 + call qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] +.LBB5_5: + mov rdx, rbx + add rsp, 8 + pop rbx + pop r14 + ret diff --git a/benches/new_box_zeroed_with_elems_dynamic_padding.x86-64.mca b/benches/new_box_zeroed_with_elems_dynamic_padding.x86-64.mca new file mode 100644 index 0000000000..f666a03ce9 --- /dev/null +++ b/benches/new_box_zeroed_with_elems_dynamic_padding.x86-64.mca @@ -0,0 +1,103 @@ +Iterations: 100 +Instructions: 3200 +Total Cycles: 2989 +Total uOps: 4300 + +Dispatch Width: 4 +uOps Per Cycle: 1.44 +IPC: 1.07 +Block RThroughput: 10.8 + + +Instruction Info: +[1]: #uOps +[2]: Latency +[3]: RThroughput +[4]: MayLoad +[5]: MayStore +[6]: HasSideEffects (U) + +[1] [2] [3] [4] [5] [6] Instructions: + 2 5 1.00 * push r14 + 2 5 1.00 * push rbx + 2 5 1.00 * push rax + 1 1 0.33 mov rbx, rdi + 1 1 0.33 mov ecx, 3 + 1 1 0.33 mov rax, rdi + 2 4 1.00 mul rcx + 1 1 1.00 jo .LBB5_6 + 1 1 0.33 mov r14, rax + 1 1 0.33 cmp rax, -10 + 1 1 1.00 ja .LBB5_6 + 1 1 0.50 lea rax, [r14 + 9] + 1 1 0.33 not r14d + 1 1 0.33 and r14d, 3 + 1 1 0.33 add r14, rax + 1 1 0.50 setb al + 1 1 0.33 movabs rcx, 9223372036854775803 + 1 1 0.33 cmp r14, rcx + 2 2 1.00 seta cl + 1 1 0.33 or cl, al + 1 1 1.00 je .LBB5_4 + 1 0 0.25 xor eax, eax + 1 1 1.00 jmp .LBB5_5 + 4 7 1.00 * call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] + 1 1 0.33 mov esi, 4 + 1 1 0.33 mov rdi, r14 + 4 7 1.00 * call qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] + 1 1 0.33 mov rdx, rbx + 1 1 0.33 add rsp, 8 + 1 6 0.50 * pop rbx + 1 6 0.50 * pop r14 + 1 1 1.00 U ret + + +Resources: +[0] - SBDivider +[1] - SBFPDivider +[2] - SBPort0 +[3] - SBPort1 +[4] - SBPort4 +[5] - SBPort5 +[6.0] - SBPort23 +[6.1] - SBPort23 + + +Resource pressure per iteration: +[0] [1] [2] [3] [4] [5] [6.0] [6.1] + - - 8.99 8.98 5.00 10.03 4.49 4.51 + +Resource pressure by instruction: +[0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: + - - - - 1.00 - 0.49 0.51 push r14 + - - - - 1.00 - 0.51 0.49 push rbx + - - - - 1.00 - 0.49 0.51 push rax + - - 0.95 0.04 - 0.01 - - mov rbx, rdi + - - - 0.97 - 0.03 - - mov ecx, 3 + - - 0.02 0.02 - 0.96 - - mov rax, rdi + - - 1.00 1.00 - - - - mul rcx + - - - - - 1.00 - - jo .LBB5_6 + - - 0.02 0.97 - 0.01 - - mov r14, rax + - - 0.97 0.03 - - - - cmp rax, -10 + - - - - - 1.00 - - ja .LBB5_6 + - - 0.99 0.01 - - - - lea rax, [r14 + 9] + - - 0.01 0.99 - - - - not r14d + - - 0.97 0.03 - - - - and r14d, 3 + - - 0.01 0.98 - 0.01 - - add r14, rax + - - 1.00 - - - - - setb al + - - 0.02 - - 0.98 - - movabs rcx, 9223372036854775803 + - - - 0.97 - 0.03 - - cmp r14, rcx + - - 2.00 - - - - - seta cl + - - 0.03 0.03 - 0.94 - - or cl, al + - - - - - 1.00 - - je .LBB5_4 + - - - - - - - - xor eax, eax + - - - - - 1.00 - - jmp .LBB5_5 + - - - - 1.00 1.00 1.02 0.98 call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] + - - 0.03 0.97 - - - - mov esi, 4 + - - 0.96 0.01 - 0.03 - - mov rdi, r14 + - - - - 1.00 1.00 0.98 1.02 call qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] + - - - 0.97 - 0.03 - - mov rdx, rbx + - - 0.01 0.99 - - - - add rsp, 8 + - - - - - - 0.50 0.50 pop rbx + - - - - - - 0.50 0.50 pop r14 + - - - - - 1.00 - - ret diff --git a/benches/new_box_zeroed_with_elems_dynamic_size.rs b/benches/new_box_zeroed_with_elems_dynamic_size.rs new file mode 100644 index 0000000000..1b12ca2206 --- /dev/null +++ b/benches/new_box_zeroed_with_elems_dynamic_size.rs @@ -0,0 +1,9 @@ +use zerocopy::*; + +#[path = "formats/coco_dynamic_size.rs"] +mod format; + +#[unsafe(no_mangle)] +fn bench_new_box_zeroed_with_elems_dynamic_size(count: usize) -> Option> { + FromZeros::new_box_zeroed_with_elems(count).ok() +} diff --git a/benches/new_box_zeroed_with_elems_dynamic_size.x86-64 b/benches/new_box_zeroed_with_elems_dynamic_size.x86-64 new file mode 100644 index 0000000000..175fff0fd3 --- /dev/null +++ b/benches/new_box_zeroed_with_elems_dynamic_size.x86-64 @@ -0,0 +1,22 @@ +bench_new_box_zeroed_with_elems_dynamic_size: + push r14 + push rbx + push rax + mov rbx, rdi + movabs rax, 4611686018427387900 + cmp rdi, rax + jbe .LBB5_2 + xor eax, eax + jmp .LBB5_3 +.LBB5_2: + lea r14, [2*rbx + 4] + call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] + mov esi, 2 + mov rdi, r14 + call qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] +.LBB5_3: + mov rdx, rbx + add rsp, 8 + pop rbx + pop r14 + ret diff --git a/benches/new_box_zeroed_with_elems_dynamic_size.x86-64.mca b/benches/new_box_zeroed_with_elems_dynamic_size.x86-64.mca new file mode 100644 index 0000000000..88b5f84b98 --- /dev/null +++ b/benches/new_box_zeroed_with_elems_dynamic_size.x86-64.mca @@ -0,0 +1,77 @@ +Iterations: 100 +Instructions: 1900 +Total Cycles: 2990 +Total uOps: 2800 + +Dispatch Width: 4 +uOps Per Cycle: 0.94 +IPC: 0.64 +Block RThroughput: 7.0 + + +Instruction Info: +[1]: #uOps +[2]: Latency +[3]: RThroughput +[4]: MayLoad +[5]: MayStore +[6]: HasSideEffects (U) + +[1] [2] [3] [4] [5] [6] Instructions: + 2 5 1.00 * push r14 + 2 5 1.00 * push rbx + 2 5 1.00 * push rax + 1 1 0.33 mov rbx, rdi + 1 1 0.33 movabs rax, 4611686018427387900 + 1 1 0.33 cmp rdi, rax + 1 1 1.00 jbe .LBB5_2 + 1 0 0.25 xor eax, eax + 1 1 1.00 jmp .LBB5_3 + 1 1 0.50 lea r14, [2*rbx + 4] + 4 7 1.00 * call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] + 1 1 0.33 mov esi, 2 + 1 1 0.33 mov rdi, r14 + 4 7 1.00 * call qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] + 1 1 0.33 mov rdx, rbx + 1 1 0.33 add rsp, 8 + 1 6 0.50 * pop rbx + 1 6 0.50 * pop r14 + 1 1 1.00 U ret + + +Resources: +[0] - SBDivider +[1] - SBFPDivider +[2] - SBPort0 +[3] - SBPort1 +[4] - SBPort4 +[5] - SBPort5 +[6.0] - SBPort23 +[6.1] - SBPort23 + + +Resource pressure per iteration: +[0] [1] [2] [3] [4] [5] [6.0] [6.1] + - - 3.97 3.97 5.00 5.06 4.50 4.50 + +Resource pressure by instruction: +[0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: + - - - - 1.00 - 0.50 0.50 push r14 + - - - - 1.00 - 0.50 0.50 push rbx + - - - - 1.00 - 0.50 0.50 push rax + - - 0.94 0.05 - 0.01 - - mov rbx, rdi + - - 0.05 0.95 - - - - movabs rax, 4611686018427387900 + - - 0.95 - - 0.05 - - cmp rdi, rax + - - - - - 1.00 - - jbe .LBB5_2 + - - - - - - - - xor eax, eax + - - - - - 1.00 - - jmp .LBB5_3 + - - - 1.00 - - - - lea r14, [2*rbx + 4] + - - - - 1.00 1.00 1.00 1.00 call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] + - - 0.06 0.94 - - - - mov esi, 2 + - - 0.94 0.06 - - - - mov rdi, r14 + - - - - 1.00 1.00 1.00 1.00 call qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] + - - 0.05 0.95 - - - - mov rdx, rbx + - - 0.98 0.02 - - - - add rsp, 8 + - - - - - - 0.50 0.50 pop rbx + - - - - - - 0.50 0.50 pop r14 + - - - - - 1.00 - - ret diff --git a/benches/new_vec_zeroed.rs b/benches/new_vec_zeroed.rs new file mode 100644 index 0000000000..3d95b2b24d --- /dev/null +++ b/benches/new_vec_zeroed.rs @@ -0,0 +1,9 @@ +use zerocopy::*; + +#[path = "formats/coco_static_size.rs"] +mod format; + +#[unsafe(no_mangle)] +fn bench_new_vec_zeroed(len: usize) -> Option> { + FromZeros::new_vec_zeroed(len).ok() +} diff --git a/benches/new_vec_zeroed.x86-64 b/benches/new_vec_zeroed.x86-64 new file mode 100644 index 0000000000..264fa4a852 --- /dev/null +++ b/benches/new_vec_zeroed.x86-64 @@ -0,0 +1,44 @@ +bench_new_vec_zeroed: + push r15 + push r14 + push r12 + push rbx + push rax + mov rbx, rdi + movabs r12, 9223372036854775805 + mov ecx, 6 + mov rax, rsi + mul rcx + jo .LBB5_6 + cmp rax, r12 + jbe .LBB5_2 +.LBB5_6: + add r12, 3 + mov qword ptr [rbx], r12 + jmp .LBB5_7 +.LBB5_2: + mov r14, rsi + test rax, rax + je .LBB5_3 + mov r15, rax + call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] + mov esi, 2 + mov rdi, r15 + call qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] + test rax, rax + jne .LBB5_5 + jmp .LBB5_6 +.LBB5_3: + mov eax, 2 +.LBB5_5: + mov qword ptr [rbx], r14 + mov qword ptr [rbx + 8], rax + mov qword ptr [rbx + 16], r14 +.LBB5_7: + mov rax, rbx + add rsp, 8 + pop rbx + pop r12 + pop r14 + pop r15 + ret diff --git a/benches/new_vec_zeroed.x86-64.mca b/benches/new_vec_zeroed.x86-64.mca new file mode 100644 index 0000000000..093bbde096 --- /dev/null +++ b/benches/new_vec_zeroed.x86-64.mca @@ -0,0 +1,115 @@ +Iterations: 100 +Instructions: 3800 +Total Cycles: 5277 +Total uOps: 5000 + +Dispatch Width: 4 +uOps Per Cycle: 0.95 +IPC: 0.72 +Block RThroughput: 12.5 + + +Instruction Info: +[1]: #uOps +[2]: Latency +[3]: RThroughput +[4]: MayLoad +[5]: MayStore +[6]: HasSideEffects (U) + +[1] [2] [3] [4] [5] [6] Instructions: + 2 5 1.00 * push r15 + 2 5 1.00 * push r14 + 2 5 1.00 * push r12 + 2 5 1.00 * push rbx + 2 5 1.00 * push rax + 1 1 0.33 mov rbx, rdi + 1 1 0.33 movabs r12, 9223372036854775805 + 1 1 0.33 mov ecx, 6 + 1 1 0.33 mov rax, rsi + 2 4 1.00 mul rcx + 1 1 1.00 jo .LBB5_6 + 1 1 0.33 cmp rax, r12 + 1 1 1.00 jbe .LBB5_2 + 1 1 0.33 add r12, 3 + 1 1 1.00 * mov qword ptr [rbx], r12 + 1 1 1.00 jmp .LBB5_7 + 1 1 0.33 mov r14, rsi + 1 1 0.33 test rax, rax + 1 1 1.00 je .LBB5_3 + 1 1 0.33 mov r15, rax + 4 7 1.00 * call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] + 1 1 0.33 mov esi, 2 + 1 1 0.33 mov rdi, r15 + 4 7 1.00 * call qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] + 1 1 0.33 test rax, rax + 1 1 1.00 jne .LBB5_5 + 1 1 1.00 jmp .LBB5_6 + 1 1 0.33 mov eax, 2 + 1 1 1.00 * mov qword ptr [rbx], r14 + 1 1 1.00 * mov qword ptr [rbx + 8], rax + 1 1 1.00 * mov qword ptr [rbx + 16], r14 + 1 1 0.33 mov rax, rbx + 1 1 0.33 add rsp, 8 + 1 6 0.50 * pop rbx + 1 6 0.50 * pop r12 + 1 6 0.50 * pop r14 + 1 6 0.50 * pop r15 + 1 1 1.00 U ret + + +Resources: +[0] - SBDivider +[1] - SBFPDivider +[2] - SBPort0 +[3] - SBPort1 +[4] - SBPort4 +[5] - SBPort5 +[6.0] - SBPort23 +[6.1] - SBPort23 + + +Resource pressure per iteration: +[0] [1] [2] [3] [4] [5] [6.0] [6.1] + - - 7.01 7.98 11.00 11.01 8.50 8.50 + +Resource pressure by instruction: +[0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: + - - - - 1.00 - 0.50 0.50 push r15 + - - - - 1.00 - 0.50 0.50 push r14 + - - - - 1.00 - 0.50 0.50 push r12 + - - - - 1.00 - 0.50 0.50 push rbx + - - - - 1.00 - 0.50 0.50 push rax + - - 0.98 0.01 - 0.01 - - mov rbx, rdi + - - 0.01 0.99 - - - - movabs r12, 9223372036854775805 + - - 0.02 - - 0.98 - - mov ecx, 6 + - - - 0.98 - 0.02 - - mov rax, rsi + - - 1.00 1.00 - - - - mul rcx + - - - - - 1.00 - - jo .LBB5_6 + - - 0.99 0.01 - - - - cmp rax, r12 + - - - - - 1.00 - - jbe .LBB5_2 + - - - - - 1.00 - - add r12, 3 + - - - - 1.00 - 0.50 0.50 mov qword ptr [rbx], r12 + - - - - - 1.00 - - jmp .LBB5_7 + - - 0.98 0.02 - - - - mov r14, rsi + - - 0.01 0.99 - - - - test rax, rax + - - - - - 1.00 - - je .LBB5_3 + - - 0.99 0.01 - - - - mov r15, rax + - - - - 1.00 1.00 1.00 1.00 call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] + - - 0.01 0.99 - - - - mov esi, 2 + - - 0.99 0.01 - - - - mov rdi, r15 + - - - - 1.00 1.00 1.00 1.00 call qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] + - - 0.01 0.99 - - - - test rax, rax + - - - - - 1.00 - - jne .LBB5_5 + - - - - - 1.00 - - jmp .LBB5_6 + - - 0.02 0.98 - - - - mov eax, 2 + - - - - 1.00 - 0.50 0.50 mov qword ptr [rbx], r14 + - - - - 1.00 - 0.50 0.50 mov qword ptr [rbx + 8], rax + - - - - 1.00 - 0.50 0.50 mov qword ptr [rbx + 16], r14 + - - 0.97 0.03 - - - - mov rax, rbx + - - 0.03 0.97 - - - - add rsp, 8 + - - - - - - 0.50 0.50 pop rbx + - - - - - - 0.50 0.50 pop r12 + - - - - - - 0.50 0.50 pop r14 + - - - - - - 0.50 0.50 pop r15 + - - - - - 1.00 - - ret diff --git a/benches/new_zeroed.rs b/benches/new_zeroed.rs new file mode 100644 index 0000000000..b49f62edb1 --- /dev/null +++ b/benches/new_zeroed.rs @@ -0,0 +1,9 @@ +use zerocopy::*; + +#[path = "formats/coco_static_size.rs"] +mod format; + +#[unsafe(no_mangle)] +fn bench_new_zeroed() -> format::LocoPacket { + FromZeros::new_zeroed() +} diff --git a/benches/new_zeroed.x86-64 b/benches/new_zeroed.x86-64 new file mode 100644 index 0000000000..b4d305e41f --- /dev/null +++ b/benches/new_zeroed.x86-64 @@ -0,0 +1,3 @@ +bench_new_zeroed: + xor eax, eax + ret diff --git a/benches/new_zeroed.x86-64.mca b/benches/new_zeroed.x86-64.mca new file mode 100644 index 0000000000..44583ca308 --- /dev/null +++ b/benches/new_zeroed.x86-64.mca @@ -0,0 +1,43 @@ +Iterations: 100 +Instructions: 200 +Total Cycles: 103 +Total uOps: 200 + +Dispatch Width: 4 +uOps Per Cycle: 1.94 +IPC: 1.94 +Block RThroughput: 1.0 + + +Instruction Info: +[1]: #uOps +[2]: Latency +[3]: RThroughput +[4]: MayLoad +[5]: MayStore +[6]: HasSideEffects (U) + +[1] [2] [3] [4] [5] [6] Instructions: + 1 0 0.25 xor eax, eax + 1 1 1.00 U ret + + +Resources: +[0] - SBDivider +[1] - SBFPDivider +[2] - SBPort0 +[3] - SBPort1 +[4] - SBPort4 +[5] - SBPort5 +[6.0] - SBPort23 +[6.1] - SBPort23 + + +Resource pressure per iteration: +[0] [1] [2] [3] [4] [5] [6.0] [6.1] + - - - - - 1.00 - - + +Resource pressure by instruction: +[0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: + - - - - - - - - xor eax, eax + - - - - - 1.00 - - ret diff --git a/benches/zero_dynamic_padding.rs b/benches/zero_dynamic_padding.rs new file mode 100644 index 0000000000..8eda0953d1 --- /dev/null +++ b/benches/zero_dynamic_padding.rs @@ -0,0 +1,9 @@ +use zerocopy::*; + +#[path = "formats/coco_dynamic_padding.rs"] +mod format; + +#[unsafe(no_mangle)] +fn bench_zero_dynamic_padding(source: &mut format::LocoPacket) { + source.zero() +} diff --git a/benches/zero_dynamic_padding.x86-64 b/benches/zero_dynamic_padding.x86-64 new file mode 100644 index 0000000000..7dccf1745f --- /dev/null +++ b/benches/zero_dynamic_padding.x86-64 @@ -0,0 +1,7 @@ +bench_zero_dynamic_padding: + lea rax, [rsi + 2*rsi] + movabs rdx, 9223372036854775804 + and rdx, rax + add rdx, 12 + xor esi, esi + jmp qword ptr [rip + memset@GOTPCREL] diff --git a/benches/zero_dynamic_padding.x86-64.mca b/benches/zero_dynamic_padding.x86-64.mca new file mode 100644 index 0000000000..098fc10787 --- /dev/null +++ b/benches/zero_dynamic_padding.x86-64.mca @@ -0,0 +1,51 @@ +Iterations: 100 +Instructions: 600 +Total Cycles: 209 +Total uOps: 700 + +Dispatch Width: 4 +uOps Per Cycle: 3.35 +IPC: 2.87 +Block RThroughput: 1.8 + + +Instruction Info: +[1]: #uOps +[2]: Latency +[3]: RThroughput +[4]: MayLoad +[5]: MayStore +[6]: HasSideEffects (U) + +[1] [2] [3] [4] [5] [6] Instructions: + 1 1 0.50 lea rax, [rsi + 2*rsi] + 1 1 0.33 movabs rdx, 9223372036854775804 + 1 1 0.33 and rdx, rax + 1 1 0.33 add rdx, 12 + 1 0 0.25 xor esi, esi + 2 6 1.00 * jmp qword ptr [rip + memset@GOTPCREL] + + +Resources: +[0] - SBDivider +[1] - SBFPDivider +[2] - SBPort0 +[3] - SBPort1 +[4] - SBPort4 +[5] - SBPort5 +[6.0] - SBPort23 +[6.1] - SBPort23 + + +Resource pressure per iteration: +[0] [1] [2] [3] [4] [5] [6.0] [6.1] + - - 1.66 1.66 - 1.68 0.50 0.50 + +Resource pressure by instruction: +[0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: + - - 0.33 0.67 - - - - lea rax, [rsi + 2*rsi] + - - 0.98 - - 0.02 - - movabs rdx, 9223372036854775804 + - - 0.01 0.66 - 0.33 - - and rdx, rax + - - 0.34 0.33 - 0.33 - - add rdx, 12 + - - - - - - - - xor esi, esi + - - - - - 1.00 0.50 0.50 jmp qword ptr [rip + memset@GOTPCREL] diff --git a/benches/zero_dynamic_size.rs b/benches/zero_dynamic_size.rs new file mode 100644 index 0000000000..536d800ebc --- /dev/null +++ b/benches/zero_dynamic_size.rs @@ -0,0 +1,9 @@ +use zerocopy::*; + +#[path = "formats/coco_dynamic_size.rs"] +mod format; + +#[unsafe(no_mangle)] +fn bench_zero_dynamic_size(source: &mut format::LocoPacket) { + source.zero() +} diff --git a/benches/zero_dynamic_size.x86-64 b/benches/zero_dynamic_size.x86-64 new file mode 100644 index 0000000000..2b31ed644e --- /dev/null +++ b/benches/zero_dynamic_size.x86-64 @@ -0,0 +1,5 @@ +bench_zero_dynamic_size: + lea rdx, [2*rsi + 5] + and rdx, -2 + xor esi, esi + jmp qword ptr [rip + memset@GOTPCREL] diff --git a/benches/zero_dynamic_size.x86-64.mca b/benches/zero_dynamic_size.x86-64.mca new file mode 100644 index 0000000000..0b086a29b0 --- /dev/null +++ b/benches/zero_dynamic_size.x86-64.mca @@ -0,0 +1,47 @@ +Iterations: 100 +Instructions: 400 +Total Cycles: 142 +Total uOps: 500 + +Dispatch Width: 4 +uOps Per Cycle: 3.52 +IPC: 2.82 +Block RThroughput: 1.3 + + +Instruction Info: +[1]: #uOps +[2]: Latency +[3]: RThroughput +[4]: MayLoad +[5]: MayStore +[6]: HasSideEffects (U) + +[1] [2] [3] [4] [5] [6] Instructions: + 1 1 0.50 lea rdx, [2*rsi + 5] + 1 1 0.33 and rdx, -2 + 1 0 0.25 xor esi, esi + 2 6 1.00 * jmp qword ptr [rip + memset@GOTPCREL] + + +Resources: +[0] - SBDivider +[1] - SBFPDivider +[2] - SBPort0 +[3] - SBPort1 +[4] - SBPort4 +[5] - SBPort5 +[6.0] - SBPort23 +[6.1] - SBPort23 + + +Resource pressure per iteration: +[0] [1] [2] [3] [4] [5] [6.0] [6.1] + - - 0.99 1.00 - 1.01 0.50 0.50 + +Resource pressure by instruction: +[0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: + - - 0.99 0.01 - - - - lea rdx, [2*rsi + 5] + - - - 0.99 - 0.01 - - and rdx, -2 + - - - - - - - - xor esi, esi + - - - - - 1.00 0.50 0.50 jmp qword ptr [rip + memset@GOTPCREL] diff --git a/benches/zero_static_size.rs b/benches/zero_static_size.rs new file mode 100644 index 0000000000..fa7fa0839c --- /dev/null +++ b/benches/zero_static_size.rs @@ -0,0 +1,9 @@ +use zerocopy::*; + +#[path = "formats/coco_static_size.rs"] +mod format; + +#[unsafe(no_mangle)] +fn bench_zero_static_size(source: &mut format::LocoPacket) { + source.zero() +} diff --git a/benches/zero_static_size.x86-64 b/benches/zero_static_size.x86-64 new file mode 100644 index 0000000000..ced8e18949 --- /dev/null +++ b/benches/zero_static_size.x86-64 @@ -0,0 +1,4 @@ +bench_zero_static_size: + mov word ptr [rdi + 4], 0 + mov dword ptr [rdi], 0 + ret diff --git a/benches/zero_static_size.x86-64.mca b/benches/zero_static_size.x86-64.mca new file mode 100644 index 0000000000..042897ef2d --- /dev/null +++ b/benches/zero_static_size.x86-64.mca @@ -0,0 +1,45 @@ +Iterations: 100 +Instructions: 300 +Total Cycles: 203 +Total uOps: 300 + +Dispatch Width: 4 +uOps Per Cycle: 1.48 +IPC: 1.48 +Block RThroughput: 2.0 + + +Instruction Info: +[1]: #uOps +[2]: Latency +[3]: RThroughput +[4]: MayLoad +[5]: MayStore +[6]: HasSideEffects (U) + +[1] [2] [3] [4] [5] [6] Instructions: + 1 1 1.00 * mov word ptr [rdi + 4], 0 + 1 1 1.00 * mov dword ptr [rdi], 0 + 1 1 1.00 U ret + + +Resources: +[0] - SBDivider +[1] - SBFPDivider +[2] - SBPort0 +[3] - SBPort1 +[4] - SBPort4 +[5] - SBPort5 +[6.0] - SBPort23 +[6.1] - SBPort23 + + +Resource pressure per iteration: +[0] [1] [2] [3] [4] [5] [6.0] [6.1] + - - - - 2.00 1.00 1.00 1.00 + +Resource pressure by instruction: +[0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: + - - - - 1.00 - - 1.00 mov word ptr [rdi + 4], 0 + - - - - 1.00 - 1.00 - mov dword ptr [rdi], 0 + - - - - - 1.00 - - ret diff --git a/src/lib.rs b/src/lib.rs index f7ff18a2f9..cf6fb4518c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3458,6 +3458,29 @@ pub unsafe trait FromZeros: TryFromBytes { /// assert_eq!(header.length, [0, 0]); /// assert_eq!(header.checksum, [0, 0]); /// ``` + /// + #[doc = codegen_section!( + header = "h5", + bench = "zero", + format = "coco", + arity = 3, + [ + open + @index 1 + @title "Sized" + @variant "static_size" + ], + [ + @index 2 + @title "Unsized" + @variant "dynamic_size" + ], + [ + @index 3 + @title "Dynamically Padded" + @variant "dynamic_padding" + ] + )] #[inline(always)] fn zero(&mut self) { let slf: *mut Self = self; @@ -3498,6 +3521,12 @@ pub unsafe trait FromZeros: TryFromBytes { /// assert_eq!(header.length, [0, 0]); /// assert_eq!(header.checksum, [0, 0]); /// ``` + /// + #[doc = codegen_section!( + header = "h5", + bench = "new_zeroed", + format = "coco_static_size", + )] #[must_use = "has no side effects"] #[inline(always)] fn new_zeroed() -> Self @@ -3524,6 +3553,12 @@ pub unsafe trait FromZeros: TryFromBytes { /// /// Returns an error on allocation failure. Allocation failure is guaranteed /// never to cause a panic or an abort. + /// + #[doc = codegen_section!( + header = "h5", + bench = "new_box_zeroed", + format = "coco_static_size", + )] #[must_use = "has no side effects (other than allocation)"] #[cfg(any(feature = "alloc", test))] #[cfg_attr(doc_cfg, doc(cfg(feature = "alloc")))] @@ -3589,6 +3624,24 @@ pub unsafe trait FromZeros: TryFromBytes { /// /// Returns an error on allocation failure. Allocation failure is /// guaranteed never to cause a panic or an abort. + /// + #[doc = codegen_section!( + header = "h5", + bench = "new_box_zeroed_with_elems", + format = "coco", + arity = 2, + [ + open + @index 1 + @title "Unsized" + @variant "dynamic_size" + ], + [ + @index 2 + @title "Dynamically Padded" + @variant "dynamic_padding" + ] + )] #[must_use = "has no side effects (other than allocation)"] #[cfg(feature = "alloc")] #[cfg_attr(doc_cfg, doc(cfg(feature = "alloc")))] @@ -3637,6 +3690,12 @@ pub unsafe trait FromZeros: TryFromBytes { /// /// Returns an error on allocation failure. Allocation failure is /// guaranteed never to cause a panic or an abort. + /// + #[doc = codegen_section!( + header = "h5", + bench = "new_vec_zeroed", + format = "coco_static_size", + )] #[must_use = "has no side effects (other than allocation)"] #[cfg(feature = "alloc")] #[cfg_attr(doc_cfg, doc(cfg(feature = "alloc")))] @@ -3650,6 +3709,12 @@ pub unsafe trait FromZeros: TryFromBytes { /// Extends a `Vec` by pushing `additional` new items onto the end of /// the vector. The new items are initialized with zeros. + /// + #[doc = codegen_section!( + header = "h5", + bench = "extend_vec_zeroed", + format = "coco_static_size", + )] #[cfg(not(no_zerocopy_panic_in_const_and_vec_try_reserve_1_57_0))] #[cfg(feature = "alloc")] #[cfg_attr(doc_cfg, doc(cfg(all(rust = "1.57.0", feature = "alloc"))))] @@ -3669,6 +3734,12 @@ pub unsafe trait FromZeros: TryFromBytes { /// # Panics /// /// Panics if `position > v.len()`. + /// + #[doc = codegen_section!( + header = "h5", + bench = "insert_vec_zeroed", + format = "coco_static_size", + )] #[cfg(not(no_zerocopy_panic_in_const_and_vec_try_reserve_1_57_0))] #[cfg(feature = "alloc")] #[cfg_attr(doc_cfg, doc(cfg(all(rust = "1.57.0", feature = "alloc"))))] diff --git a/tests/codegen.rs b/tests/codegen.rs index 4a9dfd6a1d..44c204099b 100644 --- a/tests/codegen.rs +++ b/tests/codegen.rs @@ -10,6 +10,8 @@ use std::{panic, path::PathBuf, process::Command, thread}; +use regex::Regex; + enum Directive { Asm, Mca, @@ -49,6 +51,7 @@ fn run_codegen_test(bench_name: &str, target_cpu: &str, bless: bool) { manifest_path, "--target-dir", target_dir, + "--all-features", "--bench", bench_name, "--target-cpu", @@ -61,16 +64,15 @@ fn run_codegen_test(bench_name: &str, target_cpu: &str, bless: bool) { .expect("failed to execute process") }; + let re = Regex::new(r"(\.Lanon\.)[0-z]+(\.\d+)").unwrap(); + let test_directive = |directive: Directive| { let output = cargo_asm(&directive); - let actual_result = output.stdout; + let actual_result = String::from_utf8_lossy(&output.stdout); + let actual_result = re.replace_all(&actual_result, "${1}HASH${2}"); if !(output.status.success()) { - panic!( - "{}\n{}", - String::from_utf8_lossy(&actual_result), - String::from_utf8_lossy(&output.stderr) - ); + panic!("{}\n{}", &actual_result, String::from_utf8_lossy(&output.stderr)); } let expected_file_path = { @@ -83,10 +85,10 @@ fn run_codegen_test(bench_name: &str, target_cpu: &str, bless: bool) { }; if bless { - std::fs::write(expected_file_path, &actual_result).unwrap(); + std::fs::write(expected_file_path, actual_result.as_bytes()).unwrap(); } else { let expected_result = std::fs::read(expected_file_path).unwrap_or_default(); - if actual_result != expected_result { + if actual_result.as_bytes() != expected_result { let expected = String::from_utf8_lossy(&expected_result[..]); panic!("Bless codegen tests with BLESS=1\nGot unexpected output:\n{}", expected); } diff --git a/vendor/aho-corasick/.cargo-checksum.json b/vendor/aho-corasick/.cargo-checksum.json new file mode 100644 index 0000000000..8eac3049f6 --- /dev/null +++ b/vendor/aho-corasick/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"COPYING":"01c266bced4a434da0051174d6bee16a4c82cf634e2679b6155d40d75012390f","Cargo.toml":"c9b1b15e299ba4e6ed0d6f25cde30b26b13b6068a7fbd980000c37bca19b0104","DESIGN.md":"64ff45ea2a89d4c32b29af91acb7743a861fcac417cb94fde8e6559405d603b2","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","README.md":"5999e5768f5da8ab9b50c016766b5185b4c79936c56bef6d311ddcb0a38c4b94","UNLICENSE":"7e12e5df4bae12cb21581ba157ced20e1986a0508dd10d0e8a4ab9a4cf94e85c","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/ahocorasick.rs":"b92c9a65c4ee8029ff5a710aa1514caf838e73072c177dff5375463769f0b1ce","src/automaton.rs":"931af0aad03079bc4f6400d573fce832ce1edeeaf196815a16750d57b54b2183","src/buffer.rs":"dae7ee7c1f846ca9cf115ba4949484000e1837b4fb7311f8d8c9a35011c9c26f","src/byte_frequencies.rs":"2fb85b381c038c1e44ce94294531cdcd339dca48b1e61f41455666e802cbbc9e","src/classes.rs":"99a53a2ed8eea8c13699def90e31dfdff9d0b90572b1db3cb534e3396e7a0ed0","src/dfa.rs":"25e4455b3e179a7e192108d05f3683993456b36e3ebed99f827558c52525b7e6","src/error.rs":"d34c2c9c815df5d9dedc46b4b3ce109cd2cee07825de643f0c574ec960367beb","src/lib.rs":"7a47d4c87f83e0e7ddf0777a71e4858904e73477ce18404cb89e656070e86aef","src/nfa.rs":"3b817b4aa85540e8c0d35aff7ed7cfbab70ec7d2aaa779d63b4f5369bff31ce1","src/packed/api.rs":"df42e7500c94c9de1ac44145a0dd99ea02047e6bba229da12f2575337beebcf0","src/packed/mod.rs":"ad2f8e18996737347a1181a4457387276d139315bcabfc5e34494af0c0319701","src/packed/pattern.rs":"3abf3835d4c4f8a43753c52936a894d819f713f233fc046e19de5ef95200dcce","src/packed/rabinkarp.rs":"ad7d4533f96aed336e29c5553657ae57b0d733ace9c707a6cf4d08d8fc6edee5","src/packed/teddy/README.md":"b4b83fb5afafbbea6cb76fe70f49cc8ced888f682d98abe5ea5773e95d9ec2b0","src/packed/teddy/compile.rs":"aad40b3f93d2c388b409b31fb2795d414a365237789d5b1a7510d97ceb8ce260","src/packed/teddy/mod.rs":"83b52bd80272970ad17234d0db293d17c1710ec582302bf516b203c8edec037e","src/packed/teddy/runtime.rs":"836146e90b320b14fa2c65fe4af7915a41f6fb04408aac5fac731c22ff46adae","src/packed/tests.rs":"b8dc4d3281ecd6d0fa2bf7ef16cf292a467dfdce64e470c7921e983bfa60fee2","src/packed/vector.rs":"ab3c0535fca5f09198d58cbfae44c292aeb3ce44bc92bca36d30dc72963639fc","src/prefilter.rs":"82a3eb6d5c0c3f10bc8d5f57d55d6d14cf4cf21c475bb5253e1921084063b8d7","src/state_id.rs":"519ec8c7bf3fa72103d4c561c193759759f535dca924c9853efe630f406d2029","src/tests.rs":"ee9b85f3c27cb2fba5796e5be8019aafecc13ee9a4f614553f2bc8953f51c6de"},"package":"cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac"} \ No newline at end of file diff --git a/vendor/aho-corasick/COPYING b/vendor/aho-corasick/COPYING new file mode 100644 index 0000000000..bb9c20a094 --- /dev/null +++ b/vendor/aho-corasick/COPYING @@ -0,0 +1,3 @@ +This project is dual-licensed under the Unlicense and MIT licenses. + +You may use this code under the terms of either license. diff --git a/vendor/aho-corasick/Cargo.toml b/vendor/aho-corasick/Cargo.toml new file mode 100644 index 0000000000..ee8e94e4c0 --- /dev/null +++ b/vendor/aho-corasick/Cargo.toml @@ -0,0 +1,50 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2018" +name = "aho-corasick" +version = "0.7.20" +authors = ["Andrew Gallant "] +exclude = ["/aho-corasick-debug"] +autotests = false +description = "Fast multiple substring searching." +homepage = "https://github.com/BurntSushi/aho-corasick" +readme = "README.md" +keywords = [ + "string", + "search", + "text", + "aho", + "multi", +] +categories = ["text-processing"] +license = "Unlicense OR MIT" +repository = "https://github.com/BurntSushi/aho-corasick" + +[profile.bench] +debug = true + +[profile.release] +debug = true + +[lib] +name = "aho_corasick" + +[dependencies.memchr] +version = "2.4.0" +default-features = false + +[dev-dependencies] + +[features] +default = ["std"] +std = ["memchr/std"] diff --git a/vendor/aho-corasick/DESIGN.md b/vendor/aho-corasick/DESIGN.md new file mode 100644 index 0000000000..0e15ad0d87 --- /dev/null +++ b/vendor/aho-corasick/DESIGN.md @@ -0,0 +1,483 @@ +This document describes the internal design of this crate, which is an object +lesson in what happens when you take a fairly simple old algorithm like +Aho-Corasick and make it fast and production ready. + +The target audience of this document is Rust programmers that have some +familiarity with string searching, however, one does not need to know the +Aho-Corasick algorithm in order to read this (it is explained below). One +should, however, know what a trie is. (If you don't, go read its Wikipedia +article.) + +The center-piece of this crate is an implementation of Aho-Corasick. On its +own, Aho-Corasick isn't that complicated. The complex pieces come from the +different variants of Aho-Corasick implemented in this crate. Specifically, +they are: + +* Aho-Corasick as an NFA, using dense transitions near the root with sparse + transitions elsewhere. +* Aho-Corasick as a DFA. (An NFA is slower to search, but cheaper to construct + and uses less memory.) + * A DFA with pre-multiplied state identifiers. This saves a multiplication + instruction in the core search loop. + * A DFA with equivalence classes of bytes as the alphabet, instead of the + traditional 256-byte alphabet. This shrinks the size of the DFA in memory, + but adds an extra lookup in the core search loop to map the input byte to + an equivalent class. +* The option to choose how state identifiers are represented, via one of + u8, u16, u32, u64 or usize. This permits creating compact automatons when + matching a small number of patterns. +* Supporting "standard" match semantics, along with its overlapping variant, + in addition to leftmost-first and leftmost-longest semantics. The "standard" + semantics are typically what you see in a textbook description of + Aho-Corasick. However, Aho-Corasick is also useful as an optimization in + regex engines, which often use leftmost-first or leftmost-longest semantics. + Thus, it is useful to implement those semantics here. The "standard" and + "leftmost" search algorithms are subtly different, and also require slightly + different construction algorithms. +* Support for ASCII case insensitive matching. +* Support for accelerating searches when the patterns all start with a small + number of fixed bytes. Or alternatively, when the patterns all contain a + small number of rare bytes. (Searching for these bytes uses SIMD vectorized + code courtesy of `memchr`.) +* Transparent support for alternative SIMD vectorized search routines for + smaller number of literals, such as the Teddy algorithm. We called these + "packed" search routines because they use SIMD. They can often be an order of + magnitude faster than just Aho-Corasick, but don't scale as well. +* Support for searching streams. This can reuse most of the underlying code, + but does require careful buffering support. +* Support for anchored searches, which permit efficient `is_prefix` checks for + a large number of patterns. + +When you combine all of this together along with trying to make everything as +fast as possible, what you end up with is enitrely too much code with too much +`unsafe`. Alas, I was not smart enough to figure out how to reduce it. Instead, +we will explain it. + + +# Basics + +The fundamental problem this crate is trying to solve is to determine the +occurrences of possibly many patterns in a haystack. The naive way to solve +this is to look for a match for each pattern at each position in the haystack: + + for i in 0..haystack.len(): + for p in patterns.iter(): + if haystack[i..].starts_with(p.bytes()): + return Match(p.id(), i, i + p.bytes().len()) + +Those four lines are effectively all this crate does. The problem with those +four lines is that they are very slow, especially when you're searching for a +large number of patterns. + +While there are many different algorithms available to solve this, a popular +one is Aho-Corasick. It's a common solution because it's not too hard to +implement, scales quite well even when searching for thousands of patterns and +is generally pretty fast. Aho-Corasick does well here because, regardless of +the number of patterns you're searching for, it always visits each byte in the +haystack exactly once. This means, generally speaking, adding more patterns to +an Aho-Corasick automaton does not make it slower. (Strictly speaking, however, +this is not true, since a larger automaton will make less effective use of the +CPU's cache.) + +Aho-Corasick can be succinctly described as a trie with state transitions +between some of the nodes that efficiently instruct the search algorithm to +try matching alternative keys in the automaton. The trick is that these state +transitions are arranged such that each byte of input needs to be inspected +only once. These state transitions are typically called "failure transitions," +because they instruct the searcher (the thing traversing the automaton while +reading from the haystack) what to do when a byte in the haystack does not +correspond to a valid transition in the current state of the trie. + +More formally, a failure transition points to a state in the automaton that may +lead to a match whose prefix is a proper suffix of the path traversed through +the trie so far. (If no such proper suffix exists, then the failure transition +points back to the start state of the trie, effectively restarting the search.) +This is perhaps simpler to explain pictorally. For example, let's say we built +an Aho-Corasick automaton with the following patterns: 'abcd' and 'cef'. The +trie looks like this: + + a - S1 - b - S2 - c - S3 - d - S4* + / + S0 - c - S5 - e - S6 - f - S7* + +where states marked with a `*` are match states (meaning, the search algorithm +should stop and report a match to the caller). + +So given this trie, it should be somewhat straight-forward to see how it can +be used to determine whether any particular haystack *starts* with either +`abcd` or `cef`. It's easy to express this in code: + + fn has_prefix(trie: &Trie, haystack: &[u8]) -> bool { + let mut state_id = trie.start(); + // If the empty pattern is in trie, then state_id is a match state. + if trie.is_match(state_id) { + return true; + } + for (i, &b) in haystack.iter().enumerate() { + state_id = match trie.next_state(state_id, b) { + Some(id) => id, + // If there was no transition for this state and byte, then we know + // the haystack does not start with one of the patterns in our trie. + None => return false, + }; + if trie.is_match(state_id) { + return true; + } + } + false + } + +And that's pretty much it. All we do is move through the trie starting with the +bytes at the beginning of the haystack. If we find ourselves in a position +where we can't move, or if we've looked through the entire haystack without +seeing a match state, then we know the haystack does not start with any of the +patterns in the trie. + +The meat of the Aho-Corasick algorithm is in how we add failure transitions to +our trie to keep searching efficient. Specifically, it permits us to not only +check whether a haystack *starts* with any one of a number of patterns, but +rather, whether the haystack contains any of a number of patterns *anywhere* in +the haystack. + +As mentioned before, failure transitions connect a proper suffix of the path +traversed through the trie before, with a path that leads to a match that has a +prefix corresponding to that proper suffix. So in our case, for patterns `abcd` +and `cef`, with a haystack `abcef`, we want to transition to state `S5` (from +the diagram above) from `S3` upon seeing that the byte following `c` is not +`d`. Namely, the proper suffix in this example is `c`, which is a prefix of +`cef`. So the modified diagram looks like this: + + + a - S1 - b - S2 - c - S3 - d - S4* + / / + / ---------------- + / / + S0 - c - S5 - e - S6 - f - S7* + +One thing that isn't shown in this diagram is that *all* states have a failure +transition, but only `S3` has a *non-trivial* failure transition. That is, all +other states have a failure transition back to the start state. So if our +haystack was `abzabcd`, then the searcher would transition back to `S0` after +seeing `z`, which effectively restarts the search. (Because there is no pattern +in our trie that has a prefix of `bz` or `z`.) + +The code for traversing this *automaton* or *finite state machine* (it is no +longer just a trie) is not that much different from the `has_prefix` code +above: + + fn contains(fsm: &FiniteStateMachine, haystack: &[u8]) -> bool { + let mut state_id = fsm.start(); + // If the empty pattern is in fsm, then state_id is a match state. + if fsm.is_match(state_id) { + return true; + } + for (i, &b) in haystack.iter().enumerate() { + // While the diagram above doesn't show this, we may wind up needing + // to follow multiple failure transitions before we land on a state + // in which we can advance. Therefore, when searching for the next + // state, we need to loop until we don't see a failure transition. + // + // This loop terminates because the start state has no empty + // transitions. Every transition from the start state either points to + // another state, or loops back to the start state. + loop { + match fsm.next_state(state_id, b) { + Some(id) => { + state_id = id; + break; + } + // Unlike our code above, if there was no transition for this + // state, then we don't quit. Instead, we look for this state's + // failure transition and follow that instead. + None => { + state_id = fsm.next_fail_state(state_id); + } + }; + } + if fsm.is_match(state_id) { + return true; + } + } + false + } + +Other than the complication around traversing failure transitions, this code +is still roughly "traverse the automaton with bytes from the haystack, and quit +when a match is seen." + +And that concludes our section on the basics. While we didn't go deep into +how the automaton is built (see `src/nfa.rs`, which has detailed comments about +that), the basic structure of Aho-Corasick should be reasonably clear. + + +# NFAs and DFAs + +There are generally two types of finite automata: non-deterministic finite +automata (NFA) and deterministic finite automata (DFA). The difference between +them is, principally, that an NFA can be in multiple states at once. This is +typically accomplished by things called _epsilon_ transitions, where one could +move to a new state without consuming any bytes from the input. (The other +mechanism by which NFAs can be in more than one state is where the same byte in +a particular state transitions to multiple distinct states.) In contrast, a DFA +can only ever be in one state at a time. A DFA has no epsilon transitions, and +for any given state, a byte transitions to at most one other state. + +By this formulation, the Aho-Corasick automaton described in the previous +section is an NFA. This is because failure transitions are, effectively, +epsilon transitions. That is, whenever the automaton is in state `S`, it is +actually in the set of states that are reachable by recursively following +failure transitions from `S`. (This means that, for example, the start state +is always active since the start state is reachable via failure transitions +from any state in the automaton.) + +NFAs have a lot of nice properties. They tend to be easier to construct, and +also tend to use less memory. However, their primary downside is that they are +typically slower to execute. For example, the code above showing how to search +with an Aho-Corasick automaton needs to potentially iterate through many +failure transitions for every byte of input. While this is a fairly small +amount of overhead, this can add up, especially if the automaton has a lot of +overlapping patterns with a lot of failure transitions. + +A DFA's search code, by contrast, looks like this: + + fn contains(dfa: &DFA, haystack: &[u8]) -> bool { + let mut state_id = dfa.start(); + // If the empty pattern is in dfa, then state_id is a match state. + if dfa.is_match(state_id) { + return true; + } + for (i, &b) in haystack.iter().enumerate() { + // An Aho-Corasick DFA *never* has a missing state that requires + // failure transitions to be followed. One byte of input advances the + // automaton by one state. Always. + state_id = dfa.next_state(state_id, b); + if dfa.is_match(state_id) { + return true; + } + } + false + } + +The search logic here is much simpler than for the NFA, and this tends to +translate into significant performance benefits as well, since there's a lot +less work being done for each byte in the haystack. How is this accomplished? +It's done by pre-following all failure transitions for all states for all bytes +in the alphabet, and then building a single state transition table. Building +this DFA can be much more costly than building the NFA, and use much more +memory, but the better performance can be worth it. + +Users of this crate can actually choose between using an NFA or a DFA. By +default, an NFA is used, because it typically strikes the best balance between +space usage and search performance. But the DFA option is available for cases +where a little extra memory and upfront time building the automaton is okay. +For example, the `AhoCorasick::auto_configure` and +`AhoCorasickBuilder::auto_configure` methods will enable the DFA setting if +there are a small number of patterns. + + +# More DFA tricks + +As described in the previous section, one of the downsides of using a DFA +is that it uses more memory and can take longer to build. One small way of +mitigating these concerns is to map the alphabet used by the automaton into +a smaller space. Typically, the alphabet of a DFA has 256 elements in it: +one element for each possible value that fits into a byte. However, in many +cases, one does not need the full alphabet. For example, if all patterns in an +Aho-Corasick automaton are ASCII letters, then this only uses up 52 distinct +bytes. As far as the automaton is concerned, the rest of the 204 bytes are +indistinguishable from one another: they will never disrciminate between a +match or a non-match. Therefore, in cases like that, the alphabet can be shrunk +to just 53 elements. One for each ASCII letter, and then another to serve as a +placeholder for every other unused byte. + +In practice, this library doesn't quite compute the optimal set of equivalence +classes, but it's close enough in most cases. The key idea is that this then +allows the transition table for the DFA to be potentially much smaller. The +downside of doing this, however, is that since the transition table is defined +in terms of this smaller alphabet space, every byte in the haystack must be +re-mapped to this smaller space. This requires an additional 256-byte table. +In practice, this can lead to a small search time hit, but it can be difficult +to measure. Moreover, it can sometimes lead to faster search times for bigger +automata, since it could be difference between more parts of the automaton +staying in the CPU cache or not. + +One other trick for DFAs employed by this crate is the notion of premultiplying +state identifiers. Specifically, the normal way to compute the next transition +in a DFA is via the following (assuming that the transition table is laid out +sequentially in memory, in row-major order, where the rows are states): + + next_state_id = dfa.transitions[current_state_id * 256 + current_byte] + +However, since the value `256` is a fixed constant, we can actually premultiply +the state identifiers in the table when we build the table initially. Then, the +next transition computation simply becomes: + + next_state_id = dfa.transitions[current_state_id + current_byte] + +This doesn't seem like much, but when this is being executed for every byte of +input that you're searching, saving that extra multiplication instruction can +add up. + +The same optimization works even when equivalence classes are enabled, as +described above. The only difference is that the premultiplication is by the +total number of equivalence classes instead of 256. + +There isn't much downside to premultiplying state identifiers, other than the +fact that you may need to choose a bigger integer representation than you would +otherwise. For example, if you don't premultiply state identifiers, then an +automaton that uses `u8` as a state identifier can hold up to 256 states. +However, if they are premultiplied, then it can only hold up to +`floor(256 / len(alphabet))` states. Thus premultiplication impacts how compact +your DFA can be. In practice, it's pretty rare to use `u8` as a state +identifier, so premultiplication is usually a good thing to do. + +Both equivalence classes and premultiplication are tuneable parameters via the +`AhoCorasickBuilder` type, and both are enabled by default. + + +# Match semantics + +One of the more interesting things about this implementation of Aho-Corasick +that (as far as this author knows) separates it from other implementations, is +that it natively supports leftmost-first and leftmost-longest match semantics. +Briefly, match semantics refer to the decision procedure by which searching +will disambiguate matches when there are multiple to choose from: + +* **standard** match semantics emits matches as soon as they are detected by + the automaton. This is typically equivalent to the textbook non-overlapping + formulation of Aho-Corasick. +* **leftmost-first** match semantics means that 1) the next match is the match + starting at the leftmost position and 2) among multiple matches starting at + the same leftmost position, the match corresponding to the pattern provided + first by the caller is reported. +* **leftmost-longest** is like leftmost-first, except when there are multiple + matches starting at the same leftmost position, the pattern corresponding to + the longest match is returned. + +(The crate API documentation discusses these differences, with examples, in +more depth on the `MatchKind` type.) + +The reason why supporting these match semantics is important is because it +gives the user more control over the match procedure. For example, +leftmost-first permits users to implement match priority by simply putting the +higher priority patterns first. Leftmost-longest, on the other hand, permits +finding the longest possible match, which might be useful when trying to find +words matching a dictionary. Additionally, regex engines often want to use +Aho-Corasick as an optimization when searching for an alternation of literals. +In order to preserve correct match semantics, regex engines typically can't use +the standard textbook definition directly, since regex engines will implement +either leftmost-first (Perl-like) or leftmost-longest (POSIX) match semantics. + +Supporting leftmost semantics requires a couple key changes: + +* Constructing the Aho-Corasick automaton changes a bit in both how the trie is + constructed and how failure transitions are found. Namely, only a subset of + the failure transitions are added. Specifically, only the failure transitions + that either do not occur after a match or do occur after a match but preserve + that match are kept. (More details on this can be found in `src/nfa.rs`.) +* The search algorithm changes slightly. Since we are looking for the leftmost + match, we cannot quit as soon as a match is detected. Instead, after a match + is detected, we must keep searching until either the end of the input or + until a dead state is seen. (Dead states are not used for standard match + semantics. Dead states mean that searching should stop after a match has been + found.) + +Other implementations of Aho-Corasick do support leftmost match semantics, but +they do it with more overhead at search time, or even worse, with a queue of +matches and sophisticated hijinks to disambiguate the matches. While our +construction algorithm becomes a bit more complicated, the correct match +semantics fall out from the structure of the automaton itself. + + +# Overlapping matches + +One of the nice properties of an Aho-Corasick automaton is that it can report +all possible matches, even when they overlap with one another. In this mode, +the match semantics don't matter, since all possible matches are reported. +Overlapping searches work just like regular searches, except the state +identifier at which the previous search left off is carried over to the next +search, so that it can pick up where it left off. If there are additional +matches at that state, then they are reported before resuming the search. + +Enabling leftmost-first or leftmost-longest match semantics causes the +automaton to use a subset of all failure transitions, which means that +overlapping searches cannot be used. Therefore, if leftmost match semantics are +used, attempting to do an overlapping search will panic. Thus, to get +overlapping searches, the caller must use the default standard match semantics. +This behavior was chosen because there are only two alternatives, which were +deemed worse: + +* Compile two automatons internally, one for standard semantics and one for + the semantics requested by the caller (if not standard). +* Create a new type, distinct from the `AhoCorasick` type, which has different + capabilities based on the configuration options. + +The first is untenable because of the amount of memory used by the automaton. +The second increases the complexity of the API too much by adding too many +types that do similar things. It is conceptually much simpler to keep all +searching isolated to a single type. Callers may query whether the automaton +supports overlapping searches via the `AhoCorasick::supports_overlapping` +method. + + +# Stream searching + +Since Aho-Corasick is an automaton, it is possible to do partial searches on +partial parts of the haystack, and then resume that search on subsequent pieces +of the haystack. This is useful when the haystack you're trying to search is +not stored contiguously in memory, or if one does not want to read the entire +haystack into memory at once. + +Currently, only standard semantics are supported for stream searching. This is +some of the more complicated code in this crate, and is something I would very +much like to improve. In particular, it currently has the restriction that it +must buffer at least enough of the haystack in memory in order to fit the +longest possible match. The difficulty in getting stream searching right is +that the implementation choices (such as the buffer size) often impact what the +API looks like and what it's allowed to do. + + +# Prefilters + +In some cases, Aho-Corasick is not the fastest way to find matches containing +multiple patterns. Sometimes, the search can be accelerated using highly +optimized SIMD routines. For example, consider searching the following +patterns: + + Sherlock + Moriarty + Watson + +It is plausible that it would be much faster to quickly look for occurrences of +the leading bytes, `S`, `M` or `W`, before trying to start searching via the +automaton. Indeed, this is exactly what this crate will do. + +When there are more than three distinct starting bytes, then this crate will +look for three distinct bytes occurring at any position in the patterns, while +preferring bytes that are heuristically determined to be rare over others. For +example: + + Abuzz + Sanchez + Vasquez + Topaz + Waltz + +Here, we have more than 3 distinct starting bytes, but all of the patterns +contain `z`, which is typically a rare byte. In this case, the prefilter will +scan for `z`, back up a bit, and then execute the Aho-Corasick automaton. + +If all of that fails, then a packed multiple substring algorithm will be +attempted. Currently, the only algorithm available for this is Teddy, but more +may be added in the future. Teddy is unlike the above prefilters in that it +confirms its own matches, so when Teddy is active, it might not be necessary +for Aho-Corasick to run at all. (See `Automaton::leftmost_find_at_no_state_imp` +in `src/automaton.rs`.) However, the current Teddy implementation only works +in `x86_64` and when SSSE3 or AVX2 are available, and moreover, only works +_well_ when there are a small number of patterns (say, less than 100). Teddy +also requires the haystack to be of a certain length (more than 16-34 bytes). +When the haystack is shorter than that, Rabin-Karp is used instead. (See +`src/packed/rabinkarp.rs`.) + +There is a more thorough description of Teddy at +[`src/packed/teddy/README.md`](src/packed/teddy/README.md). diff --git a/vendor/aho-corasick/LICENSE-MIT b/vendor/aho-corasick/LICENSE-MIT new file mode 100644 index 0000000000..3b0a5dc09c --- /dev/null +++ b/vendor/aho-corasick/LICENSE-MIT @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Andrew Gallant + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/vendor/aho-corasick/README.md b/vendor/aho-corasick/README.md new file mode 100644 index 0000000000..f033e01834 --- /dev/null +++ b/vendor/aho-corasick/README.md @@ -0,0 +1,187 @@ +aho-corasick +============ +A library for finding occurrences of many patterns at once with SIMD +acceleration in some cases. This library provides multiple pattern +search principally through an implementation of the +[Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm), +which builds a finite state machine for executing searches in linear time. +Features include case insensitive matching, overlapping matches, fast searching +via SIMD and optional full DFA construction and search & replace in streams. + +[![Build status](https://github.com/BurntSushi/aho-corasick/workflows/ci/badge.svg)](https://github.com/BurntSushi/aho-corasick/actions) +[![crates.io](https://img.shields.io/crates/v/aho-corasick.svg)](https://crates.io/crates/aho-corasick) + +Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/). + + +### Documentation + +https://docs.rs/aho-corasick + + +### Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +aho-corasick = "0.7" +``` + + +### Example: basic searching + +This example shows how to search for occurrences of multiple patterns +simultaneously. Each match includes the pattern that matched along with the +byte offsets of the match. + +```rust +use aho_corasick::AhoCorasick; + +let patterns = &["apple", "maple", "Snapple"]; +let haystack = "Nobody likes maple in their apple flavored Snapple."; + +let ac = AhoCorasick::new(patterns); +let mut matches = vec![]; +for mat in ac.find_iter(haystack) { + matches.push((mat.pattern(), mat.start(), mat.end())); +} +assert_eq!(matches, vec![ + (1, 13, 18), + (0, 28, 33), + (2, 43, 50), +]); +``` + + +### Example: case insensitivity + +This is like the previous example, but matches `Snapple` case insensitively +using `AhoCorasickBuilder`: + +```rust +use aho_corasick::AhoCorasickBuilder; + +let patterns = &["apple", "maple", "snapple"]; +let haystack = "Nobody likes maple in their apple flavored Snapple."; + +let ac = AhoCorasickBuilder::new() + .ascii_case_insensitive(true) + .build(patterns); +let mut matches = vec![]; +for mat in ac.find_iter(haystack) { + matches.push((mat.pattern(), mat.start(), mat.end())); +} +assert_eq!(matches, vec![ + (1, 13, 18), + (0, 28, 33), + (2, 43, 50), +]); +``` + + +### Example: replacing matches in a stream + +This example shows how to execute a search and replace on a stream without +loading the entire stream into memory first. + +```rust +use aho_corasick::AhoCorasick; + +let patterns = &["fox", "brown", "quick"]; +let replace_with = &["sloth", "grey", "slow"]; + +// In a real example, these might be `std::fs::File`s instead. All you need to +// do is supply a pair of `std::io::Read` and `std::io::Write` implementations. +let rdr = "The quick brown fox."; +let mut wtr = vec![]; + +let ac = AhoCorasick::new(patterns); +ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with) + .expect("stream_replace_all failed"); +assert_eq!(b"The slow grey sloth.".to_vec(), wtr); +``` + + +### Example: finding the leftmost first match + +In the textbook description of Aho-Corasick, its formulation is typically +structured such that it reports all possible matches, even when they overlap +with another. In many cases, overlapping matches may not be desired, such as +the case of finding all successive non-overlapping matches like you might with +a standard regular expression. + +Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do +this doesn't always work in the expected way, since it will report matches as +soon as they are seen. For example, consider matching the regex `Samwise|Sam` +against the text `Samwise`. Most regex engines (that are Perl-like, or +non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick +algorithm modified for reporting non-overlapping matches will report `Sam`. + +A novel contribution of this library is the ability to change the match +semantics of Aho-Corasick (without additional search time overhead) such that +`Samwise` is reported instead. For example, here's the standard approach: + +```rust +use aho_corasick::AhoCorasick; + +let patterns = &["Samwise", "Sam"]; +let haystack = "Samwise"; + +let ac = AhoCorasick::new(patterns); +let mat = ac.find(haystack).expect("should have a match"); +assert_eq!("Sam", &haystack[mat.start()..mat.end()]); +``` + +And now here's the leftmost-first version, which matches how a Perl-like +regex will work: + +```rust +use aho_corasick::{AhoCorasickBuilder, MatchKind}; + +let patterns = &["Samwise", "Sam"]; +let haystack = "Samwise"; + +let ac = AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostFirst) + .build(patterns); +let mat = ac.find(haystack).expect("should have a match"); +assert_eq!("Samwise", &haystack[mat.start()..mat.end()]); +``` + +In addition to leftmost-first semantics, this library also supports +leftmost-longest semantics, which match the POSIX behavior of a regular +expression alternation. See `MatchKind` in the docs for more details. + + +### Minimum Rust version policy + +This crate's minimum supported `rustc` version is `1.41.1`. + +The current policy is that the minimum Rust version required to use this crate +can be increased in minor version updates. For example, if `crate 1.0` requires +Rust 1.20.0, then `crate 1.0.z` for all values of `z` will also require Rust +1.20.0 or newer. However, `crate 1.y` for `y > 0` may require a newer minimum +version of Rust. + +In general, this crate will be conservative with respect to the minimum +supported version of Rust. + + +### FFI bindings + +* [G-Research/ahocorasick_rs](https://github.com/G-Research/ahocorasick_rs/) +is a Python wrapper for this library. + + +### Future work + +Here are some plans for the future: + +* Assuming the current API is sufficient, I'd like to commit to it and release + a `1.0` version of this crate some time in the next 6-12 months. +* Support stream searching with leftmost match semantics. Currently, only + standard match semantics are supported. Getting this right seems possible, + but is tricky since the match state needs to be propagated through multiple + searches. (With standard semantics, as soon as a match is seen the search + ends.) diff --git a/vendor/aho-corasick/UNLICENSE b/vendor/aho-corasick/UNLICENSE new file mode 100644 index 0000000000..68a49daad8 --- /dev/null +++ b/vendor/aho-corasick/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to diff --git a/vendor/aho-corasick/rustfmt.toml b/vendor/aho-corasick/rustfmt.toml new file mode 100644 index 0000000000..aa37a218b9 --- /dev/null +++ b/vendor/aho-corasick/rustfmt.toml @@ -0,0 +1,2 @@ +max_width = 79 +use_small_heuristics = "max" diff --git a/vendor/aho-corasick/src/ahocorasick.rs b/vendor/aho-corasick/src/ahocorasick.rs new file mode 100644 index 0000000000..cfd74fde34 --- /dev/null +++ b/vendor/aho-corasick/src/ahocorasick.rs @@ -0,0 +1,2141 @@ +use std::io; + +use crate::automaton::Automaton; +use crate::buffer::Buffer; +use crate::dfa::{self, DFA}; +use crate::error::Result; +use crate::nfa::{self, NFA}; +use crate::packed; +use crate::prefilter::{Prefilter, PrefilterState}; +use crate::state_id::StateID; +use crate::Match; + +/// An automaton for searching multiple strings in linear time. +/// +/// The `AhoCorasick` type supports a few basic ways of constructing an +/// automaton, including +/// [`AhoCorasick::new`](struct.AhoCorasick.html#method.new) +/// and +/// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured). +/// However, there are a fair number of configurable options that can be set +/// by using +/// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html) +/// instead. Such options include, but are not limited to, how matches are +/// determined, simple case insensitivity, whether to use a DFA or not and +/// various knobs for controlling the space-vs-time trade offs taken when +/// building the automaton. +/// +/// If you aren't sure where to start, try beginning with +/// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured). +/// +/// # Resource usage +/// +/// Aho-Corasick automatons are always constructed in `O(p)` time, where `p` +/// is the combined length of all patterns being searched. With that said, +/// building an automaton can be fairly costly because of high constant +/// factors, particularly when enabling the +/// [DFA](struct.AhoCorasickBuilder.html#method.dfa) +/// option (which is disabled by default). For this reason, it's generally a +/// good idea to build an automaton once and reuse it as much as possible. +/// +/// Aho-Corasick automatons can also use a fair bit of memory. To get a +/// concrete idea of how much memory is being used, try using the +/// [`AhoCorasick::heap_bytes`](struct.AhoCorasick.html#method.heap_bytes) +/// method. +/// +/// # Examples +/// +/// This example shows how to search for occurrences of multiple patterns +/// simultaneously in a case insensitive fashion. Each match includes the +/// pattern that matched along with the byte offsets of the match. +/// +/// ``` +/// use aho_corasick::AhoCorasickBuilder; +/// +/// let patterns = &["apple", "maple", "snapple"]; +/// let haystack = "Nobody likes maple in their apple flavored Snapple."; +/// +/// let ac = AhoCorasickBuilder::new() +/// .ascii_case_insensitive(true) +/// .build(patterns); +/// let mut matches = vec![]; +/// for mat in ac.find_iter(haystack) { +/// matches.push((mat.pattern(), mat.start(), mat.end())); +/// } +/// assert_eq!(matches, vec![ +/// (1, 13, 18), +/// (0, 28, 33), +/// (2, 43, 50), +/// ]); +/// ``` +/// +/// This example shows how to replace matches with some other string: +/// +/// ``` +/// use aho_corasick::AhoCorasick; +/// +/// let patterns = &["fox", "brown", "quick"]; +/// let haystack = "The quick brown fox."; +/// let replace_with = &["sloth", "grey", "slow"]; +/// +/// let ac = AhoCorasick::new(patterns); +/// let result = ac.replace_all(haystack, replace_with); +/// assert_eq!(result, "The slow grey sloth."); +/// ``` +#[derive(Clone, Debug)] +pub struct AhoCorasick { + imp: Imp, + match_kind: MatchKind, +} + +impl AhoCorasick { + /// Create a new Aho-Corasick automaton using the default configuration. + /// + /// The default configuration optimizes for less space usage, but at the + /// expense of longer search times. To change the configuration, use + /// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html) + /// for fine-grained control, or + /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured) + /// for automatic configuration if you aren't sure which settings to pick. + /// + /// This uses the default + /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard) + /// match semantics, which reports a match as soon as it is found. This + /// corresponds to the standard match semantics supported by textbook + /// descriptions of the Aho-Corasick algorithm. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let ac = AhoCorasick::new(&[ + /// "foo", "bar", "baz", + /// ]); + /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); + /// ``` + pub fn new(patterns: I) -> AhoCorasick + where + I: IntoIterator, + P: AsRef<[u8]>, + { + AhoCorasickBuilder::new().build(patterns) + } + + /// Build an Aho-Corasick automaton with an automatically determined + /// configuration. + /// + /// Specifically, this requires a slice of patterns instead of an iterator + /// since the configuration is determined by looking at the patterns before + /// constructing the automaton. The idea here is to balance space and time + /// automatically. That is, when searching a small number of patterns, this + /// will attempt to use the fastest possible configuration since the total + /// space required will be small anyway. As the number of patterns grows, + /// this will fall back to slower configurations that use less space. + /// + /// If you want auto configuration but with match semantics different from + /// the default `MatchKind::Standard`, then use + /// [`AhoCorasickBuilder::auto_configure`](struct.AhoCorasickBuilder.html#method.auto_configure). + /// + /// # Examples + /// + /// Basic usage is just like `new`, except you must provide a slice: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let ac = AhoCorasick::new_auto_configured(&[ + /// "foo", "bar", "baz", + /// ]); + /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); + /// ``` + pub fn new_auto_configured(patterns: &[B]) -> AhoCorasick + where + B: AsRef<[u8]>, + { + AhoCorasickBuilder::new().auto_configure(patterns).build(patterns) + } +} + +impl AhoCorasick { + /// Returns true if and only if this automaton matches the haystack at any + /// position. + /// + /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. + /// This includes, but is not limited to, `String`, `&str`, `Vec`, and + /// `&[u8]` itself. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let ac = AhoCorasick::new(&[ + /// "foo", "bar", "quux", "baz", + /// ]); + /// assert!(ac.is_match("xxx bar xxx")); + /// assert!(!ac.is_match("xxx qux xxx")); + /// ``` + pub fn is_match>(&self, haystack: B) -> bool { + self.earliest_find(haystack).is_some() + } + + /// Returns the location of the first detected match in `haystack`. + /// + /// This method has the same behavior regardless of the + /// [`MatchKind`](enum.MatchKind.html) + /// of this automaton. + /// + /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. + /// This includes, but is not limited to, `String`, `&str`, `Vec`, and + /// `&[u8]` itself. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let ac = AhoCorasick::new(&[ + /// "abc", "b", + /// ]); + /// let mat = ac.earliest_find("abcd").expect("should have match"); + /// assert_eq!(1, mat.pattern()); + /// assert_eq!((1, 2), (mat.start(), mat.end())); + /// ``` + pub fn earliest_find>(&self, haystack: B) -> Option { + let mut prestate = PrefilterState::new(self.max_pattern_len()); + let mut start = self.imp.start_state(); + self.imp.earliest_find_at( + &mut prestate, + haystack.as_ref(), + 0, + &mut start, + ) + } + + /// Returns the location of the first match according to the match + /// semantics that this automaton was constructed with. + /// + /// When using `MatchKind::Standard`, this corresponds precisely to the + /// same behavior as + /// [`earliest_find`](struct.AhoCorasick.html#method.earliest_find). + /// Otherwise, match semantics correspond to either + /// [leftmost-first](enum.MatchKind.html#variant.LeftmostFirst) + /// or + /// [leftmost-longest](enum.MatchKind.html#variant.LeftmostLongest). + /// + /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. + /// This includes, but is not limited to, `String`, `&str`, `Vec`, and + /// `&[u8]` itself. + /// + /// # Examples + /// + /// Basic usage, with standard semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::Standard) // default, not necessary + /// .build(patterns); + /// let mat = ac.find(haystack).expect("should have a match"); + /// assert_eq!("b", &haystack[mat.start()..mat.end()]); + /// ``` + /// + /// Now with leftmost-first semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns); + /// let mat = ac.find(haystack).expect("should have a match"); + /// assert_eq!("abc", &haystack[mat.start()..mat.end()]); + /// ``` + /// + /// And finally, leftmost-longest semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostLongest) + /// .build(patterns); + /// let mat = ac.find(haystack).expect("should have a match"); + /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]); + /// ``` + pub fn find>(&self, haystack: B) -> Option { + let mut prestate = PrefilterState::new(self.max_pattern_len()); + self.imp.find_at_no_state(&mut prestate, haystack.as_ref(), 0) + } + + /// Returns an iterator of non-overlapping matches, using the match + /// semantics that this automaton was constructed with. + /// + /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. + /// This includes, but is not limited to, `String`, `&str`, `Vec`, and + /// `&[u8]` itself. + /// + /// # Examples + /// + /// Basic usage, with standard semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::Standard) // default, not necessary + /// .build(patterns); + /// let matches: Vec = ac + /// .find_iter(haystack) + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![2, 2, 2], matches); + /// ``` + /// + /// Now with leftmost-first semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns); + /// let matches: Vec = ac + /// .find_iter(haystack) + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![0, 2, 0], matches); + /// ``` + /// + /// And finally, leftmost-longest semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostLongest) + /// .build(patterns); + /// let matches: Vec = ac + /// .find_iter(haystack) + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![0, 2, 1], matches); + /// ``` + pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( + &'a self, + haystack: &'b B, + ) -> FindIter<'a, 'b, S> { + FindIter::new(self, haystack.as_ref()) + } + + /// Returns an iterator of overlapping matches in the given `haystack`. + /// + /// Overlapping matches can _only_ be detected using + /// `MatchKind::Standard` semantics. If this automaton was constructed with + /// leftmost semantics, then this method will panic. To determine whether + /// this will panic at runtime, use the + /// [`AhoCorasick::supports_overlapping`](struct.AhoCorasick.html#method.supports_overlapping) + /// method. + /// + /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. + /// This includes, but is not limited to, `String`, `&str`, `Vec`, and + /// `&[u8]` itself. + /// + /// # Panics + /// + /// This panics when `AhoCorasick::supports_overlapping` returns `false`. + /// That is, this panics when this automaton's match semantics are not + /// `MatchKind::Standard`. + /// + /// # Examples + /// + /// Basic usage, with standard semantics: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasick::new(patterns); + /// let matches: Vec = ac + /// .find_overlapping_iter(haystack) + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![2, 0, 2, 2, 0, 1], matches); + /// ``` + pub fn find_overlapping_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( + &'a self, + haystack: &'b B, + ) -> FindOverlappingIter<'a, 'b, S> { + FindOverlappingIter::new(self, haystack.as_ref()) + } + + /// Replace all matches with a corresponding value in the `replace_with` + /// slice given. Matches correspond to the same matches as reported by + /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). + /// + /// Replacements are determined by the index of the matching pattern. + /// For example, if the pattern with index `2` is found, then it is + /// replaced by `replace_with[2]`. + /// + /// # Panics + /// + /// This panics when `replace_with.len()` does not equal the total number + /// of patterns that are matched by this automaton. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns); + /// let result = ac.replace_all(haystack, &["x", "y", "z"]); + /// assert_eq!("x the z to the xage", result); + /// ``` + pub fn replace_all(&self, haystack: &str, replace_with: &[B]) -> String + where + B: AsRef, + { + assert_eq!( + replace_with.len(), + self.pattern_count(), + "replace_all requires a replacement for every pattern \ + in the automaton" + ); + let mut dst = String::with_capacity(haystack.len()); + self.replace_all_with(haystack, &mut dst, |mat, _, dst| { + dst.push_str(replace_with[mat.pattern()].as_ref()); + true + }); + dst + } + + /// Replace all matches using raw bytes with a corresponding value in the + /// `replace_with` slice given. Matches correspond to the same matches as + /// reported by [`find_iter`](struct.AhoCorasick.html#method.find_iter). + /// + /// Replacements are determined by the index of the matching pattern. + /// For example, if the pattern with index `2` is found, then it is + /// replaced by `replace_with[2]`. + /// + /// # Panics + /// + /// This panics when `replace_with.len()` does not equal the total number + /// of patterns that are matched by this automaton. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = b"append the app to the appendage"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns); + /// let result = ac.replace_all_bytes(haystack, &["x", "y", "z"]); + /// assert_eq!(b"x the z to the xage".to_vec(), result); + /// ``` + pub fn replace_all_bytes( + &self, + haystack: &[u8], + replace_with: &[B], + ) -> Vec + where + B: AsRef<[u8]>, + { + assert_eq!( + replace_with.len(), + self.pattern_count(), + "replace_all_bytes requires a replacement for every pattern \ + in the automaton" + ); + let mut dst = Vec::with_capacity(haystack.len()); + self.replace_all_with_bytes(haystack, &mut dst, |mat, _, dst| { + dst.extend(replace_with[mat.pattern()].as_ref()); + true + }); + dst + } + + /// Replace all matches using a closure called on each match. + /// Matches correspond to the same matches as reported by + /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). + /// + /// The closure accepts three parameters: the match found, the text of + /// the match and a string buffer with which to write the replaced text + /// (if any). If the closure returns `true`, then it continues to the next + /// match. If the closure returns `false`, then searching is stopped. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns); + /// let mut result = String::new(); + /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| { + /// dst.push_str(&mat.pattern().to_string()); + /// true + /// }); + /// assert_eq!("0 the 2 to the 0age", result); + /// ``` + /// + /// Stopping the replacement by returning `false` (continued from the + /// example above): + /// + /// ``` + /// # use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// # let patterns = &["append", "appendage", "app"]; + /// # let haystack = "append the app to the appendage"; + /// # let ac = AhoCorasickBuilder::new() + /// # .match_kind(MatchKind::LeftmostFirst) + /// # .build(patterns); + /// let mut result = String::new(); + /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| { + /// dst.push_str(&mat.pattern().to_string()); + /// mat.pattern() != 2 + /// }); + /// assert_eq!("0 the 2 to the appendage", result); + /// ``` + pub fn replace_all_with( + &self, + haystack: &str, + dst: &mut String, + mut replace_with: F, + ) where + F: FnMut(&Match, &str, &mut String) -> bool, + { + let mut last_match = 0; + for mat in self.find_iter(haystack) { + dst.push_str(&haystack[last_match..mat.start()]); + last_match = mat.end(); + if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) { + break; + }; + } + dst.push_str(&haystack[last_match..]); + } + + /// Replace all matches using raw bytes with a closure called on each + /// match. Matches correspond to the same matches as reported by + /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). + /// + /// The closure accepts three parameters: the match found, the text of + /// the match and a byte buffer with which to write the replaced text + /// (if any). If the closure returns `true`, then it continues to the next + /// match. If the closure returns `false`, then searching is stopped. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = b"append the app to the appendage"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns); + /// let mut result = vec![]; + /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| { + /// dst.extend(mat.pattern().to_string().bytes()); + /// true + /// }); + /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result); + /// ``` + /// + /// Stopping the replacement by returning `false` (continued from the + /// example above): + /// + /// ``` + /// # use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// # let patterns = &["append", "appendage", "app"]; + /// # let haystack = b"append the app to the appendage"; + /// # let ac = AhoCorasickBuilder::new() + /// # .match_kind(MatchKind::LeftmostFirst) + /// # .build(patterns); + /// let mut result = vec![]; + /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| { + /// dst.extend(mat.pattern().to_string().bytes()); + /// mat.pattern() != 2 + /// }); + /// assert_eq!(b"0 the 2 to the appendage".to_vec(), result); + /// ``` + pub fn replace_all_with_bytes( + &self, + haystack: &[u8], + dst: &mut Vec, + mut replace_with: F, + ) where + F: FnMut(&Match, &[u8], &mut Vec) -> bool, + { + let mut last_match = 0; + for mat in self.find_iter(haystack) { + dst.extend(&haystack[last_match..mat.start()]); + last_match = mat.end(); + if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) { + break; + }; + } + dst.extend(&haystack[last_match..]); + } + + /// Returns an iterator of non-overlapping matches in the given + /// stream. Matches correspond to the same matches as reported by + /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). + /// + /// The matches yielded by this iterator use absolute position offsets in + /// the stream given, where the first byte has index `0`. Matches are + /// yieled until the stream is exhausted. + /// + /// Each item yielded by the iterator is an `io::Result`, where an + /// error is yielded if there was a problem reading from the reader given. + /// + /// When searching a stream, an internal buffer is used. Therefore, callers + /// should avoiding providing a buffered reader, if possible. + /// + /// Searching a stream requires that the automaton was built with + /// `MatchKind::Standard` semantics. If this automaton was constructed + /// with leftmost semantics, then this method will panic. To determine + /// whether this will panic at runtime, use the + /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream) + /// method. + /// + /// # Memory usage + /// + /// In general, searching streams will use a constant amount of memory for + /// its internal buffer. The one requirement is that the internal buffer + /// must be at least the size of the longest possible match. In most use + /// cases, the default buffer size will be much larger than any individual + /// match. + /// + /// # Panics + /// + /// This panics when `AhoCorasick::supports_stream` returns `false`. + /// That is, this panics when this automaton's match semantics are not + /// `MatchKind::Standard`. This restriction may be lifted in the future. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// # fn example() -> Result<(), ::std::io::Error> { + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasick::new(patterns); + /// let mut matches = vec![]; + /// for result in ac.stream_find_iter(haystack.as_bytes()) { + /// let mat = result?; + /// matches.push(mat.pattern()); + /// } + /// assert_eq!(vec![2, 2, 2], matches); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn stream_find_iter<'a, R: io::Read>( + &'a self, + rdr: R, + ) -> StreamFindIter<'a, R, S> { + StreamFindIter::new(self, rdr) + } + + /// Search for and replace all matches of this automaton in + /// the given reader, and write the replacements to the given + /// writer. Matches correspond to the same matches as reported by + /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). + /// + /// Replacements are determined by the index of the matching pattern. + /// For example, if the pattern with index `2` is found, then it is + /// replaced by `replace_with[2]`. + /// + /// After all matches are replaced, the writer is _not_ flushed. + /// + /// If there was a problem reading from the given reader or writing to the + /// given writer, then the corresponding `io::Error` is returned and all + /// replacement is stopped. + /// + /// When searching a stream, an internal buffer is used. Therefore, callers + /// should avoiding providing a buffered reader, if possible. However, + /// callers may want to provide a buffered writer. + /// + /// Searching a stream requires that the automaton was built with + /// `MatchKind::Standard` semantics. If this automaton was constructed + /// with leftmost semantics, then this method will panic. To determine + /// whether this will panic at runtime, use the + /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream) + /// method. + /// + /// # Memory usage + /// + /// In general, searching streams will use a constant amount of memory for + /// its internal buffer. The one requirement is that the internal buffer + /// must be at least the size of the longest possible match. In most use + /// cases, the default buffer size will be much larger than any individual + /// match. + /// + /// # Panics + /// + /// This panics when `AhoCorasick::supports_stream` returns `false`. + /// That is, this panics when this automaton's match semantics are not + /// `MatchKind::Standard`. This restriction may be lifted in the future. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// # fn example() -> Result<(), ::std::io::Error> { + /// let patterns = &["fox", "brown", "quick"]; + /// let haystack = "The quick brown fox."; + /// let replace_with = &["sloth", "grey", "slow"]; + /// + /// let ac = AhoCorasick::new(patterns); + /// let mut result = vec![]; + /// ac.stream_replace_all(haystack.as_bytes(), &mut result, replace_with)?; + /// assert_eq!(b"The slow grey sloth.".to_vec(), result); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn stream_replace_all( + &self, + rdr: R, + wtr: W, + replace_with: &[B], + ) -> io::Result<()> + where + R: io::Read, + W: io::Write, + B: AsRef<[u8]>, + { + assert_eq!( + replace_with.len(), + self.pattern_count(), + "stream_replace_all requires a replacement for every pattern \ + in the automaton" + ); + self.stream_replace_all_with(rdr, wtr, |mat, _, wtr| { + wtr.write_all(replace_with[mat.pattern()].as_ref()) + }) + } + + /// Search the given reader and replace all matches of this automaton + /// using the given closure. The result is written to the given + /// writer. Matches correspond to the same matches as reported by + /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). + /// + /// The closure accepts three parameters: the match found, the text of + /// the match and the writer with which to write the replaced text (if any). + /// + /// After all matches are replaced, the writer is _not_ flushed. + /// + /// If there was a problem reading from the given reader or writing to the + /// given writer, then the corresponding `io::Error` is returned and all + /// replacement is stopped. + /// + /// When searching a stream, an internal buffer is used. Therefore, callers + /// should avoiding providing a buffered reader, if possible. However, + /// callers may want to provide a buffered writer. + /// + /// Searching a stream requires that the automaton was built with + /// `MatchKind::Standard` semantics. If this automaton was constructed + /// with leftmost semantics, then this method will panic. To determine + /// whether this will panic at runtime, use the + /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream) + /// method. + /// + /// # Memory usage + /// + /// In general, searching streams will use a constant amount of memory for + /// its internal buffer. The one requirement is that the internal buffer + /// must be at least the size of the longest possible match. In most use + /// cases, the default buffer size will be much larger than any individual + /// match. + /// + /// # Panics + /// + /// This panics when `AhoCorasick::supports_stream` returns `false`. + /// That is, this panics when this automaton's match semantics are not + /// `MatchKind::Standard`. This restriction may be lifted in the future. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use std::io::Write; + /// use aho_corasick::AhoCorasick; + /// + /// # fn example() -> Result<(), ::std::io::Error> { + /// let patterns = &["fox", "brown", "quick"]; + /// let haystack = "The quick brown fox."; + /// + /// let ac = AhoCorasick::new(patterns); + /// let mut result = vec![]; + /// ac.stream_replace_all_with( + /// haystack.as_bytes(), + /// &mut result, + /// |mat, _, wtr| { + /// wtr.write_all(mat.pattern().to_string().as_bytes()) + /// }, + /// )?; + /// assert_eq!(b"The 2 1 0.".to_vec(), result); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn stream_replace_all_with( + &self, + rdr: R, + mut wtr: W, + mut replace_with: F, + ) -> io::Result<()> + where + R: io::Read, + W: io::Write, + F: FnMut(&Match, &[u8], &mut W) -> io::Result<()>, + { + let mut it = StreamChunkIter::new(self, rdr); + while let Some(result) = it.next() { + let chunk = result?; + match chunk { + StreamChunk::NonMatch { bytes, .. } => { + wtr.write_all(bytes)?; + } + StreamChunk::Match { bytes, mat } => { + replace_with(&mat, bytes, &mut wtr)?; + } + } + } + Ok(()) + } + + /// Returns the match kind used by this automaton. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind}; + /// + /// let ac = AhoCorasick::new(&[ + /// "foo", "bar", "quux", "baz", + /// ]); + /// assert_eq!(&MatchKind::Standard, ac.match_kind()); + /// ``` + pub fn match_kind(&self) -> &MatchKind { + self.imp.match_kind() + } + + /// Returns the length of the longest pattern matched by this automaton. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let ac = AhoCorasick::new(&[ + /// "foo", "bar", "quux", "baz", + /// ]); + /// assert_eq!(4, ac.max_pattern_len()); + /// ``` + pub fn max_pattern_len(&self) -> usize { + self.imp.max_pattern_len() + } + + /// Return the total number of patterns matched by this automaton. + /// + /// This includes patterns that may never participate in a match. For + /// example, if + /// [`MatchKind::LeftmostFirst`](enum.MatchKind.html#variant.LeftmostFirst) + /// match semantics are used, and the patterns `Sam` and `Samwise` were + /// used to build the automaton, then `Samwise` can never participate in a + /// match because `Sam` will always take priority. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let ac = AhoCorasick::new(&[ + /// "foo", "bar", "baz", + /// ]); + /// assert_eq!(3, ac.pattern_count()); + /// ``` + pub fn pattern_count(&self) -> usize { + self.imp.pattern_count() + } + + /// Returns true if and only if this automaton supports reporting + /// overlapping matches. + /// + /// If this returns false and overlapping matches are requested, then it + /// will result in a panic. + /// + /// Since leftmost matching is inherently incompatible with overlapping + /// matches, only + /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard) + /// supports overlapping matches. This is unlikely to change in the future. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::Standard) + /// .build(&["foo", "bar", "baz"]); + /// assert!(ac.supports_overlapping()); + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(&["foo", "bar", "baz"]); + /// assert!(!ac.supports_overlapping()); + /// ``` + pub fn supports_overlapping(&self) -> bool { + self.match_kind.supports_overlapping() + } + + /// Returns true if and only if this automaton supports stream searching. + /// + /// If this returns false and stream searching (or replacing) is attempted, + /// then it will result in a panic. + /// + /// Currently, only + /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard) + /// supports streaming. This may be expanded in the future. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::Standard) + /// .build(&["foo", "bar", "baz"]); + /// assert!(ac.supports_stream()); + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(&["foo", "bar", "baz"]); + /// assert!(!ac.supports_stream()); + /// ``` + pub fn supports_stream(&self) -> bool { + self.match_kind.supports_stream() + } + + /// Returns the approximate total amount of heap used by this automaton, in + /// units of bytes. + /// + /// # Examples + /// + /// This example shows the difference in heap usage between a few + /// configurations: + /// + /// ```ignore + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let ac = AhoCorasickBuilder::new() + /// .dfa(false) // default + /// .build(&["foo", "bar", "baz"]); + /// assert_eq!(10_336, ac.heap_bytes()); + /// + /// let ac = AhoCorasickBuilder::new() + /// .dfa(false) // default + /// .ascii_case_insensitive(true) + /// .build(&["foo", "bar", "baz"]); + /// assert_eq!(10_384, ac.heap_bytes()); + /// + /// let ac = AhoCorasickBuilder::new() + /// .dfa(true) + /// .ascii_case_insensitive(true) + /// .build(&["foo", "bar", "baz"]); + /// assert_eq!(1_248, ac.heap_bytes()); + /// ``` + pub fn heap_bytes(&self) -> usize { + match self.imp { + Imp::NFA(ref nfa) => nfa.heap_bytes(), + Imp::DFA(ref dfa) => dfa.heap_bytes(), + } + } +} + +/// The internal implementation of Aho-Corasick, which is either an NFA or +/// a DFA. The NFA is slower but uses less memory. The DFA is faster but uses +/// more memory. +#[derive(Clone, Debug)] +enum Imp { + NFA(NFA), + DFA(DFA), +} + +impl Imp { + /// Returns the type of match semantics implemented by this automaton. + fn match_kind(&self) -> &MatchKind { + match *self { + Imp::NFA(ref nfa) => nfa.match_kind(), + Imp::DFA(ref dfa) => dfa.match_kind(), + } + } + + /// Returns the identifier of the start state. + fn start_state(&self) -> S { + match *self { + Imp::NFA(ref nfa) => nfa.start_state(), + Imp::DFA(ref dfa) => dfa.start_state(), + } + } + + /// The length, in bytes, of the longest pattern in this automaton. This + /// information is useful for maintaining correct buffer sizes when + /// searching on streams. + fn max_pattern_len(&self) -> usize { + match *self { + Imp::NFA(ref nfa) => nfa.max_pattern_len(), + Imp::DFA(ref dfa) => dfa.max_pattern_len(), + } + } + + /// The total number of patterns added to this automaton. This includes + /// patterns that may never match. The maximum matching pattern that can be + /// reported is exactly one less than this number. + fn pattern_count(&self) -> usize { + match *self { + Imp::NFA(ref nfa) => nfa.pattern_count(), + Imp::DFA(ref dfa) => dfa.pattern_count(), + } + } + + /// Returns the prefilter object, if one exists, for the underlying + /// automaton. + fn prefilter(&self) -> Option<&dyn Prefilter> { + match *self { + Imp::NFA(ref nfa) => nfa.prefilter(), + Imp::DFA(ref dfa) => dfa.prefilter(), + } + } + + /// Returns true if and only if we should attempt to use a prefilter. + fn use_prefilter(&self) -> bool { + let p = match self.prefilter() { + None => return false, + Some(p) => p, + }; + !p.looks_for_non_start_of_match() + } + + #[inline(always)] + fn overlapping_find_at( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + state_id: &mut S, + match_index: &mut usize, + ) -> Option { + match *self { + Imp::NFA(ref nfa) => nfa.overlapping_find_at( + prestate, + haystack, + at, + state_id, + match_index, + ), + Imp::DFA(ref dfa) => dfa.overlapping_find_at( + prestate, + haystack, + at, + state_id, + match_index, + ), + } + } + + #[inline(always)] + fn earliest_find_at( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + state_id: &mut S, + ) -> Option { + match *self { + Imp::NFA(ref nfa) => { + nfa.earliest_find_at(prestate, haystack, at, state_id) + } + Imp::DFA(ref dfa) => { + dfa.earliest_find_at(prestate, haystack, at, state_id) + } + } + } + + #[inline(always)] + fn find_at_no_state( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Option { + match *self { + Imp::NFA(ref nfa) => nfa.find_at_no_state(prestate, haystack, at), + Imp::DFA(ref dfa) => dfa.find_at_no_state(prestate, haystack, at), + } + } +} + +/// An iterator of non-overlapping matches in a particular haystack. +/// +/// This iterator yields matches according to the +/// [`MatchKind`](enum.MatchKind.html) +/// used by this automaton. +/// +/// This iterator is constructed via the +/// [`AhoCorasick::find_iter`](struct.AhoCorasick.html#method.find_iter) +/// method. +/// +/// The type variable `S` refers to the representation used for state +/// identifiers. (By default, this is `usize`.) +/// +/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. +/// +/// The lifetime `'b` refers to the lifetime of the haystack being searched. +#[derive(Debug)] +pub struct FindIter<'a, 'b, S: StateID> { + fsm: &'a Imp, + prestate: PrefilterState, + haystack: &'b [u8], + pos: usize, +} + +impl<'a, 'b, S: StateID> FindIter<'a, 'b, S> { + fn new(ac: &'a AhoCorasick, haystack: &'b [u8]) -> FindIter<'a, 'b, S> { + let prestate = PrefilterState::new(ac.max_pattern_len()); + FindIter { fsm: &ac.imp, prestate, haystack, pos: 0 } + } +} + +impl<'a, 'b, S: StateID> Iterator for FindIter<'a, 'b, S> { + type Item = Match; + + fn next(&mut self) -> Option { + if self.pos > self.haystack.len() { + return None; + } + let result = self.fsm.find_at_no_state( + &mut self.prestate, + self.haystack, + self.pos, + ); + let mat = match result { + None => return None, + Some(mat) => mat, + }; + if mat.end() == self.pos { + // If the automaton can match the empty string and if we found an + // empty match, then we need to forcefully move the position. + self.pos += 1; + } else { + self.pos = mat.end(); + } + Some(mat) + } +} + +/// An iterator of overlapping matches in a particular haystack. +/// +/// This iterator will report all possible matches in a particular haystack, +/// even when the matches overlap. +/// +/// This iterator is constructed via the +/// [`AhoCorasick::find_overlapping_iter`](struct.AhoCorasick.html#method.find_overlapping_iter) +/// method. +/// +/// The type variable `S` refers to the representation used for state +/// identifiers. (By default, this is `usize`.) +/// +/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. +/// +/// The lifetime `'b` refers to the lifetime of the haystack being searched. +#[derive(Debug)] +pub struct FindOverlappingIter<'a, 'b, S: StateID> { + fsm: &'a Imp, + prestate: PrefilterState, + haystack: &'b [u8], + pos: usize, + state_id: S, + match_index: usize, +} + +impl<'a, 'b, S: StateID> FindOverlappingIter<'a, 'b, S> { + fn new( + ac: &'a AhoCorasick, + haystack: &'b [u8], + ) -> FindOverlappingIter<'a, 'b, S> { + assert!( + ac.supports_overlapping(), + "automaton does not support overlapping searches" + ); + let prestate = PrefilterState::new(ac.max_pattern_len()); + FindOverlappingIter { + fsm: &ac.imp, + prestate, + haystack, + pos: 0, + state_id: ac.imp.start_state(), + match_index: 0, + } + } +} + +impl<'a, 'b, S: StateID> Iterator for FindOverlappingIter<'a, 'b, S> { + type Item = Match; + + fn next(&mut self) -> Option { + let result = self.fsm.overlapping_find_at( + &mut self.prestate, + self.haystack, + self.pos, + &mut self.state_id, + &mut self.match_index, + ); + match result { + None => return None, + Some(m) => { + self.pos = m.end(); + Some(m) + } + } + } +} + +/// An iterator that reports Aho-Corasick matches in a stream. +/// +/// This iterator yields elements of type `io::Result`, where an error +/// is reported if there was a problem reading from the underlying stream. +/// The iterator terminates only when the underlying stream reaches `EOF`. +/// +/// This iterator is constructed via the +/// [`AhoCorasick::stream_find_iter`](struct.AhoCorasick.html#method.stream_find_iter) +/// method. +/// +/// The type variable `R` refers to the `io::Read` stream that is being read +/// from. +/// +/// The type variable `S` refers to the representation used for state +/// identifiers. (By default, this is `usize`.) +/// +/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. +#[derive(Debug)] +pub struct StreamFindIter<'a, R, S: StateID> { + it: StreamChunkIter<'a, R, S>, +} + +impl<'a, R: io::Read, S: StateID> StreamFindIter<'a, R, S> { + fn new(ac: &'a AhoCorasick, rdr: R) -> StreamFindIter<'a, R, S> { + StreamFindIter { it: StreamChunkIter::new(ac, rdr) } + } +} + +impl<'a, R: io::Read, S: StateID> Iterator for StreamFindIter<'a, R, S> { + type Item = io::Result; + + fn next(&mut self) -> Option> { + loop { + match self.it.next() { + None => return None, + Some(Err(err)) => return Some(Err(err)), + Some(Ok(StreamChunk::NonMatch { .. })) => {} + Some(Ok(StreamChunk::Match { mat, .. })) => { + return Some(Ok(mat)); + } + } + } + } +} + +/// An iterator over chunks in an underlying reader. Each chunk either +/// corresponds to non-matching bytes or matching bytes, but all bytes from +/// the underlying reader are reported in sequence. There may be an arbitrary +/// number of non-matching chunks before seeing a matching chunk. +/// +/// N.B. This does not actually implement Iterator because we need to borrow +/// from the underlying reader. But conceptually, it's still an iterator. +#[derive(Debug)] +struct StreamChunkIter<'a, R, S: StateID> { + /// The AC automaton. + fsm: &'a Imp, + /// State associated with this automaton's prefilter. It is a heuristic + /// for stopping the prefilter if it's deemed ineffective. + prestate: PrefilterState, + /// The source of bytes we read from. + rdr: R, + /// A fixed size buffer. This is what we actually search. There are some + /// invariants around the buffer's size, namely, it must be big enough to + /// contain the longest possible match. + buf: Buffer, + /// The ID of the FSM state we're currently in. + state_id: S, + /// The current position at which to start the next search in `buf`. + search_pos: usize, + /// The absolute position of `search_pos`, where `0` corresponds to the + /// position of the first byte read from `rdr`. + absolute_pos: usize, + /// The ending position of the last StreamChunk that was returned to the + /// caller. This position is used to determine whether we need to emit + /// non-matching bytes before emitting a match. + report_pos: usize, + /// A match that should be reported on the next call. + pending_match: Option, + /// Enabled only when the automaton can match the empty string. When + /// enabled, we need to execute one final search after consuming the + /// reader to find the trailing empty match. + has_empty_match_at_end: bool, +} + +/// A single chunk yielded by the stream chunk iterator. +/// +/// The `'r` lifetime refers to the lifetime of the stream chunk iterator. +#[derive(Debug)] +enum StreamChunk<'r> { + /// A chunk that does not contain any matches. + NonMatch { bytes: &'r [u8] }, + /// A chunk that precisely contains a match. + Match { bytes: &'r [u8], mat: Match }, +} + +impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> { + fn new(ac: &'a AhoCorasick, rdr: R) -> StreamChunkIter<'a, R, S> { + assert!( + ac.supports_stream(), + "stream searching is only supported for Standard match semantics" + ); + + let prestate = if ac.imp.use_prefilter() { + PrefilterState::new(ac.max_pattern_len()) + } else { + PrefilterState::disabled() + }; + let buf = Buffer::new(ac.imp.max_pattern_len()); + let state_id = ac.imp.start_state(); + StreamChunkIter { + fsm: &ac.imp, + prestate, + rdr, + buf, + state_id, + absolute_pos: 0, + report_pos: 0, + search_pos: 0, + pending_match: None, + has_empty_match_at_end: ac.is_match(""), + } + } + + fn next(&mut self) -> Option> { + loop { + if let Some(mut mat) = self.pending_match.take() { + let bytes = &self.buf.buffer()[mat.start()..mat.end()]; + self.report_pos = mat.end(); + mat = mat.increment(self.absolute_pos); + return Some(Ok(StreamChunk::Match { bytes, mat })); + } + if self.search_pos >= self.buf.len() { + if let Some(end) = self.unreported() { + let bytes = &self.buf.buffer()[self.report_pos..end]; + self.report_pos = end; + return Some(Ok(StreamChunk::NonMatch { bytes })); + } + if self.buf.len() >= self.buf.min_buffer_len() { + // This is the point at which we roll our buffer, which we + // only do if our buffer has at least the minimum amount of + // bytes in it. Before rolling, we update our various + // positions to be consistent with the buffer after it has + // been rolled. + + self.report_pos -= + self.buf.len() - self.buf.min_buffer_len(); + self.absolute_pos += + self.search_pos - self.buf.min_buffer_len(); + self.search_pos = self.buf.min_buffer_len(); + self.buf.roll(); + } + match self.buf.fill(&mut self.rdr) { + Err(err) => return Some(Err(err)), + Ok(false) => { + // We've hit EOF, but if there are still some + // unreported bytes remaining, return them now. + if self.report_pos < self.buf.len() { + let bytes = &self.buf.buffer()[self.report_pos..]; + self.report_pos = self.buf.len(); + + let chunk = StreamChunk::NonMatch { bytes }; + return Some(Ok(chunk)); + } else { + // We've reported everything, but there might still + // be a match at the very last position. + if !self.has_empty_match_at_end { + return None; + } + // fallthrough for another search to get trailing + // empty matches + self.has_empty_match_at_end = false; + } + } + Ok(true) => {} + } + } + let result = self.fsm.earliest_find_at( + &mut self.prestate, + self.buf.buffer(), + self.search_pos, + &mut self.state_id, + ); + match result { + None => { + self.search_pos = self.buf.len(); + } + Some(mat) => { + self.state_id = self.fsm.start_state(); + if mat.end() == self.search_pos { + // If the automaton can match the empty string and if + // we found an empty match, then we need to forcefully + // move the position. + self.search_pos += 1; + } else { + self.search_pos = mat.end(); + } + self.pending_match = Some(mat.clone()); + if self.report_pos < mat.start() { + let bytes = + &self.buf.buffer()[self.report_pos..mat.start()]; + self.report_pos = mat.start(); + + let chunk = StreamChunk::NonMatch { bytes }; + return Some(Ok(chunk)); + } + } + } + } + } + + fn unreported(&self) -> Option { + let end = self.search_pos.saturating_sub(self.buf.min_buffer_len()); + if self.report_pos < end { + Some(end) + } else { + None + } + } +} + +/// A builder for configuring an Aho-Corasick automaton. +#[derive(Clone, Debug)] +pub struct AhoCorasickBuilder { + nfa_builder: nfa::Builder, + dfa_builder: dfa::Builder, + dfa: bool, +} + +impl Default for AhoCorasickBuilder { + fn default() -> AhoCorasickBuilder { + AhoCorasickBuilder::new() + } +} + +impl AhoCorasickBuilder { + /// Create a new builder for configuring an Aho-Corasick automaton. + /// + /// If you don't need fine grained configuration or aren't sure which knobs + /// to set, try using + /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured) + /// instead. + pub fn new() -> AhoCorasickBuilder { + AhoCorasickBuilder { + nfa_builder: nfa::Builder::new(), + dfa_builder: dfa::Builder::new(), + dfa: false, + } + } + + /// Build an Aho-Corasick automaton using the configuration set on this + /// builder. + /// + /// A builder may be reused to create more automatons. + /// + /// This method will use the default for representing internal state + /// identifiers, which is `usize`. This guarantees that building the + /// automaton will succeed and is generally a good default, but can make + /// the size of the automaton 2-8 times bigger than it needs to be, + /// depending on your target platform. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasickBuilder; + /// + /// let patterns = &["foo", "bar", "baz"]; + /// let ac = AhoCorasickBuilder::new() + /// .build(patterns); + /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); + /// ``` + pub fn build(&self, patterns: I) -> AhoCorasick + where + I: IntoIterator, + P: AsRef<[u8]>, + { + // The builder only returns an error if the chosen state ID + // representation is too small to fit all of the given patterns. In + // this case, since we fix the representation to usize, it will always + // work because it's impossible to overflow usize since the underlying + // storage would OOM long before that happens. + self.build_with_size::(patterns) + .expect("usize state ID type should always work") + } + + /// Build an Aho-Corasick automaton using the configuration set on this + /// builder with a specific state identifier representation. This only has + /// an effect when the `dfa` option is enabled. + /// + /// Generally, the choices for a state identifier representation are + /// `u8`, `u16`, `u32`, `u64` or `usize`, with `usize` being the default. + /// The advantage of choosing a smaller state identifier representation + /// is that the automaton produced will be smaller. This might be + /// beneficial for just generally using less space, or might even allow it + /// to fit more of the automaton in your CPU's cache, leading to overall + /// better search performance. + /// + /// Unlike the standard `build` method, this can report an error if the + /// state identifier representation cannot support the size of the + /// automaton. + /// + /// Note that the state identifier representation is determined by the + /// `S` type variable. This requires a type hint of some sort, either + /// by specifying the return type or using the turbofish, e.g., + /// `build_with_size::(...)`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, AhoCorasickBuilder}; + /// + /// # fn example() -> Result<(), ::aho_corasick::Error> { + /// let patterns = &["foo", "bar", "baz"]; + /// let ac: AhoCorasick = AhoCorasickBuilder::new() + /// .build_with_size(patterns)?; + /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); + /// # Ok(()) }; example().unwrap() + /// ``` + /// + /// Or alternatively, with turbofish: + /// + /// ``` + /// use aho_corasick::AhoCorasickBuilder; + /// + /// # fn example() -> Result<(), ::aho_corasick::Error> { + /// let patterns = &["foo", "bar", "baz"]; + /// let ac = AhoCorasickBuilder::new() + /// .build_with_size::(patterns)?; + /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn build_with_size( + &self, + patterns: I, + ) -> Result> + where + S: StateID, + I: IntoIterator, + P: AsRef<[u8]>, + { + let nfa = self.nfa_builder.build(patterns)?; + let match_kind = nfa.match_kind().clone(); + let imp = if self.dfa { + let dfa = self.dfa_builder.build(&nfa)?; + Imp::DFA(dfa) + } else { + Imp::NFA(nfa) + }; + Ok(AhoCorasick { imp, match_kind }) + } + + /// Automatically configure the settings on this builder according to the + /// patterns that will be used to construct the automaton. + /// + /// The idea here is to balance space and time automatically. That is, when + /// searching a small number of patterns, this will attempt to use the + /// fastest possible configuration since the total space required will be + /// small anyway. As the number of patterns grows, this will fall back to + /// slower configurations that use less space. + /// + /// This is guaranteed to never set `match_kind`, but any other option may + /// be overridden. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasickBuilder; + /// + /// let patterns = &["foo", "bar", "baz"]; + /// let ac = AhoCorasickBuilder::new() + /// .auto_configure(patterns) + /// .build(patterns); + /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); + /// ``` + pub fn auto_configure>( + &mut self, + patterns: &[B], + ) -> &mut AhoCorasickBuilder { + // N.B. Currently we only use the length of `patterns` to make a + // decision here, and could therefore ask for an `ExactSizeIterator` + // instead. But it's conceivable that we might adapt this to look at + // the total number of bytes, which would requires a second pass. + // + // The logic here is fairly rudimentary at the moment, but probably + // OK. The idea here is to use the fastest thing possible for a small + // number of patterns. That is, a DFA with no byte classes, since byte + // classes require an extra indirection for every byte searched. With a + // moderate number of patterns, we still want a DFA, but save on both + // space and compilation time by enabling byte classes. Finally, fall + // back to the slower but smaller NFA. + if patterns.len() <= 100 { + // N.B. Using byte classes can actually be faster by improving + // locality, but this only really applies for multi-megabyte + // automata (i.e., automata that don't fit in your CPU's cache). + self.dfa(true); + } else if patterns.len() <= 5000 { + self.dfa(true); + } + self + } + + /// Set the desired match semantics. + /// + /// The default is `MatchKind::Standard`, which corresponds to the match + /// semantics supported by the standard textbook description of the + /// Aho-Corasick algorithm. Namely, matches are reported as soon as they + /// are found. Moreover, this is the only way to get overlapping matches + /// or do stream searching. + /// + /// The other kinds of match semantics that are supported are + /// `MatchKind::LeftmostFirst` and `MatchKind::LeftmostLongest`. The former + /// corresponds to the match you would get if you were to try to match + /// each pattern at each position in the haystack in the same order that + /// you give to the automaton. That is, it returns the leftmost match + /// corresponding the earliest pattern given to the automaton. The latter + /// corresponds to finding the longest possible match among all leftmost + /// matches. + /// + /// For more details on match semantics, see the + /// [documentation for `MatchKind`](enum.MatchKind.html). + /// + /// # Examples + /// + /// In these examples, we demonstrate the differences between match + /// semantics for a particular set of patterns in a specific order: + /// `b`, `abc`, `abcd`. + /// + /// Standard semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::Standard) // default, not necessary + /// .build(patterns); + /// let mat = ac.find(haystack).expect("should have a match"); + /// assert_eq!("b", &haystack[mat.start()..mat.end()]); + /// ``` + /// + /// Leftmost-first semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns); + /// let mat = ac.find(haystack).expect("should have a match"); + /// assert_eq!("abc", &haystack[mat.start()..mat.end()]); + /// ``` + /// + /// Leftmost-longest semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostLongest) + /// .build(patterns); + /// let mat = ac.find(haystack).expect("should have a match"); + /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]); + /// ``` + pub fn match_kind(&mut self, kind: MatchKind) -> &mut AhoCorasickBuilder { + self.nfa_builder.match_kind(kind); + self + } + + /// Enable anchored mode, which requires all matches to start at the + /// first position in a haystack. + /// + /// This option is disabled by default. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasickBuilder; + /// + /// let patterns = &["foo", "bar"]; + /// let haystack = "foobar"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .anchored(true) + /// .build(patterns); + /// assert_eq!(1, ac.find_iter(haystack).count()); + /// ``` + /// + /// When searching for overlapping matches, all matches that start at + /// the beginning of a haystack will be reported: + /// + /// ``` + /// use aho_corasick::AhoCorasickBuilder; + /// + /// let patterns = &["foo", "foofoo"]; + /// let haystack = "foofoo"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .anchored(true) + /// .build(patterns); + /// assert_eq!(2, ac.find_overlapping_iter(haystack).count()); + /// // A non-anchored search would return 3 matches. + /// ``` + pub fn anchored(&mut self, yes: bool) -> &mut AhoCorasickBuilder { + self.nfa_builder.anchored(yes); + self + } + + /// Enable ASCII-aware case insensitive matching. + /// + /// When this option is enabled, searching will be performed without + /// respect to case for ASCII letters (`a-z` and `A-Z`) only. + /// + /// Enabling this option does not change the search algorithm, but it may + /// increase the size of the automaton. + /// + /// **NOTE:** It is unlikely that support for Unicode case folding will + /// be added in the future. The ASCII case works via a simple hack to the + /// underlying automaton, but full Unicode handling requires a fair bit of + /// sophistication. If you do need Unicode handling, you might consider + /// using the [`regex` crate](https://docs.rs/regex) or the lower level + /// [`regex-automata` crate](https://docs.rs/regex-automata). + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasickBuilder; + /// + /// let patterns = &["FOO", "bAr", "BaZ"]; + /// let haystack = "foo bar baz"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .ascii_case_insensitive(true) + /// .build(patterns); + /// assert_eq!(3, ac.find_iter(haystack).count()); + /// ``` + pub fn ascii_case_insensitive( + &mut self, + yes: bool, + ) -> &mut AhoCorasickBuilder { + self.nfa_builder.ascii_case_insensitive(yes); + self + } + + /// Set the limit on how many NFA states use a dense representation for + /// their transitions. + /// + /// A dense representation uses more space, but supports faster access to + /// transitions at search time. Thus, this setting permits the control of a + /// space vs time trade off when using the NFA variant of Aho-Corasick. + /// + /// This limit is expressed in terms of the depth of a state, i.e., the + /// number of transitions from the starting state of the NFA. The idea is + /// that most of the time searching will be spent near the starting state + /// of the automaton, so states near the start state should use a dense + /// representation. States further away from the start state would then use + /// a sparse representation, which uses less space but is slower to access + /// transitions at search time. + /// + /// By default, this is set to a low but non-zero number. + /// + /// This setting has no effect if the `dfa` option is enabled. + pub fn dense_depth(&mut self, depth: usize) -> &mut AhoCorasickBuilder { + self.nfa_builder.dense_depth(depth); + self + } + + /// Compile the standard Aho-Corasick automaton into a deterministic finite + /// automaton (DFA). + /// + /// When this is disabled (which is the default), then a non-deterministic + /// finite automaton (NFA) is used instead. + /// + /// The main benefit to a DFA is that it can execute searches more quickly + /// than a NFA (perhaps 2-4 times as fast). The main drawback is that the + /// DFA uses more space and can take much longer to build. + /// + /// Enabling this option does not change the time complexity for + /// constructing the Aho-Corasick automaton (which is `O(p)` where + /// `p` is the total number of patterns being compiled). Enabling this + /// option does however reduce the time complexity of non-overlapping + /// searches from `O(n + p)` to `O(n)`, where `n` is the length of the + /// haystack. + /// + /// In general, it's a good idea to enable this if you're searching a + /// small number of fairly short patterns (~1000), or if you want the + /// fastest possible search without regard to compilation time or space + /// usage. + pub fn dfa(&mut self, yes: bool) -> &mut AhoCorasickBuilder { + self.dfa = yes; + self + } + + /// Enable heuristic prefilter optimizations. + /// + /// When enabled, searching will attempt to quickly skip to match + /// candidates using specialized literal search routines. A prefilter + /// cannot always be used, and is generally treated as a heuristic. It + /// can be useful to disable this if the prefilter is observed to be + /// sub-optimal for a particular workload. + /// + /// This is enabled by default. + pub fn prefilter(&mut self, yes: bool) -> &mut AhoCorasickBuilder { + self.nfa_builder.prefilter(yes); + self + } + + /// Shrink the size of the transition alphabet by mapping bytes to their + /// equivalence classes. This only has an effect when the `dfa` option is + /// enabled. + /// + /// When enabled, each a DFA will use a map from all possible bytes + /// to their corresponding equivalence class. Each equivalence class + /// represents a set of bytes that does not discriminate between a match + /// and a non-match in the DFA. For example, the patterns `bar` and `baz` + /// have at least five equivalence classes: singleton sets of `b`, `a`, `r` + /// and `z`, and a final set that contains every other byte. + /// + /// The advantage of this map is that the size of the transition table can + /// be reduced drastically from `#states * 256 * sizeof(id)` to + /// `#states * k * sizeof(id)` where `k` is the number of equivalence + /// classes. As a result, total space usage can decrease substantially. + /// Moreover, since a smaller alphabet is used, compilation becomes faster + /// as well. + /// + /// The disadvantage of this map is that every byte searched must be + /// passed through this map before it can be used to determine the next + /// transition. This has a small match time performance cost. However, if + /// the DFA is otherwise very large without byte classes, then using byte + /// classes can greatly improve memory locality and thus lead to better + /// overall performance. + /// + /// This option is enabled by default. + #[deprecated( + since = "0.7.16", + note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57" + )] + pub fn byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder { + self.dfa_builder.byte_classes(yes); + self + } + + /// Premultiply state identifiers in the transition table. This only has + /// an effect when the `dfa` option is enabled. + /// + /// When enabled, state identifiers are premultiplied to point to their + /// corresponding row in the transition table. That is, given the `i`th + /// state, its corresponding premultiplied identifier is `i * k` where `k` + /// is the alphabet size of the automaton. (The alphabet size is at most + /// 256, but is in practice smaller if byte classes is enabled.) + /// + /// When state identifiers are not premultiplied, then the identifier of + /// the `i`th state is `i`. + /// + /// The advantage of premultiplying state identifiers is that is saves a + /// multiplication instruction per byte when searching with a DFA. This has + /// been observed to lead to a 20% performance benefit in micro-benchmarks. + /// + /// The primary disadvantage of premultiplying state identifiers is + /// that they require a larger integer size to represent. For example, + /// if the DFA has 200 states, then its premultiplied form requires 16 + /// bits to represent every possible state identifier, where as its + /// non-premultiplied form only requires 8 bits. + /// + /// This option is enabled by default. + #[deprecated( + since = "0.7.16", + note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57" + )] + pub fn premultiply(&mut self, yes: bool) -> &mut AhoCorasickBuilder { + self.dfa_builder.premultiply(yes); + self + } +} + +/// A knob for controlling the match semantics of an Aho-Corasick automaton. +/// +/// There are two generally different ways that Aho-Corasick automatons can +/// report matches. The first way is the "standard" approach that results from +/// implementing most textbook explanations of Aho-Corasick. The second way is +/// to report only the leftmost non-overlapping matches. The leftmost approach +/// is in turn split into two different ways of resolving ambiguous matches: +/// leftmost-first and leftmost-longest. +/// +/// The `Standard` match kind is the default and is the only one that supports +/// overlapping matches and stream searching. (Trying to find overlapping +/// or streaming matches using leftmost match semantics will result in a +/// panic.) The `Standard` match kind will report matches as they are seen. +/// When searching for overlapping matches, then all possible matches are +/// reported. When searching for non-overlapping matches, the first match seen +/// is reported. For example, for non-overlapping matches, given the patterns +/// `abcd` and `b` and the subject string `abcdef`, only a match for `b` is +/// reported since it is detected first. The `abcd` match is never reported +/// since it overlaps with the `b` match. +/// +/// In contrast, the leftmost match kind always prefers the leftmost match +/// among all possible matches. Given the same example as above with `abcd` and +/// `b` as patterns and `abcdef` as the subject string, the leftmost match is +/// `abcd` since it begins before the `b` match, even though the `b` match is +/// detected before the `abcd` match. In this case, the `b` match is not +/// reported at all since it overlaps with the `abcd` match. +/// +/// The difference between leftmost-first and leftmost-longest is in how they +/// resolve ambiguous matches when there are multiple leftmost matches to +/// choose from. Leftmost-first always chooses the pattern that was provided +/// earliest, where as leftmost-longest always chooses the longest matching +/// pattern. For example, given the patterns `a` and `ab` and the subject +/// string `ab`, the leftmost-first match is `a` but the leftmost-longest match +/// is `ab`. Conversely, if the patterns were given in reverse order, i.e., +/// `ab` and `a`, then both the leftmost-first and leftmost-longest matches +/// would be `ab`. Stated differently, the leftmost-first match depends on the +/// order in which the patterns were given to the Aho-Corasick automaton. +/// Because of that, when leftmost-first matching is used, if a pattern `A` +/// that appears before a pattern `B` is a prefix of `B`, then it is impossible +/// to ever observe a match of `B`. +/// +/// If you're not sure which match kind to pick, then stick with the standard +/// kind, which is the default. In particular, if you need overlapping or +/// streaming matches, then you _must_ use the standard kind. The leftmost +/// kinds are useful in specific circumstances. For example, leftmost-first can +/// be very useful as a way to implement match priority based on the order of +/// patterns given and leftmost-longest can be useful for dictionary searching +/// such that only the longest matching words are reported. +/// +/// # Relationship with regular expression alternations +/// +/// Understanding match semantics can be a little tricky, and one easy way +/// to conceptualize non-overlapping matches from an Aho-Corasick automaton +/// is to think about them as a simple alternation of literals in a regular +/// expression. For example, let's say we wanted to match the strings +/// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It +/// turns out that regular expression engines have two different ways of +/// matching this alternation. The first way, leftmost-longest, is commonly +/// found in POSIX compatible implementations of regular expressions (such as +/// `grep`). The second way, leftmost-first, is commonly found in backtracking +/// implementations such as Perl. (Some regex engines, such as RE2 and Rust's +/// regex engine do not use backtracking, but still implement leftmost-first +/// semantics in an effort to match the behavior of dominant backtracking +/// regex engines such as those found in Perl, Ruby, Python, Javascript and +/// PHP.) +/// +/// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex +/// will match `Samwise` because it is the longest possible match, but a +/// Perl-like regex will match `Sam` since it appears earlier in the +/// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine +/// will never match `Samwise` since `Sam` will always have higher priority. +/// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to +/// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is +/// still longest match, but it also appears earlier than `Sam`. +/// +/// The "standard" match semantics of Aho-Corasick generally don't correspond +/// to the match semantics of any large group of regex implementations, so +/// there's no direct analogy that can be made here. Standard match semantics +/// are generally useful for overlapping matches, or if you just want to see +/// matches as they are detected. +/// +/// The main conclusion to draw from this section is that the match semantics +/// can be tweaked to precisely match either Perl-like regex alternations or +/// POSIX regex alternations. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum MatchKind { + /// Use standard match semantics, which support overlapping matches. When + /// used with non-overlapping matches, matches are reported as they are + /// seen. + Standard, + /// Use leftmost-first match semantics, which reports leftmost matches. + /// When there are multiple possible leftmost matches, the match + /// corresponding to the pattern that appeared earlier when constructing + /// the automaton is reported. + /// + /// This does **not** support overlapping matches or stream searching. If + /// this match kind is used, attempting to find overlapping matches or + /// stream matches will panic. + LeftmostFirst, + /// Use leftmost-longest match semantics, which reports leftmost matches. + /// When there are multiple possible leftmost matches, the longest match + /// is chosen. + /// + /// This does **not** support overlapping matches or stream searching. If + /// this match kind is used, attempting to find overlapping matches or + /// stream matches will panic. + LeftmostLongest, + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +/// The default match kind is `MatchKind::Standard`. +impl Default for MatchKind { + fn default() -> MatchKind { + MatchKind::Standard + } +} + +impl MatchKind { + fn supports_overlapping(&self) -> bool { + self.is_standard() + } + + fn supports_stream(&self) -> bool { + // TODO: It may be possible to support this. It's hard. + // + // See: https://github.com/rust-lang/regex/issues/425#issuecomment-471367838 + self.is_standard() + } + + pub(crate) fn is_standard(&self) -> bool { + *self == MatchKind::Standard + } + + pub(crate) fn is_leftmost(&self) -> bool { + *self == MatchKind::LeftmostFirst + || *self == MatchKind::LeftmostLongest + } + + pub(crate) fn is_leftmost_first(&self) -> bool { + *self == MatchKind::LeftmostFirst + } + + /// Convert this match kind into a packed match kind. If this match kind + /// corresponds to standard semantics, then this returns None, since + /// packed searching does not support standard semantics. + pub(crate) fn as_packed(&self) -> Option { + match *self { + MatchKind::Standard => None, + MatchKind::LeftmostFirst => Some(packed::MatchKind::LeftmostFirst), + MatchKind::LeftmostLongest => { + Some(packed::MatchKind::LeftmostLongest) + } + MatchKind::__Nonexhaustive => unreachable!(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn oibits() { + use std::panic::{RefUnwindSafe, UnwindSafe}; + + fn assert_send() {} + fn assert_sync() {} + fn assert_unwind_safe() {} + + assert_send::(); + assert_sync::(); + assert_unwind_safe::(); + assert_send::(); + assert_sync::(); + assert_unwind_safe::(); + } +} diff --git a/vendor/aho-corasick/src/automaton.rs b/vendor/aho-corasick/src/automaton.rs new file mode 100644 index 0000000000..d88743a2fe --- /dev/null +++ b/vendor/aho-corasick/src/automaton.rs @@ -0,0 +1,573 @@ +use crate::ahocorasick::MatchKind; +use crate::prefilter::{self, Candidate, Prefilter, PrefilterState}; +use crate::state_id::{dead_id, fail_id, StateID}; +use crate::Match; + +// NOTE: This trait essentially started as a copy of the same trait from from +// regex-automata, with some wording changed since we use this trait for +// NFAs in addition to DFAs in this crate. Additionally, we do not export +// this trait. It's only used internally to reduce code duplication. The +// regex-automata crate needs to expose it because its Regex type is generic +// over implementations of this trait. In this crate, we encapsulate everything +// behind the AhoCorasick type. +// +// This trait is a bit of a mess, but it's not quite clear how to fix it. +// Basically, there are several competing concerns: +// +// * We need performance, so everything effectively needs to get monomorphized. +// * There are several variations on searching Aho-Corasick automatons: +// overlapping, standard and leftmost. Overlapping and standard are somewhat +// combined together below, but there is no real way to combine standard with +// leftmost. Namely, leftmost requires continuing a search even after a match +// is found, in order to correctly disambiguate a match. +// * On top of that, *sometimes* callers want to know which state the automaton +// is in after searching. This is principally useful for overlapping and +// stream searches. However, when callers don't care about this, we really +// do not want to be forced to compute it, since it sometimes requires extra +// work. Thus, there are effectively two copies of leftmost searching: one +// for tracking the state ID and one that doesn't. We should ideally do the +// same for standard searching, but my sanity stopped me. + +// SAFETY RATIONALE: Previously, the code below went to some length to remove +// all bounds checks. This generally produced tighter assembly and lead to +// 20-50% improvements in micro-benchmarks on corpora made up of random +// characters. This somewhat makes sense, since the branch predictor is going +// to be at its worse on random text. +// +// However, using the aho-corasick-debug tool and manually benchmarking +// different inputs, the code *with* bounds checks actually wound up being +// slightly faster: +// +// $ cat input +// Sherlock Holmes +// John Watson +// Professor Moriarty +// Irene Adler +// Mary Watson +// +// $ aho-corasick-debug-safe \ +// input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa +// pattern read time: 32.824µs +// automaton build time: 444.687µs +// automaton heap usage: 72392 bytes +// match count: 639 +// count time: 1.809961702s +// +// $ aho-corasick-debug-master \ +// input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa +// pattern read time: 31.425µs +// automaton build time: 317.434µs +// automaton heap usage: 72392 bytes +// match count: 639 +// count time: 2.059157705s +// +// I was able to reproduce this result on two different machines (an i5 and +// an i7). Therefore, we go the route of safe code for now. + +/// A trait describing the interface of an Aho-Corasick finite state machine. +/// +/// Every automaton has exactly one fail state, one dead state and exactly one +/// start state. Generally, these correspond to the first, second and third +/// states, respectively. The dead state is always treated as a sentinel. That +/// is, no correct Aho-Corasick automaton will ever transition into the fail +/// state. The dead state, however, can be transitioned into, but only when +/// leftmost-first or leftmost-longest match semantics are enabled and only +/// when at least one match has been observed. +/// +/// Every automaton also has one or more match states, such that +/// `Automaton::is_match_state(id)` returns `true` if and only if `id` +/// corresponds to a match state. +pub trait Automaton { + /// The representation used for state identifiers in this automaton. + /// + /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`. + type ID: StateID; + + /// The type of matching that should be done. + fn match_kind(&self) -> &MatchKind; + + /// Returns true if and only if this automaton uses anchored searches. + fn anchored(&self) -> bool; + + /// An optional prefilter for quickly skipping to the next candidate match. + /// A prefilter must report at least every match, although it may report + /// positions that do not correspond to a match. That is, it must not allow + /// false negatives, but can allow false positives. + /// + /// Currently, a prefilter only runs when the automaton is in the start + /// state. That is, the position reported by a prefilter should always + /// correspond to the start of a potential match. + fn prefilter(&self) -> Option<&dyn Prefilter>; + + /// Return the identifier of this automaton's start state. + fn start_state(&self) -> Self::ID; + + /// Returns true if and only if the given state identifier refers to a + /// valid state. + fn is_valid(&self, id: Self::ID) -> bool; + + /// Returns true if and only if the given identifier corresponds to a match + /// state. + /// + /// The state ID given must be valid, or else implementors may panic. + fn is_match_state(&self, id: Self::ID) -> bool; + + /// Returns true if and only if the given identifier corresponds to a state + /// that is either the dead state or a match state. + /// + /// Depending on the implementation of the automaton, this routine can + /// be used to save a branch in the core matching loop. Nevertheless, + /// `is_match_state(id) || id == dead_id()` is always a valid + /// implementation. Indeed, this is the default implementation. + /// + /// The state ID given must be valid, or else implementors may panic. + fn is_match_or_dead_state(&self, id: Self::ID) -> bool { + id == dead_id() || self.is_match_state(id) + } + + /// If the given state is a match state, return the match corresponding + /// to the given match index. `end` must be the ending position of the + /// detected match. If no match exists or if `match_index` exceeds the + /// number of matches in this state, then `None` is returned. + /// + /// The state ID given must be valid, or else implementors may panic. + /// + /// If the given state ID is correct and if the `match_index` is less than + /// the number of matches for that state, then this is guaranteed to return + /// a match. + fn get_match( + &self, + id: Self::ID, + match_index: usize, + end: usize, + ) -> Option; + + /// Returns the number of matches for the given state. If the given state + /// is not a match state, then this returns 0. + /// + /// The state ID given must be valid, or else implementors must panic. + fn match_count(&self, id: Self::ID) -> usize; + + /// Given the current state that this automaton is in and the next input + /// byte, this method returns the identifier of the next state. The + /// identifier returned must always be valid and may never correspond to + /// the fail state. The returned identifier may, however, point to the + /// dead state. + /// + /// This is not safe so that implementors may look up the next state + /// without memory safety checks such as bounds checks. As such, callers + /// must ensure that the given identifier corresponds to a valid automaton + /// state. Implementors must, in turn, ensure that this routine is safe for + /// all valid state identifiers and for all possible `u8` values. + fn next_state(&self, current: Self::ID, input: u8) -> Self::ID; + + /// Like next_state, but debug_asserts that the underlying + /// implementation never returns a `fail_id()` for the next state. + fn next_state_no_fail(&self, current: Self::ID, input: u8) -> Self::ID { + let next = self.next_state(current, input); + // We should never see a transition to the failure state. + debug_assert!( + next != fail_id(), + "automaton should never return fail_id for next state" + ); + next + } + + /// Execute a search using standard match semantics. + /// + /// This can be used even when the automaton was constructed with leftmost + /// match semantics when you want to find the earliest possible match. This + /// can also be used as part of an overlapping search implementation. + /// + /// N.B. This does not report a match if `state_id` is given as a matching + /// state. As such, this should not be used directly. + #[inline(always)] + fn standard_find_at( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + state_id: &mut Self::ID, + ) -> Option { + if let Some(pre) = self.prefilter() { + self.standard_find_at_imp( + prestate, + Some(pre), + haystack, + at, + state_id, + ) + } else { + self.standard_find_at_imp(prestate, None, haystack, at, state_id) + } + } + + // It's important for this to always be inlined. Namely, its only caller + // is standard_find_at, and the inlining should remove the case analysis + // for prefilter scanning when there is no prefilter available. + #[inline(always)] + fn standard_find_at_imp( + &self, + prestate: &mut PrefilterState, + prefilter: Option<&dyn Prefilter>, + haystack: &[u8], + mut at: usize, + state_id: &mut Self::ID, + ) -> Option { + while at < haystack.len() { + if let Some(pre) = prefilter { + if prestate.is_effective(at) && *state_id == self.start_state() + { + let c = prefilter::next(prestate, pre, haystack, at) + .into_option(); + match c { + None => return None, + Some(i) => { + at = i; + } + } + } + } + // CORRECTNESS: next_state is correct for all possible u8 values, + // so the only thing we're concerned about is the validity of + // `state_id`. `state_id` either comes from the caller (in which + // case, we assume it is correct), or it comes from the return + // value of next_state, which is guaranteed to be correct. + *state_id = self.next_state_no_fail(*state_id, haystack[at]); + at += 1; + // This routine always quits immediately after seeing a + // match, and since dead states can only come after seeing + // a match, seeing a dead state here is impossible. (Unless + // we have an anchored automaton, in which case, dead states + // are used to stop a search.) + debug_assert!( + *state_id != dead_id() || self.anchored(), + "standard find should never see a dead state" + ); + + if self.is_match_or_dead_state(*state_id) { + return if *state_id == dead_id() { + None + } else { + self.get_match(*state_id, 0, at) + }; + } + } + None + } + + /// Execute a search using leftmost (either first or longest) match + /// semantics. + /// + /// The principle difference between searching with standard semantics and + /// searching with leftmost semantics is that leftmost searching will + /// continue searching even after a match has been found. Once a match + /// is found, the search does not stop until either the haystack has been + /// exhausted or a dead state is observed in the automaton. (Dead states + /// only exist in automatons constructed with leftmost semantics.) That is, + /// we rely on the construction of the automaton to tell us when to quit. + #[inline(never)] + fn leftmost_find_at( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + state_id: &mut Self::ID, + ) -> Option { + if let Some(pre) = self.prefilter() { + self.leftmost_find_at_imp( + prestate, + Some(pre), + haystack, + at, + state_id, + ) + } else { + self.leftmost_find_at_imp(prestate, None, haystack, at, state_id) + } + } + + // It's important for this to always be inlined. Namely, its only caller + // is leftmost_find_at, and the inlining should remove the case analysis + // for prefilter scanning when there is no prefilter available. + #[inline(always)] + fn leftmost_find_at_imp( + &self, + prestate: &mut PrefilterState, + prefilter: Option<&dyn Prefilter>, + haystack: &[u8], + mut at: usize, + state_id: &mut Self::ID, + ) -> Option { + debug_assert!(self.match_kind().is_leftmost()); + if self.anchored() && at > 0 && *state_id == self.start_state() { + return None; + } + let mut last_match = self.get_match(*state_id, 0, at); + while at < haystack.len() { + if let Some(pre) = prefilter { + if prestate.is_effective(at) && *state_id == self.start_state() + { + let c = prefilter::next(prestate, pre, haystack, at) + .into_option(); + match c { + None => return None, + Some(i) => { + at = i; + } + } + } + } + // CORRECTNESS: next_state is correct for all possible u8 values, + // so the only thing we're concerned about is the validity of + // `state_id`. `state_id` either comes from the caller (in which + // case, we assume it is correct), or it comes from the return + // value of next_state, which is guaranteed to be correct. + *state_id = self.next_state_no_fail(*state_id, haystack[at]); + at += 1; + if self.is_match_or_dead_state(*state_id) { + if *state_id == dead_id() { + // The only way to enter into a dead state is if a match + // has been found, so we assert as much. This is different + // from normal automata, where you might enter a dead state + // if you know a subsequent match will never be found + // (regardless of whether a match has already been found). + // For Aho-Corasick, it is built so that we can match at + // any position, so the possibility of a match always + // exists. + // + // (Unless we have an anchored automaton, in which case, + // dead states are used to stop a search.) + debug_assert!( + last_match.is_some() || self.anchored(), + "dead state should only be seen after match" + ); + return last_match; + } + last_match = self.get_match(*state_id, 0, at); + } + } + last_match + } + + /// This is like leftmost_find_at, but does not need to track a caller + /// provided state id. In other words, the only output of this routine is a + /// match, if one exists. + /// + /// It is regrettable that we need to effectively copy a chunk of + /// implementation twice, but when we don't need to track the state ID, we + /// can allow the prefilter to report matches immediately without having + /// to re-confirm them with the automaton. The re-confirmation step is + /// necessary in leftmost_find_at because tracing through the automaton is + /// the only way to correctly set the state ID. (Perhaps an alternative + /// would be to keep a map from pattern ID to matching state ID, but that + /// complicates the code and still doesn't permit us to defer to the + /// prefilter entirely when possible.) + /// + /// I did try a few things to avoid the code duplication here, but nothing + /// optimized as well as this approach. (In microbenchmarks, there was + /// about a 25% difference.) + #[inline(never)] + fn leftmost_find_at_no_state( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Option { + if let Some(pre) = self.prefilter() { + self.leftmost_find_at_no_state_imp( + prestate, + Some(pre), + haystack, + at, + ) + } else { + self.leftmost_find_at_no_state_imp(prestate, None, haystack, at) + } + } + + // It's important for this to always be inlined. Namely, its only caller + // is leftmost_find_at_no_state, and the inlining should remove the case + // analysis for prefilter scanning when there is no prefilter available. + #[inline(always)] + fn leftmost_find_at_no_state_imp( + &self, + prestate: &mut PrefilterState, + prefilter: Option<&dyn Prefilter>, + haystack: &[u8], + mut at: usize, + ) -> Option { + debug_assert!(self.match_kind().is_leftmost()); + if self.anchored() && at > 0 { + return None; + } + // If our prefilter handles confirmation of matches 100% of the + // time, and since we don't need to track state IDs, we can avoid + // Aho-Corasick completely. + if let Some(pre) = prefilter { + // We should never have a prefilter during an anchored search. + debug_assert!(!self.anchored()); + if !pre.reports_false_positives() { + return match pre.next_candidate(prestate, haystack, at) { + Candidate::None => None, + Candidate::Match(m) => Some(m), + Candidate::PossibleStartOfMatch(_) => unreachable!(), + }; + } + } + + let mut state_id = self.start_state(); + let mut last_match = self.get_match(state_id, 0, at); + while at < haystack.len() { + if let Some(pre) = prefilter { + if prestate.is_effective(at) && state_id == self.start_state() + { + match prefilter::next(prestate, pre, haystack, at) { + Candidate::None => return None, + // Since we aren't tracking a state ID, we can + // quit early once we know we have a match. + Candidate::Match(m) => return Some(m), + Candidate::PossibleStartOfMatch(i) => { + at = i; + } + } + } + } + // CORRECTNESS: next_state is correct for all possible u8 values, + // so the only thing we're concerned about is the validity of + // `state_id`. `state_id` either comes from the caller (in which + // case, we assume it is correct), or it comes from the return + // value of next_state, which is guaranteed to be correct. + state_id = self.next_state_no_fail(state_id, haystack[at]); + at += 1; + if self.is_match_or_dead_state(state_id) { + if state_id == dead_id() { + // The only way to enter into a dead state is if a + // match has been found, so we assert as much. This + // is different from normal automata, where you might + // enter a dead state if you know a subsequent match + // will never be found (regardless of whether a match + // has already been found). For Aho-Corasick, it is + // built so that we can match at any position, so the + // possibility of a match always exists. + // + // (Unless we have an anchored automaton, in which + // case, dead states are used to stop a search.) + debug_assert!( + last_match.is_some() || self.anchored(), + "dead state should only be seen after match" + ); + return last_match; + } + last_match = self.get_match(state_id, 0, at); + } + } + last_match + } + + /// Execute an overlapping search. + /// + /// When executing an overlapping match, the previous state ID in addition + /// to the previous match index should be given. If there are more matches + /// at the given state, then the match is reported and the given index is + /// incremented. + #[inline(always)] + fn overlapping_find_at( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + state_id: &mut Self::ID, + match_index: &mut usize, + ) -> Option { + if self.anchored() && at > 0 && *state_id == self.start_state() { + return None; + } + + let match_count = self.match_count(*state_id); + if *match_index < match_count { + // This is guaranteed to return a match since + // match_index < match_count. + let result = self.get_match(*state_id, *match_index, at); + debug_assert!(result.is_some(), "must be a match"); + *match_index += 1; + return result; + } + + *match_index = 0; + match self.standard_find_at(prestate, haystack, at, state_id) { + None => None, + Some(m) => { + *match_index = 1; + Some(m) + } + } + } + + /// Return the earliest match found. This returns as soon as we know that + /// we have a match. As such, this does not necessarily correspond to the + /// leftmost starting match, but rather, the leftmost position at which a + /// match ends. + #[inline(always)] + fn earliest_find_at( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + state_id: &mut Self::ID, + ) -> Option { + if *state_id == self.start_state() { + if self.anchored() && at > 0 { + return None; + } + if let Some(m) = self.get_match(*state_id, 0, at) { + return Some(m); + } + } + self.standard_find_at(prestate, haystack, at, state_id) + } + + /// A convenience function for finding the next match according to the + /// match semantics of this automaton. For standard match semantics, this + /// finds the earliest match. Otherwise, the leftmost match is found. + #[inline(always)] + fn find_at( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + state_id: &mut Self::ID, + ) -> Option { + match *self.match_kind() { + MatchKind::Standard => { + self.earliest_find_at(prestate, haystack, at, state_id) + } + MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => { + self.leftmost_find_at(prestate, haystack, at, state_id) + } + MatchKind::__Nonexhaustive => unreachable!(), + } + } + + /// Like find_at, but does not track state identifiers. This permits some + /// optimizations when a prefilter that confirms its own matches is + /// present. + #[inline(always)] + fn find_at_no_state( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Option { + match *self.match_kind() { + MatchKind::Standard => { + let mut state = self.start_state(); + self.earliest_find_at(prestate, haystack, at, &mut state) + } + MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => { + self.leftmost_find_at_no_state(prestate, haystack, at) + } + MatchKind::__Nonexhaustive => unreachable!(), + } + } +} diff --git a/vendor/aho-corasick/src/buffer.rs b/vendor/aho-corasick/src/buffer.rs new file mode 100644 index 0000000000..e7339eb202 --- /dev/null +++ b/vendor/aho-corasick/src/buffer.rs @@ -0,0 +1,132 @@ +use std::cmp; +use std::io; +use std::ptr; + +/// The default buffer capacity that we use for the stream buffer. +const DEFAULT_BUFFER_CAPACITY: usize = 8 * (1 << 10); // 8 KB + +/// A fairly simple roll buffer for supporting stream searches. +/// +/// This buffer acts as a temporary place to store a fixed amount of data when +/// reading from a stream. Its central purpose is to allow "rolling" some +/// suffix of the data to the beginning of the buffer before refilling it with +/// more data from the stream. For example, let's say we are trying to match +/// "foobar" on a stream. When we report the match, we'd like to not only +/// report the correct offsets at which the match occurs, but also the matching +/// bytes themselves. So let's say our stream is a file with the following +/// contents: `test test foobar test test`. Now assume that we happen to read +/// the aforementioned file in two chunks: `test test foo` and `bar test test`. +/// Naively, it would not be possible to report a single contiguous `foobar` +/// match, but this roll buffer allows us to do that. Namely, after the second +/// read, the contents of the buffer should be `st foobar test test`, where the +/// search should ultimately resume immediately after `foo`. (The prefix `st ` +/// is included because the roll buffer saves N bytes at the end of the buffer, +/// where N is the maximum possible length of a match.) +/// +/// A lot of the logic for dealing with this is unfortunately split out between +/// this roll buffer and the `StreamChunkIter`. +#[derive(Debug)] +pub struct Buffer { + /// The raw buffer contents. This has a fixed size and never increases. + buf: Vec, + /// The minimum size of the buffer, which is equivalent to the maximum + /// possible length of a match. This corresponds to the amount that we + /// roll + min: usize, + /// The end of the contents of this buffer. + end: usize, +} + +impl Buffer { + /// Create a new buffer for stream searching. The minimum buffer length + /// given should be the size of the maximum possible match length. + pub fn new(min_buffer_len: usize) -> Buffer { + let min = cmp::max(1, min_buffer_len); + // The minimum buffer amount is also the amount that we roll our + // buffer in order to support incremental searching. To this end, + // our actual capacity needs to be at least 1 byte bigger than our + // minimum amount, otherwise we won't have any overlap. In actuality, + // we want our buffer to be a bit bigger than that for performance + // reasons, so we set a lower bound of `8 * min`. + // + // TODO: It would be good to find a way to test the streaming + // implementation with the minimal buffer size. For now, we just + // uncomment out the next line and comment out the subsequent line. + // let capacity = 1 + min; + let capacity = cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY); + Buffer { buf: vec![0; capacity], min, end: 0 } + } + + /// Return the contents of this buffer. + #[inline] + pub fn buffer(&self) -> &[u8] { + &self.buf[..self.end] + } + + /// Return the minimum size of the buffer. The only way a buffer may be + /// smaller than this is if the stream itself contains less than the + /// minimum buffer amount. + #[inline] + pub fn min_buffer_len(&self) -> usize { + self.min + } + + /// Return the total length of the contents in the buffer. + #[inline] + pub fn len(&self) -> usize { + self.end + } + + /// Return all free capacity in this buffer. + fn free_buffer(&mut self) -> &mut [u8] { + &mut self.buf[self.end..] + } + + /// Refill the contents of this buffer by reading as much as possible into + /// this buffer's free capacity. If no more bytes could be read, then this + /// returns false. Otherwise, this reads until it has filled the buffer + /// past the minimum amount. + pub fn fill(&mut self, mut rdr: R) -> io::Result { + let mut readany = false; + loop { + let readlen = rdr.read(self.free_buffer())?; + if readlen == 0 { + return Ok(readany); + } + readany = true; + self.end += readlen; + if self.len() >= self.min { + return Ok(true); + } + } + } + + /// Roll the contents of the buffer so that the suffix of this buffer is + /// moved to the front and all other contents are dropped. The size of the + /// suffix corresponds precisely to the minimum buffer length. + /// + /// This should only be called when the entire contents of this buffer have + /// been searched. + pub fn roll(&mut self) { + let roll_start = self + .end + .checked_sub(self.min) + .expect("buffer capacity should be bigger than minimum amount"); + let roll_len = self.min; + + assert!(roll_start + roll_len <= self.end); + unsafe { + // SAFETY: A buffer contains Copy data, so there's no problem + // moving it around. Safety also depends on our indices being in + // bounds, which they always should be, given the assert above. + // + // TODO: Switch to [T]::copy_within once our MSRV is high enough. + ptr::copy( + self.buf[roll_start..].as_ptr(), + self.buf.as_mut_ptr(), + roll_len, + ); + } + self.end = roll_len; + } +} diff --git a/vendor/aho-corasick/src/byte_frequencies.rs b/vendor/aho-corasick/src/byte_frequencies.rs new file mode 100644 index 0000000000..c313b629db --- /dev/null +++ b/vendor/aho-corasick/src/byte_frequencies.rs @@ -0,0 +1,258 @@ +pub const BYTE_FREQUENCIES: [u8; 256] = [ + 55, // '\x00' + 52, // '\x01' + 51, // '\x02' + 50, // '\x03' + 49, // '\x04' + 48, // '\x05' + 47, // '\x06' + 46, // '\x07' + 45, // '\x08' + 103, // '\t' + 242, // '\n' + 66, // '\x0b' + 67, // '\x0c' + 229, // '\r' + 44, // '\x0e' + 43, // '\x0f' + 42, // '\x10' + 41, // '\x11' + 40, // '\x12' + 39, // '\x13' + 38, // '\x14' + 37, // '\x15' + 36, // '\x16' + 35, // '\x17' + 34, // '\x18' + 33, // '\x19' + 56, // '\x1a' + 32, // '\x1b' + 31, // '\x1c' + 30, // '\x1d' + 29, // '\x1e' + 28, // '\x1f' + 255, // ' ' + 148, // '!' + 164, // '"' + 149, // '#' + 136, // '$' + 160, // '%' + 155, // '&' + 173, // "'" + 221, // '(' + 222, // ')' + 134, // '*' + 122, // '+' + 232, // ',' + 202, // '-' + 215, // '.' + 224, // '/' + 208, // '0' + 220, // '1' + 204, // '2' + 187, // '3' + 183, // '4' + 179, // '5' + 177, // '6' + 168, // '7' + 178, // '8' + 200, // '9' + 226, // ':' + 195, // ';' + 154, // '<' + 184, // '=' + 174, // '>' + 126, // '?' + 120, // '@' + 191, // 'A' + 157, // 'B' + 194, // 'C' + 170, // 'D' + 189, // 'E' + 162, // 'F' + 161, // 'G' + 150, // 'H' + 193, // 'I' + 142, // 'J' + 137, // 'K' + 171, // 'L' + 176, // 'M' + 185, // 'N' + 167, // 'O' + 186, // 'P' + 112, // 'Q' + 175, // 'R' + 192, // 'S' + 188, // 'T' + 156, // 'U' + 140, // 'V' + 143, // 'W' + 123, // 'X' + 133, // 'Y' + 128, // 'Z' + 147, // '[' + 138, // '\\' + 146, // ']' + 114, // '^' + 223, // '_' + 151, // '`' + 249, // 'a' + 216, // 'b' + 238, // 'c' + 236, // 'd' + 253, // 'e' + 227, // 'f' + 218, // 'g' + 230, // 'h' + 247, // 'i' + 135, // 'j' + 180, // 'k' + 241, // 'l' + 233, // 'm' + 246, // 'n' + 244, // 'o' + 231, // 'p' + 139, // 'q' + 245, // 'r' + 243, // 's' + 251, // 't' + 235, // 'u' + 201, // 'v' + 196, // 'w' + 240, // 'x' + 214, // 'y' + 152, // 'z' + 182, // '{' + 205, // '|' + 181, // '}' + 127, // '~' + 27, // '\x7f' + 212, // '\x80' + 211, // '\x81' + 210, // '\x82' + 213, // '\x83' + 228, // '\x84' + 197, // '\x85' + 169, // '\x86' + 159, // '\x87' + 131, // '\x88' + 172, // '\x89' + 105, // '\x8a' + 80, // '\x8b' + 98, // '\x8c' + 96, // '\x8d' + 97, // '\x8e' + 81, // '\x8f' + 207, // '\x90' + 145, // '\x91' + 116, // '\x92' + 115, // '\x93' + 144, // '\x94' + 130, // '\x95' + 153, // '\x96' + 121, // '\x97' + 107, // '\x98' + 132, // '\x99' + 109, // '\x9a' + 110, // '\x9b' + 124, // '\x9c' + 111, // '\x9d' + 82, // '\x9e' + 108, // '\x9f' + 118, // '\xa0' + 141, // '¡' + 113, // '¢' + 129, // '£' + 119, // '¤' + 125, // '¥' + 165, // '¦' + 117, // '§' + 92, // '¨' + 106, // '©' + 83, // 'ª' + 72, // '«' + 99, // '¬' + 93, // '\xad' + 65, // '®' + 79, // '¯' + 166, // '°' + 237, // '±' + 163, // '²' + 199, // '³' + 190, // '´' + 225, // 'µ' + 209, // '¶' + 203, // '·' + 198, // '¸' + 217, // '¹' + 219, // 'º' + 206, // '»' + 234, // '¼' + 248, // '½' + 158, // '¾' + 239, // '¿' + 255, // 'À' + 255, // 'Á' + 255, // 'Â' + 255, // 'Ã' + 255, // 'Ä' + 255, // 'Å' + 255, // 'Æ' + 255, // 'Ç' + 255, // 'È' + 255, // 'É' + 255, // 'Ê' + 255, // 'Ë' + 255, // 'Ì' + 255, // 'Í' + 255, // 'Î' + 255, // 'Ï' + 255, // 'Ð' + 255, // 'Ñ' + 255, // 'Ò' + 255, // 'Ó' + 255, // 'Ô' + 255, // 'Õ' + 255, // 'Ö' + 255, // '×' + 255, // 'Ø' + 255, // 'Ù' + 255, // 'Ú' + 255, // 'Û' + 255, // 'Ü' + 255, // 'Ý' + 255, // 'Þ' + 255, // 'ß' + 255, // 'à' + 255, // 'á' + 255, // 'â' + 255, // 'ã' + 255, // 'ä' + 255, // 'å' + 255, // 'æ' + 255, // 'ç' + 255, // 'è' + 255, // 'é' + 255, // 'ê' + 255, // 'ë' + 255, // 'ì' + 255, // 'í' + 255, // 'î' + 255, // 'ï' + 255, // 'ð' + 255, // 'ñ' + 255, // 'ò' + 255, // 'ó' + 255, // 'ô' + 255, // 'õ' + 255, // 'ö' + 255, // '÷' + 255, // 'ø' + 255, // 'ù' + 255, // 'ú' + 255, // 'û' + 255, // 'ü' + 255, // 'ý' + 255, // 'þ' + 255, // 'ÿ' +]; diff --git a/vendor/aho-corasick/src/classes.rs b/vendor/aho-corasick/src/classes.rs new file mode 100644 index 0000000000..f84ae2104e --- /dev/null +++ b/vendor/aho-corasick/src/classes.rs @@ -0,0 +1,238 @@ +use std::fmt; + +/// A representation of byte oriented equivalence classes. +/// +/// This is used in an FSM to reduce the size of the transition table. This can +/// have a particularly large impact not only on the total size of an FSM, but +/// also on compile times. +#[derive(Clone, Copy)] +pub struct ByteClasses([u8; 256]); + +impl ByteClasses { + /// Creates a new set of equivalence classes where all bytes are mapped to + /// the same class. + pub fn empty() -> ByteClasses { + ByteClasses([0; 256]) + } + + /// Creates a new set of equivalence classes where each byte belongs to + /// its own equivalence class. + pub fn singletons() -> ByteClasses { + let mut classes = ByteClasses::empty(); + for i in 0..256 { + classes.set(i as u8, i as u8); + } + classes + } + + /// Set the equivalence class for the given byte. + #[inline] + pub fn set(&mut self, byte: u8, class: u8) { + self.0[byte as usize] = class; + } + + /// Get the equivalence class for the given byte. + #[inline] + pub fn get(&self, byte: u8) -> u8 { + // SAFETY: This is safe because all dense transitions have + // exactly 256 elements, so all u8 values are valid indices. + self.0[byte as usize] + } + + /// Return the total number of elements in the alphabet represented by + /// these equivalence classes. Equivalently, this returns the total number + /// of equivalence classes. + #[inline] + pub fn alphabet_len(&self) -> usize { + self.0[255] as usize + 1 + } + + /// Returns true if and only if every byte in this class maps to its own + /// equivalence class. Equivalently, there are 256 equivalence classes + /// and each class contains exactly one byte. + #[inline] + pub fn is_singleton(&self) -> bool { + self.alphabet_len() == 256 + } + + /// Returns an iterator over a sequence of representative bytes from each + /// equivalence class. Namely, this yields exactly N items, where N is + /// equivalent to the number of equivalence classes. Each item is an + /// arbitrary byte drawn from each equivalence class. + /// + /// This is useful when one is determinizing an NFA and the NFA's alphabet + /// hasn't been converted to equivalence classes yet. Picking an arbitrary + /// byte from each equivalence class then permits a full exploration of + /// the NFA instead of using every possible byte value. + pub fn representatives(&self) -> ByteClassRepresentatives<'_> { + ByteClassRepresentatives { classes: self, byte: 0, last_class: None } + } + + /// Returns all of the bytes in the given equivalence class. + /// + /// The second element in the tuple indicates the number of elements in + /// the array. + fn elements(&self, equiv: u8) -> ([u8; 256], usize) { + let (mut array, mut len) = ([0; 256], 0); + for b in 0..256 { + if self.get(b as u8) == equiv { + array[len] = b as u8; + len += 1; + } + } + (array, len) + } +} + +impl fmt::Debug for ByteClasses { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.is_singleton() { + write!(f, "ByteClasses({{singletons}})") + } else { + write!(f, "ByteClasses(")?; + for equiv in 0..self.alphabet_len() { + let (members, len) = self.elements(equiv as u8); + write!(f, " {} => {:?}", equiv, &members[..len])?; + } + write!(f, ")") + } + } +} + +/// An iterator over representative bytes from each equivalence class. +#[derive(Debug)] +pub struct ByteClassRepresentatives<'a> { + classes: &'a ByteClasses, + byte: usize, + last_class: Option, +} + +impl<'a> Iterator for ByteClassRepresentatives<'a> { + type Item = u8; + + fn next(&mut self) -> Option { + while self.byte < 256 { + let byte = self.byte as u8; + let class = self.classes.get(byte); + self.byte += 1; + + if self.last_class != Some(class) { + self.last_class = Some(class); + return Some(byte); + } + } + None + } +} + +/// A byte class builder keeps track of an *approximation* of equivalence +/// classes of bytes during NFA construction. That is, every byte in an +/// equivalence class cannot discriminate between a match and a non-match. +/// +/// For example, in the literals `abc` and `xyz`, the bytes [\x00-`], [d-w] +/// and [{-\xFF] never discriminate between a match and a non-match, precisely +/// because they never occur in the literals anywhere. +/// +/// Note though that this does not necessarily compute the minimal set of +/// equivalence classes. For example, in the literals above, the byte ranges +/// [\x00-`], [d-w] and [{-\xFF] are all treated as distinct equivalence +/// classes even though they could be treated a single class. The reason for +/// this is implementation complexity. In the future, we should endeavor to +/// compute the minimal equivalence classes since they can have a rather large +/// impact on the size of the DFA. +/// +/// The representation here is 256 booleans, all initially set to false. Each +/// boolean maps to its corresponding byte based on position. A `true` value +/// indicates the end of an equivalence class, where its corresponding byte +/// and all of the bytes corresponding to all previous contiguous `false` +/// values are in the same equivalence class. +/// +/// This particular representation only permits contiguous ranges of bytes to +/// be in the same equivalence class, which means that we can never discover +/// the true minimal set of equivalence classes. +#[derive(Debug)] +pub struct ByteClassBuilder(Vec); + +impl ByteClassBuilder { + /// Create a new builder of byte classes where all bytes are part of the + /// same equivalence class. + pub fn new() -> ByteClassBuilder { + ByteClassBuilder(vec![false; 256]) + } + + /// Indicate the the range of byte given (inclusive) can discriminate a + /// match between it and all other bytes outside of the range. + pub fn set_range(&mut self, start: u8, end: u8) { + debug_assert!(start <= end); + if start > 0 { + self.0[start as usize - 1] = true; + } + self.0[end as usize] = true; + } + + /// Build byte classes that map all byte values to their corresponding + /// equivalence class. The last mapping indicates the largest equivalence + /// class identifier (which is never bigger than 255). + pub fn build(&self) -> ByteClasses { + let mut classes = ByteClasses::empty(); + let mut class = 0u8; + let mut i = 0; + loop { + classes.set(i as u8, class as u8); + if i >= 255 { + break; + } + if self.0[i] { + class = class.checked_add(1).unwrap(); + } + i += 1; + } + classes + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn byte_classes() { + let mut set = ByteClassBuilder::new(); + set.set_range(b'a', b'z'); + + let classes = set.build(); + assert_eq!(classes.get(0), 0); + assert_eq!(classes.get(1), 0); + assert_eq!(classes.get(2), 0); + assert_eq!(classes.get(b'a' - 1), 0); + assert_eq!(classes.get(b'a'), 1); + assert_eq!(classes.get(b'm'), 1); + assert_eq!(classes.get(b'z'), 1); + assert_eq!(classes.get(b'z' + 1), 2); + assert_eq!(classes.get(254), 2); + assert_eq!(classes.get(255), 2); + + let mut set = ByteClassBuilder::new(); + set.set_range(0, 2); + set.set_range(4, 6); + let classes = set.build(); + assert_eq!(classes.get(0), 0); + assert_eq!(classes.get(1), 0); + assert_eq!(classes.get(2), 0); + assert_eq!(classes.get(3), 1); + assert_eq!(classes.get(4), 2); + assert_eq!(classes.get(5), 2); + assert_eq!(classes.get(6), 2); + assert_eq!(classes.get(7), 3); + assert_eq!(classes.get(255), 3); + } + + #[test] + fn full_byte_classes() { + let mut set = ByteClassBuilder::new(); + for i in 0..256u16 { + set.set_range(i as u8, i as u8); + } + assert_eq!(set.build().alphabet_len(), 256); + } +} diff --git a/vendor/aho-corasick/src/dfa.rs b/vendor/aho-corasick/src/dfa.rs new file mode 100644 index 0000000000..a03a254c19 --- /dev/null +++ b/vendor/aho-corasick/src/dfa.rs @@ -0,0 +1,713 @@ +use std::mem::size_of; + +use crate::ahocorasick::MatchKind; +use crate::automaton::Automaton; +use crate::classes::ByteClasses; +use crate::error::Result; +use crate::nfa::{PatternID, PatternLength, NFA}; +use crate::prefilter::{Prefilter, PrefilterObj, PrefilterState}; +use crate::state_id::{dead_id, fail_id, premultiply_overflow_error, StateID}; +use crate::Match; + +#[derive(Clone, Debug)] +pub enum DFA { + Standard(Standard), + ByteClass(ByteClass), + Premultiplied(Premultiplied), + PremultipliedByteClass(PremultipliedByteClass), +} + +impl DFA { + fn repr(&self) -> &Repr { + match *self { + DFA::Standard(ref dfa) => dfa.repr(), + DFA::ByteClass(ref dfa) => dfa.repr(), + DFA::Premultiplied(ref dfa) => dfa.repr(), + DFA::PremultipliedByteClass(ref dfa) => dfa.repr(), + } + } + + pub fn match_kind(&self) -> &MatchKind { + &self.repr().match_kind + } + + pub fn heap_bytes(&self) -> usize { + self.repr().heap_bytes + } + + pub fn max_pattern_len(&self) -> usize { + self.repr().max_pattern_len + } + + pub fn pattern_count(&self) -> usize { + self.repr().pattern_count + } + + pub fn prefilter(&self) -> Option<&dyn Prefilter> { + self.repr().prefilter.as_ref().map(|p| p.as_ref()) + } + + pub fn start_state(&self) -> S { + self.repr().start_id + } + + #[inline(always)] + pub fn overlapping_find_at( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + state_id: &mut S, + match_index: &mut usize, + ) -> Option { + match *self { + DFA::Standard(ref dfa) => dfa.overlapping_find_at( + prestate, + haystack, + at, + state_id, + match_index, + ), + DFA::ByteClass(ref dfa) => dfa.overlapping_find_at( + prestate, + haystack, + at, + state_id, + match_index, + ), + DFA::Premultiplied(ref dfa) => dfa.overlapping_find_at( + prestate, + haystack, + at, + state_id, + match_index, + ), + DFA::PremultipliedByteClass(ref dfa) => dfa.overlapping_find_at( + prestate, + haystack, + at, + state_id, + match_index, + ), + } + } + + #[inline(always)] + pub fn earliest_find_at( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + state_id: &mut S, + ) -> Option { + match *self { + DFA::Standard(ref dfa) => { + dfa.earliest_find_at(prestate, haystack, at, state_id) + } + DFA::ByteClass(ref dfa) => { + dfa.earliest_find_at(prestate, haystack, at, state_id) + } + DFA::Premultiplied(ref dfa) => { + dfa.earliest_find_at(prestate, haystack, at, state_id) + } + DFA::PremultipliedByteClass(ref dfa) => { + dfa.earliest_find_at(prestate, haystack, at, state_id) + } + } + } + + #[inline(always)] + pub fn find_at_no_state( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Option { + match *self { + DFA::Standard(ref dfa) => { + dfa.find_at_no_state(prestate, haystack, at) + } + DFA::ByteClass(ref dfa) => { + dfa.find_at_no_state(prestate, haystack, at) + } + DFA::Premultiplied(ref dfa) => { + dfa.find_at_no_state(prestate, haystack, at) + } + DFA::PremultipliedByteClass(ref dfa) => { + dfa.find_at_no_state(prestate, haystack, at) + } + } + } +} + +#[derive(Clone, Debug)] +pub struct Standard(Repr); + +impl Standard { + fn repr(&self) -> &Repr { + &self.0 + } +} + +impl Automaton for Standard { + type ID = S; + + fn match_kind(&self) -> &MatchKind { + &self.repr().match_kind + } + + fn anchored(&self) -> bool { + self.repr().anchored + } + + fn prefilter(&self) -> Option<&dyn Prefilter> { + self.repr().prefilter.as_ref().map(|p| p.as_ref()) + } + + fn start_state(&self) -> S { + self.repr().start_id + } + + fn is_valid(&self, id: S) -> bool { + id.to_usize() < self.repr().state_count + } + + fn is_match_state(&self, id: S) -> bool { + self.repr().is_match_state(id) + } + + fn is_match_or_dead_state(&self, id: S) -> bool { + self.repr().is_match_or_dead_state(id) + } + + fn get_match( + &self, + id: S, + match_index: usize, + end: usize, + ) -> Option { + self.repr().get_match(id, match_index, end) + } + + fn match_count(&self, id: S) -> usize { + self.repr().match_count(id) + } + + fn next_state(&self, current: S, input: u8) -> S { + let o = current.to_usize() * 256 + input as usize; + self.repr().trans[o] + } +} + +#[derive(Clone, Debug)] +pub struct ByteClass(Repr); + +impl ByteClass { + fn repr(&self) -> &Repr { + &self.0 + } +} + +impl Automaton for ByteClass { + type ID = S; + + fn match_kind(&self) -> &MatchKind { + &self.repr().match_kind + } + + fn anchored(&self) -> bool { + self.repr().anchored + } + + fn prefilter(&self) -> Option<&dyn Prefilter> { + self.repr().prefilter.as_ref().map(|p| p.as_ref()) + } + + fn start_state(&self) -> S { + self.repr().start_id + } + + fn is_valid(&self, id: S) -> bool { + id.to_usize() < self.repr().state_count + } + + fn is_match_state(&self, id: S) -> bool { + self.repr().is_match_state(id) + } + + fn is_match_or_dead_state(&self, id: S) -> bool { + self.repr().is_match_or_dead_state(id) + } + + fn get_match( + &self, + id: S, + match_index: usize, + end: usize, + ) -> Option { + self.repr().get_match(id, match_index, end) + } + + fn match_count(&self, id: S) -> usize { + self.repr().match_count(id) + } + + fn next_state(&self, current: S, input: u8) -> S { + let alphabet_len = self.repr().byte_classes.alphabet_len(); + let input = self.repr().byte_classes.get(input); + let o = current.to_usize() * alphabet_len + input as usize; + self.repr().trans[o] + } +} + +#[derive(Clone, Debug)] +pub struct Premultiplied(Repr); + +impl Premultiplied { + fn repr(&self) -> &Repr { + &self.0 + } +} + +impl Automaton for Premultiplied { + type ID = S; + + fn match_kind(&self) -> &MatchKind { + &self.repr().match_kind + } + + fn anchored(&self) -> bool { + self.repr().anchored + } + + fn prefilter(&self) -> Option<&dyn Prefilter> { + self.repr().prefilter.as_ref().map(|p| p.as_ref()) + } + + fn start_state(&self) -> S { + self.repr().start_id + } + + fn is_valid(&self, id: S) -> bool { + (id.to_usize() / 256) < self.repr().state_count + } + + fn is_match_state(&self, id: S) -> bool { + self.repr().is_match_state(id) + } + + fn is_match_or_dead_state(&self, id: S) -> bool { + self.repr().is_match_or_dead_state(id) + } + + fn get_match( + &self, + id: S, + match_index: usize, + end: usize, + ) -> Option { + if id > self.repr().max_match { + return None; + } + self.repr() + .matches + .get(id.to_usize() / 256) + .and_then(|m| m.get(match_index)) + .map(|&(id, len)| Match { pattern: id, len, end }) + } + + fn match_count(&self, id: S) -> usize { + let o = id.to_usize() / 256; + self.repr().matches[o].len() + } + + fn next_state(&self, current: S, input: u8) -> S { + let o = current.to_usize() + input as usize; + self.repr().trans[o] + } +} + +#[derive(Clone, Debug)] +pub struct PremultipliedByteClass(Repr); + +impl PremultipliedByteClass { + fn repr(&self) -> &Repr { + &self.0 + } +} + +impl Automaton for PremultipliedByteClass { + type ID = S; + + fn match_kind(&self) -> &MatchKind { + &self.repr().match_kind + } + + fn anchored(&self) -> bool { + self.repr().anchored + } + + fn prefilter(&self) -> Option<&dyn Prefilter> { + self.repr().prefilter.as_ref().map(|p| p.as_ref()) + } + + fn start_state(&self) -> S { + self.repr().start_id + } + + fn is_valid(&self, id: S) -> bool { + (id.to_usize() / self.repr().alphabet_len()) < self.repr().state_count + } + + fn is_match_state(&self, id: S) -> bool { + self.repr().is_match_state(id) + } + + fn is_match_or_dead_state(&self, id: S) -> bool { + self.repr().is_match_or_dead_state(id) + } + + fn get_match( + &self, + id: S, + match_index: usize, + end: usize, + ) -> Option { + if id > self.repr().max_match { + return None; + } + self.repr() + .matches + .get(id.to_usize() / self.repr().alphabet_len()) + .and_then(|m| m.get(match_index)) + .map(|&(id, len)| Match { pattern: id, len, end }) + } + + fn match_count(&self, id: S) -> usize { + let o = id.to_usize() / self.repr().alphabet_len(); + self.repr().matches[o].len() + } + + fn next_state(&self, current: S, input: u8) -> S { + let input = self.repr().byte_classes.get(input); + let o = current.to_usize() + input as usize; + self.repr().trans[o] + } +} + +#[derive(Clone, Debug)] +pub struct Repr { + match_kind: MatchKind, + anchored: bool, + premultiplied: bool, + start_id: S, + /// The length, in bytes, of the longest pattern in this automaton. This + /// information is useful for keeping correct buffer sizes when searching + /// on streams. + max_pattern_len: usize, + /// The total number of patterns added to this automaton. This includes + /// patterns that may never match. + pattern_count: usize, + state_count: usize, + max_match: S, + /// The number of bytes of heap used by this NFA's transition table. + heap_bytes: usize, + /// A prefilter for quickly detecting candidate matchs, if pertinent. + prefilter: Option, + byte_classes: ByteClasses, + trans: Vec, + matches: Vec>, +} + +impl Repr { + /// Returns the total alphabet size for this DFA. + /// + /// If byte classes are enabled, then this corresponds to the number of + /// equivalence classes. If they are disabled, then this is always 256. + fn alphabet_len(&self) -> usize { + self.byte_classes.alphabet_len() + } + + /// Returns true only if the given state is a match state. + fn is_match_state(&self, id: S) -> bool { + id <= self.max_match && id > dead_id() + } + + /// Returns true only if the given state is either a dead state or a match + /// state. + fn is_match_or_dead_state(&self, id: S) -> bool { + id <= self.max_match + } + + /// Get the ith match for the given state, where the end position of a + /// match was found at `end`. + /// + /// # Panics + /// + /// The caller must ensure that the given state identifier is valid, + /// otherwise this may panic. The `match_index` need not be valid. That is, + /// if the given state has no matches then this returns `None`. + fn get_match( + &self, + id: S, + match_index: usize, + end: usize, + ) -> Option { + if id > self.max_match { + return None; + } + self.matches + .get(id.to_usize()) + .and_then(|m| m.get(match_index)) + .map(|&(id, len)| Match { pattern: id, len, end }) + } + + /// Return the total number of matches for the given state. + /// + /// # Panics + /// + /// The caller must ensure that the given identifier is valid, or else + /// this panics. + fn match_count(&self, id: S) -> usize { + self.matches[id.to_usize()].len() + } + + /// Get the next state given `from` as the current state and `byte` as the + /// current input byte. + fn next_state(&self, from: S, byte: u8) -> S { + let alphabet_len = self.alphabet_len(); + let byte = self.byte_classes.get(byte); + self.trans[from.to_usize() * alphabet_len + byte as usize] + } + + /// Set the `byte` transition for the `from` state to point to `to`. + fn set_next_state(&mut self, from: S, byte: u8, to: S) { + let alphabet_len = self.alphabet_len(); + let byte = self.byte_classes.get(byte); + self.trans[from.to_usize() * alphabet_len + byte as usize] = to; + } + + /// Swap the given states in place. + fn swap_states(&mut self, id1: S, id2: S) { + assert!(!self.premultiplied, "can't swap states in premultiplied DFA"); + + let o1 = id1.to_usize() * self.alphabet_len(); + let o2 = id2.to_usize() * self.alphabet_len(); + for b in 0..self.alphabet_len() { + self.trans.swap(o1 + b, o2 + b); + } + self.matches.swap(id1.to_usize(), id2.to_usize()); + } + + /// This routine shuffles all match states in this DFA to the beginning + /// of the DFA such that every non-match state appears after every match + /// state. (With one exception: the special fail and dead states remain as + /// the first two states.) + /// + /// The purpose of doing this shuffling is to avoid an extra conditional + /// in the search loop, and in particular, detecting whether a state is a + /// match or not does not need to access any memory. + /// + /// This updates `self.max_match` to point to the last matching state as + /// well as `self.start` if the starting state was moved. + fn shuffle_match_states(&mut self) { + assert!( + !self.premultiplied, + "cannot shuffle match states of premultiplied DFA" + ); + + if self.state_count <= 1 { + return; + } + + let mut first_non_match = self.start_id.to_usize(); + while first_non_match < self.state_count + && self.matches[first_non_match].len() > 0 + { + first_non_match += 1; + } + + let mut swaps: Vec = vec![fail_id(); self.state_count]; + let mut cur = self.state_count - 1; + while cur > first_non_match { + if self.matches[cur].len() > 0 { + self.swap_states( + S::from_usize(cur), + S::from_usize(first_non_match), + ); + swaps[cur] = S::from_usize(first_non_match); + swaps[first_non_match] = S::from_usize(cur); + + first_non_match += 1; + while first_non_match < cur + && self.matches[first_non_match].len() > 0 + { + first_non_match += 1; + } + } + cur -= 1; + } + for id in (0..self.state_count).map(S::from_usize) { + let alphabet_len = self.alphabet_len(); + let offset = id.to_usize() * alphabet_len; + for next in &mut self.trans[offset..offset + alphabet_len] { + if swaps[next.to_usize()] != fail_id() { + *next = swaps[next.to_usize()]; + } + } + } + if swaps[self.start_id.to_usize()] != fail_id() { + self.start_id = swaps[self.start_id.to_usize()]; + } + self.max_match = S::from_usize(first_non_match - 1); + } + + fn premultiply(&mut self) -> Result<()> { + if self.premultiplied || self.state_count <= 1 { + return Ok(()); + } + + let alpha_len = self.alphabet_len(); + premultiply_overflow_error( + S::from_usize(self.state_count - 1), + alpha_len, + )?; + + for id in (2..self.state_count).map(S::from_usize) { + let offset = id.to_usize() * alpha_len; + for next in &mut self.trans[offset..offset + alpha_len] { + if *next == dead_id() { + continue; + } + *next = S::from_usize(next.to_usize() * alpha_len); + } + } + self.premultiplied = true; + self.start_id = S::from_usize(self.start_id.to_usize() * alpha_len); + self.max_match = S::from_usize(self.max_match.to_usize() * alpha_len); + Ok(()) + } + + /// Computes the total amount of heap used by this NFA in bytes. + fn calculate_size(&mut self) { + let mut size = (self.trans.len() * size_of::()) + + (self.matches.len() + * size_of::>()); + for state_matches in &self.matches { + size += + state_matches.len() * size_of::<(PatternID, PatternLength)>(); + } + size += self.prefilter.as_ref().map_or(0, |p| p.as_ref().heap_bytes()); + self.heap_bytes = size; + } +} + +/// A builder for configuring the determinization of an NFA into a DFA. +#[derive(Clone, Debug)] +pub struct Builder { + premultiply: bool, + byte_classes: bool, +} + +impl Builder { + /// Create a new builder for a DFA. + pub fn new() -> Builder { + Builder { premultiply: true, byte_classes: true } + } + + /// Build a DFA from the given NFA. + /// + /// This returns an error if the state identifiers exceed their + /// representation size. This can only happen when state ids are + /// premultiplied (which is enabled by default). + pub fn build(&self, nfa: &NFA) -> Result> { + let byte_classes = if self.byte_classes { + nfa.byte_classes().clone() + } else { + ByteClasses::singletons() + }; + let alphabet_len = byte_classes.alphabet_len(); + let trans = vec![fail_id(); alphabet_len * nfa.state_len()]; + let matches = vec![vec![]; nfa.state_len()]; + let mut repr = Repr { + match_kind: nfa.match_kind().clone(), + anchored: nfa.anchored(), + premultiplied: false, + start_id: nfa.start_state(), + max_pattern_len: nfa.max_pattern_len(), + pattern_count: nfa.pattern_count(), + state_count: nfa.state_len(), + max_match: fail_id(), + heap_bytes: 0, + prefilter: nfa.prefilter_obj().map(|p| p.clone()), + byte_classes: byte_classes.clone(), + trans, + matches, + }; + for id in (0..nfa.state_len()).map(S::from_usize) { + repr.matches[id.to_usize()].extend_from_slice(nfa.matches(id)); + + let fail = nfa.failure_transition(id); + nfa.iter_all_transitions(&byte_classes, id, |b, mut next| { + if next == fail_id() { + next = nfa_next_state_memoized(nfa, &repr, id, fail, b); + } + repr.set_next_state(id, b, next); + }); + } + repr.shuffle_match_states(); + repr.calculate_size(); + if self.premultiply { + repr.premultiply()?; + if byte_classes.is_singleton() { + Ok(DFA::Premultiplied(Premultiplied(repr))) + } else { + Ok(DFA::PremultipliedByteClass(PremultipliedByteClass(repr))) + } + } else { + if byte_classes.is_singleton() { + Ok(DFA::Standard(Standard(repr))) + } else { + Ok(DFA::ByteClass(ByteClass(repr))) + } + } + } + + /// Whether to use byte classes or in the DFA. + pub fn byte_classes(&mut self, yes: bool) -> &mut Builder { + self.byte_classes = yes; + self + } + + /// Whether to premultiply state identifier in the DFA. + pub fn premultiply(&mut self, yes: bool) -> &mut Builder { + self.premultiply = yes; + self + } +} + +/// This returns the next NFA transition (including resolving failure +/// transitions), except once it sees a state id less than the id of the DFA +/// state that is currently being populated, then we no longer need to follow +/// failure transitions and can instead query the pre-computed state id from +/// the DFA itself. +/// +/// In general, this should only be called when a failure transition is seen. +fn nfa_next_state_memoized( + nfa: &NFA, + dfa: &Repr, + populating: S, + mut current: S, + input: u8, +) -> S { + loop { + if current < populating { + return dfa.next_state(current, input); + } + let next = nfa.next_state(current, input); + if next != fail_id() { + return next; + } + current = nfa.failure_transition(current); + } +} diff --git a/vendor/aho-corasick/src/error.rs b/vendor/aho-corasick/src/error.rs new file mode 100644 index 0000000000..a57a777945 --- /dev/null +++ b/vendor/aho-corasick/src/error.rs @@ -0,0 +1,101 @@ +use std::error; +use std::fmt; +use std::result; + +pub type Result = result::Result; + +/// An error that occurred during the construction of an Aho-Corasick +/// automaton. +#[derive(Clone, Debug)] +pub struct Error { + kind: ErrorKind, +} + +/// The kind of error that occurred. +#[derive(Clone, Debug)] +pub enum ErrorKind { + /// An error that occurs when constructing an automaton would require the + /// use of a state ID that overflows the chosen state ID representation. + /// For example, if one is using `u8` for state IDs and builds a DFA with + /// 257 states, then the last state's ID will be `256` which cannot be + /// represented with `u8`. + StateIDOverflow { + /// The maximum possible state ID. + max: usize, + }, + /// An error that occurs when premultiplication of state IDs is requested + /// when constructing an Aho-Corasick DFA, but doing so would overflow the + /// chosen state ID representation. + /// + /// When `max == requested_max`, then the state ID would overflow `usize`. + PremultiplyOverflow { + /// The maximum possible state id. + max: usize, + /// The maximum ID required by premultiplication. + requested_max: usize, + }, +} + +impl Error { + /// Return the kind of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } + + pub(crate) fn state_id_overflow(max: usize) -> Error { + Error { kind: ErrorKind::StateIDOverflow { max } } + } + + pub(crate) fn premultiply_overflow( + max: usize, + requested_max: usize, + ) -> Error { + Error { kind: ErrorKind::PremultiplyOverflow { max, requested_max } } + } +} + +impl error::Error for Error { + fn description(&self) -> &str { + match self.kind { + ErrorKind::StateIDOverflow { .. } => { + "state id representation too small" + } + ErrorKind::PremultiplyOverflow { .. } => { + "state id representation too small for premultiplication" + } + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.kind { + ErrorKind::StateIDOverflow { max } => write!( + f, + "building the automaton failed because it required \ + building more states that can be identified, where the \ + maximum ID for the chosen representation is {}", + max, + ), + ErrorKind::PremultiplyOverflow { max, requested_max } => { + if max == requested_max { + write!( + f, + "premultiplication of states requires the ability to \ + represent a state ID greater than what can fit on \ + this platform's usize, which is {}", + ::std::usize::MAX, + ) + } else { + write!( + f, + "premultiplication of states requires the ability to \ + represent at least a state ID of {}, but the chosen \ + representation only permits a maximum state ID of {}", + requested_max, max, + ) + } + } + } + } +} diff --git a/vendor/aho-corasick/src/lib.rs b/vendor/aho-corasick/src/lib.rs new file mode 100644 index 0000000000..4465a565f6 --- /dev/null +++ b/vendor/aho-corasick/src/lib.rs @@ -0,0 +1,303 @@ +/*! +A library for finding occurrences of many patterns at once. This library +provides multiple pattern search principally through an implementation of the +[Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm), +which builds a fast finite state machine for executing searches in linear time. + +Additionally, this library provides a number of configuration options for +building the automaton that permit controlling the space versus time trade +off. Other features include simple ASCII case insensitive matching, finding +overlapping matches, replacements, searching streams and even searching and +replacing text in streams. + +Finally, unlike all other (known) Aho-Corasick implementations, this one +supports enabling +[leftmost-first](enum.MatchKind.html#variant.LeftmostFirst) +or +[leftmost-longest](enum.MatchKind.html#variant.LeftmostFirst) +match semantics, using a (seemingly) novel alternative construction algorithm. +For more details on what match semantics means, see the +[`MatchKind`](enum.MatchKind.html) +type. + +# Overview + +This section gives a brief overview of the primary types in this crate: + +* [`AhoCorasick`](struct.AhoCorasick.html) is the primary type and represents + an Aho-Corasick automaton. This is the type you use to execute searches. +* [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html) can be used to build + an Aho-Corasick automaton, and supports configuring a number of options. +* [`Match`](struct.Match.html) represents a single match reported by an + Aho-Corasick automaton. Each match has two pieces of information: the pattern + that matched and the start and end byte offsets corresponding to the position + in the haystack at which it matched. + +Additionally, the [`packed`](packed/index.html) sub-module contains a lower +level API for using fast vectorized routines for finding a small number of +patterns in a haystack. + +# Example: basic searching + +This example shows how to search for occurrences of multiple patterns +simultaneously. Each match includes the pattern that matched along with the +byte offsets of the match. + +``` +use aho_corasick::AhoCorasick; + +let patterns = &["apple", "maple", "Snapple"]; +let haystack = "Nobody likes maple in their apple flavored Snapple."; + +let ac = AhoCorasick::new(patterns); +let mut matches = vec![]; +for mat in ac.find_iter(haystack) { + matches.push((mat.pattern(), mat.start(), mat.end())); +} +assert_eq!(matches, vec![ + (1, 13, 18), + (0, 28, 33), + (2, 43, 50), +]); +``` + +# Example: case insensitivity + +This is like the previous example, but matches `Snapple` case insensitively +using `AhoCorasickBuilder`: + +``` +use aho_corasick::AhoCorasickBuilder; + +let patterns = &["apple", "maple", "snapple"]; +let haystack = "Nobody likes maple in their apple flavored Snapple."; + +let ac = AhoCorasickBuilder::new() + .ascii_case_insensitive(true) + .build(patterns); +let mut matches = vec![]; +for mat in ac.find_iter(haystack) { + matches.push((mat.pattern(), mat.start(), mat.end())); +} +assert_eq!(matches, vec![ + (1, 13, 18), + (0, 28, 33), + (2, 43, 50), +]); +``` + +# Example: replacing matches in a stream + +This example shows how to execute a search and replace on a stream without +loading the entire stream into memory first. + +``` +use aho_corasick::AhoCorasick; + +# fn example() -> Result<(), ::std::io::Error> { +let patterns = &["fox", "brown", "quick"]; +let replace_with = &["sloth", "grey", "slow"]; + +// In a real example, these might be `std::fs::File`s instead. All you need to +// do is supply a pair of `std::io::Read` and `std::io::Write` implementations. +let rdr = "The quick brown fox."; +let mut wtr = vec![]; + +let ac = AhoCorasick::new(patterns); +ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)?; +assert_eq!(b"The slow grey sloth.".to_vec(), wtr); +# Ok(()) }; example().unwrap() +``` + +# Example: finding the leftmost first match + +In the textbook description of Aho-Corasick, its formulation is typically +structured such that it reports all possible matches, even when they overlap +with another. In many cases, overlapping matches may not be desired, such as +the case of finding all successive non-overlapping matches like you might with +a standard regular expression. + +Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do +this doesn't always work in the expected way, since it will report matches as +soon as they are seen. For example, consider matching the regex `Samwise|Sam` +against the text `Samwise`. Most regex engines (that are Perl-like, or +non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick +algorithm modified for reporting non-overlapping matches will report `Sam`. + +A novel contribution of this library is the ability to change the match +semantics of Aho-Corasick (without additional search time overhead) such that +`Samwise` is reported instead. For example, here's the standard approach: + +``` +use aho_corasick::AhoCorasick; + +let patterns = &["Samwise", "Sam"]; +let haystack = "Samwise"; + +let ac = AhoCorasick::new(patterns); +let mat = ac.find(haystack).expect("should have a match"); +assert_eq!("Sam", &haystack[mat.start()..mat.end()]); +``` + +And now here's the leftmost-first version, which matches how a Perl-like +regex will work: + +``` +use aho_corasick::{AhoCorasickBuilder, MatchKind}; + +let patterns = &["Samwise", "Sam"]; +let haystack = "Samwise"; + +let ac = AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostFirst) + .build(patterns); +let mat = ac.find(haystack).expect("should have a match"); +assert_eq!("Samwise", &haystack[mat.start()..mat.end()]); +``` + +In addition to leftmost-first semantics, this library also supports +leftmost-longest semantics, which match the POSIX behavior of a regular +expression alternation. See +[`MatchKind`](enum.MatchKind.html) +for more details. + +# Prefilters + +While an Aho-Corasick automaton can perform admirably when compared to more +naive solutions, it is generally slower than more specialized algorithms that +are accelerated using vector instructions such as SIMD. + +For that reason, this library will internally use a "prefilter" to attempt +to accelerate searches when possible. Currently, this library has several +different algorithms it might use depending on the patterns provided. Once the +number of patterns gets too big, prefilters are no longer used. + +While a prefilter is generally good to have on by default since it works +well in the common case, it can lead to less predictable or even sub-optimal +performance in some cases. For that reason, prefilters can be explicitly +disabled via +[`AhoCorasickBuilder::prefilter`](struct.AhoCorasickBuilder.html#method.prefilter). +*/ + +#![deny(missing_docs)] + +// We can never be truly no_std, but we could be alloc-only some day, so +// require the std feature for now. +#[cfg(not(feature = "std"))] +compile_error!("`std` feature is currently required to build this crate"); + +// #[cfg(doctest)] +// #[macro_use] +// extern crate doc_comment; + +// #[cfg(doctest)] +// doctest!("../README.md"); + +pub use crate::ahocorasick::{ + AhoCorasick, AhoCorasickBuilder, FindIter, FindOverlappingIter, MatchKind, + StreamFindIter, +}; +pub use crate::error::{Error, ErrorKind}; +pub use crate::state_id::StateID; + +mod ahocorasick; +mod automaton; +mod buffer; +mod byte_frequencies; +mod classes; +mod dfa; +mod error; +mod nfa; +pub mod packed; +mod prefilter; +mod state_id; +#[cfg(test)] +mod tests; + +/// A representation of a match reported by an Aho-Corasick automaton. +/// +/// A match has two essential pieces of information: the identifier of the +/// pattern that matched, along with the start and end offsets of the match +/// in the haystack. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use aho_corasick::AhoCorasick; +/// +/// let ac = AhoCorasick::new(&[ +/// "foo", "bar", "baz", +/// ]); +/// let mat = ac.find("xxx bar xxx").expect("should have a match"); +/// assert_eq!(1, mat.pattern()); +/// assert_eq!(4, mat.start()); +/// assert_eq!(7, mat.end()); +/// ``` +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub struct Match { + /// The pattern id. + pattern: usize, + /// The length of this match, such that the starting position of the match + /// is `end - len`. + /// + /// We use length here because, other than the pattern id, the only + /// information about each pattern that the automaton stores is its length. + /// So using the length here is just a bit more natural. But it isn't + /// technically required. + len: usize, + /// The end offset of the match, exclusive. + end: usize, +} + +impl Match { + /// Returns the identifier of the pattern that matched. + /// + /// The identifier of a pattern is derived from the position in which it + /// was originally inserted into the corresponding automaton. The first + /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` + /// and so on. + #[inline] + pub fn pattern(&self) -> usize { + self.pattern + } + + /// The starting position of the match. + #[inline] + pub fn start(&self) -> usize { + self.end - self.len + } + + /// The ending position of the match. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// The length, in bytes, of the match. + #[inline] + pub fn len(&self) -> usize { + self.len + } + + /// Returns true if and only if this match is empty. That is, when + /// `start() == end()`. + /// + /// An empty match can only be returned when the empty string was among + /// the patterns used to build the Aho-Corasick automaton. + #[inline] + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + #[inline] + fn increment(&self, by: usize) -> Match { + Match { pattern: self.pattern, len: self.len, end: self.end + by } + } + + #[inline] + fn from_span(id: usize, start: usize, end: usize) -> Match { + Match { pattern: id, len: end - start, end } + } +} diff --git a/vendor/aho-corasick/src/nfa.rs b/vendor/aho-corasick/src/nfa.rs new file mode 100644 index 0000000000..05c5cfb8e6 --- /dev/null +++ b/vendor/aho-corasick/src/nfa.rs @@ -0,0 +1,1214 @@ +use std::cmp; +use std::collections::{BTreeSet, VecDeque}; +use std::fmt; +use std::mem::size_of; +use std::ops::{Index, IndexMut}; + +use crate::ahocorasick::MatchKind; +use crate::automaton::Automaton; +use crate::classes::{ByteClassBuilder, ByteClasses}; +use crate::error::Result; +use crate::prefilter::{self, opposite_ascii_case, Prefilter, PrefilterObj}; +use crate::state_id::{dead_id, fail_id, usize_to_state_id, StateID}; +use crate::Match; + +/// The identifier for a pattern, which is simply the position of the pattern +/// in the sequence of patterns given by the caller. +pub type PatternID = usize; + +/// The length of a pattern, in bytes. +pub type PatternLength = usize; + +/// An Aho-Corasick automaton, represented as an NFA. +/// +/// This is the classical formulation of Aho-Corasick, which involves building +/// up a prefix trie of a given set of patterns, and then wiring up failure +/// transitions between states in order to guarantee linear time matching. The +/// standard formulation is, technically, an NFA because of these failure +/// transitions. That is, one can see them as enabling the automaton to be in +/// multiple states at once. Indeed, during search, it is possible to check +/// the transitions on multiple states for a single input byte. +/// +/// This particular implementation not only supports the standard style of +/// matching, but also provides a mode for choosing leftmost-first or +/// leftmost-longest match semantics. When a leftmost mode is chosen, some +/// failure transitions that would otherwise be added are elided. See +/// the documentation of `MatchKind` for more details and examples on how the +/// match semantics may differ. +/// +/// If one wants a DFA, then it is necessary to first build an NFA and convert +/// it into a DFA. Note, however, that because we've constrained ourselves to +/// matching literal patterns, this does not need to use subset construction +/// for determinization. Instead, the DFA has at most a number of states +/// equivalent to the number of NFA states. The only real difference between +/// them is that all failure transitions are followed and pre-computed. This +/// uses much more memory, but also executes searches more quickly. +#[derive(Clone)] +pub struct NFA { + /// The match semantics built into this NFA. + match_kind: MatchKind, + /// The start state id as an index into `states`. + start_id: S, + /// The length, in bytes, of the longest pattern in this automaton. This + /// information is useful for keeping correct buffer sizes when searching + /// on streams. + max_pattern_len: usize, + /// The total number of patterns added to this automaton, including + /// patterns that may never be matched. + pattern_count: usize, + /// The number of bytes of heap used by this NFA's transition table. + heap_bytes: usize, + /// A prefilter for quickly skipping to candidate matches, if pertinent. + prefilter: Option, + /// Whether this automaton anchors all matches to the start of input. + anchored: bool, + /// A set of equivalence classes in terms of bytes. We compute this while + /// building the NFA, but don't use it in the NFA's states. Instead, we + /// use this for building the DFA. We store it on the NFA since it's easy + /// to compute while visiting the patterns. + byte_classes: ByteClasses, + /// A set of states. Each state defines its own transitions, a fail + /// transition and a set of indices corresponding to matches. + /// + /// The first state is always the fail state, which is used only as a + /// sentinel. Namely, in the final NFA, no transition into the fail state + /// exists. (Well, they do, but they aren't followed. Instead, the state's + /// failure transition is followed.) + /// + /// The second state (index 1) is always the dead state. Dead states are + /// in every automaton, but only used when leftmost-{first,longest} match + /// semantics are enabled. Specifically, they instruct search to stop + /// at specific points in order to report the correct match location. In + /// the standard Aho-Corasick construction, there are no transitions to + /// the dead state. + /// + /// The third state (index 2) is generally intended to be the starting or + /// "root" state. + states: Vec>, +} + +impl NFA { + /// Returns the equivalence classes of bytes found while constructing + /// this NFA. + /// + /// Note that the NFA doesn't actually make use of these equivalence + /// classes. Instead, these are useful for building the DFA when desired. + pub fn byte_classes(&self) -> &ByteClasses { + &self.byte_classes + } + + /// Returns a prefilter, if one exists. + pub fn prefilter_obj(&self) -> Option<&PrefilterObj> { + self.prefilter.as_ref() + } + + /// Returns the total number of heap bytes used by this NFA's transition + /// table. + pub fn heap_bytes(&self) -> usize { + self.heap_bytes + + self.prefilter.as_ref().map_or(0, |p| p.as_ref().heap_bytes()) + } + + /// Return the length of the longest pattern in this automaton. + pub fn max_pattern_len(&self) -> usize { + self.max_pattern_len + } + + /// Return the total number of patterns added to this automaton. + pub fn pattern_count(&self) -> usize { + self.pattern_count + } + + /// Returns the total number of states in this NFA. + pub fn state_len(&self) -> usize { + self.states.len() + } + + /// Returns the matches for the given state. + pub fn matches(&self, id: S) -> &[(PatternID, PatternLength)] { + &self.states[id.to_usize()].matches + } + + /// Returns an iterator over all transitions in the given state according + /// to the given equivalence classes, including transitions to `fail_id()`. + /// The number of transitions returned is always equivalent to the number + /// of equivalence classes. + pub fn iter_all_transitions( + &self, + byte_classes: &ByteClasses, + id: S, + f: F, + ) { + self.states[id.to_usize()].trans.iter_all(byte_classes, f); + } + + /// Returns the failure transition for the given state. + pub fn failure_transition(&self, id: S) -> S { + self.states[id.to_usize()].fail + } + + /// Returns the next state for the given state and input byte. + /// + /// Note that this does not follow failure transitions. As such, the id + /// returned may be `fail_id`. + pub fn next_state(&self, current: S, input: u8) -> S { + self.states[current.to_usize()].next_state(input) + } + + fn state(&self, id: S) -> &State { + &self.states[id.to_usize()] + } + + fn state_mut(&mut self, id: S) -> &mut State { + &mut self.states[id.to_usize()] + } + + fn start(&self) -> &State { + self.state(self.start_id) + } + + fn start_mut(&mut self) -> &mut State { + let id = self.start_id; + self.state_mut(id) + } + + fn iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<'_, S> { + IterTransitionsMut::new(self, id) + } + + fn copy_matches(&mut self, src: S, dst: S) { + let (src, dst) = + get_two_mut(&mut self.states, src.to_usize(), dst.to_usize()); + dst.matches.extend_from_slice(&src.matches); + } + + fn copy_empty_matches(&mut self, dst: S) { + let start_id = self.start_id; + self.copy_matches(start_id, dst); + } + + fn add_dense_state(&mut self, depth: usize) -> Result { + let trans = Transitions::Dense(Dense::new()); + let id = usize_to_state_id(self.states.len())?; + self.states.push(State { + trans, + // Anchored automatons do not have any failure transitions. + fail: if self.anchored { dead_id() } else { self.start_id }, + depth, + matches: vec![], + }); + Ok(id) + } + + fn add_sparse_state(&mut self, depth: usize) -> Result { + let trans = Transitions::Sparse(vec![]); + let id = usize_to_state_id(self.states.len())?; + self.states.push(State { + trans, + // Anchored automatons do not have any failure transitions. + fail: if self.anchored { dead_id() } else { self.start_id }, + depth, + matches: vec![], + }); + Ok(id) + } +} + +impl Automaton for NFA { + type ID = S; + + fn match_kind(&self) -> &MatchKind { + &self.match_kind + } + + fn anchored(&self) -> bool { + self.anchored + } + + fn prefilter(&self) -> Option<&dyn Prefilter> { + self.prefilter.as_ref().map(|p| p.as_ref()) + } + + fn start_state(&self) -> S { + self.start_id + } + + fn is_valid(&self, id: S) -> bool { + id.to_usize() < self.states.len() + } + + fn is_match_state(&self, id: S) -> bool { + self.states[id.to_usize()].is_match() + } + + fn get_match( + &self, + id: S, + match_index: usize, + end: usize, + ) -> Option { + let state = match self.states.get(id.to_usize()) { + None => return None, + Some(state) => state, + }; + state.matches.get(match_index).map(|&(id, len)| Match { + pattern: id, + len, + end, + }) + } + + fn match_count(&self, id: S) -> usize { + self.states[id.to_usize()].matches.len() + } + + fn next_state(&self, mut current: S, input: u8) -> S { + // This terminates since: + // + // 1. `State.fail` never points to fail_id(). + // 2. All `State.fail` values point to a state closer to `start`. + // 3. The start state has no transitions to fail_id(). + loop { + let state = &self.states[current.to_usize()]; + let next = state.next_state(input); + if next != fail_id() { + return next; + } + current = state.fail; + } + } +} + +/// A representation of an NFA state for an Aho-Corasick automaton. +/// +/// It contains the transitions to the next state, a failure transition for +/// cases where there exists no other transition for the current input byte, +/// the matches implied by visiting this state (if any) and the depth of this +/// state. The depth of a state is simply the distance from it to the start +/// state in the automaton, where the depth of the start state is 0. +#[derive(Clone, Debug)] +pub struct State { + trans: Transitions, + fail: S, + matches: Vec<(PatternID, PatternLength)>, + // TODO: Strictly speaking, this isn't needed for searching. It's only + // used when building an NFA that supports leftmost match semantics. We + // could drop this from the state and dynamically build a map only when + // computing failure transitions, but it's not clear which is better. + // Benchmark this. + depth: usize, +} + +impl State { + fn heap_bytes(&self) -> usize { + self.trans.heap_bytes() + + (self.matches.len() * size_of::<(PatternID, PatternLength)>()) + } + + fn add_match(&mut self, i: PatternID, len: PatternLength) { + self.matches.push((i, len)); + } + + fn is_match(&self) -> bool { + !self.matches.is_empty() + } + + fn next_state(&self, input: u8) -> S { + self.trans.next_state(input) + } + + fn set_next_state(&mut self, input: u8, next: S) { + self.trans.set_next_state(input, next); + } +} + +/// Represents the transitions for a single dense state. +/// +/// The primary purpose here is to encapsulate index access. Namely, since a +/// dense representation always contains 256 elements, all values of `u8` are +/// valid indices. +#[derive(Clone, Debug)] +struct Dense(Vec); + +impl Dense +where + S: StateID, +{ + fn new() -> Self { + Dense(vec![fail_id(); 256]) + } + + #[inline] + fn len(&self) -> usize { + self.0.len() + } +} + +impl Index for Dense { + type Output = S; + + #[inline] + fn index(&self, i: u8) -> &S { + // SAFETY: This is safe because all dense transitions have + // exactly 256 elements, so all u8 values are valid indices. + &self.0[i as usize] + } +} + +impl IndexMut for Dense { + #[inline] + fn index_mut(&mut self, i: u8) -> &mut S { + // SAFETY: This is safe because all dense transitions have + // exactly 256 elements, so all u8 values are valid indices. + &mut self.0[i as usize] + } +} + +/// A representation of transitions in an NFA. +/// +/// Transitions have either a sparse representation, which is slower for +/// lookups but uses less memory, or a dense representation, which is faster +/// for lookups but uses more memory. In the sparse representation, the absence +/// of a state implies a transition to `fail_id()`. Transitions to `dead_id()` +/// are still explicitly represented. +/// +/// For the NFA, by default, we use a dense representation for transitions for +/// states close to the start state because it's likely these are the states +/// that will be most frequently visited. +#[derive(Clone, Debug)] +enum Transitions { + Sparse(Vec<(u8, S)>), + Dense(Dense), +} + +impl Transitions { + fn heap_bytes(&self) -> usize { + match *self { + Transitions::Sparse(ref sparse) => { + sparse.len() * size_of::<(u8, S)>() + } + Transitions::Dense(ref dense) => dense.len() * size_of::(), + } + } + + fn next_state(&self, input: u8) -> S { + match *self { + Transitions::Sparse(ref sparse) => { + for &(b, id) in sparse { + if b == input { + return id; + } + } + fail_id() + } + Transitions::Dense(ref dense) => dense[input], + } + } + + fn set_next_state(&mut self, input: u8, next: S) { + match *self { + Transitions::Sparse(ref mut sparse) => { + match sparse.binary_search_by_key(&input, |&(b, _)| b) { + Ok(i) => sparse[i] = (input, next), + Err(i) => sparse.insert(i, (input, next)), + } + } + Transitions::Dense(ref mut dense) => { + dense[input] = next; + } + } + } + + /// Iterate over transitions in this state while skipping over transitions + /// to `fail_id()`. + fn iter(&self, mut f: F) { + match *self { + Transitions::Sparse(ref sparse) => { + for &(b, id) in sparse { + f(b, id); + } + } + Transitions::Dense(ref dense) => { + for b in AllBytesIter::new() { + let id = dense[b]; + if id != fail_id() { + f(b, id); + } + } + } + } + } + + /// Iterate over all transitions in this state according to the given + /// equivalence classes, including transitions to `fail_id()`. + fn iter_all(&self, classes: &ByteClasses, mut f: F) { + if classes.is_singleton() { + match *self { + Transitions::Sparse(ref sparse) => { + sparse_iter(sparse, f); + } + Transitions::Dense(ref dense) => { + for b in AllBytesIter::new() { + f(b, dense[b]); + } + } + } + } else { + // In this case, we only want to yield a single byte for each + // equivalence class. + match *self { + Transitions::Sparse(ref sparse) => { + let mut last_class = None; + sparse_iter(sparse, |b, next| { + let class = classes.get(b); + if last_class != Some(class) { + last_class = Some(class); + f(b, next); + } + }) + } + Transitions::Dense(ref dense) => { + for b in classes.representatives() { + f(b, dense[b]); + } + } + } + } + } +} + +/// Iterator over transitions in a state, skipping transitions to `fail_id()`. +/// +/// This abstracts over the representation of NFA transitions, which may be +/// either in a sparse or dense representation. +/// +/// This somewhat idiosyncratically borrows the NFA mutably, so that when one +/// is iterating over transitions, the caller can still mutate the NFA. This +/// is useful when creating failure transitions. +#[derive(Debug)] +struct IterTransitionsMut<'a, S: StateID> { + nfa: &'a mut NFA, + state_id: S, + cur: usize, +} + +impl<'a, S: StateID> IterTransitionsMut<'a, S> { + fn new(nfa: &'a mut NFA, state_id: S) -> IterTransitionsMut<'a, S> { + IterTransitionsMut { nfa, state_id, cur: 0 } + } + + fn nfa(&mut self) -> &mut NFA { + self.nfa + } +} + +impl<'a, S: StateID> Iterator for IterTransitionsMut<'a, S> { + type Item = (u8, S); + + fn next(&mut self) -> Option<(u8, S)> { + match self.nfa.states[self.state_id.to_usize()].trans { + Transitions::Sparse(ref sparse) => { + if self.cur >= sparse.len() { + return None; + } + let i = self.cur; + self.cur += 1; + Some(sparse[i]) + } + Transitions::Dense(ref dense) => { + while self.cur < dense.len() { + // There are always exactly 255 transitions in dense repr. + debug_assert!(self.cur < 256); + + let b = self.cur as u8; + let id = dense[b]; + self.cur += 1; + if id != fail_id() { + return Some((b, id)); + } + } + None + } + } + } +} + +/// A simple builder for configuring the NFA construction of Aho-Corasick. +#[derive(Clone, Debug)] +pub struct Builder { + dense_depth: usize, + match_kind: MatchKind, + prefilter: bool, + anchored: bool, + ascii_case_insensitive: bool, +} + +impl Default for Builder { + fn default() -> Builder { + Builder { + dense_depth: 2, + match_kind: MatchKind::default(), + prefilter: true, + anchored: false, + ascii_case_insensitive: false, + } + } +} + +impl Builder { + pub fn new() -> Builder { + Builder::default() + } + + pub fn build(&self, patterns: I) -> Result> + where + I: IntoIterator, + P: AsRef<[u8]>, + { + Compiler::new(self)?.compile(patterns) + } + + pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder { + self.match_kind = kind; + self + } + + pub fn dense_depth(&mut self, depth: usize) -> &mut Builder { + self.dense_depth = depth; + self + } + + pub fn prefilter(&mut self, yes: bool) -> &mut Builder { + self.prefilter = yes; + self + } + + pub fn anchored(&mut self, yes: bool) -> &mut Builder { + self.anchored = yes; + self + } + + pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder { + self.ascii_case_insensitive = yes; + self + } +} + +/// A compiler uses a builder configuration and builds up the NFA formulation +/// of an Aho-Corasick automaton. This roughly corresponds to the standard +/// formulation described in textbooks. +#[derive(Debug)] +struct Compiler<'a, S: StateID> { + builder: &'a Builder, + prefilter: prefilter::Builder, + nfa: NFA, + byte_classes: ByteClassBuilder, +} + +impl<'a, S: StateID> Compiler<'a, S> { + fn new(builder: &'a Builder) -> Result> { + Ok(Compiler { + builder, + prefilter: prefilter::Builder::new(builder.match_kind) + .ascii_case_insensitive(builder.ascii_case_insensitive), + nfa: NFA { + match_kind: builder.match_kind, + start_id: usize_to_state_id(2)?, + max_pattern_len: 0, + pattern_count: 0, + heap_bytes: 0, + prefilter: None, + anchored: builder.anchored, + byte_classes: ByteClasses::singletons(), + states: vec![], + }, + byte_classes: ByteClassBuilder::new(), + }) + } + + fn compile(mut self, patterns: I) -> Result> + where + I: IntoIterator, + P: AsRef<[u8]>, + { + self.add_state(0)?; // the fail state, which is never entered + self.add_state(0)?; // the dead state, only used for leftmost + self.add_state(0)?; // the start state + self.build_trie(patterns)?; + self.add_start_state_loop(); + self.add_dead_state_loop(); + if !self.builder.anchored { + self.fill_failure_transitions(); + } + self.close_start_state_loop(); + self.nfa.byte_classes = self.byte_classes.build(); + if !self.builder.anchored { + self.nfa.prefilter = self.prefilter.build(); + } + self.calculate_size(); + Ok(self.nfa) + } + + /// This sets up the initial prefix trie that makes up the Aho-Corasick + /// automaton. Effectively, it creates the basic structure of the + /// automaton, where every pattern given has a path from the start state to + /// the end of the pattern. + fn build_trie(&mut self, patterns: I) -> Result<()> + where + I: IntoIterator, + P: AsRef<[u8]>, + { + 'PATTERNS: for (pati, pat) in patterns.into_iter().enumerate() { + let pat = pat.as_ref(); + self.nfa.max_pattern_len = + cmp::max(self.nfa.max_pattern_len, pat.len()); + self.nfa.pattern_count += 1; + + let mut prev = self.nfa.start_id; + let mut saw_match = false; + for (depth, &b) in pat.iter().enumerate() { + // When leftmost-first match semantics are requested, we + // specifically stop adding patterns when a previously added + // pattern is a prefix of it. We avoid adding it because + // leftmost-first semantics imply that the pattern can never + // match. This is not just an optimization to save space! It + // is necessary for correctness. In fact, this is the only + // difference in the automaton between the implementations for + // leftmost-first and leftmost-longest. + saw_match = saw_match || self.nfa.state(prev).is_match(); + if self.builder.match_kind.is_leftmost_first() && saw_match { + // Skip to the next pattern immediately. This avoids + // incorrectly adding a match after this loop terminates. + continue 'PATTERNS; + } + + // Add this byte to our equivalence classes. We don't use these + // for NFA construction. These are instead used only if we're + // building a DFA. They would technically be useful for the + // NFA, but it would require a second pass over the patterns. + self.byte_classes.set_range(b, b); + if self.builder.ascii_case_insensitive { + let b = opposite_ascii_case(b); + self.byte_classes.set_range(b, b); + } + + // If the transition from prev using the current byte already + // exists, then just move through it. Otherwise, add a new + // state. We track the depth here so that we can determine + // how to represent transitions. States near the start state + // use a dense representation that uses more memory but is + // faster. Other states use a sparse representation that uses + // less memory but is slower. + let next = self.nfa.state(prev).next_state(b); + if next != fail_id() { + prev = next; + } else { + let next = self.add_state(depth + 1)?; + self.nfa.state_mut(prev).set_next_state(b, next); + if self.builder.ascii_case_insensitive { + let b = opposite_ascii_case(b); + self.nfa.state_mut(prev).set_next_state(b, next); + } + prev = next; + } + } + // Once the pattern has been added, log the match in the final + // state that it reached. + self.nfa.state_mut(prev).add_match(pati, pat.len()); + // ... and hand it to the prefilter builder, if applicable. + if self.builder.prefilter { + self.prefilter.add(pat); + } + } + Ok(()) + } + + /// This routine creates failure transitions according to the standard + /// textbook formulation of the Aho-Corasick algorithm, with a couple small + /// tweaks to support "leftmost" semantics. + /// + /// Building failure transitions is the most interesting part of building + /// the Aho-Corasick automaton, because they are what allow searches to + /// be performed in linear time. Specifically, a failure transition is + /// a single transition associated with each state that points back to + /// the longest proper suffix of the pattern being searched. The failure + /// transition is followed whenever there exists no transition on the + /// current state for the current input byte. If there is no other proper + /// suffix, then the failure transition points back to the starting state. + /// + /// For example, let's say we built an Aho-Corasick automaton with the + /// following patterns: 'abcd' and 'cef'. The trie looks like this: + /// + /// ```ignore + /// a - S1 - b - S2 - c - S3 - d - S4* + /// / + /// S0 - c - S5 - e - S6 - f - S7* + /// ``` + /// + /// At this point, it should be fairly straight-forward to see how this + /// trie can be used in a simplistic way. At any given position in the + /// text we're searching (called the "subject" string), all we need to do + /// is follow the transitions in the trie by consuming one transition for + /// each byte in the subject string. If we reach a match state, then we can + /// report that location as a match. + /// + /// The trick comes when searching a subject string like 'abcef'. We'll + /// initially follow the transition from S0 to S1 and wind up in S3 after + /// observng the 'c' byte. At this point, the next byte is 'e' but state + /// S3 has no transition for 'e', so the search fails. We then would need + /// to restart the search at the next position in 'abcef', which + /// corresponds to 'b'. The match would fail, but the next search starting + /// at 'c' would finally succeed. The problem with this approach is that + /// we wind up searching the subject string potentially many times. In + /// effect, this makes the algorithm have worst case `O(n * m)` complexity, + /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead + /// like to achieve a `O(n + m)` worst case complexity. + /// + /// This is where failure transitions come in. Instead of dying at S3 in + /// the first search, the automaton can instruct the search to move to + /// another part of the automaton that corresponds to a suffix of what + /// we've seen so far. Recall that we've seen 'abc' in the subject string, + /// and the automaton does indeed have a non-empty suffix, 'c', that could + /// potentially lead to another match. Thus, the actual Aho-Corasick + /// automaton for our patterns in this case looks like this: + /// + /// ```ignore + /// a - S1 - b - S2 - c - S3 - d - S4* + /// / / + /// / ---------------- + /// / / + /// S0 - c - S5 - e - S6 - f - S7* + /// ``` + /// + /// That is, we have a failure transition from S3 to S5, which is followed + /// exactly in cases when we are in state S3 but see any byte other than + /// 'd' (that is, we've "failed" to find a match in this portion of our + /// trie). We know we can transition back to S5 because we've already seen + /// a 'c' byte, so we don't need to re-scan it. We can then pick back up + /// with the search starting at S5 and complete our match. + /// + /// Adding failure transitions to a trie is fairly simple, but subtle. The + /// key issue is that you might have multiple failure transition that you + /// need to follow. For example, look at the trie for the patterns + /// 'abcd', 'b', 'bcd' and 'cd': + /// + /// ```ignore + /// - a - S1 - b - S2* - c - S3 - d - S4* + /// / / / + /// / ------- ------- + /// / / / + /// S0 --- b - S5* - c - S6 - d - S7* + /// \ / + /// \ -------- + /// \ / + /// - c - S8 - d - S9* + /// ``` + /// + /// The failure transitions for this trie are defined from S2 to S5, + /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it + /// corresponds to a match, since its failure transition to S5 is itself + /// a match state. + /// + /// Perhaps simplest way to think about adding these failure transitions + /// is recursively. That is, if you know the failure transitions for every + /// possible previous state that could be visited (e.g., when computing the + /// failure transition for S3, you already know the failure transitions + /// for S0, S1 and S2), then you can simply follow the failure transition + /// of the previous state and check whether the incoming transition is + /// defined after following the failure transition. + /// + /// For example, when determining the failure state for S3, by our + /// assumptions, we already know that there is a failure transition from + /// S2 (the previous state) to S5. So we follow that transition and check + /// whether the transition connecting S2 to S3 is defined. Indeed, it is, + /// as there is a transition from S5 to S6 for the byte 'c'. If no such + /// transition existed, we could keep following the failure transitions + /// until we reach the start state, which is the failure transition for + /// every state that has no corresponding proper suffix. + /// + /// We don't actually use recursion to implement this, but instead, use a + /// breadth first search of the automaton. Our base case is the start + /// state, whose failure transition is just a transition to itself. + /// + /// When building a leftmost automaton, we proceed as above, but only + /// include a subset of failure transitions. Namely, we omit any failure + /// transitions that appear after a match state in the trie. This is + /// because failure transitions always point back to a proper suffix of + /// what has been seen so far. Thus, following a failure transition after + /// a match implies looking for a match that starts after the one that has + /// already been seen, which is of course therefore not the leftmost match. + /// + /// N.B. I came up with this algorithm on my own, and after scouring all of + /// the other AC implementations I know of (Perl, Snort, many on GitHub). + /// I couldn't find any that implement leftmost semantics like this. + /// Perl of course needs leftmost-first semantics, but they implement it + /// with a seeming hack at *search* time instead of encoding it into the + /// automaton. There are also a couple Java libraries that support leftmost + /// longest semantics, but they do it by building a queue of matches at + /// search time, which is even worse than what Perl is doing. ---AG + fn fill_failure_transitions(&mut self) { + let kind = self.match_kind(); + // Initialize the queue for breadth first search with all transitions + // out of the start state. We handle the start state specially because + // we only want to follow non-self transitions. If we followed self + // transitions, then this would never terminate. + let mut queue = VecDeque::new(); + let mut seen = self.queued_set(); + let mut it = self.nfa.iter_transitions_mut(self.nfa.start_id); + while let Some((_, next)) = it.next() { + // Skip anything we've seen before and any self-transitions on the + // start state. + if next == it.nfa().start_id || seen.contains(next) { + continue; + } + queue.push_back(next); + seen.insert(next); + // Under leftmost semantics, if a state immediately following + // the start state is a match state, then we never want to + // follow its failure transition since the failure transition + // necessarily leads back to the start state, which we never + // want to do for leftmost matching after a match has been + // found. + // + // We apply the same logic to non-start states below as well. + if kind.is_leftmost() && it.nfa().state(next).is_match() { + it.nfa().state_mut(next).fail = dead_id(); + } + } + while let Some(id) = queue.pop_front() { + let mut it = self.nfa.iter_transitions_mut(id); + while let Some((b, next)) = it.next() { + if seen.contains(next) { + // The only way to visit a duplicate state in a transition + // list is when ASCII case insensitivity is enabled. In + // this case, we want to skip it since it's redundant work. + // But it would also end up duplicating matches, which + // results in reporting duplicate matches in some cases. + // See the 'acasei010' regression test. + continue; + } + queue.push_back(next); + seen.insert(next); + + // As above for start states, under leftmost semantics, once + // we see a match all subsequent states should have no failure + // transitions because failure transitions always imply looking + // for a match that is a suffix of what has been seen so far + // (where "seen so far" corresponds to the string formed by + // following the transitions from the start state to the + // current state). Under leftmost semantics, we specifically do + // not want to allow this to happen because we always want to + // report the match found at the leftmost position. + // + // The difference between leftmost-first and leftmost-longest + // occurs previously while we build the trie. For + // leftmost-first, we simply omit any entries that would + // otherwise require passing through a match state. + // + // Note that for correctness, the failure transition has to be + // set to the dead state for ALL states following a match, not + // just the match state itself. However, by setting the failure + // transition to the dead state on all match states, the dead + // state will automatically propagate to all subsequent states + // via the failure state computation below. + if kind.is_leftmost() && it.nfa().state(next).is_match() { + it.nfa().state_mut(next).fail = dead_id(); + continue; + } + let mut fail = it.nfa().state(id).fail; + while it.nfa().state(fail).next_state(b) == fail_id() { + fail = it.nfa().state(fail).fail; + } + fail = it.nfa().state(fail).next_state(b); + it.nfa().state_mut(next).fail = fail; + it.nfa().copy_matches(fail, next); + } + // If the start state is a match state, then this automaton can + // match the empty string. This implies all states are match states + // since every position matches the empty string, so copy the + // matches from the start state to every state. Strictly speaking, + // this is only necessary for overlapping matches since each + // non-empty non-start match state needs to report empty matches + // in addition to its own. For the non-overlapping case, such + // states only report the first match, which is never empty since + // it isn't a start state. + if !kind.is_leftmost() { + it.nfa().copy_empty_matches(id); + } + } + } + + /// Returns a set that tracked queued states. + /// + /// This is only necessary when ASCII case insensitivity is enabled, since + /// it is the only way to visit the same state twice. Otherwise, this + /// returns an inert set that nevers adds anything and always reports + /// `false` for every member test. + fn queued_set(&self) -> QueuedSet { + if self.builder.ascii_case_insensitive { + QueuedSet::active() + } else { + QueuedSet::inert() + } + } + + /// Set the failure transitions on the start state to loop back to the + /// start state. This effectively permits the Aho-Corasick automaton to + /// match at any position. This is also required for finding the next + /// state to terminate, namely, finding the next state should never return + /// a fail_id. + /// + /// This must be done after building the initial trie, since trie + /// construction depends on transitions to `fail_id` to determine whether a + /// state already exists or not. + fn add_start_state_loop(&mut self) { + let start_id = self.nfa.start_id; + let start = self.nfa.start_mut(); + for b in AllBytesIter::new() { + if start.next_state(b) == fail_id() { + start.set_next_state(b, start_id); + } + } + } + + /// Remove the start state loop by rewriting any transitions on the start + /// state back to the start state with transitions to the dead state. + /// + /// The loop is only closed when two conditions are met: the start state + /// is a match state and the match kind is leftmost-first or + /// leftmost-longest. (Alternatively, if this is an anchored automaton, + /// then the start state is always closed, regardless of aforementioned + /// conditions.) + /// + /// The reason for this is that under leftmost semantics, a start state + /// that is also a match implies that we should never restart the search + /// process. We allow normal transitions out of the start state, but if + /// none exist, we transition to the dead state, which signals that + /// searching should stop. + fn close_start_state_loop(&mut self) { + if self.builder.anchored + || (self.match_kind().is_leftmost() && self.nfa.start().is_match()) + { + let start_id = self.nfa.start_id; + let start = self.nfa.start_mut(); + for b in AllBytesIter::new() { + if start.next_state(b) == start_id { + start.set_next_state(b, dead_id()); + } + } + } + } + + /// Sets all transitions on the dead state to point back to the dead state. + /// Normally, missing transitions map back to the failure state, but the + /// point of the dead state is to act as a sink that can never be escaped. + fn add_dead_state_loop(&mut self) { + let dead = self.nfa.state_mut(dead_id()); + for b in AllBytesIter::new() { + dead.set_next_state(b, dead_id()); + } + } + + /// Computes the total amount of heap used by this NFA in bytes. + fn calculate_size(&mut self) { + let mut size = 0; + for state in &self.nfa.states { + size += size_of::>() + state.heap_bytes(); + } + self.nfa.heap_bytes = size; + } + + /// Add a new state to the underlying NFA with the given depth. The depth + /// is used to determine how to represent the transitions. + /// + /// If adding the new state would overflow the chosen state ID + /// representation, then this returns an error. + fn add_state(&mut self, depth: usize) -> Result { + if depth < self.builder.dense_depth { + self.nfa.add_dense_state(depth) + } else { + self.nfa.add_sparse_state(depth) + } + } + + /// Returns the match kind configured on the underlying builder. + fn match_kind(&self) -> MatchKind { + self.builder.match_kind + } +} + +/// A set of state identifiers used to avoid revisiting the same state multiple +/// times when filling in failure transitions. +/// +/// This set has an "inert" and an "active" mode. When inert, the set never +/// stores anything and always returns `false` for every member test. This is +/// useful to avoid the performance and memory overhead of maintaining this +/// set when it is not needed. +#[derive(Debug)] +struct QueuedSet { + set: Option>, +} + +impl QueuedSet { + /// Return an inert set that returns `false` for every state ID membership + /// test. + fn inert() -> QueuedSet { + QueuedSet { set: None } + } + + /// Return an active set that tracks state ID membership. + fn active() -> QueuedSet { + QueuedSet { set: Some(BTreeSet::new()) } + } + + /// Inserts the given state ID into this set. (If the set is inert, then + /// this is a no-op.) + fn insert(&mut self, state_id: S) { + if let Some(ref mut set) = self.set { + set.insert(state_id); + } + } + + /// Returns true if and only if the given state ID is in this set. If the + /// set is inert, this always returns false. + fn contains(&self, state_id: S) -> bool { + match self.set { + None => false, + Some(ref set) => set.contains(&state_id), + } + } +} + +/// An iterator over every byte value. +/// +/// We use this instead of (0..256).map(|b| b as u8) because this optimizes +/// better in debug builds. +/// +/// We also use this instead of 0..=255 because we're targeting Rust 1.24 and +/// inclusive range syntax was stabilized in Rust 1.26. We can get rid of this +/// once our MSRV is Rust 1.26 or newer. +#[derive(Debug)] +struct AllBytesIter(u16); + +impl AllBytesIter { + fn new() -> AllBytesIter { + AllBytesIter(0) + } +} + +impl Iterator for AllBytesIter { + type Item = u8; + + fn next(&mut self) -> Option { + if self.0 >= 256 { + None + } else { + let b = self.0 as u8; + self.0 += 1; + Some(b) + } + } +} + +impl fmt::Debug for NFA { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "NFA(")?; + writeln!(f, "match_kind: {:?}", self.match_kind)?; + writeln!(f, "prefilter: {:?}", self.prefilter)?; + writeln!(f, "{}", "-".repeat(79))?; + for (id, s) in self.states.iter().enumerate() { + let mut trans = vec![]; + s.trans.iter(|byte, next| { + // The start state has a bunch of uninteresting transitions + // back into itself. It's questionable to hide them since they + // are critical to understanding the automaton, but they are + // very noisy without better formatting for contiugous ranges + // to the same state. + if id == self.start_id.to_usize() && next == self.start_id { + return; + } + // Similarly, the dead state has a bunch of uninteresting + // transitions too. + if id == dead_id() { + return; + } + trans.push(format!("{} => {}", escape(byte), next.to_usize())); + }); + writeln!(f, "{:04}: {}", id, trans.join(", "))?; + + let matches: Vec = s + .matches + .iter() + .map(|&(pattern_id, _)| pattern_id.to_string()) + .collect(); + writeln!(f, " matches: {}", matches.join(", "))?; + writeln!(f, " fail: {}", s.fail.to_usize())?; + writeln!(f, " depth: {}", s.depth)?; + } + writeln!(f, "{}", "-".repeat(79))?; + writeln!(f, ")")?; + Ok(()) + } +} + +/// Iterate over all possible byte transitions given a sparse set. +fn sparse_iter(trans: &[(u8, S)], mut f: F) { + let mut byte = 0u16; + for &(b, id) in trans { + while byte < (b as u16) { + f(byte as u8, fail_id()); + byte += 1; + } + f(b, id); + byte += 1; + } + for b in byte..256 { + f(b as u8, fail_id()); + } +} + +/// Safely return two mutable borrows to two different locations in the given +/// slice. +/// +/// This panics if i == j. +fn get_two_mut(xs: &mut [T], i: usize, j: usize) -> (&mut T, &mut T) { + assert!(i != j, "{} must not be equal to {}", i, j); + if i < j { + let (before, after) = xs.split_at_mut(j); + (&mut before[i], &mut after[0]) + } else { + let (before, after) = xs.split_at_mut(i); + (&mut after[0], &mut before[j]) + } +} + +/// Return the given byte as its escaped string form. +fn escape(b: u8) -> String { + use std::ascii; + + String::from_utf8(ascii::escape_default(b).collect::>()).unwrap() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn scratch() { + let nfa: NFA = Builder::new() + .dense_depth(0) + // .match_kind(MatchKind::LeftmostShortest) + // .match_kind(MatchKind::LeftmostLongest) + .match_kind(MatchKind::LeftmostFirst) + // .build(&["abcd", "ce", "b"]) + // .build(&["ab", "bc"]) + // .build(&["b", "bcd", "ce"]) + // .build(&["abc", "bx"]) + // .build(&["abc", "bd", "ab"]) + // .build(&["abcdefghi", "hz", "abcdefgh"]) + // .build(&["abcd", "bce", "b"]) + .build(&["abcdefg", "bcde", "bcdef"]) + .unwrap(); + println!("{:?}", nfa); + } +} diff --git a/vendor/aho-corasick/src/packed/api.rs b/vendor/aho-corasick/src/packed/api.rs new file mode 100644 index 0000000000..51703d0e2b --- /dev/null +++ b/vendor/aho-corasick/src/packed/api.rs @@ -0,0 +1,625 @@ +use std::u16; + +use crate::packed::pattern::Patterns; +use crate::packed::rabinkarp::RabinKarp; +use crate::packed::teddy::{self, Teddy}; +use crate::Match; + +/// This is a limit placed on the total number of patterns we're willing to try +/// and match at once. As more sophisticated algorithms are added, this number +/// may be increased. +const PATTERN_LIMIT: usize = 128; + +/// A knob for controlling the match semantics of a packed multiple string +/// searcher. +/// +/// This differs from the +/// [`MatchKind`](../enum.MatchKind.html) +/// type in the top-level crate module in that it doesn't support +/// "standard" match semantics, and instead only supports leftmost-first or +/// leftmost-longest. Namely, "standard" semantics cannot be easily supported +/// by packed searchers. +/// +/// For more information on the distinction between leftmost-first and +/// leftmost-longest, see the docs on the top-level `MatchKind` type. +/// +/// Unlike the top-level `MatchKind` type, the default match semantics for this +/// type are leftmost-first. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum MatchKind { + /// Use leftmost-first match semantics, which reports leftmost matches. + /// When there are multiple possible leftmost matches, the match + /// corresponding to the pattern that appeared earlier when constructing + /// the automaton is reported. + /// + /// This is the default. + LeftmostFirst, + /// Use leftmost-longest match semantics, which reports leftmost matches. + /// When there are multiple possible leftmost matches, the longest match + /// is chosen. + LeftmostLongest, + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl Default for MatchKind { + fn default() -> MatchKind { + MatchKind::LeftmostFirst + } +} + +/// The configuration for a packed multiple pattern searcher. +/// +/// The configuration is currently limited only to being able to select the +/// match semantics (leftmost-first or leftmost-longest) of a searcher. In the +/// future, more knobs may be made available. +/// +/// A configuration produces a [`packed::Builder`](struct.Builder.html), which +/// in turn can be used to construct a +/// [`packed::Searcher`](struct.Searcher.html) for searching. +/// +/// # Example +/// +/// This example shows how to use leftmost-longest semantics instead of the +/// default (leftmost-first). +/// +/// ``` +/// use aho_corasick::packed::{Config, MatchKind}; +/// +/// # fn example() -> Option<()> { +/// let searcher = Config::new() +/// .match_kind(MatchKind::LeftmostLongest) +/// .builder() +/// .add("foo") +/// .add("foobar") +/// .build()?; +/// let matches: Vec = searcher +/// .find_iter("foobar") +/// .map(|mat| mat.pattern()) +/// .collect(); +/// assert_eq!(vec![1], matches); +/// # Some(()) } +/// # if cfg!(target_arch = "x86_64") { +/// # example().unwrap() +/// # } else { +/// # assert!(example().is_none()); +/// # } +/// ``` +#[derive(Clone, Debug)] +pub struct Config { + kind: MatchKind, + force: Option, + force_teddy_fat: Option, + force_avx: Option, +} + +/// An internal option for forcing the use of a particular packed algorithm. +/// +/// When an algorithm is forced, if a searcher could not be constructed for it, +/// then no searcher will be returned even if an alternative algorithm would +/// work. +#[derive(Clone, Debug)] +enum ForceAlgorithm { + Teddy, + RabinKarp, +} + +impl Default for Config { + fn default() -> Config { + Config::new() + } +} + +impl Config { + /// Create a new default configuration. A default configuration uses + /// leftmost-first match semantics. + pub fn new() -> Config { + Config { + kind: MatchKind::LeftmostFirst, + force: None, + force_teddy_fat: None, + force_avx: None, + } + } + + /// Create a packed builder from this configuration. The builder can be + /// used to accumulate patterns and create a + /// [`Searcher`](struct.Searcher.html) + /// from them. + pub fn builder(&self) -> Builder { + Builder::from_config(self.clone()) + } + + /// Set the match semantics for this configuration. + pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config { + self.kind = kind; + self + } + + /// An undocumented method for forcing the use of the Teddy algorithm. + /// + /// This is only exposed for more precise testing and benchmarks. Callers + /// should not use it as it is not part of the API stability guarantees of + /// this crate. + #[doc(hidden)] + pub fn force_teddy(&mut self, yes: bool) -> &mut Config { + if yes { + self.force = Some(ForceAlgorithm::Teddy); + } else { + self.force = None; + } + self + } + + /// An undocumented method for forcing the use of the Fat Teddy algorithm. + /// + /// This is only exposed for more precise testing and benchmarks. Callers + /// should not use it as it is not part of the API stability guarantees of + /// this crate. + #[doc(hidden)] + pub fn force_teddy_fat(&mut self, yes: Option) -> &mut Config { + self.force_teddy_fat = yes; + self + } + + /// An undocumented method for forcing the use of SSE (`Some(false)`) or + /// AVX (`Some(true)`) algorithms. + /// + /// This is only exposed for more precise testing and benchmarks. Callers + /// should not use it as it is not part of the API stability guarantees of + /// this crate. + #[doc(hidden)] + pub fn force_avx(&mut self, yes: Option) -> &mut Config { + self.force_avx = yes; + self + } + + /// An undocumented method for forcing the use of the Rabin-Karp algorithm. + /// + /// This is only exposed for more precise testing and benchmarks. Callers + /// should not use it as it is not part of the API stability guarantees of + /// this crate. + #[doc(hidden)] + pub fn force_rabin_karp(&mut self, yes: bool) -> &mut Config { + if yes { + self.force = Some(ForceAlgorithm::RabinKarp); + } else { + self.force = None; + } + self + } +} + +/// A builder for constructing a packed searcher from a collection of patterns. +/// +/// # Example +/// +/// This example shows how to use a builder to construct a searcher. By +/// default, leftmost-first match semantics are used. +/// +/// ``` +/// use aho_corasick::packed::{Builder, MatchKind}; +/// +/// # fn example() -> Option<()> { +/// let searcher = Builder::new() +/// .add("foobar") +/// .add("foo") +/// .build()?; +/// let matches: Vec = searcher +/// .find_iter("foobar") +/// .map(|mat| mat.pattern()) +/// .collect(); +/// assert_eq!(vec![0], matches); +/// # Some(()) } +/// # if cfg!(target_arch = "x86_64") { +/// # example().unwrap() +/// # } else { +/// # assert!(example().is_none()); +/// # } +/// ``` +#[derive(Clone, Debug)] +pub struct Builder { + /// The configuration of this builder and subsequent matcher. + config: Config, + /// Set to true if the builder detects that a matcher cannot be built. + inert: bool, + /// The patterns provided by the caller. + patterns: Patterns, +} + +impl Builder { + /// Create a new builder for constructing a multi-pattern searcher. This + /// constructor uses the default configuration. + pub fn new() -> Builder { + Builder::from_config(Config::new()) + } + + fn from_config(config: Config) -> Builder { + Builder { config, inert: false, patterns: Patterns::new() } + } + + /// Build a searcher from the patterns added to this builder so far. + pub fn build(&self) -> Option { + if self.inert || self.patterns.is_empty() { + return None; + } + let mut patterns = self.patterns.clone(); + patterns.set_match_kind(self.config.kind); + let rabinkarp = RabinKarp::new(&patterns); + // Effectively, we only want to return a searcher if we can use Teddy, + // since Teddy is our only fast packed searcher at the moment. + // Rabin-Karp is only used when searching haystacks smaller than what + // Teddy can support. Thus, the only way to get a Rabin-Karp searcher + // is to force it using undocumented APIs (for tests/benchmarks). + let (search_kind, minimum_len) = match self.config.force { + None | Some(ForceAlgorithm::Teddy) => { + let teddy = match self.build_teddy(&patterns) { + None => return None, + Some(teddy) => teddy, + }; + let minimum_len = teddy.minimum_len(); + (SearchKind::Teddy(teddy), minimum_len) + } + Some(ForceAlgorithm::RabinKarp) => (SearchKind::RabinKarp, 0), + }; + Some(Searcher { patterns, rabinkarp, search_kind, minimum_len }) + } + + fn build_teddy(&self, patterns: &Patterns) -> Option { + teddy::Builder::new() + .avx(self.config.force_avx) + .fat(self.config.force_teddy_fat) + .build(&patterns) + } + + /// Add the given pattern to this set to match. + /// + /// The order in which patterns are added is significant. Namely, when + /// using leftmost-first match semantics, then when multiple patterns can + /// match at a particular location, the pattern that was added first is + /// used as the match. + /// + /// If the number of patterns added exceeds the amount supported by packed + /// searchers, then the builder will stop accumulating patterns and render + /// itself inert. At this point, constructing a searcher will always return + /// `None`. + pub fn add>(&mut self, pattern: P) -> &mut Builder { + if self.inert { + return self; + } else if self.patterns.len() >= PATTERN_LIMIT { + self.inert = true; + self.patterns.reset(); + return self; + } + // Just in case PATTERN_LIMIT increases beyond u16::MAX. + assert!(self.patterns.len() <= u16::MAX as usize); + + let pattern = pattern.as_ref(); + if pattern.is_empty() { + self.inert = true; + self.patterns.reset(); + return self; + } + self.patterns.add(pattern); + self + } + + /// Add the given iterator of patterns to this set to match. + /// + /// The iterator must yield elements that can be converted into a `&[u8]`. + /// + /// The order in which patterns are added is significant. Namely, when + /// using leftmost-first match semantics, then when multiple patterns can + /// match at a particular location, the pattern that was added first is + /// used as the match. + /// + /// If the number of patterns added exceeds the amount supported by packed + /// searchers, then the builder will stop accumulating patterns and render + /// itself inert. At this point, constructing a searcher will always return + /// `None`. + pub fn extend(&mut self, patterns: I) -> &mut Builder + where + I: IntoIterator, + P: AsRef<[u8]>, + { + for p in patterns { + self.add(p); + } + self + } +} + +impl Default for Builder { + fn default() -> Builder { + Builder::new() + } +} + +/// A packed searcher for quickly finding occurrences of multiple patterns. +/// +/// If callers need more flexible construction, or if one wants to change the +/// match semantics (either leftmost-first or leftmost-longest), then one can +/// use the [`Config`](struct.Config.html) and/or +/// [`Builder`](struct.Builder.html) types for more fine grained control. +/// +/// # Example +/// +/// This example shows how to create a searcher from an iterator of patterns. +/// By default, leftmost-first match semantics are used. +/// +/// ``` +/// use aho_corasick::packed::{MatchKind, Searcher}; +/// +/// # fn example() -> Option<()> { +/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; +/// let matches: Vec = searcher +/// .find_iter("foobar") +/// .map(|mat| mat.pattern()) +/// .collect(); +/// assert_eq!(vec![0], matches); +/// # Some(()) } +/// # if cfg!(target_arch = "x86_64") { +/// # example().unwrap() +/// # } else { +/// # assert!(example().is_none()); +/// # } +/// ``` +#[derive(Clone, Debug)] +pub struct Searcher { + patterns: Patterns, + rabinkarp: RabinKarp, + search_kind: SearchKind, + minimum_len: usize, +} + +#[derive(Clone, Debug)] +enum SearchKind { + Teddy(Teddy), + RabinKarp, +} + +impl Searcher { + /// A convenience function for constructing a searcher from an iterator + /// of things that can be converted to a `&[u8]`. + /// + /// If a searcher could not be constructed (either because of an + /// unsupported CPU or because there are too many patterns), then `None` + /// is returned. + /// + /// # Example + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::packed::{MatchKind, Searcher}; + /// + /// # fn example() -> Option<()> { + /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; + /// let matches: Vec = searcher + /// .find_iter("foobar") + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![0], matches); + /// # Some(()) } + /// # if cfg!(target_arch = "x86_64") { + /// # example().unwrap() + /// # } else { + /// # assert!(example().is_none()); + /// # } + /// ``` + pub fn new(patterns: I) -> Option + where + I: IntoIterator, + P: AsRef<[u8]>, + { + Builder::new().extend(patterns).build() + } + + /// Return the first occurrence of any of the patterns in this searcher, + /// according to its match semantics, in the given haystack. The `Match` + /// returned will include the identifier of the pattern that matched, which + /// corresponds to the index of the pattern (starting from `0`) in which it + /// was added. + /// + /// # Example + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::packed::{MatchKind, Searcher}; + /// + /// # fn example() -> Option<()> { + /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; + /// let mat = searcher.find("foobar")?; + /// assert_eq!(0, mat.pattern()); + /// assert_eq!(0, mat.start()); + /// assert_eq!(6, mat.end()); + /// # Some(()) } + /// # if cfg!(target_arch = "x86_64") { + /// # example().unwrap() + /// # } else { + /// # assert!(example().is_none()); + /// # } + /// ``` + pub fn find>(&self, haystack: B) -> Option { + self.find_at(haystack, 0) + } + + /// Return the first occurrence of any of the patterns in this searcher, + /// according to its match semantics, in the given haystack starting from + /// the given position. + /// + /// The `Match` returned will include the identifier of the pattern that + /// matched, which corresponds to the index of the pattern (starting from + /// `0`) in which it was added. The offsets in the `Match` will be relative + /// to the start of `haystack` (and not `at`). + /// + /// # Example + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::packed::{MatchKind, Searcher}; + /// + /// # fn example() -> Option<()> { + /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; + /// let mat = searcher.find_at("foofoobar", 3)?; + /// assert_eq!(0, mat.pattern()); + /// assert_eq!(3, mat.start()); + /// assert_eq!(9, mat.end()); + /// # Some(()) } + /// # if cfg!(target_arch = "x86_64") { + /// # example().unwrap() + /// # } else { + /// # assert!(example().is_none()); + /// # } + /// ``` + pub fn find_at>( + &self, + haystack: B, + at: usize, + ) -> Option { + let haystack = haystack.as_ref(); + match self.search_kind { + SearchKind::Teddy(ref teddy) => { + if haystack[at..].len() < teddy.minimum_len() { + return self.slow_at(haystack, at); + } + teddy.find_at(&self.patterns, haystack, at) + } + SearchKind::RabinKarp => { + self.rabinkarp.find_at(&self.patterns, haystack, at) + } + } + } + + /// Return an iterator of non-overlapping occurrences of the patterns in + /// this searcher, according to its match semantics, in the given haystack. + /// + /// # Example + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::packed::{MatchKind, Searcher}; + /// + /// # fn example() -> Option<()> { + /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; + /// let matches: Vec = searcher + /// .find_iter("foobar fooba foofoo") + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![0, 1, 1, 1], matches); + /// # Some(()) } + /// # if cfg!(target_arch = "x86_64") { + /// # example().unwrap() + /// # } else { + /// # assert!(example().is_none()); + /// # } + /// ``` + pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( + &'a self, + haystack: &'b B, + ) -> FindIter<'a, 'b> { + FindIter { searcher: self, haystack: haystack.as_ref(), at: 0 } + } + + /// Returns the match kind used by this packed searcher. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::packed::{MatchKind, Searcher}; + /// + /// # fn example() -> Option<()> { + /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; + /// // leftmost-first is the default. + /// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind()); + /// # Some(()) } + /// # if cfg!(target_arch = "x86_64") { + /// # example().unwrap() + /// # } else { + /// # assert!(example().is_none()); + /// # } + /// ``` + pub fn match_kind(&self) -> &MatchKind { + self.patterns.match_kind() + } + + /// Returns the minimum length of a haystack that is required in order for + /// packed searching to be effective. + /// + /// In some cases, the underlying packed searcher may not be able to search + /// very short haystacks. When that occurs, the implementation will defer + /// to a slower non-packed searcher (which is still generally faster than + /// Aho-Corasick for a small number of patterns). However, callers may + /// want to avoid ever using the slower variant, which one can do by + /// never passing a haystack shorter than the minimum length returned by + /// this method. + pub fn minimum_len(&self) -> usize { + self.minimum_len + } + + /// Returns the approximate total amount of heap used by this searcher, in + /// units of bytes. + pub fn heap_bytes(&self) -> usize { + self.patterns.heap_bytes() + + self.rabinkarp.heap_bytes() + + self.search_kind.heap_bytes() + } + + /// Use a slow (non-packed) searcher. + /// + /// This is useful when a packed searcher could be constructed, but could + /// not be used to search a specific haystack. For example, if Teddy was + /// built but the haystack is smaller than ~34 bytes, then Teddy might not + /// be able to run. + fn slow_at(&self, haystack: &[u8], at: usize) -> Option { + self.rabinkarp.find_at(&self.patterns, haystack, at) + } +} + +impl SearchKind { + fn heap_bytes(&self) -> usize { + match *self { + SearchKind::Teddy(ref ted) => ted.heap_bytes(), + SearchKind::RabinKarp => 0, + } + } +} + +/// An iterator over non-overlapping matches from a packed searcher. +/// +/// The lifetime `'s` refers to the lifetime of the underlying +/// [`Searcher`](struct.Searcher.html), while the lifetime `'h` refers to the +/// lifetime of the haystack being searched. +#[derive(Debug)] +pub struct FindIter<'s, 'h> { + searcher: &'s Searcher, + haystack: &'h [u8], + at: usize, +} + +impl<'s, 'h> Iterator for FindIter<'s, 'h> { + type Item = Match; + + fn next(&mut self) -> Option { + if self.at > self.haystack.len() { + return None; + } + match self.searcher.find_at(&self.haystack, self.at) { + None => None, + Some(c) => { + self.at = c.end; + Some(c) + } + } + } +} diff --git a/vendor/aho-corasick/src/packed/mod.rs b/vendor/aho-corasick/src/packed/mod.rs new file mode 100644 index 0000000000..97c40ff503 --- /dev/null +++ b/vendor/aho-corasick/src/packed/mod.rs @@ -0,0 +1,117 @@ +/*! +A lower level API for packed multiple substring search, principally for a small +number of patterns. + +This sub-module provides vectorized routines for quickly finding matches of a +small number of patterns. In general, users of this crate shouldn't need to +interface with this module directly, as the primary +[`AhoCorasick`](../struct.AhoCorasick.html) +searcher will use these routines automatically as a prefilter when applicable. +However, in some cases, callers may want to bypass the Aho-Corasick machinery +entirely and use this vectorized searcher directly. + +# Overview + +The primary types in this sub-module are: + +* [`Searcher`](struct.Searcher.html) executes the actual search algorithm to + report matches in a haystack. +* [`Builder`](struct.Builder.html) accumulates patterns incrementally and can + construct a `Searcher`. +* [`Config`](struct.Config.html) permits tuning the searcher, and itself will + produce a `Builder` (which can then be used to build a `Searcher`). + Currently, the only tuneable knob are the match semantics, but this may be + expanded in the future. + +# Examples + +This example shows how to create a searcher from an iterator of patterns. +By default, leftmost-first match semantics are used. (See the top-level +[`MatchKind`](../enum.MatchKind.html) type for more details about match +semantics, which apply similarly to packed substring search.) + +``` +use aho_corasick::packed::{MatchKind, Searcher}; + +# fn example() -> Option<()> { +let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; +let matches: Vec = searcher + .find_iter("foobar") + .map(|mat| mat.pattern()) + .collect(); +assert_eq!(vec![0], matches); +# Some(()) } +# if cfg!(target_arch = "x86_64") { +# example().unwrap() +# } else { +# assert!(example().is_none()); +# } +``` + +This example shows how to use [`Config`](struct.Config.html) to change the +match semantics to leftmost-longest: + +``` +use aho_corasick::packed::{Config, MatchKind}; + +# fn example() -> Option<()> { +let searcher = Config::new() + .match_kind(MatchKind::LeftmostLongest) + .builder() + .add("foo") + .add("foobar") + .build()?; +let matches: Vec = searcher + .find_iter("foobar") + .map(|mat| mat.pattern()) + .collect(); +assert_eq!(vec![1], matches); +# Some(()) } +# if cfg!(target_arch = "x86_64") { +# example().unwrap() +# } else { +# assert!(example().is_none()); +# } +``` + +# Packed substring searching + +Packed substring searching refers to the use of SIMD (Single Instruction, +Multiple Data) to accelerate the detection of matches in a haystack. Unlike +conventional algorithms, such as Aho-Corasick, SIMD algorithms for substring +search tend to do better with a small number of patterns, where as Aho-Corasick +generally maintains reasonably consistent performance regardless of the number +of patterns you give it. Because of this, the vectorized searcher in this +sub-module cannot be used as a general purpose searcher, since building the +searcher may fail. However, in exchange, when searching for a small number of +patterns, searching can be quite a bit faster than Aho-Corasick (sometimes by +an order of magnitude). + +The key take away here is that constructing a searcher from a list of patterns +is a fallible operation. While the precise conditions under which building a +searcher can fail is specifically an implementation detail, here are some +common reasons: + +* Too many patterns were given. Typically, the limit is on the order of 100 or + so, but this limit may fluctuate based on available CPU features. +* The available packed algorithms require CPU features that aren't available. + For example, currently, this crate only provides packed algorithms for + `x86_64`. Therefore, constructing a packed searcher on any other target + (e.g., ARM) will always fail. +* Zero patterns were given, or one of the patterns given was empty. Packed + searchers require at least one pattern and that all patterns are non-empty. +* Something else about the nature of the patterns (typically based on + heuristics) suggests that a packed searcher would perform very poorly, so + no searcher is built. +*/ + +pub use crate::packed::api::{Builder, Config, FindIter, MatchKind, Searcher}; + +mod api; +mod pattern; +mod rabinkarp; +mod teddy; +#[cfg(test)] +mod tests; +#[cfg(target_arch = "x86_64")] +mod vector; diff --git a/vendor/aho-corasick/src/packed/pattern.rs b/vendor/aho-corasick/src/packed/pattern.rs new file mode 100644 index 0000000000..f4c6756fcb --- /dev/null +++ b/vendor/aho-corasick/src/packed/pattern.rs @@ -0,0 +1,318 @@ +use std::cmp; +use std::fmt; +use std::mem; +use std::u16; +use std::usize; + +use crate::packed::api::MatchKind; + +/// The type used for representing a pattern identifier. +/// +/// We don't use `usize` here because our packed searchers don't scale to +/// huge numbers of patterns, so we keep things a bit smaller. +pub type PatternID = u16; + +/// A non-empty collection of non-empty patterns to search for. +/// +/// This collection of patterns is what is passed around to both execute +/// searches and to construct the searchers themselves. Namely, this permits +/// searches to avoid copying all of the patterns, and allows us to keep only +/// one copy throughout all packed searchers. +/// +/// Note that this collection is not a set. The same pattern can appear more +/// than once. +#[derive(Clone, Debug)] +pub struct Patterns { + /// The match semantics supported by this collection of patterns. + /// + /// The match semantics determines the order of the iterator over patterns. + /// For leftmost-first, patterns are provided in the same order as were + /// provided by the caller. For leftmost-longest, patterns are provided in + /// descending order of length, with ties broken by the order in which they + /// were provided by the caller. + kind: MatchKind, + /// The collection of patterns, indexed by their identifier. + by_id: Vec>, + /// The order of patterns defined for iteration, given by pattern + /// identifiers. The order of `by_id` and `order` is always the same for + /// leftmost-first semantics, but may be different for leftmost-longest + /// semantics. + order: Vec, + /// The length of the smallest pattern, in bytes. + minimum_len: usize, + /// The largest pattern identifier. This should always be equivalent to + /// the number of patterns minus one in this collection. + max_pattern_id: PatternID, + /// The total number of pattern bytes across the entire collection. This + /// is used for reporting total heap usage in constant time. + total_pattern_bytes: usize, +} + +impl Patterns { + /// Create a new collection of patterns for the given match semantics. The + /// ID of each pattern is the index of the pattern at which it occurs in + /// the `by_id` slice. + /// + /// If any of the patterns in the slice given are empty, then this panics. + /// Similarly, if the number of patterns given is zero, then this also + /// panics. + pub fn new() -> Patterns { + Patterns { + kind: MatchKind::default(), + by_id: vec![], + order: vec![], + minimum_len: usize::MAX, + max_pattern_id: 0, + total_pattern_bytes: 0, + } + } + + /// Add a pattern to this collection. + /// + /// This panics if the pattern given is empty. + pub fn add(&mut self, bytes: &[u8]) { + assert!(!bytes.is_empty()); + assert!(self.by_id.len() <= u16::MAX as usize); + + let id = self.by_id.len() as u16; + self.max_pattern_id = id; + self.order.push(id); + self.by_id.push(bytes.to_vec()); + self.minimum_len = cmp::min(self.minimum_len, bytes.len()); + self.total_pattern_bytes += bytes.len(); + } + + /// Set the match kind semantics for this collection of patterns. + /// + /// If the kind is not set, then the default is leftmost-first. + pub fn set_match_kind(&mut self, kind: MatchKind) { + match kind { + MatchKind::LeftmostFirst => { + self.order.sort(); + } + MatchKind::LeftmostLongest => { + let (order, by_id) = (&mut self.order, &mut self.by_id); + order.sort_by(|&id1, &id2| { + by_id[id1 as usize] + .len() + .cmp(&by_id[id2 as usize].len()) + .reverse() + }); + } + MatchKind::__Nonexhaustive => unreachable!(), + } + } + + /// Return the number of patterns in this collection. + /// + /// This is guaranteed to be greater than zero. + pub fn len(&self) -> usize { + self.by_id.len() + } + + /// Returns true if and only if this collection of patterns is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns the approximate total amount of heap used by these patterns, in + /// units of bytes. + pub fn heap_bytes(&self) -> usize { + self.order.len() * mem::size_of::() + + self.by_id.len() * mem::size_of::>() + + self.total_pattern_bytes + } + + /// Clears all heap memory associated with this collection of patterns and + /// resets all state such that it is a valid empty collection. + pub fn reset(&mut self) { + self.kind = MatchKind::default(); + self.by_id.clear(); + self.order.clear(); + self.minimum_len = usize::MAX; + self.max_pattern_id = 0; + } + + /// Return the maximum pattern identifier in this collection. This can be + /// useful in searchers for ensuring that the collection of patterns they + /// are provided at search time and at build time have the same size. + pub fn max_pattern_id(&self) -> PatternID { + assert_eq!((self.max_pattern_id + 1) as usize, self.len()); + self.max_pattern_id + } + + /// Returns the length, in bytes, of the smallest pattern. + /// + /// This is guaranteed to be at least one. + pub fn minimum_len(&self) -> usize { + self.minimum_len + } + + /// Returns the match semantics used by these patterns. + pub fn match_kind(&self) -> &MatchKind { + &self.kind + } + + /// Return the pattern with the given identifier. If such a pattern does + /// not exist, then this panics. + pub fn get(&self, id: PatternID) -> Pattern<'_> { + Pattern(&self.by_id[id as usize]) + } + + /// Return the pattern with the given identifier without performing bounds + /// checks. + /// + /// # Safety + /// + /// Callers must ensure that a pattern with the given identifier exists + /// before using this method. + #[cfg(target_arch = "x86_64")] + pub unsafe fn get_unchecked(&self, id: PatternID) -> Pattern<'_> { + Pattern(self.by_id.get_unchecked(id as usize)) + } + + /// Return an iterator over all the patterns in this collection, in the + /// order in which they should be matched. + /// + /// Specifically, in a naive multi-pattern matcher, the following is + /// guaranteed to satisfy the match semantics of this collection of + /// patterns: + /// + /// ```ignore + /// for i in 0..haystack.len(): + /// for p in patterns.iter(): + /// if haystack[i..].starts_with(p.bytes()): + /// return Match(p.id(), i, i + p.bytes().len()) + /// ``` + /// + /// Namely, among the patterns in a collection, if they are matched in + /// the order provided by this iterator, then the result is guaranteed + /// to satisfy the correct match semantics. (Either leftmost-first or + /// leftmost-longest.) + pub fn iter(&self) -> PatternIter<'_> { + PatternIter { patterns: self, i: 0 } + } +} + +/// An iterator over the patterns in the `Patterns` collection. +/// +/// The order of the patterns provided by this iterator is consistent with the +/// match semantics of the originating collection of patterns. +/// +/// The lifetime `'p` corresponds to the lifetime of the collection of patterns +/// this is iterating over. +#[derive(Debug)] +pub struct PatternIter<'p> { + patterns: &'p Patterns, + i: usize, +} + +impl<'p> Iterator for PatternIter<'p> { + type Item = (PatternID, Pattern<'p>); + + fn next(&mut self) -> Option<(PatternID, Pattern<'p>)> { + if self.i >= self.patterns.len() { + return None; + } + let id = self.patterns.order[self.i]; + let p = self.patterns.get(id); + self.i += 1; + Some((id, p)) + } +} + +/// A pattern that is used in packed searching. +#[derive(Clone)] +pub struct Pattern<'a>(&'a [u8]); + +impl<'a> fmt::Debug for Pattern<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Pattern") + .field("lit", &String::from_utf8_lossy(&self.0)) + .finish() + } +} + +impl<'p> Pattern<'p> { + /// Returns the length of this pattern, in bytes. + pub fn len(&self) -> usize { + self.0.len() + } + + /// Returns the bytes of this pattern. + pub fn bytes(&self) -> &[u8] { + &self.0 + } + + /// Returns the first `len` low nybbles from this pattern. If this pattern + /// is shorter than `len`, then this panics. + #[cfg(target_arch = "x86_64")] + pub fn low_nybbles(&self, len: usize) -> Vec { + let mut nybs = vec![]; + for &b in self.bytes().iter().take(len) { + nybs.push(b & 0xF); + } + nybs + } + + /// Returns true if this pattern is a prefix of the given bytes. + #[inline(always)] + pub fn is_prefix(&self, bytes: &[u8]) -> bool { + self.len() <= bytes.len() && self.equals(&bytes[..self.len()]) + } + + /// Returns true if and only if this pattern equals the given bytes. + #[inline(always)] + pub fn equals(&self, bytes: &[u8]) -> bool { + // Why not just use memcmp for this? Well, memcmp requires calling out + // to libc, and this routine is called in fairly hot code paths. Other + // than just calling out to libc, it also seems to result in worse + // codegen. By rolling our own memcpy in pure Rust, it seems to appear + // more friendly to the optimizer. + // + // This results in an improvement in just about every benchmark. Some + // smaller than others, but in some cases, up to 30% faster. + + if self.len() != bytes.len() { + return false; + } + if self.len() < 8 { + for (&b1, &b2) in self.bytes().iter().zip(bytes) { + if b1 != b2 { + return false; + } + } + return true; + } + // When we have 8 or more bytes to compare, then proceed in chunks of + // 8 at a time using unaligned loads. + let mut p1 = self.bytes().as_ptr(); + let mut p2 = bytes.as_ptr(); + let p1end = self.bytes()[self.len() - 8..].as_ptr(); + let p2end = bytes[bytes.len() - 8..].as_ptr(); + // SAFETY: Via the conditional above, we know that both `p1` and `p2` + // have the same length, so `p1 < p1end` implies that `p2 < p2end`. + // Thus, derefencing both `p1` and `p2` in the loop below is safe. + // + // Moreover, we set `p1end` and `p2end` to be 8 bytes before the actual + // end of of `p1` and `p2`. Thus, the final dereference outside of the + // loop is guaranteed to be valid. + // + // Finally, we needn't worry about 64-bit alignment here, since we + // do unaligned loads. + unsafe { + while p1 < p1end { + let v1 = (p1 as *const u64).read_unaligned(); + let v2 = (p2 as *const u64).read_unaligned(); + if v1 != v2 { + return false; + } + p1 = p1.add(8); + p2 = p2.add(8); + } + let v1 = (p1end as *const u64).read_unaligned(); + let v2 = (p2end as *const u64).read_unaligned(); + v1 == v2 + } + } +} diff --git a/vendor/aho-corasick/src/packed/rabinkarp.rs b/vendor/aho-corasick/src/packed/rabinkarp.rs new file mode 100644 index 0000000000..c081f70437 --- /dev/null +++ b/vendor/aho-corasick/src/packed/rabinkarp.rs @@ -0,0 +1,185 @@ +use std::mem; + +use crate::packed::pattern::{PatternID, Patterns}; +use crate::Match; + +/// The type of the rolling hash used in the Rabin-Karp algorithm. +type Hash = usize; + +/// The number of buckets to store our patterns in. We don't want this to be +/// too big in order to avoid wasting memory, but we don't want it to be too +/// small either to avoid spending too much time confirming literals. +/// +/// The number of buckets MUST be a power of two. Otherwise, determining the +/// bucket from a hash will slow down the code considerably. Using a power +/// of two means `hash % NUM_BUCKETS` can compile down to a simple `and` +/// instruction. +const NUM_BUCKETS: usize = 64; + +/// An implementation of the Rabin-Karp algorithm. The main idea of this +/// algorithm is to maintain a rolling hash as it moves through the input, and +/// then check whether that hash corresponds to the same hash for any of the +/// patterns we're looking for. +/// +/// A draw back of naively scaling Rabin-Karp to multiple patterns is that +/// it requires all of the patterns to be the same length, which in turn +/// corresponds to the number of bytes to hash. We adapt this to work for +/// multiple patterns of varying size by fixing the number of bytes to hash +/// to be the length of the smallest pattern. We also split the patterns into +/// several buckets to hopefully make the confirmation step faster. +/// +/// Wikipedia has a decent explanation, if a bit heavy on the theory: +/// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm +/// +/// But ESMAJ provides something a bit more concrete: +/// https://www-igm.univ-mlv.fr/~lecroq/string/node5.html +#[derive(Clone, Debug)] +pub struct RabinKarp { + /// The order of patterns in each bucket is significant. Namely, they are + /// arranged such that the first one to match is the correct match. This + /// may not necessarily correspond to the order provided by the caller. + /// For example, if leftmost-longest semantics are used, then the patterns + /// are sorted by their length in descending order. If leftmost-first + /// semantics are used, then the patterns are sorted by their pattern ID + /// in ascending order (which corresponds to the caller's order). + buckets: Vec>, + /// The length of the hashing window. Generally, this corresponds to the + /// length of the smallest pattern. + hash_len: usize, + /// The factor to subtract out of a hash before updating it with a new + /// byte. + hash_2pow: usize, + /// The maximum identifier of a pattern. This is used as a sanity check + /// to ensure that the patterns provided by the caller are the same as + /// the patterns that were used to compile the matcher. This sanity check + /// possibly permits safely eliminating bounds checks regardless of what + /// patterns are provided by the caller. + /// + /// (Currently, we don't use this to elide bounds checks since it doesn't + /// result in a measurable performance improvement, but we do use it for + /// better failure modes.) + max_pattern_id: PatternID, +} + +impl RabinKarp { + /// Compile a new Rabin-Karp matcher from the patterns given. + /// + /// This panics if any of the patterns in the collection are empty, or if + /// the collection is itself empty. + pub fn new(patterns: &Patterns) -> RabinKarp { + assert!(patterns.len() >= 1); + let hash_len = patterns.minimum_len(); + assert!(hash_len >= 1); + + let mut hash_2pow = 1usize; + for _ in 1..hash_len { + hash_2pow = hash_2pow.wrapping_shl(1); + } + + let mut rk = RabinKarp { + buckets: vec![vec![]; NUM_BUCKETS], + hash_len, + hash_2pow, + max_pattern_id: patterns.max_pattern_id(), + }; + for (id, pat) in patterns.iter() { + let hash = rk.hash(&pat.bytes()[..rk.hash_len]); + let bucket = hash % NUM_BUCKETS; + rk.buckets[bucket].push((hash, id)); + } + rk + } + + /// Return the first matching pattern in the given haystack, begining the + /// search at `at`. + pub fn find_at( + &self, + patterns: &Patterns, + haystack: &[u8], + mut at: usize, + ) -> Option { + assert_eq!(NUM_BUCKETS, self.buckets.len()); + assert_eq!( + self.max_pattern_id, + patterns.max_pattern_id(), + "Rabin-Karp must be called with same patterns it was built with", + ); + + if at + self.hash_len > haystack.len() { + return None; + } + let mut hash = self.hash(&haystack[at..at + self.hash_len]); + loop { + let bucket = &self.buckets[hash % NUM_BUCKETS]; + for &(phash, pid) in bucket { + if phash == hash { + if let Some(c) = self.verify(patterns, pid, haystack, at) { + return Some(c); + } + } + } + if at + self.hash_len >= haystack.len() { + return None; + } + hash = self.update_hash( + hash, + haystack[at], + haystack[at + self.hash_len], + ); + at += 1; + } + } + + /// Returns the approximate total amount of heap used by this searcher, in + /// units of bytes. + pub fn heap_bytes(&self) -> usize { + let num_patterns = self.max_pattern_id as usize + 1; + self.buckets.len() * mem::size_of::>() + + num_patterns * mem::size_of::<(Hash, PatternID)>() + } + + /// Verify whether the pattern with the given id matches at + /// `haystack[at..]`. + /// + /// We tag this function as `cold` because it helps improve codegen. + /// Intuitively, it would seem like inlining it would be better. However, + /// the only time this is called and a match is not found is when there + /// there is a hash collision, or when a prefix of a pattern matches but + /// the entire pattern doesn't match. This is hopefully fairly rare, and + /// if it does occur a lot, it's going to be slow no matter what we do. + #[cold] + fn verify( + &self, + patterns: &Patterns, + id: PatternID, + haystack: &[u8], + at: usize, + ) -> Option { + let pat = patterns.get(id); + if pat.is_prefix(&haystack[at..]) { + Some(Match::from_span(id as usize, at, at + pat.len())) + } else { + None + } + } + + /// Hash the given bytes. + fn hash(&self, bytes: &[u8]) -> Hash { + assert_eq!(self.hash_len, bytes.len()); + + let mut hash = 0usize; + for &b in bytes { + hash = hash.wrapping_shl(1).wrapping_add(b as usize); + } + hash + } + + /// Update the hash given based on removing `old_byte` at the beginning + /// of some byte string, and appending `new_byte` to the end of that same + /// byte string. + fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash { + prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow)) + .wrapping_shl(1) + .wrapping_add(new_byte as usize) + } +} diff --git a/vendor/aho-corasick/src/packed/teddy/README.md b/vendor/aho-corasick/src/packed/teddy/README.md new file mode 100644 index 0000000000..51b999b869 --- /dev/null +++ b/vendor/aho-corasick/src/packed/teddy/README.md @@ -0,0 +1,386 @@ +Teddy is a SIMD accelerated multiple substring matching algorithm. The name +and the core ideas in the algorithm were learned from the [Hyperscan][1_u] +project. The implementation in this repository was mostly motivated for use in +accelerating regex searches by searching for small sets of required literals +extracted from the regex. + + +# Background + +The key idea of Teddy is to do *packed* substring matching. In the literature, +packed substring matching is the idea of examining multiple bytes in a haystack +at a time to detect matches. Implementations of, for example, memchr (which +detects matches of a single byte) have been doing this for years. Only +recently, with the introduction of various SIMD instructions, has this been +extended to substring matching. The PCMPESTRI instruction (and its relatives), +for example, implements substring matching in hardware. It is, however, limited +to substrings of length 16 bytes or fewer, but this restriction is fine in a +regex engine, since we rarely care about the performance difference between +searching for a 16 byte literal and a 16 + N literal; 16 is already long +enough. The key downside of the PCMPESTRI instruction, on current (2016) CPUs +at least, is its latency and throughput. As a result, it is often faster to +do substring search with a Boyer-Moore (or Two-Way) variant and a well placed +memchr to quickly skip through the haystack. + +There are fewer results from the literature on packed substring matching, +and even fewer for packed multiple substring matching. Ben-Kiki et al. [2] +describes use of PCMPESTRI for substring matching, but is mostly theoretical +and hand-waves performance. There is other theoretical work done by Bille [3] +as well. + +The rest of the work in the field, as far as I'm aware, is by Faro and Kulekci +and is generally focused on multiple pattern search. Their first paper [4a] +introduces the concept of a fingerprint, which is computed for every block of +N bytes in every pattern. The haystack is then scanned N bytes at a time and +a fingerprint is computed in the same way it was computed for blocks in the +patterns. If the fingerprint corresponds to one that was found in a pattern, +then a verification step follows to confirm that one of the substrings with the +corresponding fingerprint actually matches at the current location. Various +implementation tricks are employed to make sure the fingerprint lookup is fast; +typically by truncating the fingerprint. (This may, of course, provoke more +steps in the verification process, so a balance must be struck.) + +The main downside of [4a] is that the minimum substring length is 32 bytes, +presumably because of how the algorithm uses certain SIMD instructions. This +essentially makes it useless for general purpose regex matching, where a small +number of short patterns is far more likely. + +Faro and Kulekci published another paper [4b] that is conceptually very similar +to [4a]. The key difference is that it uses the CRC32 instruction (introduced +as part of SSE 4.2) to compute fingerprint values. This also enables the +algorithm to work effectively on substrings as short as 7 bytes with 4 byte +windows. 7 bytes is unfortunately still too long. The window could be +technically shrunk to 2 bytes, thereby reducing minimum length to 3, but the +small window size ends up negating most performance benefits—and it's likely +the common case in a general purpose regex engine. + +Faro and Kulekci also published [4c] that appears to be intended as a +replacement to using PCMPESTRI. In particular, it is specifically motivated by +the high throughput/latency time of PCMPESTRI and therefore chooses other SIMD +instructions that are faster. While this approach works for short substrings, +I personally couldn't see a way to generalize it to multiple substring search. + +Faro and Kulekci have another paper [4d] that I haven't been able to read +because it is behind a paywall. + + +# Teddy + +Finally, we get to Teddy. If the above literature review is complete, then it +appears that Teddy is a novel algorithm. More than that, in my experience, it +completely blows away the competition for short substrings, which is exactly +what we want in a general purpose regex engine. Again, the algorithm appears +to be developed by the authors of [Hyperscan][1_u]. Hyperscan was open sourced +late 2015, and no earlier history could be found. Therefore, tracking the exact +provenance of the algorithm with respect to the published literature seems +difficult. + +At a high level, Teddy works somewhat similarly to the fingerprint algorithms +published by Faro and Kulekci, but Teddy does it in a way that scales a bit +better. Namely: + +1. Teddy's core algorithm scans the haystack in 16 (for SSE, or 32 for AVX) + byte chunks. 16 (or 32) is significant because it corresponds to the number + of bytes in a SIMD vector. +2. Bitwise operations are performed on each chunk to discover if any region of + it matches a set of precomputed fingerprints from the patterns. If there are + matches, then a verification step is performed. In this implementation, our + verification step is naive. This can be improved upon. + +The details to make this work are quite clever. First, we must choose how to +pick our fingerprints. In Hyperscan's implementation, I *believe* they use the +last N bytes of each substring, where N must be at least the minimum length of +any substring in the set being searched. In this implementation, we use the +first N bytes of each substring. (The tradeoffs between these choices aren't +yet clear to me.) We then must figure out how to quickly test whether an +occurrence of any fingerprint from the set of patterns appears in a 16 byte +block from the haystack. To keep things simple, let's assume N = 1 and examine +some examples to motivate the approach. Here are our patterns: + +```ignore +foo +bar +baz +``` + +The corresponding fingerprints, for N = 1, are `f`, `b` and `b`. Now let's set +our 16 byte block to: + +```ignore +bat cat foo bump +xxxxxxxxxxxxxxxx +``` + +To cut to the chase, Teddy works by using bitsets. In particular, Teddy creates +a mask that allows us to quickly compute membership of a fingerprint in a 16 +byte block that also tells which pattern the fingerprint corresponds to. In +this case, our fingerprint is a single byte, so an appropriate abstraction is +a map from a single byte to a list of patterns that contain that fingerprint: + +```ignore +f |--> foo +b |--> bar, baz +``` + +Now, all we need to do is figure out how to represent this map in vector space +and use normal SIMD operations to perform a lookup. The first simplification +we can make is to represent our patterns as bit fields occupying a single +byte. This is important, because a single SIMD vector can store 16 bytes. + +```ignore +f |--> 00000001 +b |--> 00000010, 00000100 +``` + +How do we perform lookup though? It turns out that SSSE3 introduced a very cool +instruction called PSHUFB. The instruction takes two SIMD vectors, `A` and `B`, +and returns a third vector `C`. All vectors are treated as 16 8-bit integers. +`C` is formed by `C[i] = A[B[i]]`. (This is a bit of a simplification, but true +for the purposes of this algorithm. For full details, see [Intel's Intrinsics +Guide][5_u].) This essentially lets us use the values in `B` to lookup values +in `A`. + +If we could somehow cause `B` to contain our 16 byte block from the haystack, +and if `A` could contain our bitmasks, then we'd end up with something like +this for `A`: + +```ignore + 0x00 0x01 ... 0x62 ... 0x66 ... 0xFF +A = 0 0 00000110 00000001 0 +``` + +And if `B` contains our window from our haystack, we could use shuffle to take +the values from `B` and use them to look up our bitsets in `A`. But of course, +we can't do this because `A` in the above example contains 256 bytes, which +is much larger than the size of a SIMD vector. + +Nybbles to the rescue! A nybble is 4 bits. Instead of one mask to hold all of +our bitsets, we can use two masks, where one mask corresponds to the lower four +bits of our fingerprint and the other mask corresponds to the upper four bits. +So our map now looks like: + +```ignore +'f' & 0xF = 0x6 |--> 00000001 +'f' >> 4 = 0x6 |--> 00000111 +'b' & 0xF = 0x2 |--> 00000110 +'b' >> 4 = 0x6 |--> 00000111 +``` + +Notice that the bitsets for each nybble correspond to the union of all +fingerprints that contain that nybble. For example, both `f` and `b` have the +same upper 4 bits but differ on the lower 4 bits. Putting this together, we +have `A0`, `A1` and `B`, where `A0` is our mask for the lower nybble, `A1` is +our mask for the upper nybble and `B` is our 16 byte block from the haystack: + +```ignore + 0x00 0x01 0x02 0x03 ... 0x06 ... 0xF +A0 = 0 0 00000110 0 00000001 0 +A1 = 0 0 0 0 00000111 0 +B = b a t _ t p +B = 0x62 0x61 0x74 0x20 0x74 0x70 +``` + +But of course, we can't use `B` with `PSHUFB` yet, since its values are 8 bits, +and we need indexes that are at most 4 bits (corresponding to one of 16 +values). We can apply the same transformation to split `B` into lower and upper +nybbles as we did `A`. As before, `B0` corresponds to the lower nybbles and +`B1` corresponds to the upper nybbles: + +```ignore + b a t _ c a t _ f o o _ b u m p +B0 = 0x2 0x1 0x4 0x0 0x3 0x1 0x4 0x0 0x6 0xF 0xF 0x0 0x2 0x5 0xD 0x0 +B1 = 0x6 0x6 0x7 0x2 0x6 0x6 0x7 0x2 0x6 0x6 0x6 0x2 0x6 0x7 0x6 0x7 +``` + +And now we have a nice correspondence. `B0` can index `A0` and `B1` can index +`A1`. Here's what we get when we apply `C0 = PSHUFB(A0, B0)`: + +```ignore + b a ... f o ... p + A0[0x2] A0[0x1] A0[0x6] A0[0xF] A0[0x0] +C0 = 00000110 0 00000001 0 0 +``` + +And `C1 = PSHUFB(A1, B1)`: + +```ignore + b a ... f o ... p + A1[0x6] A1[0x6] A1[0x6] A1[0x6] A1[0x7] +C1 = 00000111 00000111 00000111 00000111 0 +``` + +Notice how neither one of `C0` or `C1` is guaranteed to report fully correct +results all on its own. For example, `C1` claims that `b` is a fingerprint for +the pattern `foo` (since `A1[0x6] = 00000111`), and that `o` is a fingerprint +for all of our patterns. But if we combined `C0` and `C1` with an `AND` +operation: + +```ignore + b a ... f o ... p +C = 00000110 0 00000001 0 0 +``` + +Then we now have that `C[i]` contains a bitset corresponding to the matching +fingerprints in a haystack's 16 byte block, where `i` is the `ith` byte in that +block. + +Once we have that, we can look for the position of the least significant bit +in `C`. (Least significant because we only target `x86_64` here, which is +always little endian. Thus, the least significant bytes correspond to bytes +in our haystack at a lower address.) That position, modulo `8`, gives us +the pattern that the fingerprint matches. That position, integer divided by +`8`, also gives us the byte offset that the fingerprint occurs in inside the +16 byte haystack block. Using those two pieces of information, we can run a +verification procedure that tries to match all substrings containing that +fingerprint at that position in the haystack. + + +# Implementation notes + +The problem with the algorithm as described above is that it uses a single byte +for a fingerprint. This will work well if the fingerprints are rare in the +haystack (e.g., capital letters or special characters in normal English text), +but if the fingerprints are common, you'll wind up spending too much time in +the verification step, which effectively negates the performance benefits of +scanning 16 bytes at a time. Remember, the key to the performance of this +algorithm is to do as little work as possible per 16 (or 32) bytes. + +This algorithm can be extrapolated in a relatively straight-forward way to use +larger fingerprints. That is, instead of a single byte prefix, we might use a +two or three byte prefix. The implementation here implements N = {1, 2, 3} +and always picks the largest N possible. The rationale is that the bigger the +fingerprint, the fewer verification steps we'll do. Of course, if N is too +large, then we'll end up doing too much on each step. + +The way to extend it is: + +1. Add a mask for each byte in the fingerprint. (Remember that each mask is + composed of two SIMD vectors.) This results in a value of `C` for each byte + in the fingerprint while searching. +2. When testing each 16 (or 32) byte block, each value of `C` must be shifted + so that they are aligned. Once aligned, they should all be `AND`'d together. + This will give you only the bitsets corresponding to the full match of the + fingerprint. To do this, one needs to save the last byte (for N=2) or last + two bytes (for N=3) from the previous iteration, and then line them up with + the first one or two bytes of the next iteration. + +## Verification + +Verification generally follows the procedure outlined above. The tricky parts +are in the right formulation of operations to get our bits out of our vectors. +We have a limited set of operations available to us on SIMD vectors as 128-bit +or 256-bit numbers, so we wind up needing to rip out 2 (or 4) 64-bit integers +from our vectors, and then run our verification step on each of those. The +verification step looks at the least significant bit set, and from its +position, we can derive the byte offset and bucket. (Again, as described +above.) Once we know the bucket, we do a fairly naive exhaustive search for +every literal in that bucket. (Hyperscan is a bit smarter here and uses a hash +table, but I haven't had time to thoroughly explore that. A few initial +half-hearted attempts resulted in worse performance.) + +## AVX + +The AVX version of Teddy extrapolates almost perfectly from the SSE version. +The only hickup is that PALIGNR is used to align chunks in the 16-bit version, +and there is no equivalent instruction in AVX. AVX does have VPALIGNR, but it +only works within 128-bit lanes. So there's a bit of tomfoolery to get around +this by shuffling the vectors before calling VPALIGNR. + +The only other aspect to AVX is that since our masks are still fundamentally +16-bytes (0x0-0xF), they are duplicated to 32-bytes, so that they can apply to +32-byte chunks. + +## Fat Teddy + +In the version of Teddy described above, 8 buckets are used to group patterns +that we want to search for. However, when AVX is available, we can extend the +number of buckets to 16 by permitting each byte in our masks to use 16-bits +instead of 8-bits to represent the buckets it belongs to. (This variant is also +in Hyperscan.) However, what we give up is the ability to scan 32 bytes at a +time, even though we're using AVX. Instead, we have to scan 16 bytes at a time. +What we gain, though, is (hopefully) less work in our verification routine. +It patterns are more spread out across more buckets, then there should overall +be fewer false positives. In general, Fat Teddy permits us to grow our capacity +a bit and search for more literals before Teddy gets overwhelmed. + +The tricky part of Fat Teddy is in how we adjust our masks and our verification +procedure. For the masks, we simply represent the first 8 buckets in each of +the low 16 bytes, and then the second 8 buckets in each of the high 16 bytes. +Then, in the search loop, instead of loading 32 bytes from the haystack, we +load the same 16 bytes from the haystack into both the low and high 16 byte +portions of our 256-bit vector. So for example, a mask might look like this: + + bits: 00100001 00000000 ... 11000000 00000000 00000001 ... 00000000 + byte: 31 30 16 15 14 0 + offset: 15 14 0 15 14 0 + buckets: 8-15 8-15 8-15 0-7 0-7 0-7 + +Where `byte` is the position in the vector (higher numbers corresponding to +more significant bits), `offset` is the corresponding position in the haystack +chunk, and `buckets` corresponds to the bucket assignments for that particular +byte. + +In particular, notice that the bucket assignments for offset `0` are spread +out between bytes `0` and `16`. This works well for the chunk-by-chunk search +procedure, but verification really wants to process all bucket assignments for +each offset at once. Otherwise, we might wind up finding a match at offset +`1` in one the first 8 buckets, when we really should have reported a match +at offset `0` in one of the second 8 buckets. (Because we want the leftmost +match.) + +Thus, for verification, we rearrange the above vector such that it is a +sequence of 16-bit integers, where the least significant 16-bit integer +corresponds to all of the bucket assignments for offset `0`. So with the +above vector, the least significant 16-bit integer would be + + 11000000 000000 + +which was taken from bytes `16` and `0`. Then the verification step pretty much +runs as described, except with 16 buckets instead of 8. + + +# References + +- **[1]** [Hyperscan on GitHub](https://github.com/intel/hyperscan), + [webpage](https://www.hyperscan.io/) +- **[2a]** Ben-Kiki, O., Bille, P., Breslauer, D., Gasieniec, L., Grossi, R., + & Weimann, O. (2011). + _Optimal packed string matching_. + In LIPIcs-Leibniz International Proceedings in Informatics (Vol. 13). + Schloss Dagstuhl-Leibniz-Zentrum fuer Informatik. + DOI: 10.4230/LIPIcs.FSTTCS.2011.423. + [PDF](https://drops.dagstuhl.de/opus/volltexte/2011/3355/pdf/37.pdf). +- **[2b]** Ben-Kiki, O., Bille, P., Breslauer, D., Ga̧sieniec, L., Grossi, R., + & Weimann, O. (2014). + _Towards optimal packed string matching_. + Theoretical Computer Science, 525, 111-129. + DOI: 10.1016/j.tcs.2013.06.013. + [PDF](https://www.cs.haifa.ac.il/~oren/Publications/bpsm.pdf). +- **[3]** Bille, P. (2011). + _Fast searching in packed strings_. + Journal of Discrete Algorithms, 9(1), 49-56. + DOI: 10.1016/j.jda.2010.09.003. + [PDF](https://www.sciencedirect.com/science/article/pii/S1570866710000353). +- **[4a]** Faro, S., & Külekci, M. O. (2012, October). + _Fast multiple string matching using streaming SIMD extensions technology_. + In String Processing and Information Retrieval (pp. 217-228). + Springer Berlin Heidelberg. + DOI: 10.1007/978-3-642-34109-0_23. + [PDF](https://www.dmi.unict.it/faro/papers/conference/faro32.pdf). +- **[4b]** Faro, S., & Külekci, M. O. (2013, September). + _Towards a Very Fast Multiple String Matching Algorithm for Short Patterns_. + In Stringology (pp. 78-91). + [PDF](https://www.dmi.unict.it/faro/papers/conference/faro36.pdf). +- **[4c]** Faro, S., & Külekci, M. O. (2013, January). + _Fast packed string matching for short patterns_. + In Proceedings of the Meeting on Algorithm Engineering & Expermiments + (pp. 113-121). + Society for Industrial and Applied Mathematics. + [PDF](https://arxiv.org/pdf/1209.6449.pdf). +- **[4d]** Faro, S., & Külekci, M. O. (2014). + _Fast and flexible packed string matching_. + Journal of Discrete Algorithms, 28, 61-72. + DOI: 10.1016/j.jda.2014.07.003. + +[1_u]: https://github.com/intel/hyperscan +[5_u]: https://software.intel.com/sites/landingpage/IntrinsicsGuide diff --git a/vendor/aho-corasick/src/packed/teddy/compile.rs b/vendor/aho-corasick/src/packed/teddy/compile.rs new file mode 100644 index 0000000000..741cb6923c --- /dev/null +++ b/vendor/aho-corasick/src/packed/teddy/compile.rs @@ -0,0 +1,414 @@ +// See the README in this directory for an explanation of the Teddy algorithm. + +use std::cmp; +use std::collections::BTreeMap; +use std::fmt; + +use crate::packed::pattern::{PatternID, Patterns}; +use crate::packed::teddy::Teddy; + +/// A builder for constructing a Teddy matcher. +/// +/// The builder primarily permits fine grained configuration of the Teddy +/// matcher. Most options are made only available for testing/benchmarking +/// purposes. In reality, options are automatically determined by the nature +/// and number of patterns given to the builder. +#[derive(Clone, Debug)] +pub struct Builder { + /// When none, this is automatically determined. Otherwise, `false` means + /// slim Teddy is used (8 buckets) and `true` means fat Teddy is used + /// (16 buckets). Fat Teddy requires AVX2, so if that CPU feature isn't + /// available and Fat Teddy was requested, no matcher will be built. + fat: Option, + /// When none, this is automatically determined. Otherwise, `false` means + /// that 128-bit vectors will be used (up to SSSE3 instructions) where as + /// `true` means that 256-bit vectors will be used. As with `fat`, if + /// 256-bit vectors are requested and they aren't available, then a + /// searcher will not be built. + avx: Option, +} + +impl Default for Builder { + fn default() -> Builder { + Builder::new() + } +} + +impl Builder { + /// Create a new builder for configuring a Teddy matcher. + pub fn new() -> Builder { + Builder { fat: None, avx: None } + } + + /// Build a matcher for the set of patterns given. If a matcher could not + /// be built, then `None` is returned. + /// + /// Generally, a matcher isn't built if the necessary CPU features aren't + /// available, an unsupported target or if the searcher is believed to be + /// slower than standard techniques (i.e., if there are too many literals). + pub fn build(&self, patterns: &Patterns) -> Option { + self.build_imp(patterns) + } + + /// Require the use of Fat (true) or Slim (false) Teddy. Fat Teddy uses + /// 16 buckets where as Slim Teddy uses 8 buckets. More buckets are useful + /// for a larger set of literals. + /// + /// `None` is the default, which results in an automatic selection based + /// on the number of literals and available CPU features. + pub fn fat(&mut self, yes: Option) -> &mut Builder { + self.fat = yes; + self + } + + /// Request the use of 256-bit vectors (true) or 128-bit vectors (false). + /// Generally, a larger vector size is better since it either permits + /// matching more patterns or matching more bytes in the haystack at once. + /// + /// `None` is the default, which results in an automatic selection based on + /// the number of literals and available CPU features. + pub fn avx(&mut self, yes: Option) -> &mut Builder { + self.avx = yes; + self + } + + fn build_imp(&self, patterns: &Patterns) -> Option { + use crate::packed::teddy::runtime; + + // Most of the logic here is just about selecting the optimal settings, + // or perhaps even rejecting construction altogether. The choices + // we have are: fat (avx only) or not, ssse3 or avx2, and how many + // patterns we allow ourselves to search. Additionally, for testing + // and benchmarking, we permit callers to try to "force" a setting, + // and if the setting isn't allowed (e.g., forcing AVX when AVX isn't + // available), then we bail and return nothing. + + if patterns.len() > 64 { + return None; + } + let has_ssse3 = is_x86_feature_detected!("ssse3"); + let has_avx = is_x86_feature_detected!("avx2"); + let avx = if self.avx == Some(true) { + if !has_avx { + return None; + } + true + } else if self.avx == Some(false) { + if !has_ssse3 { + return None; + } + false + } else if !has_ssse3 && !has_avx { + return None; + } else { + has_avx + }; + let fat = match self.fat { + None => avx && patterns.len() > 32, + Some(false) => false, + Some(true) if !avx => return None, + Some(true) => true, + }; + + let mut compiler = Compiler::new(patterns, fat); + compiler.compile(); + let Compiler { buckets, masks, .. } = compiler; + // SAFETY: It is required that the builder only produce Teddy matchers + // that are allowed to run on the current CPU, since we later assume + // that the presence of (for example) TeddySlim1Mask256 means it is + // safe to call functions marked with the `avx2` target feature. + match (masks.len(), avx, fat) { + (1, false, _) => Some(Teddy { + buckets, + max_pattern_id: patterns.max_pattern_id(), + exec: runtime::Exec::TeddySlim1Mask128( + runtime::TeddySlim1Mask128 { + mask1: runtime::Mask128::new(masks[0]), + }, + ), + }), + (1, true, false) => Some(Teddy { + buckets, + max_pattern_id: patterns.max_pattern_id(), + exec: runtime::Exec::TeddySlim1Mask256( + runtime::TeddySlim1Mask256 { + mask1: runtime::Mask256::new(masks[0]), + }, + ), + }), + (1, true, true) => Some(Teddy { + buckets, + max_pattern_id: patterns.max_pattern_id(), + exec: runtime::Exec::TeddyFat1Mask256( + runtime::TeddyFat1Mask256 { + mask1: runtime::Mask256::new(masks[0]), + }, + ), + }), + (2, false, _) => Some(Teddy { + buckets, + max_pattern_id: patterns.max_pattern_id(), + exec: runtime::Exec::TeddySlim2Mask128( + runtime::TeddySlim2Mask128 { + mask1: runtime::Mask128::new(masks[0]), + mask2: runtime::Mask128::new(masks[1]), + }, + ), + }), + (2, true, false) => Some(Teddy { + buckets, + max_pattern_id: patterns.max_pattern_id(), + exec: runtime::Exec::TeddySlim2Mask256( + runtime::TeddySlim2Mask256 { + mask1: runtime::Mask256::new(masks[0]), + mask2: runtime::Mask256::new(masks[1]), + }, + ), + }), + (2, true, true) => Some(Teddy { + buckets, + max_pattern_id: patterns.max_pattern_id(), + exec: runtime::Exec::TeddyFat2Mask256( + runtime::TeddyFat2Mask256 { + mask1: runtime::Mask256::new(masks[0]), + mask2: runtime::Mask256::new(masks[1]), + }, + ), + }), + (3, false, _) => Some(Teddy { + buckets, + max_pattern_id: patterns.max_pattern_id(), + exec: runtime::Exec::TeddySlim3Mask128( + runtime::TeddySlim3Mask128 { + mask1: runtime::Mask128::new(masks[0]), + mask2: runtime::Mask128::new(masks[1]), + mask3: runtime::Mask128::new(masks[2]), + }, + ), + }), + (3, true, false) => Some(Teddy { + buckets, + max_pattern_id: patterns.max_pattern_id(), + exec: runtime::Exec::TeddySlim3Mask256( + runtime::TeddySlim3Mask256 { + mask1: runtime::Mask256::new(masks[0]), + mask2: runtime::Mask256::new(masks[1]), + mask3: runtime::Mask256::new(masks[2]), + }, + ), + }), + (3, true, true) => Some(Teddy { + buckets, + max_pattern_id: patterns.max_pattern_id(), + exec: runtime::Exec::TeddyFat3Mask256( + runtime::TeddyFat3Mask256 { + mask1: runtime::Mask256::new(masks[0]), + mask2: runtime::Mask256::new(masks[1]), + mask3: runtime::Mask256::new(masks[2]), + }, + ), + }), + _ => unreachable!(), + } + } +} + +/// A compiler is in charge of allocating patterns into buckets and generating +/// the masks necessary for searching. +#[derive(Clone)] +struct Compiler<'p> { + patterns: &'p Patterns, + buckets: Vec>, + masks: Vec, +} + +impl<'p> Compiler<'p> { + /// Create a new Teddy compiler for the given patterns. If `fat` is true, + /// then 16 buckets will be used instead of 8. + /// + /// This panics if any of the patterns given are empty. + fn new(patterns: &'p Patterns, fat: bool) -> Compiler<'p> { + let mask_len = cmp::min(3, patterns.minimum_len()); + assert!(1 <= mask_len && mask_len <= 3); + + Compiler { + patterns, + buckets: vec![vec![]; if fat { 16 } else { 8 }], + masks: vec![Mask::default(); mask_len], + } + } + + /// Compile the patterns in this compiler into buckets and masks. + fn compile(&mut self) { + let mut lonibble_to_bucket: BTreeMap, usize> = BTreeMap::new(); + for (id, pattern) in self.patterns.iter() { + // We try to be slightly clever in how we assign patterns into + // buckets. Generally speaking, we want patterns with the same + // prefix to be in the same bucket, since it minimizes the amount + // of time we spend churning through buckets in the verification + // step. + // + // So we could assign patterns with the same N-prefix (where N + // is the size of the mask, which is one of {1, 2, 3}) to the + // same bucket. However, case insensitive searches are fairly + // common, so we'd for example, ideally want to treat `abc` and + // `ABC` as if they shared the same prefix. ASCII has the nice + // property that the lower 4 bits of A and a are the same, so we + // therefore group patterns with the same low-nybbe-N-prefix into + // the same bucket. + // + // MOREOVER, this is actually necessary for correctness! In + // particular, by grouping patterns with the same prefix into the + // same bucket, we ensure that we preserve correct leftmost-first + // and leftmost-longest match semantics. In addition to the fact + // that `patterns.iter()` iterates in the correct order, this + // guarantees that all possible ambiguous matches will occur in + // the same bucket. The verification routine could be adjusted to + // support correct leftmost match semantics regardless of bucket + // allocation, but that results in a performance hit. It's much + // nicer to be able to just stop as soon as a match is found. + let lonybs = pattern.low_nybbles(self.masks.len()); + if let Some(&bucket) = lonibble_to_bucket.get(&lonybs) { + self.buckets[bucket].push(id); + } else { + // N.B. We assign buckets in reverse because it shouldn't have + // any influence on performance, but it does make it harder to + // get leftmost match semantics accidentally correct. + let bucket = (self.buckets.len() - 1) + - (id as usize % self.buckets.len()); + self.buckets[bucket].push(id); + lonibble_to_bucket.insert(lonybs, bucket); + } + } + for (bucket_index, bucket) in self.buckets.iter().enumerate() { + for &pat_id in bucket { + let pat = self.patterns.get(pat_id); + for (i, mask) in self.masks.iter_mut().enumerate() { + if self.buckets.len() == 8 { + mask.add_slim(bucket_index as u8, pat.bytes()[i]); + } else { + mask.add_fat(bucket_index as u8, pat.bytes()[i]); + } + } + } + } + } +} + +impl<'p> fmt::Debug for Compiler<'p> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut buckets = vec![vec![]; self.buckets.len()]; + for (i, bucket) in self.buckets.iter().enumerate() { + for &patid in bucket { + buckets[i].push(self.patterns.get(patid)); + } + } + f.debug_struct("Compiler") + .field("buckets", &buckets) + .field("masks", &self.masks) + .finish() + } +} + +/// Mask represents the low and high nybble masks that will be used during +/// search. Each mask is 32 bytes wide, although only the first 16 bytes are +/// used for the SSSE3 runtime. +/// +/// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set +/// if and only if the corresponding nybble is in the ith bucket. The index of +/// the byte (0-15, inclusive) corresponds to the nybble. +/// +/// Each mask is used as the target of a shuffle, where the indices for the +/// shuffle are taken from the haystack. AND'ing the shuffles for both the +/// low and high masks together also results in 8-bit bitsets, but where bit +/// `i` is set if and only if the correspond *byte* is in the ith bucket. +/// +/// During compilation, masks are just arrays. But during search, these masks +/// are represented as 128-bit or 256-bit vectors. +/// +/// (See the README is this directory for more details.) +#[derive(Clone, Copy, Default)] +pub struct Mask { + lo: [u8; 32], + hi: [u8; 32], +} + +impl Mask { + /// Update this mask by adding the given byte to the given bucket. The + /// given bucket must be in the range 0-7. + /// + /// This is for "slim" Teddy, where there are only 8 buckets. + fn add_slim(&mut self, bucket: u8, byte: u8) { + assert!(bucket < 8); + + let byte_lo = (byte & 0xF) as usize; + let byte_hi = ((byte >> 4) & 0xF) as usize; + // When using 256-bit vectors, we need to set this bucket assignment in + // the low and high 128-bit portions of the mask. This allows us to + // process 32 bytes at a time. Namely, AVX2 shuffles operate on each + // of the 128-bit lanes, rather than the full 256-bit vector at once. + self.lo[byte_lo] |= 1 << bucket; + self.lo[byte_lo + 16] |= 1 << bucket; + self.hi[byte_hi] |= 1 << bucket; + self.hi[byte_hi + 16] |= 1 << bucket; + } + + /// Update this mask by adding the given byte to the given bucket. The + /// given bucket must be in the range 0-15. + /// + /// This is for "fat" Teddy, where there are 16 buckets. + fn add_fat(&mut self, bucket: u8, byte: u8) { + assert!(bucket < 16); + + let byte_lo = (byte & 0xF) as usize; + let byte_hi = ((byte >> 4) & 0xF) as usize; + // Unlike slim teddy, fat teddy only works with AVX2. For fat teddy, + // the high 128 bits of our mask correspond to buckets 8-15, while the + // low 128 bits correspond to buckets 0-7. + if bucket < 8 { + self.lo[byte_lo] |= 1 << bucket; + self.hi[byte_hi] |= 1 << bucket; + } else { + self.lo[byte_lo + 16] |= 1 << (bucket % 8); + self.hi[byte_hi + 16] |= 1 << (bucket % 8); + } + } + + /// Return the low 128 bits of the low-nybble mask. + pub fn lo128(&self) -> [u8; 16] { + let mut tmp = [0; 16]; + tmp.copy_from_slice(&self.lo[..16]); + tmp + } + + /// Return the full low-nybble mask. + pub fn lo256(&self) -> [u8; 32] { + self.lo + } + + /// Return the low 128 bits of the high-nybble mask. + pub fn hi128(&self) -> [u8; 16] { + let mut tmp = [0; 16]; + tmp.copy_from_slice(&self.hi[..16]); + tmp + } + + /// Return the full high-nybble mask. + pub fn hi256(&self) -> [u8; 32] { + self.hi + } +} + +impl fmt::Debug for Mask { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let (mut parts_lo, mut parts_hi) = (vec![], vec![]); + for i in 0..32 { + parts_lo.push(format!("{:02}: {:08b}", i, self.lo[i])); + parts_hi.push(format!("{:02}: {:08b}", i, self.hi[i])); + } + f.debug_struct("Mask") + .field("lo", &parts_lo) + .field("hi", &parts_hi) + .finish() + } +} diff --git a/vendor/aho-corasick/src/packed/teddy/mod.rs b/vendor/aho-corasick/src/packed/teddy/mod.rs new file mode 100644 index 0000000000..3268cdfad2 --- /dev/null +++ b/vendor/aho-corasick/src/packed/teddy/mod.rs @@ -0,0 +1,62 @@ +#[cfg(target_arch = "x86_64")] +pub use crate::packed::teddy::compile::Builder; +#[cfg(not(target_arch = "x86_64"))] +pub use crate::packed::teddy::fallback::Builder; +#[cfg(not(target_arch = "x86_64"))] +pub use crate::packed::teddy::fallback::Teddy; +#[cfg(target_arch = "x86_64")] +pub use crate::packed::teddy::runtime::Teddy; + +#[cfg(target_arch = "x86_64")] +mod compile; +#[cfg(target_arch = "x86_64")] +mod runtime; + +#[cfg(not(target_arch = "x86_64"))] +mod fallback { + use crate::packed::pattern::Patterns; + use crate::Match; + + #[derive(Clone, Debug, Default)] + pub struct Builder(()); + + impl Builder { + pub fn new() -> Builder { + Builder(()) + } + + pub fn build(&self, _: &Patterns) -> Option { + None + } + + pub fn fat(&mut self, _: Option) -> &mut Builder { + self + } + + pub fn avx(&mut self, _: Option) -> &mut Builder { + self + } + } + + #[derive(Clone, Debug)] + pub struct Teddy(()); + + impl Teddy { + pub fn find_at( + &self, + _: &Patterns, + _: &[u8], + _: usize, + ) -> Option { + None + } + + pub fn minimum_len(&self) -> usize { + 0 + } + + pub fn heap_bytes(&self) -> usize { + 0 + } + } +} diff --git a/vendor/aho-corasick/src/packed/teddy/runtime.rs b/vendor/aho-corasick/src/packed/teddy/runtime.rs new file mode 100644 index 0000000000..0d96913701 --- /dev/null +++ b/vendor/aho-corasick/src/packed/teddy/runtime.rs @@ -0,0 +1,1204 @@ +// See the README in this directory for an explanation of the Teddy algorithm. +// It is strongly recommended to peruse the README before trying to grok this +// code, as its use of SIMD is pretty opaque, although I tried to add comments +// where appropriate. +// +// Moreover, while there is a lot of code in this file, most of it is +// repeated variants of the same thing. Specifically, there are three Teddy +// variants: Slim 128-bit Teddy (8 buckets), Slim 256-bit Teddy (8 buckets) +// and Fat 256-bit Teddy (16 buckets). For each variant, there are three +// implementations, corresponding to mask lengths of 1, 2 and 3. Bringing it to +// a total of nine variants. Each one is structured roughly the same: +// +// while at <= len(haystack) - CHUNK_SIZE: +// let candidate = find_candidate_in_chunk(haystack, at) +// if not all zeroes(candidate): +// if match = verify(haystack, at, candidate): +// return match +// +// For the most part, this remains unchanged. The parts that vary are the +// verification routine (for slim vs fat Teddy) and the candidate extraction +// (based on the number of masks). +// +// In the code below, a "candidate" corresponds to a single vector with 8-bit +// lanes. Each lane is itself an 8-bit bitset, where the ith bit is set in the +// jth lane if and only if the byte occurring at position `j` is in the +// bucket `i` (where the `j`th position is the position in the current window +// of the haystack, which is always 16 or 32 bytes). Note to be careful here: +// the ith bit and the jth lane correspond to the least significant bits of the +// vector. So when visualizing how the current window of bytes is stored in a +// vector, you often need to flip it around. For example, the text `abcd` in a +// 4-byte vector would look like this: +// +// 01100100 01100011 01100010 01100001 +// d c b a +// +// When the mask length is 1, then finding the candidate is pretty straight +// forward: you just apply the shuffle indices (from the haystack window) to +// the masks, and then AND them together, as described in the README. But for +// masks of length 2 and 3, you need to keep a little state. Specifically, +// you need to store the final 1 (for mask length 2) or 2 (for mask length 3) +// bytes of the candidate for use when searching the next window. This is for +// handling matches that span two windows. +// +// With respect to the repeated code, it would likely be possible to reduce +// the number of copies of code below using polymorphism, but I find this +// formulation clearer instead of needing to reason through generics. However, +// I admit, there may be a simpler generic construction that I'm missing. +// +// All variants are fairly heavily tested in src/packed/tests.rs. + +use std::arch::x86_64::*; +use std::mem; + +use crate::packed::pattern::{PatternID, Patterns}; +use crate::packed::teddy::compile; +use crate::packed::vector::*; +use crate::Match; + +/// The Teddy runtime. +/// +/// A Teddy runtime can be used to quickly search for occurrences of one or +/// more patterns. While it does not scale to an arbitrary number of patterns +/// like Aho-Corasick, it does find occurrences for a small set of patterns +/// much more quickly than Aho-Corasick. +/// +/// Teddy cannot run on small haystacks below a certain size, which is +/// dependent on the type of matcher used. This size can be queried via the +/// `minimum_len` method. Violating this will result in a panic. +/// +/// Finally, when callers use a Teddy runtime, they must provide precisely the +/// patterns used to construct the Teddy matcher. Violating this will result +/// in either a panic or incorrect results, but will never sacrifice memory +/// safety. +#[derive(Clone, Debug)] +pub struct Teddy { + /// The allocation of patterns in buckets. This only contains the IDs of + /// patterns. In order to do full verification, callers must provide the + /// actual patterns when using Teddy. + pub buckets: Vec>, + /// The maximum identifier of a pattern. This is used as a sanity check to + /// ensure that the patterns provided by the caller are the same as the + /// patterns that were used to compile the matcher. This sanity check + /// permits safely eliminating bounds checks regardless of what patterns + /// are provided by the caller. + /// + /// Note that users of the aho-corasick crate cannot get this wrong. Only + /// code internal to this crate can get it wrong, since neither `Patterns` + /// type nor the Teddy runtime are public API items. + pub max_pattern_id: PatternID, + /// The actual runtime to use. + pub exec: Exec, +} + +impl Teddy { + /// Return the first occurrence of a match in the given haystack after or + /// starting at `at`. + /// + /// The patterns provided must be precisely the same patterns given to the + /// Teddy builder, otherwise this may panic or produce incorrect results. + /// + /// All matches are consistent with the match semantics (leftmost-first or + /// leftmost-longest) set on `pats`. + pub fn find_at( + &self, + pats: &Patterns, + haystack: &[u8], + at: usize, + ) -> Option { + // This assert is a bit subtle, but it's an important guarantee. + // Namely, if the maximum pattern ID seen by Teddy is the same as the + // one in the patterns given, then we are guaranteed that every pattern + // ID in all Teddy buckets are valid indices into `pats`. While this + // is nominally true, there is no guarantee that callers provide the + // same `pats` to both the Teddy builder and the searcher, which would + // otherwise make `find_at` unsafe to call. But this assert lets us + // keep this routine safe and eliminate an important bounds check in + // verification. + assert_eq!( + self.max_pattern_id, + pats.max_pattern_id(), + "teddy must be called with same patterns it was built with", + ); + // SAFETY: The haystack must have at least a minimum number of bytes + // for Teddy to be able to work. The minimum number varies depending on + // which matcher is used below. If this is violated, then it's possible + // for searching to do out-of-bounds writes. + assert!(haystack[at..].len() >= self.minimum_len()); + // SAFETY: The various Teddy matchers are always safe to call because + // the Teddy builder guarantees that a particular Exec variant is + // built only when it can be run the current CPU. That is, the Teddy + // builder will not produce a Exec::TeddySlim1Mask256 unless AVX2 is + // enabled. That is, our dynamic CPU feature detection is performed + // once in the builder, and we rely on the type system to avoid needing + // to do it again. + unsafe { + match self.exec { + Exec::TeddySlim1Mask128(ref e) => { + e.find_at(pats, self, haystack, at) + } + Exec::TeddySlim1Mask256(ref e) => { + e.find_at(pats, self, haystack, at) + } + Exec::TeddyFat1Mask256(ref e) => { + e.find_at(pats, self, haystack, at) + } + Exec::TeddySlim2Mask128(ref e) => { + e.find_at(pats, self, haystack, at) + } + Exec::TeddySlim2Mask256(ref e) => { + e.find_at(pats, self, haystack, at) + } + Exec::TeddyFat2Mask256(ref e) => { + e.find_at(pats, self, haystack, at) + } + Exec::TeddySlim3Mask128(ref e) => { + e.find_at(pats, self, haystack, at) + } + Exec::TeddySlim3Mask256(ref e) => { + e.find_at(pats, self, haystack, at) + } + Exec::TeddyFat3Mask256(ref e) => { + e.find_at(pats, self, haystack, at) + } + } + } + } + + /// Returns the minimum length of a haystack that must be provided by + /// callers to this Teddy searcher. Providing a haystack shorter than this + /// will result in a panic, but will never violate memory safety. + pub fn minimum_len(&self) -> usize { + // SAFETY: These values must be correct in order to ensure safety. + // The Teddy runtime assumes their haystacks have at least these + // lengths. Violating this will sacrifice memory safety. + match self.exec { + Exec::TeddySlim1Mask128(_) => 16, + Exec::TeddySlim1Mask256(_) => 32, + Exec::TeddyFat1Mask256(_) => 16, + Exec::TeddySlim2Mask128(_) => 17, + Exec::TeddySlim2Mask256(_) => 33, + Exec::TeddyFat2Mask256(_) => 17, + Exec::TeddySlim3Mask128(_) => 18, + Exec::TeddySlim3Mask256(_) => 34, + Exec::TeddyFat3Mask256(_) => 34, + } + } + + /// Returns the approximate total amount of heap used by this searcher, in + /// units of bytes. + pub fn heap_bytes(&self) -> usize { + let num_patterns = self.max_pattern_id as usize + 1; + self.buckets.len() * mem::size_of::>() + + num_patterns * mem::size_of::() + } + + /// Runs the verification routine for Slim 128-bit Teddy. + /// + /// The candidate given should be a collection of 8-bit bitsets (one bitset + /// per lane), where the ith bit is set in the jth lane if and only if the + /// byte occurring at `at + j` in `haystack` is in the bucket `i`. + /// + /// This is not safe to call unless the SSSE3 target feature is enabled. + /// The `target_feature` attribute is not applied since this function is + /// always forcefully inlined. + #[inline(always)] + unsafe fn verify128( + &self, + pats: &Patterns, + haystack: &[u8], + at: usize, + cand: __m128i, + ) -> Option { + debug_assert!(!is_all_zeroes128(cand)); + debug_assert_eq!(8, self.buckets.len()); + + // Convert the candidate into 64-bit chunks, and then verify each of + // those chunks. + let parts = unpack64x128(cand); + for (i, &part) in parts.iter().enumerate() { + let pos = at + i * 8; + if let Some(m) = self.verify64(pats, 8, haystack, pos, part) { + return Some(m); + } + } + None + } + + /// Runs the verification routine for Slim 256-bit Teddy. + /// + /// The candidate given should be a collection of 8-bit bitsets (one bitset + /// per lane), where the ith bit is set in the jth lane if and only if the + /// byte occurring at `at + j` in `haystack` is in the bucket `i`. + /// + /// This is not safe to call unless the AVX2 target feature is enabled. + /// The `target_feature` attribute is not applied since this function is + /// always forcefully inlined. + #[inline(always)] + unsafe fn verify256( + &self, + pats: &Patterns, + haystack: &[u8], + at: usize, + cand: __m256i, + ) -> Option { + debug_assert!(!is_all_zeroes256(cand)); + debug_assert_eq!(8, self.buckets.len()); + + // Convert the candidate into 64-bit chunks, and then verify each of + // those chunks. + let parts = unpack64x256(cand); + for (i, &part) in parts.iter().enumerate() { + let pos = at + i * 8; + if let Some(m) = self.verify64(pats, 8, haystack, pos, part) { + return Some(m); + } + } + None + } + + /// Runs the verification routine for Fat 256-bit Teddy. + /// + /// The candidate given should be a collection of 8-bit bitsets (one bitset + /// per lane), where the ith bit is set in the jth lane if and only if the + /// byte occurring at `at + (j < 16 ? j : j - 16)` in `haystack` is in the + /// bucket `j < 16 ? i : i + 8`. + /// + /// This is not safe to call unless the AVX2 target feature is enabled. + /// The `target_feature` attribute is not applied since this function is + /// always forcefully inlined. + #[inline(always)] + unsafe fn verify_fat256( + &self, + pats: &Patterns, + haystack: &[u8], + at: usize, + cand: __m256i, + ) -> Option { + debug_assert!(!is_all_zeroes256(cand)); + debug_assert_eq!(16, self.buckets.len()); + + // This is a bit tricky, but we basically want to convert our + // candidate, which looks like this + // + // a31 a30 ... a17 a16 a15 a14 ... a01 a00 + // + // where each a(i) is an 8-bit bitset corresponding to the activated + // buckets, to this + // + // a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00 + // + // Namely, for Fat Teddy, the high 128-bits of the candidate correspond + // to the same bytes in the haystack in the low 128-bits (so we only + // scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7. + // + // The verification routine wants to look at all potentially matching + // buckets before moving on to the next lane. So for example, both + // a16 and a00 both correspond to the first byte in our window; a00 + // contains buckets 0-7 and a16 contains buckets 8-15. Specifically, + // a16 should be checked before a01. So the transformation shown above + // allows us to use our normal verification procedure with one small + // change: we treat each bitset as 16 bits instead of 8 bits. + + // Swap the 128-bit lanes in the candidate vector. + let swap = _mm256_permute4x64_epi64(cand, 0x4E); + // Interleave the bytes from the low 128-bit lanes, starting with + // cand first. + let r1 = _mm256_unpacklo_epi8(cand, swap); + // Interleave the bytes from the high 128-bit lanes, starting with + // cand first. + let r2 = _mm256_unpackhi_epi8(cand, swap); + // Now just take the 2 low 64-bit integers from both r1 and r2. We + // can drop the high 64-bit integers because they are a mirror image + // of the low 64-bit integers. All we care about are the low 128-bit + // lanes of r1 and r2. Combined, they contain all our 16-bit bitsets + // laid out in the desired order, as described above. + let parts = unpacklo64x256(r1, r2); + for (i, &part) in parts.iter().enumerate() { + let pos = at + i * 4; + if let Some(m) = self.verify64(pats, 16, haystack, pos, part) { + return Some(m); + } + } + None + } + + /// Verify whether there are any matches starting at or after `at` in the + /// given `haystack`. The candidate given should correspond to either 8-bit + /// (for 8 buckets) or 16-bit (16 buckets) bitsets. + #[inline(always)] + fn verify64( + &self, + pats: &Patterns, + bucket_count: usize, + haystack: &[u8], + at: usize, + mut cand: u64, + ) -> Option { + // N.B. While the bucket count is known from self.buckets.len(), + // requiring it as a parameter makes it easier for the optimizer to + // know its value, and thus produce more efficient codegen. + debug_assert!(bucket_count == 8 || bucket_count == 16); + while cand != 0 { + let bit = cand.trailing_zeros() as usize; + cand &= !(1 << bit); + + let at = at + (bit / bucket_count); + let bucket = bit % bucket_count; + if let Some(m) = self.verify_bucket(pats, haystack, bucket, at) { + return Some(m); + } + } + None + } + + /// Verify whether there are any matches starting at `at` in the given + /// `haystack` corresponding only to patterns in the given bucket. + #[inline(always)] + fn verify_bucket( + &self, + pats: &Patterns, + haystack: &[u8], + bucket: usize, + at: usize, + ) -> Option { + // Forcing this function to not inline and be "cold" seems to help + // the codegen for Teddy overall. Interestingly, this is good for a + // 16% boost in the sherlock/packed/teddy/name/alt1 benchmark (among + // others). Overall, this seems like a problem with codegen, since + // creating the Match itself is a very small amount of code. + #[cold] + #[inline(never)] + fn match_from_span( + pati: PatternID, + start: usize, + end: usize, + ) -> Match { + Match::from_span(pati as usize, start, end) + } + + // N.B. The bounds check for this bucket lookup *should* be elided + // since we assert the number of buckets in each `find_at` routine, + // and the compiler can prove that the `% 8` (or `% 16`) in callers + // of this routine will always be in bounds. + for &pati in &self.buckets[bucket] { + // SAFETY: This is safe because we are guaranteed that every + // index in a Teddy bucket is a valid index into `pats`. This + // guarantee is upheld by the assert checking `max_pattern_id` in + // the beginning of `find_at` above. + // + // This explicit bounds check elision is (amazingly) good for a + // 25-50% boost in some benchmarks, particularly ones with a lot + // of short literals. + let pat = unsafe { pats.get_unchecked(pati) }; + if pat.is_prefix(&haystack[at..]) { + return Some(match_from_span(pati, at, at + pat.len())); + } + } + None + } +} + +/// Exec represents the different search strategies supported by the Teddy +/// runtime. +/// +/// This enum is an important safety abstraction. Namely, callers should only +/// construct a variant in this enum if it is safe to execute its corresponding +/// target features on the current CPU. The 128-bit searchers require SSSE3, +/// while the 256-bit searchers require AVX2. +#[derive(Clone, Debug)] +pub enum Exec { + TeddySlim1Mask128(TeddySlim1Mask128), + TeddySlim1Mask256(TeddySlim1Mask256), + TeddyFat1Mask256(TeddyFat1Mask256), + TeddySlim2Mask128(TeddySlim2Mask128), + TeddySlim2Mask256(TeddySlim2Mask256), + TeddyFat2Mask256(TeddyFat2Mask256), + TeddySlim3Mask128(TeddySlim3Mask128), + TeddySlim3Mask256(TeddySlim3Mask256), + TeddyFat3Mask256(TeddyFat3Mask256), +} + +// Most of the code below remains undocumented because they are effectively +// repeated versions of themselves. The general structure is described in the +// README and in the comments above. + +#[derive(Clone, Debug)] +pub struct TeddySlim1Mask128 { + pub mask1: Mask128, +} + +impl TeddySlim1Mask128 { + #[target_feature(enable = "ssse3")] + unsafe fn find_at( + &self, + pats: &Patterns, + teddy: &Teddy, + haystack: &[u8], + mut at: usize, + ) -> Option { + debug_assert!(haystack[at..].len() >= teddy.minimum_len()); + // This assert helps eliminate bounds checks for bucket lookups in + // Teddy::verify_bucket, which has a small (3-4%) performance boost. + assert_eq!(8, teddy.buckets.len()); + + let len = haystack.len(); + while at <= len - 16 { + let c = self.candidate(haystack, at); + if !is_all_zeroes128(c) { + if let Some(m) = teddy.verify128(pats, haystack, at, c) { + return Some(m); + } + } + at += 16; + } + if at < len { + at = len - 16; + let c = self.candidate(haystack, at); + if !is_all_zeroes128(c) { + if let Some(m) = teddy.verify128(pats, haystack, at, c) { + return Some(m); + } + } + } + None + } + + #[inline(always)] + unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m128i { + debug_assert!(haystack[at..].len() >= 16); + + let chunk = loadu128(haystack, at); + members1m128(chunk, self.mask1) + } +} + +#[derive(Clone, Debug)] +pub struct TeddySlim1Mask256 { + pub mask1: Mask256, +} + +impl TeddySlim1Mask256 { + #[target_feature(enable = "avx2")] + unsafe fn find_at( + &self, + pats: &Patterns, + teddy: &Teddy, + haystack: &[u8], + mut at: usize, + ) -> Option { + debug_assert!(haystack[at..].len() >= teddy.minimum_len()); + // This assert helps eliminate bounds checks for bucket lookups in + // Teddy::verify_bucket, which has a small (3-4%) performance boost. + assert_eq!(8, teddy.buckets.len()); + + let len = haystack.len(); + while at <= len - 32 { + let c = self.candidate(haystack, at); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify256(pats, haystack, at, c) { + return Some(m); + } + } + at += 32; + } + if at < len { + at = len - 32; + let c = self.candidate(haystack, at); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify256(pats, haystack, at, c) { + return Some(m); + } + } + } + None + } + + #[inline(always)] + unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i { + debug_assert!(haystack[at..].len() >= 32); + + let chunk = loadu256(haystack, at); + members1m256(chunk, self.mask1) + } +} + +#[derive(Clone, Debug)] +pub struct TeddyFat1Mask256 { + pub mask1: Mask256, +} + +impl TeddyFat1Mask256 { + #[target_feature(enable = "avx2")] + unsafe fn find_at( + &self, + pats: &Patterns, + teddy: &Teddy, + haystack: &[u8], + mut at: usize, + ) -> Option { + debug_assert!(haystack[at..].len() >= teddy.minimum_len()); + // This assert helps eliminate bounds checks for bucket lookups in + // Teddy::verify_bucket, which has a small (3-4%) performance boost. + assert_eq!(16, teddy.buckets.len()); + + let len = haystack.len(); + while at <= len - 16 { + let c = self.candidate(haystack, at); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) { + return Some(m); + } + } + at += 16; + } + if at < len { + at = len - 16; + let c = self.candidate(haystack, at); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) { + return Some(m); + } + } + } + None + } + + #[inline(always)] + unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i { + debug_assert!(haystack[at..].len() >= 16); + + let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at)); + members1m256(chunk, self.mask1) + } +} + +#[derive(Clone, Debug)] +pub struct TeddySlim2Mask128 { + pub mask1: Mask128, + pub mask2: Mask128, +} + +impl TeddySlim2Mask128 { + #[target_feature(enable = "ssse3")] + unsafe fn find_at( + &self, + pats: &Patterns, + teddy: &Teddy, + haystack: &[u8], + mut at: usize, + ) -> Option { + debug_assert!(haystack[at..].len() >= teddy.minimum_len()); + // This assert helps eliminate bounds checks for bucket lookups in + // Teddy::verify_bucket, which has a small (3-4%) performance boost. + assert_eq!(8, teddy.buckets.len()); + + at += 1; + let len = haystack.len(); + let mut prev0 = ones128(); + while at <= len - 16 { + let c = self.candidate(haystack, at, &mut prev0); + if !is_all_zeroes128(c) { + if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) { + return Some(m); + } + } + at += 16; + } + if at < len { + at = len - 16; + prev0 = ones128(); + + let c = self.candidate(haystack, at, &mut prev0); + if !is_all_zeroes128(c) { + if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) { + return Some(m); + } + } + } + None + } + + #[inline(always)] + unsafe fn candidate( + &self, + haystack: &[u8], + at: usize, + prev0: &mut __m128i, + ) -> __m128i { + debug_assert!(haystack[at..].len() >= 16); + + let chunk = loadu128(haystack, at); + let (res0, res1) = members2m128(chunk, self.mask1, self.mask2); + let res0prev0 = _mm_alignr_epi8(res0, *prev0, 15); + _mm_and_si128(res0prev0, res1) + } +} + +#[derive(Clone, Debug)] +pub struct TeddySlim2Mask256 { + pub mask1: Mask256, + pub mask2: Mask256, +} + +impl TeddySlim2Mask256 { + #[target_feature(enable = "avx2")] + unsafe fn find_at( + &self, + pats: &Patterns, + teddy: &Teddy, + haystack: &[u8], + mut at: usize, + ) -> Option { + debug_assert!(haystack[at..].len() >= teddy.minimum_len()); + // This assert helps eliminate bounds checks for bucket lookups in + // Teddy::verify_bucket, which has a small (3-4%) performance boost. + assert_eq!(8, teddy.buckets.len()); + + at += 1; + let len = haystack.len(); + let mut prev0 = ones256(); + while at <= len - 32 { + let c = self.candidate(haystack, at, &mut prev0); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) { + return Some(m); + } + } + at += 32; + } + if at < len { + at = len - 32; + prev0 = ones256(); + + let c = self.candidate(haystack, at, &mut prev0); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) { + return Some(m); + } + } + } + None + } + + #[inline(always)] + unsafe fn candidate( + &self, + haystack: &[u8], + at: usize, + prev0: &mut __m256i, + ) -> __m256i { + debug_assert!(haystack[at..].len() >= 32); + + let chunk = loadu256(haystack, at); + let (res0, res1) = members2m256(chunk, self.mask1, self.mask2); + let res0prev0 = alignr256_15(res0, *prev0); + let res = _mm256_and_si256(res0prev0, res1); + *prev0 = res0; + res + } +} + +#[derive(Clone, Debug)] +pub struct TeddyFat2Mask256 { + pub mask1: Mask256, + pub mask2: Mask256, +} + +impl TeddyFat2Mask256 { + #[target_feature(enable = "avx2")] + unsafe fn find_at( + &self, + pats: &Patterns, + teddy: &Teddy, + haystack: &[u8], + mut at: usize, + ) -> Option { + debug_assert!(haystack[at..].len() >= teddy.minimum_len()); + // This assert helps eliminate bounds checks for bucket lookups in + // Teddy::verify_bucket, which has a small (3-4%) performance boost. + assert_eq!(16, teddy.buckets.len()); + + at += 1; + let len = haystack.len(); + let mut prev0 = ones256(); + while at <= len - 16 { + let c = self.candidate(haystack, at, &mut prev0); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c) + { + return Some(m); + } + } + at += 16; + } + if at < len { + at = len - 16; + prev0 = ones256(); + + let c = self.candidate(haystack, at, &mut prev0); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c) + { + return Some(m); + } + } + } + None + } + + #[inline(always)] + unsafe fn candidate( + &self, + haystack: &[u8], + at: usize, + prev0: &mut __m256i, + ) -> __m256i { + debug_assert!(haystack[at..].len() >= 16); + + let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at)); + let (res0, res1) = members2m256(chunk, self.mask1, self.mask2); + let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 15); + let res = _mm256_and_si256(res0prev0, res1); + *prev0 = res0; + res + } +} + +#[derive(Clone, Debug)] +pub struct TeddySlim3Mask128 { + pub mask1: Mask128, + pub mask2: Mask128, + pub mask3: Mask128, +} + +impl TeddySlim3Mask128 { + #[target_feature(enable = "ssse3")] + unsafe fn find_at( + &self, + pats: &Patterns, + teddy: &Teddy, + haystack: &[u8], + mut at: usize, + ) -> Option { + debug_assert!(haystack[at..].len() >= teddy.minimum_len()); + // This assert helps eliminate bounds checks for bucket lookups in + // Teddy::verify_bucket, which has a small (3-4%) performance boost. + assert_eq!(8, teddy.buckets.len()); + + at += 2; + let len = haystack.len(); + let (mut prev0, mut prev1) = (ones128(), ones128()); + while at <= len - 16 { + let c = self.candidate(haystack, at, &mut prev0, &mut prev1); + if !is_all_zeroes128(c) { + if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) { + return Some(m); + } + } + at += 16; + } + if at < len { + at = len - 16; + prev0 = ones128(); + prev1 = ones128(); + + let c = self.candidate(haystack, at, &mut prev0, &mut prev1); + if !is_all_zeroes128(c) { + if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) { + return Some(m); + } + } + } + None + } + + #[inline(always)] + unsafe fn candidate( + &self, + haystack: &[u8], + at: usize, + prev0: &mut __m128i, + prev1: &mut __m128i, + ) -> __m128i { + debug_assert!(haystack[at..].len() >= 16); + + let chunk = loadu128(haystack, at); + let (res0, res1, res2) = + members3m128(chunk, self.mask1, self.mask2, self.mask3); + let res0prev0 = _mm_alignr_epi8(res0, *prev0, 14); + let res1prev1 = _mm_alignr_epi8(res1, *prev1, 15); + let res = _mm_and_si128(_mm_and_si128(res0prev0, res1prev1), res2); + *prev0 = res0; + *prev1 = res1; + res + } +} + +#[derive(Clone, Debug)] +pub struct TeddySlim3Mask256 { + pub mask1: Mask256, + pub mask2: Mask256, + pub mask3: Mask256, +} + +impl TeddySlim3Mask256 { + #[target_feature(enable = "avx2")] + unsafe fn find_at( + &self, + pats: &Patterns, + teddy: &Teddy, + haystack: &[u8], + mut at: usize, + ) -> Option { + debug_assert!(haystack[at..].len() >= teddy.minimum_len()); + // This assert helps eliminate bounds checks for bucket lookups in + // Teddy::verify_bucket, which has a small (3-4%) performance boost. + assert_eq!(8, teddy.buckets.len()); + + at += 2; + let len = haystack.len(); + let (mut prev0, mut prev1) = (ones256(), ones256()); + while at <= len - 32 { + let c = self.candidate(haystack, at, &mut prev0, &mut prev1); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) { + return Some(m); + } + } + at += 32; + } + if at < len { + at = len - 32; + prev0 = ones256(); + prev1 = ones256(); + + let c = self.candidate(haystack, at, &mut prev0, &mut prev1); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) { + return Some(m); + } + } + } + None + } + + #[inline(always)] + unsafe fn candidate( + &self, + haystack: &[u8], + at: usize, + prev0: &mut __m256i, + prev1: &mut __m256i, + ) -> __m256i { + debug_assert!(haystack[at..].len() >= 32); + + let chunk = loadu256(haystack, at); + let (res0, res1, res2) = + members3m256(chunk, self.mask1, self.mask2, self.mask3); + let res0prev0 = alignr256_14(res0, *prev0); + let res1prev1 = alignr256_15(res1, *prev1); + let res = + _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2); + *prev0 = res0; + *prev1 = res1; + res + } +} + +#[derive(Clone, Debug)] +pub struct TeddyFat3Mask256 { + pub mask1: Mask256, + pub mask2: Mask256, + pub mask3: Mask256, +} + +impl TeddyFat3Mask256 { + #[target_feature(enable = "avx2")] + unsafe fn find_at( + &self, + pats: &Patterns, + teddy: &Teddy, + haystack: &[u8], + mut at: usize, + ) -> Option { + debug_assert!(haystack[at..].len() >= teddy.minimum_len()); + // This assert helps eliminate bounds checks for bucket lookups in + // Teddy::verify_bucket, which has a small (3-4%) performance boost. + assert_eq!(16, teddy.buckets.len()); + + at += 2; + let len = haystack.len(); + let (mut prev0, mut prev1) = (ones256(), ones256()); + while at <= len - 16 { + let c = self.candidate(haystack, at, &mut prev0, &mut prev1); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c) + { + return Some(m); + } + } + at += 16; + } + if at < len { + at = len - 16; + prev0 = ones256(); + prev1 = ones256(); + + let c = self.candidate(haystack, at, &mut prev0, &mut prev1); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c) + { + return Some(m); + } + } + } + None + } + + #[inline(always)] + unsafe fn candidate( + &self, + haystack: &[u8], + at: usize, + prev0: &mut __m256i, + prev1: &mut __m256i, + ) -> __m256i { + debug_assert!(haystack[at..].len() >= 16); + + let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at)); + let (res0, res1, res2) = + members3m256(chunk, self.mask1, self.mask2, self.mask3); + let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 14); + let res1prev1 = _mm256_alignr_epi8(res1, *prev1, 15); + let res = + _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2); + *prev0 = res0; + *prev1 = res1; + res + } +} + +/// A 128-bit mask for the low and high nybbles in a set of patterns. Each +/// lane `j` corresponds to a bitset where the `i`th bit is set if and only if +/// the nybble `j` is in the bucket `i` at a particular position. +#[derive(Clone, Copy, Debug)] +pub struct Mask128 { + lo: __m128i, + hi: __m128i, +} + +impl Mask128 { + /// Create a new SIMD mask from the mask produced by the Teddy builder. + pub fn new(mask: compile::Mask) -> Mask128 { + // SAFETY: This is safe since [u8; 16] has the same representation + // as __m128i. + unsafe { + Mask128 { + lo: mem::transmute(mask.lo128()), + hi: mem::transmute(mask.hi128()), + } + } + } +} + +/// A 256-bit mask for the low and high nybbles in a set of patterns. Each +/// lane `j` corresponds to a bitset where the `i`th bit is set if and only if +/// the nybble `j` is in the bucket `i` at a particular position. +/// +/// This is slightly tweaked dependending on whether Slim or Fat Teddy is being +/// used. For Slim Teddy, the bitsets in the lower 128-bits are the same as +/// the bitsets in the higher 128-bits, so that we can search 32 bytes at a +/// time. (Remember, the nybbles in the haystack are used as indices into these +/// masks, and 256-bit shuffles only operate on 128-bit lanes.) +/// +/// For Fat Teddy, the bitsets are not repeated, but instead, the high 128 +/// bits correspond to buckets 8-15. So that a bitset `00100010` has buckets +/// 1 and 5 set if it's in the lower 128 bits, but has buckets 9 and 13 set +/// if it's in the higher 128 bits. +#[derive(Clone, Copy, Debug)] +pub struct Mask256 { + lo: __m256i, + hi: __m256i, +} + +impl Mask256 { + /// Create a new SIMD mask from the mask produced by the Teddy builder. + pub fn new(mask: compile::Mask) -> Mask256 { + // SAFETY: This is safe since [u8; 32] has the same representation + // as __m256i. + unsafe { + Mask256 { + lo: mem::transmute(mask.lo256()), + hi: mem::transmute(mask.hi256()), + } + } + } +} + +// The "members" routines below are responsible for taking a chunk of bytes, +// a number of nybble masks and returning the result of using the masks to +// lookup bytes in the chunk. The results of the high and low nybble masks are +// AND'ed together, such that each candidate returned is a vector, with byte +// sized lanes, and where each lane is an 8-bit bitset corresponding to the +// buckets that contain the corresponding byte. +// +// In the case of masks of length greater than 1, callers will need to keep +// the results from the previous haystack's window, and then shift the vectors +// so that they all line up. Then they can be AND'ed together. + +/// Return a candidate for Slim 128-bit Teddy, where `chunk` corresponds to a +/// 16-byte window of the haystack (where the least significant byte +/// corresponds to the start of the window), and `mask1` corresponds to a +/// low/high mask for the first byte of all patterns that are being searched. +#[target_feature(enable = "ssse3")] +unsafe fn members1m128(chunk: __m128i, mask1: Mask128) -> __m128i { + let lomask = _mm_set1_epi8(0xF); + let hlo = _mm_and_si128(chunk, lomask); + let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask); + _mm_and_si128( + _mm_shuffle_epi8(mask1.lo, hlo), + _mm_shuffle_epi8(mask1.hi, hhi), + ) +} + +/// Return a candidate for Slim 256-bit Teddy, where `chunk` corresponds to a +/// 32-byte window of the haystack (where the least significant byte +/// corresponds to the start of the window), and `mask1` corresponds to a +/// low/high mask for the first byte of all patterns that are being searched. +/// +/// Note that this can also be used for Fat Teddy, where the high 128 bits in +/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte +/// window in the haystack. +#[target_feature(enable = "avx2")] +unsafe fn members1m256(chunk: __m256i, mask1: Mask256) -> __m256i { + let lomask = _mm256_set1_epi8(0xF); + let hlo = _mm256_and_si256(chunk, lomask); + let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask); + _mm256_and_si256( + _mm256_shuffle_epi8(mask1.lo, hlo), + _mm256_shuffle_epi8(mask1.hi, hhi), + ) +} + +/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds +/// to a 16-byte window of the haystack (where the least significant byte +/// corresponds to the start of the window), and the masks correspond to a +/// low/high mask for the first and second bytes of all patterns that are being +/// searched. The vectors returned correspond to candidates for the first and +/// second bytes in the patterns represented by the masks. +#[target_feature(enable = "ssse3")] +unsafe fn members2m128( + chunk: __m128i, + mask1: Mask128, + mask2: Mask128, +) -> (__m128i, __m128i) { + let lomask = _mm_set1_epi8(0xF); + let hlo = _mm_and_si128(chunk, lomask); + let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask); + let res0 = _mm_and_si128( + _mm_shuffle_epi8(mask1.lo, hlo), + _mm_shuffle_epi8(mask1.hi, hhi), + ); + let res1 = _mm_and_si128( + _mm_shuffle_epi8(mask2.lo, hlo), + _mm_shuffle_epi8(mask2.hi, hhi), + ); + (res0, res1) +} + +/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds +/// to a 32-byte window of the haystack (where the least significant byte +/// corresponds to the start of the window), and the masks correspond to a +/// low/high mask for the first and second bytes of all patterns that are being +/// searched. The vectors returned correspond to candidates for the first and +/// second bytes in the patterns represented by the masks. +/// +/// Note that this can also be used for Fat Teddy, where the high 128 bits in +/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte +/// window in the haystack. +#[target_feature(enable = "avx2")] +unsafe fn members2m256( + chunk: __m256i, + mask1: Mask256, + mask2: Mask256, +) -> (__m256i, __m256i) { + let lomask = _mm256_set1_epi8(0xF); + let hlo = _mm256_and_si256(chunk, lomask); + let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask); + let res0 = _mm256_and_si256( + _mm256_shuffle_epi8(mask1.lo, hlo), + _mm256_shuffle_epi8(mask1.hi, hhi), + ); + let res1 = _mm256_and_si256( + _mm256_shuffle_epi8(mask2.lo, hlo), + _mm256_shuffle_epi8(mask2.hi, hhi), + ); + (res0, res1) +} + +/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds +/// to a 16-byte window of the haystack (where the least significant byte +/// corresponds to the start of the window), and the masks correspond to a +/// low/high mask for the first, second and third bytes of all patterns that +/// are being searched. The vectors returned correspond to candidates for the +/// first, second and third bytes in the patterns represented by the masks. +#[target_feature(enable = "ssse3")] +unsafe fn members3m128( + chunk: __m128i, + mask1: Mask128, + mask2: Mask128, + mask3: Mask128, +) -> (__m128i, __m128i, __m128i) { + let lomask = _mm_set1_epi8(0xF); + let hlo = _mm_and_si128(chunk, lomask); + let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask); + let res0 = _mm_and_si128( + _mm_shuffle_epi8(mask1.lo, hlo), + _mm_shuffle_epi8(mask1.hi, hhi), + ); + let res1 = _mm_and_si128( + _mm_shuffle_epi8(mask2.lo, hlo), + _mm_shuffle_epi8(mask2.hi, hhi), + ); + let res2 = _mm_and_si128( + _mm_shuffle_epi8(mask3.lo, hlo), + _mm_shuffle_epi8(mask3.hi, hhi), + ); + (res0, res1, res2) +} + +/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds +/// to a 32-byte window of the haystack (where the least significant byte +/// corresponds to the start of the window), and the masks correspond to a +/// low/high mask for the first, second and third bytes of all patterns that +/// are being searched. The vectors returned correspond to candidates for the +/// first, second and third bytes in the patterns represented by the masks. +/// +/// Note that this can also be used for Fat Teddy, where the high 128 bits in +/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte +/// window in the haystack. +#[target_feature(enable = "avx2")] +unsafe fn members3m256( + chunk: __m256i, + mask1: Mask256, + mask2: Mask256, + mask3: Mask256, +) -> (__m256i, __m256i, __m256i) { + let lomask = _mm256_set1_epi8(0xF); + let hlo = _mm256_and_si256(chunk, lomask); + let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask); + let res0 = _mm256_and_si256( + _mm256_shuffle_epi8(mask1.lo, hlo), + _mm256_shuffle_epi8(mask1.hi, hhi), + ); + let res1 = _mm256_and_si256( + _mm256_shuffle_epi8(mask2.lo, hlo), + _mm256_shuffle_epi8(mask2.hi, hhi), + ); + let res2 = _mm256_and_si256( + _mm256_shuffle_epi8(mask3.lo, hlo), + _mm256_shuffle_epi8(mask3.hi, hhi), + ); + (res0, res1, res2) +} diff --git a/vendor/aho-corasick/src/packed/tests.rs b/vendor/aho-corasick/src/packed/tests.rs new file mode 100644 index 0000000000..91410cb02c --- /dev/null +++ b/vendor/aho-corasick/src/packed/tests.rs @@ -0,0 +1,568 @@ +use std::collections::HashMap; +use std::usize; + +use crate::packed::{Config, MatchKind}; +use crate::Match; + +/// A description of a single test against a multi-pattern searcher. +/// +/// A single test may not necessarily pass on every configuration of a +/// searcher. The tests are categorized and grouped appropriately below. +#[derive(Clone, Debug, Eq, PartialEq)] +struct SearchTest { + /// The name of this test, for debugging. + name: &'static str, + /// The patterns to search for. + patterns: &'static [&'static str], + /// The text to search. + haystack: &'static str, + /// Each match is a triple of (pattern_index, start, end), where + /// pattern_index is an index into `patterns` and `start`/`end` are indices + /// into `haystack`. + matches: &'static [(usize, usize, usize)], +} + +struct SearchTestOwned { + offset: usize, + name: String, + patterns: Vec, + haystack: String, + matches: Vec<(usize, usize, usize)>, +} + +impl SearchTest { + fn variations(&self) -> Vec { + let mut tests = vec![]; + for i in 0..=260 { + tests.push(self.offset_prefix(i)); + tests.push(self.offset_suffix(i)); + tests.push(self.offset_both(i)); + } + tests + } + + fn offset_both(&self, off: usize) -> SearchTestOwned { + SearchTestOwned { + offset: off, + name: self.name.to_string(), + patterns: self.patterns.iter().map(|s| s.to_string()).collect(), + haystack: format!( + "{}{}{}", + "Z".repeat(off), + self.haystack, + "Z".repeat(off) + ), + matches: self + .matches + .iter() + .map(|&(id, s, e)| (id, s + off, e + off)) + .collect(), + } + } + + fn offset_prefix(&self, off: usize) -> SearchTestOwned { + SearchTestOwned { + offset: off, + name: self.name.to_string(), + patterns: self.patterns.iter().map(|s| s.to_string()).collect(), + haystack: format!("{}{}", "Z".repeat(off), self.haystack), + matches: self + .matches + .iter() + .map(|&(id, s, e)| (id, s + off, e + off)) + .collect(), + } + } + + fn offset_suffix(&self, off: usize) -> SearchTestOwned { + SearchTestOwned { + offset: off, + name: self.name.to_string(), + patterns: self.patterns.iter().map(|s| s.to_string()).collect(), + haystack: format!("{}{}", self.haystack, "Z".repeat(off)), + matches: self.matches.to_vec(), + } + } + + // fn to_owned(&self) -> SearchTestOwned { + // SearchTestOwned { + // name: self.name.to_string(), + // patterns: self.patterns.iter().map(|s| s.to_string()).collect(), + // haystack: self.haystack.to_string(), + // matches: self.matches.iter().cloned().collect(), + // } + // } +} + +/// Short-hand constructor for SearchTest. We use it a lot below. +macro_rules! t { + ($name:ident, $patterns:expr, $haystack:expr, $matches:expr) => { + SearchTest { + name: stringify!($name), + patterns: $patterns, + haystack: $haystack, + matches: $matches, + } + }; +} + +/// A collection of test groups. +type TestCollection = &'static [&'static [SearchTest]]; + +// Define several collections corresponding to the different type of match +// semantics supported. These collections have some overlap, but each +// collection should have some tests that no other collection has. + +/// Tests for leftmost-first match semantics. +const PACKED_LEFTMOST_FIRST: TestCollection = + &[BASICS, LEFTMOST, LEFTMOST_FIRST, REGRESSION, TEDDY]; + +/// Tests for leftmost-longest match semantics. +const PACKED_LEFTMOST_LONGEST: TestCollection = + &[BASICS, LEFTMOST, LEFTMOST_LONGEST, REGRESSION, TEDDY]; + +// Now define the individual tests that make up the collections above. + +/// A collection of tests for the that should always be true regardless of +/// match semantics. That is, all combinations of leftmost-{first, longest} +/// should produce the same answer. +const BASICS: &'static [SearchTest] = &[ + t!(basic001, &["a"], "", &[]), + t!(basic010, &["a"], "a", &[(0, 0, 1)]), + t!(basic020, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]), + t!(basic030, &["a"], "aaa", &[(0, 0, 1), (0, 1, 2), (0, 2, 3)]), + t!(basic040, &["a"], "aba", &[(0, 0, 1), (0, 2, 3)]), + t!(basic050, &["a"], "bba", &[(0, 2, 3)]), + t!(basic060, &["a"], "bbb", &[]), + t!(basic070, &["a"], "bababbbba", &[(0, 1, 2), (0, 3, 4), (0, 8, 9)]), + t!(basic100, &["aa"], "", &[]), + t!(basic110, &["aa"], "aa", &[(0, 0, 2)]), + t!(basic120, &["aa"], "aabbaa", &[(0, 0, 2), (0, 4, 6)]), + t!(basic130, &["aa"], "abbab", &[]), + t!(basic140, &["aa"], "abbabaa", &[(0, 5, 7)]), + t!(basic150, &["aaa"], "aaa", &[(0, 0, 3)]), + t!(basic200, &["abc"], "abc", &[(0, 0, 3)]), + t!(basic210, &["abc"], "zazabzabcz", &[(0, 6, 9)]), + t!(basic220, &["abc"], "zazabczabcz", &[(0, 3, 6), (0, 7, 10)]), + t!(basic300, &["a", "b"], "", &[]), + t!(basic310, &["a", "b"], "z", &[]), + t!(basic320, &["a", "b"], "b", &[(1, 0, 1)]), + t!(basic330, &["a", "b"], "a", &[(0, 0, 1)]), + t!( + basic340, + &["a", "b"], + "abba", + &[(0, 0, 1), (1, 1, 2), (1, 2, 3), (0, 3, 4),] + ), + t!( + basic350, + &["b", "a"], + "abba", + &[(1, 0, 1), (0, 1, 2), (0, 2, 3), (1, 3, 4),] + ), + t!(basic360, &["abc", "bc"], "xbc", &[(1, 1, 3),]), + t!(basic400, &["foo", "bar"], "", &[]), + t!(basic410, &["foo", "bar"], "foobar", &[(0, 0, 3), (1, 3, 6),]), + t!(basic420, &["foo", "bar"], "barfoo", &[(1, 0, 3), (0, 3, 6),]), + t!(basic430, &["foo", "bar"], "foofoo", &[(0, 0, 3), (0, 3, 6),]), + t!(basic440, &["foo", "bar"], "barbar", &[(1, 0, 3), (1, 3, 6),]), + t!(basic450, &["foo", "bar"], "bafofoo", &[(0, 4, 7),]), + t!(basic460, &["bar", "foo"], "bafofoo", &[(1, 4, 7),]), + t!(basic470, &["foo", "bar"], "fobabar", &[(1, 4, 7),]), + t!(basic480, &["bar", "foo"], "fobabar", &[(0, 4, 7),]), + t!(basic700, &["yabcdef", "abcdezghi"], "yabcdefghi", &[(0, 0, 7),]), + t!(basic710, &["yabcdef", "abcdezghi"], "yabcdezghi", &[(1, 1, 10),]), + t!( + basic720, + &["yabcdef", "bcdeyabc", "abcdezghi"], + "yabcdezghi", + &[(2, 1, 10),] + ), + t!(basic810, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]), + t!(basic820, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]), + t!(basic830, &["abc", "bc"], "zazabcz", &[(0, 3, 6),]), + t!( + basic840, + &["ab", "ba"], + "abababa", + &[(0, 0, 2), (0, 2, 4), (0, 4, 6),] + ), + t!(basic850, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (0, 6, 9),]), +]; + +/// Tests for leftmost match semantics. These should pass for both +/// leftmost-first and leftmost-longest match kinds. Stated differently, among +/// ambiguous matches, the longest match and the match that appeared first when +/// constructing the automaton should always be the same. +const LEFTMOST: &'static [SearchTest] = &[ + t!(leftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]), + t!(leftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]), + t!(leftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]), + t!(leftmost032, &["ab", "a"], "xayabbbz", &[(1, 1, 2), (0, 3, 5)]), + t!(leftmost300, &["abcd", "bce", "b"], "abce", &[(1, 1, 4)]), + t!(leftmost310, &["abcd", "ce", "bc"], "abce", &[(2, 1, 3)]), + t!(leftmost320, &["abcd", "bce", "ce", "b"], "abce", &[(1, 1, 4)]), + t!(leftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[(3, 1, 3)]), + t!(leftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]), + t!(leftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]), + t!( + leftmost360, + &["abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(2, 0, 8),] + ), + t!( + leftmost370, + &["abcdefghi", "cde", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + leftmost380, + &["abcdefghi", "hz", "abcdefgh", "a"], + "abcdefghz", + &[(2, 0, 8),] + ), + t!( + leftmost390, + &["b", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + leftmost400, + &["h", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + leftmost410, + &["z", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8), (0, 8, 9),] + ), +]; + +/// Tests for non-overlapping leftmost-first match semantics. These tests +/// should generally be specific to leftmost-first, which means they should +/// generally fail under leftmost-longest semantics. +const LEFTMOST_FIRST: &'static [SearchTest] = &[ + t!(leftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), + t!(leftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]), + t!(leftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]), + t!(leftfirst040, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (0, 3, 4)]), + t!(leftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(1, 1, 5)]), + t!(leftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]), + t!(leftfirst300, &["abcd", "b", "bce"], "abce", &[(1, 1, 2)]), + t!( + leftfirst310, + &["abcd", "b", "bce", "ce"], + "abce", + &[(1, 1, 2), (3, 2, 4),] + ), + t!( + leftfirst320, + &["a", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(0, 0, 1), (2, 7, 9),] + ), + t!(leftfirst330, &["a", "abab"], "abab", &[(0, 0, 1), (0, 2, 3)]), + t!( + leftfirst340, + &["abcdef", "x", "x", "x", "x", "x", "x", "abcde"], + "abcdef", + &[(0, 0, 6)] + ), +]; + +/// Tests for non-overlapping leftmost-longest match semantics. These tests +/// should generally be specific to leftmost-longest, which means they should +/// generally fail under leftmost-first semantics. +const LEFTMOST_LONGEST: &'static [SearchTest] = &[ + t!(leftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]), + t!(leftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]), + t!(leftlong040, &["a", "ab"], "a", &[(0, 0, 1)]), + t!(leftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]), + t!(leftlong060, &["ab", "a"], "a", &[(1, 0, 1)]), + t!(leftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]), + t!(leftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(2, 1, 6)]), + t!(leftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]), + t!(leftlong300, &["abcd", "b", "bce"], "abce", &[(2, 1, 4)]), + t!( + leftlong310, + &["a", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!(leftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]), + t!(leftlong330, &["abcd", "b", "ce"], "abce", &[(1, 1, 2), (2, 2, 4),]), + t!(leftlong340, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (1, 3, 5)]), +]; + +/// Regression tests that are applied to all combinations. +/// +/// If regression tests are needed for specific match semantics, then add them +/// to the appropriate group above. +const REGRESSION: &'static [SearchTest] = &[ + t!(regression010, &["inf", "ind"], "infind", &[(0, 0, 3), (1, 3, 6),]), + t!(regression020, &["ind", "inf"], "infind", &[(1, 0, 3), (0, 3, 6),]), + t!( + regression030, + &["libcore/", "libstd/"], + "libcore/char/methods.rs", + &[(0, 0, 8),] + ), + t!( + regression040, + &["libstd/", "libcore/"], + "libcore/char/methods.rs", + &[(1, 0, 8),] + ), + t!( + regression050, + &["\x00\x00\x01", "\x00\x00\x00"], + "\x00\x00\x00", + &[(1, 0, 3),] + ), + t!( + regression060, + &["\x00\x00\x00", "\x00\x00\x01"], + "\x00\x00\x00", + &[(0, 0, 3),] + ), +]; + +const TEDDY: &'static [SearchTest] = &[ + t!( + teddy010, + &["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"], + "abcdefghijk", + &[ + (0, 0, 1), + (1, 1, 2), + (2, 2, 3), + (3, 3, 4), + (4, 4, 5), + (5, 5, 6), + (6, 6, 7), + (7, 7, 8), + (8, 8, 9), + (9, 9, 10), + (10, 10, 11) + ] + ), + t!( + teddy020, + &["ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl"], + "abcdefghijk", + &[(0, 0, 2), (2, 2, 4), (4, 4, 6), (6, 6, 8), (8, 8, 10),] + ), + t!( + teddy030, + &["abc"], + "abcdefghijklmnopqrstuvwxyzabcdefghijk", + &[(0, 0, 3), (0, 26, 29)] + ), +]; + +// Now define a test for each combination of things above that we want to run. +// Since there are a few different combinations for each collection of tests, +// we define a couple of macros to avoid repetition drudgery. The testconfig +// macro constructs the automaton from a given match kind, and runs the search +// tests one-by-one over the given collection. The `with` parameter allows one +// to configure the config with additional parameters. The testcombo macro +// invokes testconfig in precisely this way: it sets up several tests where +// each one turns a different knob on Config. + +macro_rules! testconfig { + ($name:ident, $collection:expr, $with:expr) => { + #[test] + fn $name() { + run_search_tests($collection, |test| { + let mut config = Config::new(); + $with(&mut config); + config + .builder() + .extend(test.patterns.iter().map(|p| p.as_bytes())) + .build() + .unwrap() + .find_iter(&test.haystack) + .collect() + }); + } + }; +} + +#[cfg(target_arch = "x86_64")] +testconfig!( + search_default_leftmost_first, + PACKED_LEFTMOST_FIRST, + |_: &mut Config| {} +); + +#[cfg(target_arch = "x86_64")] +testconfig!( + search_default_leftmost_longest, + PACKED_LEFTMOST_LONGEST, + |c: &mut Config| { + c.match_kind(MatchKind::LeftmostLongest); + } +); + +#[cfg(target_arch = "x86_64")] +testconfig!( + search_teddy_leftmost_first, + PACKED_LEFTMOST_FIRST, + |c: &mut Config| { + c.force_teddy(true); + } +); + +#[cfg(target_arch = "x86_64")] +testconfig!( + search_teddy_leftmost_longest, + PACKED_LEFTMOST_LONGEST, + |c: &mut Config| { + c.force_teddy(true).match_kind(MatchKind::LeftmostLongest); + } +); + +#[cfg(target_arch = "x86_64")] +testconfig!( + search_teddy_ssse3_leftmost_first, + PACKED_LEFTMOST_FIRST, + |c: &mut Config| { + c.force_teddy(true); + if is_x86_feature_detected!("ssse3") { + c.force_avx(Some(false)); + } + } +); + +#[cfg(target_arch = "x86_64")] +testconfig!( + search_teddy_ssse3_leftmost_longest, + PACKED_LEFTMOST_LONGEST, + |c: &mut Config| { + c.force_teddy(true).match_kind(MatchKind::LeftmostLongest); + if is_x86_feature_detected!("ssse3") { + c.force_avx(Some(false)); + } + } +); + +#[cfg(target_arch = "x86_64")] +testconfig!( + search_teddy_avx2_leftmost_first, + PACKED_LEFTMOST_FIRST, + |c: &mut Config| { + c.force_teddy(true); + if is_x86_feature_detected!("avx2") { + c.force_avx(Some(true)); + } + } +); + +#[cfg(target_arch = "x86_64")] +testconfig!( + search_teddy_avx2_leftmost_longest, + PACKED_LEFTMOST_LONGEST, + |c: &mut Config| { + c.force_teddy(true).match_kind(MatchKind::LeftmostLongest); + if is_x86_feature_detected!("avx2") { + c.force_avx(Some(true)); + } + } +); + +#[cfg(target_arch = "x86_64")] +testconfig!( + search_teddy_fat_leftmost_first, + PACKED_LEFTMOST_FIRST, + |c: &mut Config| { + c.force_teddy(true); + if is_x86_feature_detected!("avx2") { + c.force_teddy_fat(Some(true)); + } + } +); + +#[cfg(target_arch = "x86_64")] +testconfig!( + search_teddy_fat_leftmost_longest, + PACKED_LEFTMOST_LONGEST, + |c: &mut Config| { + c.force_teddy(true).match_kind(MatchKind::LeftmostLongest); + if is_x86_feature_detected!("avx2") { + c.force_teddy_fat(Some(true)); + } + } +); + +testconfig!( + search_rabinkarp_leftmost_first, + PACKED_LEFTMOST_FIRST, + |c: &mut Config| { + c.force_rabin_karp(true); + } +); + +testconfig!( + search_rabinkarp_leftmost_longest, + PACKED_LEFTMOST_LONGEST, + |c: &mut Config| { + c.force_rabin_karp(true).match_kind(MatchKind::LeftmostLongest); + } +); + +#[test] +fn search_tests_have_unique_names() { + let assert = |constname, tests: &[SearchTest]| { + let mut seen = HashMap::new(); // map from test name to position + for (i, test) in tests.iter().enumerate() { + if !seen.contains_key(test.name) { + seen.insert(test.name, i); + } else { + let last = seen[test.name]; + panic!( + "{} tests have duplicate names at positions {} and {}", + constname, last, i + ); + } + } + }; + assert("BASICS", BASICS); + assert("LEFTMOST", LEFTMOST); + assert("LEFTMOST_FIRST", LEFTMOST_FIRST); + assert("LEFTMOST_LONGEST", LEFTMOST_LONGEST); + assert("REGRESSION", REGRESSION); + assert("TEDDY", TEDDY); +} + +fn run_search_tests Vec>( + which: TestCollection, + mut f: F, +) { + let get_match_triples = + |matches: Vec| -> Vec<(usize, usize, usize)> { + matches + .into_iter() + .map(|m| (m.pattern(), m.start(), m.end())) + .collect() + }; + for &tests in which { + for spec in tests { + for test in spec.variations() { + assert_eq!( + test.matches, + get_match_triples(f(&test)).as_slice(), + "test: {}, patterns: {:?}, haystack: {:?}, offset: {:?}", + test.name, + test.patterns, + test.haystack, + test.offset, + ); + } + } + } +} diff --git a/vendor/aho-corasick/src/packed/vector.rs b/vendor/aho-corasick/src/packed/vector.rs new file mode 100644 index 0000000000..ca6c2b0f97 --- /dev/null +++ b/vendor/aho-corasick/src/packed/vector.rs @@ -0,0 +1,181 @@ +// This file contains a set of fairly generic utility functions when working +// with SIMD vectors. +// +// SAFETY: All of the routines below are unsafe to call because they assume +// the necessary CPU target features in order to use particular vendor +// intrinsics. Calling these routines when the underlying CPU does not support +// the appropriate target features is NOT safe. Callers must ensure this +// themselves. +// +// Note that it may not look like this safety invariant is being upheld when +// these routines are called. Namely, the CPU feature check is typically pretty +// far away from when these routines are used. Instead, we rely on the fact +// that certain types serve as a guaranteed receipt that pertinent target +// features are enabled. For example, the only way TeddySlim3Mask256 can be +// constructed is if the AVX2 CPU feature is available. Thus, any code running +// inside of TeddySlim3Mask256 can use any of the functions below without any +// additional checks: its very existence *is* the check. + +use std::arch::x86_64::*; + +/// Shift `a` to the left by two bytes (removing its two most significant +/// bytes), and concatenate it with the the two most significant bytes of `b`. +#[target_feature(enable = "avx2")] +pub unsafe fn alignr256_14(a: __m256i, b: __m256i) -> __m256i { + // Credit goes to jneem for figuring this out: + // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184 + // + // TL;DR avx2's PALIGNR instruction is actually just two 128-bit PALIGNR + // instructions, which is not what we want, so we need to do some extra + // shuffling. + + // This permute gives us the low 16 bytes of a concatenated with the high + // 16 bytes of b, in order of most significant to least significant. So + // `v = a[15:0] b[31:16]`. + let v = _mm256_permute2x128_si256(b, a, 0x21); + // This effectively does this (where we deal in terms of byte-indexing + // and byte-shifting, and use inclusive ranges): + // + // ret[15:0] := ((a[15:0] << 16) | v[15:0]) >> 14 + // = ((a[15:0] << 16) | b[31:16]) >> 14 + // ret[31:16] := ((a[31:16] << 16) | v[31:16]) >> 14 + // = ((a[31:16] << 16) | a[15:0]) >> 14 + // + // Which therefore results in: + // + // ret[31:0] := a[29:16] a[15:14] a[13:0] b[31:30] + // + // The end result is that we've effectively done this: + // + // (a << 2) | (b >> 30) + // + // When `A` and `B` are strings---where the beginning of the string is in + // the least significant bits---we effectively result in the following + // semantic operation: + // + // (A >> 2) | (B << 30) + // + // The reversal being attributed to the fact that we are in little-endian. + _mm256_alignr_epi8(a, v, 14) +} + +/// Shift `a` to the left by one byte (removing its most significant byte), and +/// concatenate it with the the most significant byte of `b`. +#[target_feature(enable = "avx2")] +pub unsafe fn alignr256_15(a: __m256i, b: __m256i) -> __m256i { + // For explanation, see alignr256_14. + let v = _mm256_permute2x128_si256(b, a, 0x21); + _mm256_alignr_epi8(a, v, 15) +} + +/// Unpack the given 128-bit vector into its 64-bit components. The first +/// element of the array returned corresponds to the least significant 64-bit +/// lane in `a`. +#[target_feature(enable = "ssse3")] +pub unsafe fn unpack64x128(a: __m128i) -> [u64; 2] { + [ + _mm_cvtsi128_si64(a) as u64, + _mm_cvtsi128_si64(_mm_srli_si128(a, 8)) as u64, + ] +} + +/// Unpack the given 256-bit vector into its 64-bit components. The first +/// element of the array returned corresponds to the least significant 64-bit +/// lane in `a`. +#[target_feature(enable = "avx2")] +pub unsafe fn unpack64x256(a: __m256i) -> [u64; 4] { + // Using transmute here is precisely equivalent, but actually slower. It's + // not quite clear why. + let lo = _mm256_extracti128_si256(a, 0); + let hi = _mm256_extracti128_si256(a, 1); + [ + _mm_cvtsi128_si64(lo) as u64, + _mm_cvtsi128_si64(_mm_srli_si128(lo, 8)) as u64, + _mm_cvtsi128_si64(hi) as u64, + _mm_cvtsi128_si64(_mm_srli_si128(hi, 8)) as u64, + ] +} + +/// Unpack the low 128-bits of `a` and `b`, and return them as 4 64-bit +/// integers. +/// +/// More precisely, if a = a4 a3 a2 a1 and b = b4 b3 b2 b1, where each element +/// is a 64-bit integer and a1/b1 correspond to the least significant 64 bits, +/// then the return value is `b2 b1 a2 a1`. +#[target_feature(enable = "avx2")] +pub unsafe fn unpacklo64x256(a: __m256i, b: __m256i) -> [u64; 4] { + let lo = _mm256_castsi256_si128(a); + let hi = _mm256_castsi256_si128(b); + [ + _mm_cvtsi128_si64(lo) as u64, + _mm_cvtsi128_si64(_mm_srli_si128(lo, 8)) as u64, + _mm_cvtsi128_si64(hi) as u64, + _mm_cvtsi128_si64(_mm_srli_si128(hi, 8)) as u64, + ] +} + +/// Returns true if and only if all bits in the given 128-bit vector are 0. +#[target_feature(enable = "ssse3")] +pub unsafe fn is_all_zeroes128(a: __m128i) -> bool { + let cmp = _mm_cmpeq_epi8(a, zeroes128()); + _mm_movemask_epi8(cmp) as u32 == 0xFFFF +} + +/// Returns true if and only if all bits in the given 256-bit vector are 0. +#[target_feature(enable = "avx2")] +pub unsafe fn is_all_zeroes256(a: __m256i) -> bool { + let cmp = _mm256_cmpeq_epi8(a, zeroes256()); + _mm256_movemask_epi8(cmp) as u32 == 0xFFFFFFFF +} + +/// Load a 128-bit vector from slice at the given position. The slice does +/// not need to be unaligned. +/// +/// Since this code assumes little-endian (there is no big-endian x86), the +/// bytes starting in `slice[at..]` will be at the least significant bits of +/// the returned vector. This is important for the surrounding code, since for +/// example, shifting the resulting vector right is equivalent to logically +/// shifting the bytes in `slice` left. +#[target_feature(enable = "sse2")] +pub unsafe fn loadu128(slice: &[u8], at: usize) -> __m128i { + let ptr = slice.get_unchecked(at..).as_ptr(); + _mm_loadu_si128(ptr as *const u8 as *const __m128i) +} + +/// Load a 256-bit vector from slice at the given position. The slice does +/// not need to be unaligned. +/// +/// Since this code assumes little-endian (there is no big-endian x86), the +/// bytes starting in `slice[at..]` will be at the least significant bits of +/// the returned vector. This is important for the surrounding code, since for +/// example, shifting the resulting vector right is equivalent to logically +/// shifting the bytes in `slice` left. +#[target_feature(enable = "avx2")] +pub unsafe fn loadu256(slice: &[u8], at: usize) -> __m256i { + let ptr = slice.get_unchecked(at..).as_ptr(); + _mm256_loadu_si256(ptr as *const u8 as *const __m256i) +} + +/// Returns a 128-bit vector with all bits set to 0. +#[target_feature(enable = "sse2")] +pub unsafe fn zeroes128() -> __m128i { + _mm_set1_epi8(0) +} + +/// Returns a 256-bit vector with all bits set to 0. +#[target_feature(enable = "avx2")] +pub unsafe fn zeroes256() -> __m256i { + _mm256_set1_epi8(0) +} + +/// Returns a 128-bit vector with all bits set to 1. +#[target_feature(enable = "sse2")] +pub unsafe fn ones128() -> __m128i { + _mm_set1_epi8(0xFF as u8 as i8) +} + +/// Returns a 256-bit vector with all bits set to 1. +#[target_feature(enable = "avx2")] +pub unsafe fn ones256() -> __m256i { + _mm256_set1_epi8(0xFF as u8 as i8) +} diff --git a/vendor/aho-corasick/src/prefilter.rs b/vendor/aho-corasick/src/prefilter.rs new file mode 100644 index 0000000000..ef814117cc --- /dev/null +++ b/vendor/aho-corasick/src/prefilter.rs @@ -0,0 +1,1057 @@ +use std::cmp; +use std::fmt; +use std::panic::{RefUnwindSafe, UnwindSafe}; +use std::u8; + +use memchr::{memchr, memchr2, memchr3}; + +use crate::ahocorasick::MatchKind; +use crate::packed; +use crate::Match; + +/// A candidate is the result of running a prefilter on a haystack at a +/// particular position. The result is either no match, a confirmed match or +/// a possible match. +/// +/// When no match is returned, the prefilter is guaranteeing that no possible +/// match can be found in the haystack, and the caller may trust this. That is, +/// all correct prefilters must never report false negatives. +/// +/// In some cases, a prefilter can confirm a match very quickly, in which case, +/// the caller may use this to stop what it's doing and report the match. In +/// this case, prefilter implementations must never report a false positive. +/// In other cases, the prefilter can only report a potential match, in which +/// case the callers must attempt to confirm the match. In this case, prefilter +/// implementations are permitted to return false positives. +#[derive(Clone, Debug)] +pub enum Candidate { + None, + Match(Match), + PossibleStartOfMatch(usize), +} + +impl Candidate { + /// Convert this candidate into an option. This is useful when callers + /// do not distinguish between true positives and false positives (i.e., + /// the caller must always confirm the match in order to update some other + /// state). + pub fn into_option(self) -> Option { + match self { + Candidate::None => None, + Candidate::Match(ref m) => Some(m.start()), + Candidate::PossibleStartOfMatch(start) => Some(start), + } + } +} + +/// A prefilter describes the behavior of fast literal scanners for quickly +/// skipping past bytes in the haystack that we know cannot possibly +/// participate in a match. +pub trait Prefilter: + Send + Sync + RefUnwindSafe + UnwindSafe + fmt::Debug +{ + /// Returns the next possible match candidate. This may yield false + /// positives, so callers must confirm a match starting at the position + /// returned. This, however, must never produce false negatives. That is, + /// this must, at minimum, return the starting position of the next match + /// in the given haystack after or at the given position. + fn next_candidate( + &self, + state: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Candidate; + + /// A method for cloning a prefilter, to work-around the fact that Clone + /// is not object-safe. + fn clone_prefilter(&self) -> Box; + + /// Returns the approximate total amount of heap used by this prefilter, in + /// units of bytes. + fn heap_bytes(&self) -> usize; + + /// Returns true if and only if this prefilter never returns false + /// positives. This is useful for completely avoiding the automaton + /// when the prefilter can quickly confirm its own matches. + /// + /// By default, this returns true, which is conservative; it is always + /// correct to return `true`. Returning `false` here and reporting a false + /// positive will result in incorrect searches. + fn reports_false_positives(&self) -> bool { + true + } + + /// Returns true if and only if this prefilter may look for a non-starting + /// position of a match. + /// + /// This is useful in a streaming context where prefilters that don't look + /// for a starting position of a match can be quite difficult to deal with. + /// + /// This returns false by default. + fn looks_for_non_start_of_match(&self) -> bool { + false + } +} + +impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P { + #[inline] + fn next_candidate( + &self, + state: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Candidate { + (**self).next_candidate(state, haystack, at) + } + + fn clone_prefilter(&self) -> Box { + (**self).clone_prefilter() + } + + fn heap_bytes(&self) -> usize { + (**self).heap_bytes() + } + + fn reports_false_positives(&self) -> bool { + (**self).reports_false_positives() + } +} + +/// A convenience object for representing any type that implements Prefilter +/// and is cloneable. +#[derive(Debug)] +pub struct PrefilterObj(Box); + +impl Clone for PrefilterObj { + fn clone(&self) -> Self { + PrefilterObj(self.0.clone_prefilter()) + } +} + +impl PrefilterObj { + /// Create a new prefilter object. + pub fn new(t: T) -> PrefilterObj { + PrefilterObj(Box::new(t)) + } + + /// Return the underlying prefilter trait object. + pub fn as_ref(&self) -> &dyn Prefilter { + &*self.0 + } +} + +/// PrefilterState tracks state associated with the effectiveness of a +/// prefilter. It is used to track how many bytes, on average, are skipped by +/// the prefilter. If this average dips below a certain threshold over time, +/// then the state renders the prefilter inert and stops using it. +/// +/// A prefilter state should be created for each search. (Where creating an +/// iterator via, e.g., `find_iter`, is treated as a single search.) +#[derive(Clone, Debug)] +pub struct PrefilterState { + /// The number of skips that has been executed. + skips: usize, + /// The total number of bytes that have been skipped. + skipped: usize, + /// The maximum length of a match. This is used to help determine how many + /// bytes on average should be skipped in order for a prefilter to be + /// effective. + max_match_len: usize, + /// Once this heuristic has been deemed permanently ineffective, it will be + /// inert throughout the rest of its lifetime. This serves as a cheap way + /// to check inertness. + inert: bool, + /// The last (absolute) position at which a prefilter scanned to. + /// Prefilters can use this position to determine whether to re-scan or + /// not. + /// + /// Unlike other things that impact effectiveness, this is a fleeting + /// condition. That is, a prefilter can be considered ineffective if it is + /// at a position before `last_scan_at`, but can become effective again + /// once the search moves past `last_scan_at`. + /// + /// The utility of this is to both avoid additional overhead from calling + /// the prefilter and to avoid quadratic behavior. This ensures that a + /// prefilter will scan any particular byte at most once. (Note that some + /// prefilters, like the start-byte prefilter, do not need to use this + /// field at all, since it only looks for starting bytes.) + last_scan_at: usize, +} + +impl PrefilterState { + /// The minimum number of skip attempts to try before considering whether + /// a prefilter is effective or not. + const MIN_SKIPS: usize = 40; + + /// The minimum amount of bytes that skipping must average, expressed as a + /// factor of the multiple of the length of a possible match. + /// + /// That is, after MIN_SKIPS have occurred, if the average number of bytes + /// skipped ever falls below MIN_AVG_FACTOR * max-match-length, then the + /// prefilter outed to be rendered inert. + const MIN_AVG_FACTOR: usize = 2; + + /// Create a fresh prefilter state. + pub fn new(max_match_len: usize) -> PrefilterState { + PrefilterState { + skips: 0, + skipped: 0, + max_match_len, + inert: false, + last_scan_at: 0, + } + } + + /// Create a prefilter state that always disables the prefilter. + pub fn disabled() -> PrefilterState { + PrefilterState { + skips: 0, + skipped: 0, + max_match_len: 0, + inert: true, + last_scan_at: 0, + } + } + + /// Update this state with the number of bytes skipped on the last + /// invocation of the prefilter. + #[inline] + fn update_skipped_bytes(&mut self, skipped: usize) { + self.skips += 1; + self.skipped += skipped; + } + + /// Updates the position at which the last scan stopped. This may be + /// greater than the position of the last candidate reported. For example, + /// searching for the "rare" byte `z` in `abczdef` for the pattern `abcz` + /// will report a candidate at position `0`, but the end of its last scan + /// will be at position `3`. + /// + /// This position factors into the effectiveness of this prefilter. If the + /// current position is less than the last position at which a scan ended, + /// then the prefilter should not be re-run until the search moves past + /// that position. + #[inline] + fn update_at(&mut self, at: usize) { + if at > self.last_scan_at { + self.last_scan_at = at; + } + } + + /// Return true if and only if this state indicates that a prefilter is + /// still effective. + /// + /// The given pos should correspond to the current starting position of the + /// search. + #[inline] + pub fn is_effective(&mut self, at: usize) -> bool { + if self.inert { + return false; + } + if at < self.last_scan_at { + return false; + } + if self.skips < PrefilterState::MIN_SKIPS { + return true; + } + + let min_avg = PrefilterState::MIN_AVG_FACTOR * self.max_match_len; + if self.skipped >= min_avg * self.skips { + return true; + } + + // We're inert. + self.inert = true; + false + } +} + +/// A builder for constructing the best possible prefilter. When constructed, +/// this builder will heuristically select the best prefilter it can build, +/// if any, and discard the rest. +#[derive(Debug)] +pub struct Builder { + count: usize, + ascii_case_insensitive: bool, + start_bytes: StartBytesBuilder, + rare_bytes: RareBytesBuilder, + packed: Option, +} + +impl Builder { + /// Create a new builder for constructing the best possible prefilter. + pub fn new(kind: MatchKind) -> Builder { + let pbuilder = kind + .as_packed() + .map(|kind| packed::Config::new().match_kind(kind).builder()); + Builder { + count: 0, + ascii_case_insensitive: false, + start_bytes: StartBytesBuilder::new(), + rare_bytes: RareBytesBuilder::new(), + packed: pbuilder, + } + } + + /// Enable ASCII case insensitivity. When set, byte strings added to this + /// builder will be interpreted without respect to ASCII case. + pub fn ascii_case_insensitive(mut self, yes: bool) -> Builder { + self.ascii_case_insensitive = yes; + self.start_bytes = self.start_bytes.ascii_case_insensitive(yes); + self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes); + self + } + + /// Return a prefilter suitable for quickly finding potential matches. + /// + /// All patterns added to an Aho-Corasick automaton should be added to this + /// builder before attempting to construct the prefilter. + pub fn build(&self) -> Option { + // match (self.start_bytes.build(), self.rare_bytes.build()) { + match (self.start_bytes.build(), self.rare_bytes.build()) { + // If we could build both start and rare prefilters, then there are + // a few cases in which we'd want to use the start-byte prefilter + // over the rare-byte prefilter, since the former has lower + // overhead. + (prestart @ Some(_), prerare @ Some(_)) => { + // If the start-byte prefilter can scan for a smaller number + // of bytes than the rare-byte prefilter, then it's probably + // faster. + let has_fewer_bytes = + self.start_bytes.count < self.rare_bytes.count; + // Otherwise, if the combined frequency rank of the detected + // bytes in the start-byte prefilter is "close" to the combined + // frequency rank of the rare-byte prefilter, then we pick + // the start-byte prefilter even if the rare-byte prefilter + // heuristically searches for rare bytes. This is because the + // rare-byte prefilter has higher constant costs, so we tend to + // prefer the start-byte prefilter when we can. + let has_rarer_bytes = + self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50; + if has_fewer_bytes || has_rarer_bytes { + prestart + } else { + prerare + } + } + (prestart @ Some(_), None) => prestart, + (None, prerare @ Some(_)) => prerare, + (None, None) if self.ascii_case_insensitive => None, + (None, None) => self + .packed + .as_ref() + .and_then(|b| b.build()) + .map(|s| PrefilterObj::new(Packed(s))), + } + } + + /// Add a literal string to this prefilter builder. + pub fn add(&mut self, bytes: &[u8]) { + self.count += 1; + self.start_bytes.add(bytes); + self.rare_bytes.add(bytes); + if let Some(ref mut pbuilder) = self.packed { + pbuilder.add(bytes); + } + } +} + +/// A type that wraps a packed searcher and implements the `Prefilter` +/// interface. +#[derive(Clone, Debug)] +struct Packed(packed::Searcher); + +impl Prefilter for Packed { + fn next_candidate( + &self, + _state: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Candidate { + self.0.find_at(haystack, at).map_or(Candidate::None, Candidate::Match) + } + + fn clone_prefilter(&self) -> Box { + Box::new(self.clone()) + } + + fn heap_bytes(&self) -> usize { + self.0.heap_bytes() + } + + fn reports_false_positives(&self) -> bool { + false + } +} + +/// A builder for constructing a rare byte prefilter. +/// +/// A rare byte prefilter attempts to pick out a small set of rare bytes that +/// occurr in the patterns, and then quickly scan to matches of those rare +/// bytes. +#[derive(Clone, Debug)] +struct RareBytesBuilder { + /// Whether this prefilter should account for ASCII case insensitivity or + /// not. + ascii_case_insensitive: bool, + /// A set of rare bytes, indexed by byte value. + rare_set: ByteSet, + /// A set of byte offsets associated with bytes in a pattern. An entry + /// corresponds to a particular bytes (its index) and is only non-zero if + /// the byte occurred at an offset greater than 0 in at least one pattern. + /// + /// If a byte's offset is not representable in 8 bits, then the rare bytes + /// prefilter becomes inert. + byte_offsets: RareByteOffsets, + /// Whether this is available as a prefilter or not. This can be set to + /// false during construction if a condition is seen that invalidates the + /// use of the rare-byte prefilter. + available: bool, + /// The number of bytes set to an active value in `byte_offsets`. + count: usize, + /// The sum of frequency ranks for the rare bytes detected. This is + /// intended to give a heuristic notion of how rare the bytes are. + rank_sum: u16, +} + +/// A set of bytes. +#[derive(Clone, Copy)] +struct ByteSet([bool; 256]); + +impl ByteSet { + fn empty() -> ByteSet { + ByteSet([false; 256]) + } + + fn insert(&mut self, b: u8) -> bool { + let new = !self.contains(b); + self.0[b as usize] = true; + new + } + + fn contains(&self, b: u8) -> bool { + self.0[b as usize] + } +} + +impl fmt::Debug for ByteSet { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut bytes = vec![]; + for b in 0..=255 { + if self.contains(b) { + bytes.push(b); + } + } + f.debug_struct("ByteSet").field("set", &bytes).finish() + } +} + +/// A set of byte offsets, keyed by byte. +#[derive(Clone, Copy)] +struct RareByteOffsets { + /// Each entry corresponds to the maximum offset of the corresponding + /// byte across all patterns seen. + set: [RareByteOffset; 256], +} + +impl RareByteOffsets { + /// Create a new empty set of rare byte offsets. + pub fn empty() -> RareByteOffsets { + RareByteOffsets { set: [RareByteOffset::default(); 256] } + } + + /// Add the given offset for the given byte to this set. If the offset is + /// greater than the existing offset, then it overwrites the previous + /// value and returns false. If there is no previous value set, then this + /// sets it and returns true. + pub fn set(&mut self, byte: u8, off: RareByteOffset) { + self.set[byte as usize].max = + cmp::max(self.set[byte as usize].max, off.max); + } +} + +impl fmt::Debug for RareByteOffsets { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut offsets = vec![]; + for off in self.set.iter() { + if off.max > 0 { + offsets.push(off); + } + } + f.debug_struct("RareByteOffsets").field("set", &offsets).finish() + } +} + +/// Offsets associated with an occurrence of a "rare" byte in any of the +/// patterns used to construct a single Aho-Corasick automaton. +#[derive(Clone, Copy, Debug)] +struct RareByteOffset { + /// The maximum offset at which a particular byte occurs from the start + /// of any pattern. This is used as a shift amount. That is, when an + /// occurrence of this byte is found, the candidate position reported by + /// the prefilter is `position_of_byte - max`, such that the automaton + /// will begin its search at a position that is guaranteed to observe a + /// match. + /// + /// To avoid accidentally quadratic behavior, a prefilter is considered + /// ineffective when it is asked to start scanning from a position that it + /// has already scanned past. + /// + /// Using a `u8` here means that if we ever see a pattern that's longer + /// than 255 bytes, then the entire rare byte prefilter is disabled. + max: u8, +} + +impl Default for RareByteOffset { + fn default() -> RareByteOffset { + RareByteOffset { max: 0 } + } +} + +impl RareByteOffset { + /// Create a new rare byte offset. If the given offset is too big, then + /// None is returned. In that case, callers should render the rare bytes + /// prefilter inert. + fn new(max: usize) -> Option { + if max > u8::MAX as usize { + None + } else { + Some(RareByteOffset { max: max as u8 }) + } + } +} + +impl RareBytesBuilder { + /// Create a new builder for constructing a rare byte prefilter. + fn new() -> RareBytesBuilder { + RareBytesBuilder { + ascii_case_insensitive: false, + rare_set: ByteSet::empty(), + byte_offsets: RareByteOffsets::empty(), + available: true, + count: 0, + rank_sum: 0, + } + } + + /// Enable ASCII case insensitivity. When set, byte strings added to this + /// builder will be interpreted without respect to ASCII case. + fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder { + self.ascii_case_insensitive = yes; + self + } + + /// Build the rare bytes prefilter. + /// + /// If there are more than 3 distinct starting bytes, or if heuristics + /// otherwise determine that this prefilter should not be used, then `None` + /// is returned. + fn build(&self) -> Option { + if !self.available || self.count > 3 { + return None; + } + let (mut bytes, mut len) = ([0; 3], 0); + for b in 0..=255 { + if self.rare_set.contains(b) { + bytes[len] = b as u8; + len += 1; + } + } + match len { + 0 => None, + 1 => Some(PrefilterObj::new(RareBytesOne { + byte1: bytes[0], + offset: self.byte_offsets.set[bytes[0] as usize], + })), + 2 => Some(PrefilterObj::new(RareBytesTwo { + offsets: self.byte_offsets, + byte1: bytes[0], + byte2: bytes[1], + })), + 3 => Some(PrefilterObj::new(RareBytesThree { + offsets: self.byte_offsets, + byte1: bytes[0], + byte2: bytes[1], + byte3: bytes[2], + })), + _ => unreachable!(), + } + } + + /// Add a byte string to this builder. + /// + /// All patterns added to an Aho-Corasick automaton should be added to this + /// builder before attempting to construct the prefilter. + fn add(&mut self, bytes: &[u8]) { + // If we've already given up, then do nothing. + if !self.available { + return; + } + // If we've already blown our budget, then don't waste time looking + // for more rare bytes. + if self.count > 3 { + self.available = false; + return; + } + // If the pattern is too long, then our offset table is bunk, so + // give up. + if bytes.len() >= 256 { + self.available = false; + return; + } + let mut rarest = match bytes.get(0) { + None => return, + Some(&b) => (b, freq_rank(b)), + }; + // The idea here is to look for the rarest byte in each pattern, and + // add that to our set. As a special exception, if we see a byte that + // we've already added, then we immediately stop and choose that byte, + // even if there's another rare byte in the pattern. This helps us + // apply the rare byte optimization in more cases by attempting to pick + // bytes that are in common between patterns. So for example, if we + // were searching for `Sherlock` and `lockjaw`, then this would pick + // `k` for both patterns, resulting in the use of `memchr` instead of + // `memchr2` for `k` and `j`. + let mut found = false; + for (pos, &b) in bytes.iter().enumerate() { + self.set_offset(pos, b); + if found { + continue; + } + if self.rare_set.contains(b) { + found = true; + continue; + } + let rank = freq_rank(b); + if rank < rarest.1 { + rarest = (b, rank); + } + } + if !found { + self.add_rare_byte(rarest.0); + } + } + + fn set_offset(&mut self, pos: usize, byte: u8) { + // This unwrap is OK because pos is never bigger than our max. + let offset = RareByteOffset::new(pos).unwrap(); + self.byte_offsets.set(byte, offset); + if self.ascii_case_insensitive { + self.byte_offsets.set(opposite_ascii_case(byte), offset); + } + } + + fn add_rare_byte(&mut self, byte: u8) { + self.add_one_rare_byte(byte); + if self.ascii_case_insensitive { + self.add_one_rare_byte(opposite_ascii_case(byte)); + } + } + + fn add_one_rare_byte(&mut self, byte: u8) { + if self.rare_set.insert(byte) { + self.count += 1; + self.rank_sum += freq_rank(byte) as u16; + } + } +} + +/// A prefilter for scanning for a single "rare" byte. +#[derive(Clone, Debug)] +struct RareBytesOne { + byte1: u8, + offset: RareByteOffset, +} + +impl Prefilter for RareBytesOne { + fn next_candidate( + &self, + state: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Candidate { + memchr(self.byte1, &haystack[at..]) + .map(|i| { + let pos = at + i; + state.last_scan_at = pos; + cmp::max(at, pos.saturating_sub(self.offset.max as usize)) + }) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } + + fn clone_prefilter(&self) -> Box { + Box::new(self.clone()) + } + + fn heap_bytes(&self) -> usize { + 0 + } + + fn looks_for_non_start_of_match(&self) -> bool { + // TODO: It should be possible to use a rare byte prefilter in a + // streaming context. The main problem is that we usually assume that + // if a prefilter has scanned some text and not found anything, then no + // match *starts* in that text. This doesn't matter in non-streaming + // contexts, but in a streaming context, if we're looking for a byte + // that doesn't start at the beginning of a match and don't find it, + // then it's still possible for a match to start at the end of the + // current buffer content. In order to fix this, the streaming searcher + // would need to become aware of prefilters that do this and use the + // appropriate offset in various places. It is quite a delicate change + // and probably shouldn't be attempted until streaming search has a + // better testing strategy. In particular, we'd really like to be able + // to vary the buffer size to force strange cases that occur at the + // edge of the buffer. If we make the buffer size minimal, then these + // cases occur more frequently and easier. + // + // This is also a bummer because this means that if the prefilter + // builder chose a rare byte prefilter, then a streaming search won't + // use any prefilter at all because the builder doesn't know how it's + // going to be used. Assuming we don't make streaming search aware of + // these special types of prefilters as described above, we could fix + // this by building a "backup" prefilter that could be used when the + // rare byte prefilter could not. But that's a bandaide. Sigh. + true + } +} + +/// A prefilter for scanning for two "rare" bytes. +#[derive(Clone, Debug)] +struct RareBytesTwo { + offsets: RareByteOffsets, + byte1: u8, + byte2: u8, +} + +impl Prefilter for RareBytesTwo { + fn next_candidate( + &self, + state: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Candidate { + memchr2(self.byte1, self.byte2, &haystack[at..]) + .map(|i| { + let pos = at + i; + state.update_at(pos); + let offset = self.offsets.set[haystack[pos] as usize].max; + cmp::max(at, pos.saturating_sub(offset as usize)) + }) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } + + fn clone_prefilter(&self) -> Box { + Box::new(self.clone()) + } + + fn heap_bytes(&self) -> usize { + 0 + } + + fn looks_for_non_start_of_match(&self) -> bool { + // TODO: See Prefilter impl for RareBytesOne. + true + } +} + +/// A prefilter for scanning for three "rare" bytes. +#[derive(Clone, Debug)] +struct RareBytesThree { + offsets: RareByteOffsets, + byte1: u8, + byte2: u8, + byte3: u8, +} + +impl Prefilter for RareBytesThree { + fn next_candidate( + &self, + state: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Candidate { + memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..]) + .map(|i| { + let pos = at + i; + state.update_at(pos); + let offset = self.offsets.set[haystack[pos] as usize].max; + cmp::max(at, pos.saturating_sub(offset as usize)) + }) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } + + fn clone_prefilter(&self) -> Box { + Box::new(self.clone()) + } + + fn heap_bytes(&self) -> usize { + 0 + } + + fn looks_for_non_start_of_match(&self) -> bool { + // TODO: See Prefilter impl for RareBytesOne. + true + } +} + +/// A builder for constructing a starting byte prefilter. +/// +/// A starting byte prefilter is a simplistic prefilter that looks for possible +/// matches by reporting all positions corresponding to a particular byte. This +/// generally only takes affect when there are at most 3 distinct possible +/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two +/// distinct starting bytes (`f` and `b`), and this prefilter returns all +/// occurrences of either `f` or `b`. +/// +/// In some cases, a heuristic frequency analysis may determine that it would +/// be better not to use this prefilter even when there are 3 or fewer distinct +/// starting bytes. +#[derive(Clone, Debug)] +struct StartBytesBuilder { + /// Whether this prefilter should account for ASCII case insensitivity or + /// not. + ascii_case_insensitive: bool, + /// The set of starting bytes observed. + byteset: Vec, + /// The number of bytes set to true in `byteset`. + count: usize, + /// The sum of frequency ranks for the rare bytes detected. This is + /// intended to give a heuristic notion of how rare the bytes are. + rank_sum: u16, +} + +impl StartBytesBuilder { + /// Create a new builder for constructing a start byte prefilter. + fn new() -> StartBytesBuilder { + StartBytesBuilder { + ascii_case_insensitive: false, + byteset: vec![false; 256], + count: 0, + rank_sum: 0, + } + } + + /// Enable ASCII case insensitivity. When set, byte strings added to this + /// builder will be interpreted without respect to ASCII case. + fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder { + self.ascii_case_insensitive = yes; + self + } + + /// Build the starting bytes prefilter. + /// + /// If there are more than 3 distinct starting bytes, or if heuristics + /// otherwise determine that this prefilter should not be used, then `None` + /// is returned. + fn build(&self) -> Option { + if self.count > 3 { + return None; + } + let (mut bytes, mut len) = ([0; 3], 0); + for b in 0..256 { + if !self.byteset[b] { + continue; + } + // We don't handle non-ASCII bytes for now. Getting non-ASCII + // bytes right is trickier, since we generally don't want to put + // a leading UTF-8 code unit into a prefilter that isn't ASCII, + // since they can frequently. Instead, it would be better to use a + // continuation byte, but this requires more sophisticated analysis + // of the automaton and a richer prefilter API. + if b > 0x7F { + return None; + } + bytes[len] = b as u8; + len += 1; + } + match len { + 0 => None, + 1 => Some(PrefilterObj::new(StartBytesOne { byte1: bytes[0] })), + 2 => Some(PrefilterObj::new(StartBytesTwo { + byte1: bytes[0], + byte2: bytes[1], + })), + 3 => Some(PrefilterObj::new(StartBytesThree { + byte1: bytes[0], + byte2: bytes[1], + byte3: bytes[2], + })), + _ => unreachable!(), + } + } + + /// Add a byte string to this builder. + /// + /// All patterns added to an Aho-Corasick automaton should be added to this + /// builder before attempting to construct the prefilter. + fn add(&mut self, bytes: &[u8]) { + if self.count > 3 { + return; + } + if let Some(&byte) = bytes.get(0) { + self.add_one_byte(byte); + if self.ascii_case_insensitive { + self.add_one_byte(opposite_ascii_case(byte)); + } + } + } + + fn add_one_byte(&mut self, byte: u8) { + if !self.byteset[byte as usize] { + self.byteset[byte as usize] = true; + self.count += 1; + self.rank_sum += freq_rank(byte) as u16; + } + } +} + +/// A prefilter for scanning for a single starting byte. +#[derive(Clone, Debug)] +struct StartBytesOne { + byte1: u8, +} + +impl Prefilter for StartBytesOne { + fn next_candidate( + &self, + _state: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Candidate { + memchr(self.byte1, &haystack[at..]) + .map(|i| at + i) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } + + fn clone_prefilter(&self) -> Box { + Box::new(self.clone()) + } + + fn heap_bytes(&self) -> usize { + 0 + } +} + +/// A prefilter for scanning for two starting bytes. +#[derive(Clone, Debug)] +struct StartBytesTwo { + byte1: u8, + byte2: u8, +} + +impl Prefilter for StartBytesTwo { + fn next_candidate( + &self, + _state: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Candidate { + memchr2(self.byte1, self.byte2, &haystack[at..]) + .map(|i| at + i) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } + + fn clone_prefilter(&self) -> Box { + Box::new(self.clone()) + } + + fn heap_bytes(&self) -> usize { + 0 + } +} + +/// A prefilter for scanning for three starting bytes. +#[derive(Clone, Debug)] +struct StartBytesThree { + byte1: u8, + byte2: u8, + byte3: u8, +} + +impl Prefilter for StartBytesThree { + fn next_candidate( + &self, + _state: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Candidate { + memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..]) + .map(|i| at + i) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } + + fn clone_prefilter(&self) -> Box { + Box::new(self.clone()) + } + + fn heap_bytes(&self) -> usize { + 0 + } +} + +/// Return the next candidate reported by the given prefilter while +/// simultaneously updating the given prestate. +/// +/// The caller is responsible for checking the prestate before deciding whether +/// to initiate a search. +#[inline] +pub fn next( + prestate: &mut PrefilterState, + prefilter: P, + haystack: &[u8], + at: usize, +) -> Candidate { + let cand = prefilter.next_candidate(prestate, haystack, at); + match cand { + Candidate::None => { + prestate.update_skipped_bytes(haystack.len() - at); + } + Candidate::Match(ref m) => { + prestate.update_skipped_bytes(m.start() - at); + } + Candidate::PossibleStartOfMatch(i) => { + prestate.update_skipped_bytes(i - at); + } + } + cand +} + +/// If the given byte is an ASCII letter, then return it in the opposite case. +/// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns +/// `b'A'`. If a non-ASCII letter is given, then the given byte is returned. +pub fn opposite_ascii_case(b: u8) -> u8 { + if b'A' <= b && b <= b'Z' { + b.to_ascii_lowercase() + } else if b'a' <= b && b <= b'z' { + b.to_ascii_uppercase() + } else { + b + } +} + +/// Return the frequency rank of the given byte. The higher the rank, the more +/// common the byte (heuristically speaking). +fn freq_rank(b: u8) -> u8 { + use crate::byte_frequencies::BYTE_FREQUENCIES; + BYTE_FREQUENCIES[b as usize] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn scratch() { + let mut b = Builder::new(MatchKind::LeftmostFirst); + b.add(b"Sherlock"); + b.add(b"locjaw"); + // b.add(b"Sherlock"); + // b.add(b"Holmes"); + // b.add(b"Watson"); + // b.add("Шерлок Холмс".as_bytes()); + // b.add("Джон Уотсон".as_bytes()); + + let s = b.build().unwrap(); + println!("{:?}", s); + } +} diff --git a/vendor/aho-corasick/src/state_id.rs b/vendor/aho-corasick/src/state_id.rs new file mode 100644 index 0000000000..8973806b29 --- /dev/null +++ b/vendor/aho-corasick/src/state_id.rs @@ -0,0 +1,192 @@ +use std::fmt::Debug; +use std::hash::Hash; + +use crate::error::{Error, Result}; + +// NOTE: Most of this code was copied from regex-automata, but without the +// (de)serialization specific stuff. + +/// Check that the premultiplication of the given state identifier can +/// fit into the representation indicated by `S`. If it cannot, or if it +/// overflows `usize` itself, then an error is returned. +pub fn premultiply_overflow_error( + last_state: S, + alphabet_len: usize, +) -> Result<()> { + let requested = match last_state.to_usize().checked_mul(alphabet_len) { + Some(requested) => requested, + None => return Err(Error::premultiply_overflow(0, 0)), + }; + if requested > S::max_id() { + return Err(Error::premultiply_overflow(S::max_id(), requested)); + } + Ok(()) +} + +/// Convert the given `usize` to the chosen state identifier +/// representation. If the given value cannot fit in the chosen +/// representation, then an error is returned. +pub fn usize_to_state_id(value: usize) -> Result { + if value > S::max_id() { + Err(Error::state_id_overflow(S::max_id())) + } else { + Ok(S::from_usize(value)) + } +} + +/// Return the unique identifier for an automaton's fail state in the chosen +/// representation indicated by `S`. +pub fn fail_id() -> S { + S::from_usize(0) +} + +/// Return the unique identifier for an automaton's fail state in the chosen +/// representation indicated by `S`. +pub fn dead_id() -> S { + S::from_usize(1) +} + +mod private { + /// Sealed stops crates other than aho-corasick from implementing any + /// traits that use it. + pub trait Sealed {} + impl Sealed for u8 {} + impl Sealed for u16 {} + impl Sealed for u32 {} + impl Sealed for u64 {} + impl Sealed for usize {} +} + +/// A trait describing the representation of an automaton's state identifier. +/// +/// The purpose of this trait is to safely express both the possible state +/// identifier representations that can be used in an automaton and to convert +/// between state identifier representations and types that can be used to +/// efficiently index memory (such as `usize`). +/// +/// In general, one should not need to implement this trait explicitly. Indeed, +/// for now, this trait is sealed such that it cannot be implemented by any +/// other type. In particular, this crate provides implementations for `u8`, +/// `u16`, `u32`, `u64` and `usize`. (`u32` and `u64` are only provided for +/// targets that can represent all corresponding values in a `usize`.) +pub trait StateID: + private::Sealed + + Clone + + Copy + + Debug + + Eq + + Hash + + PartialEq + + PartialOrd + + Ord +{ + /// Convert from a `usize` to this implementation's representation. + /// + /// Implementors may assume that `n <= Self::max_id`. That is, implementors + /// do not need to check whether `n` can fit inside this implementation's + /// representation. + fn from_usize(n: usize) -> Self; + + /// Convert this implementation's representation to a `usize`. + /// + /// Implementors must not return a `usize` value greater than + /// `Self::max_id` and must not permit overflow when converting between the + /// implementor's representation and `usize`. In general, the preferred + /// way for implementors to achieve this is to simply not provide + /// implementations of `StateID` that cannot fit into the target platform's + /// `usize`. + fn to_usize(self) -> usize; + + /// Return the maximum state identifier supported by this representation. + /// + /// Implementors must return a correct bound. Doing otherwise may result + /// in unspecified behavior (but will not violate memory safety). + fn max_id() -> usize; +} + +impl StateID for usize { + #[inline] + fn from_usize(n: usize) -> usize { + n + } + + #[inline] + fn to_usize(self) -> usize { + self + } + + #[inline] + fn max_id() -> usize { + ::std::usize::MAX + } +} + +impl StateID for u8 { + #[inline] + fn from_usize(n: usize) -> u8 { + n as u8 + } + + #[inline] + fn to_usize(self) -> usize { + self as usize + } + + #[inline] + fn max_id() -> usize { + ::std::u8::MAX as usize + } +} + +impl StateID for u16 { + #[inline] + fn from_usize(n: usize) -> u16 { + n as u16 + } + + #[inline] + fn to_usize(self) -> usize { + self as usize + } + + #[inline] + fn max_id() -> usize { + ::std::u16::MAX as usize + } +} + +#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] +impl StateID for u32 { + #[inline] + fn from_usize(n: usize) -> u32 { + n as u32 + } + + #[inline] + fn to_usize(self) -> usize { + self as usize + } + + #[inline] + fn max_id() -> usize { + ::std::u32::MAX as usize + } +} + +#[cfg(target_pointer_width = "64")] +impl StateID for u64 { + #[inline] + fn from_usize(n: usize) -> u64 { + n as u64 + } + + #[inline] + fn to_usize(self) -> usize { + self as usize + } + + #[inline] + fn max_id() -> usize { + ::std::u64::MAX as usize + } +} diff --git a/vendor/aho-corasick/src/tests.rs b/vendor/aho-corasick/src/tests.rs new file mode 100644 index 0000000000..20cd3d15dd --- /dev/null +++ b/vendor/aho-corasick/src/tests.rs @@ -0,0 +1,1254 @@ +use std::collections::HashMap; +use std::io; +use std::usize; + +use crate::{AhoCorasickBuilder, Match, MatchKind}; + +/// A description of a single test against an Aho-Corasick automaton. +/// +/// A single test may not necessarily pass on every configuration of an +/// Aho-Corasick automaton. The tests are categorized and grouped appropriately +/// below. +#[derive(Clone, Debug, Eq, PartialEq)] +struct SearchTest { + /// The name of this test, for debugging. + name: &'static str, + /// The patterns to search for. + patterns: &'static [&'static str], + /// The text to search. + haystack: &'static str, + /// Each match is a triple of (pattern_index, start, end), where + /// pattern_index is an index into `patterns` and `start`/`end` are indices + /// into `haystack`. + matches: &'static [(usize, usize, usize)], +} + +/// Short-hand constructor for SearchTest. We use it a lot below. +macro_rules! t { + ($name:ident, $patterns:expr, $haystack:expr, $matches:expr) => { + SearchTest { + name: stringify!($name), + patterns: $patterns, + haystack: $haystack, + matches: $matches, + } + }; +} + +/// A collection of test groups. +type TestCollection = &'static [&'static [SearchTest]]; + +// Define several collections corresponding to the different type of match +// semantics supported by Aho-Corasick. These collections have some overlap, +// but each collection should have some tests that no other collection has. + +/// Tests for Aho-Corasick's standard non-overlapping match semantics. +const AC_STANDARD_NON_OVERLAPPING: TestCollection = + &[BASICS, NON_OVERLAPPING, STANDARD, REGRESSION]; + +/// Tests for Aho-Corasick's anchored standard non-overlapping match semantics. +const AC_STANDARD_ANCHORED_NON_OVERLAPPING: TestCollection = + &[ANCHORED_BASICS, ANCHORED_NON_OVERLAPPING, STANDARD_ANCHORED]; + +/// Tests for Aho-Corasick's standard overlapping match semantics. +const AC_STANDARD_OVERLAPPING: TestCollection = + &[BASICS, OVERLAPPING, REGRESSION]; + +/// Tests for Aho-Corasick's anchored standard overlapping match semantics. +const AC_STANDARD_ANCHORED_OVERLAPPING: TestCollection = + &[ANCHORED_BASICS, ANCHORED_OVERLAPPING]; + +/// Tests for Aho-Corasick's leftmost-first match semantics. +const AC_LEFTMOST_FIRST: TestCollection = + &[BASICS, NON_OVERLAPPING, LEFTMOST, LEFTMOST_FIRST, REGRESSION]; + +/// Tests for Aho-Corasick's anchored leftmost-first match semantics. +const AC_LEFTMOST_FIRST_ANCHORED: TestCollection = &[ + ANCHORED_BASICS, + ANCHORED_NON_OVERLAPPING, + ANCHORED_LEFTMOST, + ANCHORED_LEFTMOST_FIRST, +]; + +/// Tests for Aho-Corasick's leftmost-longest match semantics. +const AC_LEFTMOST_LONGEST: TestCollection = + &[BASICS, NON_OVERLAPPING, LEFTMOST, LEFTMOST_LONGEST, REGRESSION]; + +/// Tests for Aho-Corasick's anchored leftmost-longest match semantics. +const AC_LEFTMOST_LONGEST_ANCHORED: TestCollection = &[ + ANCHORED_BASICS, + ANCHORED_NON_OVERLAPPING, + ANCHORED_LEFTMOST, + ANCHORED_LEFTMOST_LONGEST, +]; + +// Now define the individual tests that make up the collections above. + +/// A collection of tests for the Aho-Corasick algorithm that should always be +/// true regardless of match semantics. That is, all combinations of +/// leftmost-{shortest, first, longest} x {overlapping, non-overlapping} +/// should produce the same answer. +const BASICS: &'static [SearchTest] = &[ + t!(basic000, &[], "", &[]), + t!(basic001, &["a"], "", &[]), + t!(basic010, &["a"], "a", &[(0, 0, 1)]), + t!(basic020, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]), + t!(basic030, &["a"], "aaa", &[(0, 0, 1), (0, 1, 2), (0, 2, 3)]), + t!(basic040, &["a"], "aba", &[(0, 0, 1), (0, 2, 3)]), + t!(basic050, &["a"], "bba", &[(0, 2, 3)]), + t!(basic060, &["a"], "bbb", &[]), + t!(basic070, &["a"], "bababbbba", &[(0, 1, 2), (0, 3, 4), (0, 8, 9)]), + t!(basic100, &["aa"], "", &[]), + t!(basic110, &["aa"], "aa", &[(0, 0, 2)]), + t!(basic120, &["aa"], "aabbaa", &[(0, 0, 2), (0, 4, 6)]), + t!(basic130, &["aa"], "abbab", &[]), + t!(basic140, &["aa"], "abbabaa", &[(0, 5, 7)]), + t!(basic200, &["abc"], "abc", &[(0, 0, 3)]), + t!(basic210, &["abc"], "zazabzabcz", &[(0, 6, 9)]), + t!(basic220, &["abc"], "zazabczabcz", &[(0, 3, 6), (0, 7, 10)]), + t!(basic300, &["a", "b"], "", &[]), + t!(basic310, &["a", "b"], "z", &[]), + t!(basic320, &["a", "b"], "b", &[(1, 0, 1)]), + t!(basic330, &["a", "b"], "a", &[(0, 0, 1)]), + t!( + basic340, + &["a", "b"], + "abba", + &[(0, 0, 1), (1, 1, 2), (1, 2, 3), (0, 3, 4),] + ), + t!( + basic350, + &["b", "a"], + "abba", + &[(1, 0, 1), (0, 1, 2), (0, 2, 3), (1, 3, 4),] + ), + t!(basic360, &["abc", "bc"], "xbc", &[(1, 1, 3),]), + t!(basic400, &["foo", "bar"], "", &[]), + t!(basic410, &["foo", "bar"], "foobar", &[(0, 0, 3), (1, 3, 6),]), + t!(basic420, &["foo", "bar"], "barfoo", &[(1, 0, 3), (0, 3, 6),]), + t!(basic430, &["foo", "bar"], "foofoo", &[(0, 0, 3), (0, 3, 6),]), + t!(basic440, &["foo", "bar"], "barbar", &[(1, 0, 3), (1, 3, 6),]), + t!(basic450, &["foo", "bar"], "bafofoo", &[(0, 4, 7),]), + t!(basic460, &["bar", "foo"], "bafofoo", &[(1, 4, 7),]), + t!(basic470, &["foo", "bar"], "fobabar", &[(1, 4, 7),]), + t!(basic480, &["bar", "foo"], "fobabar", &[(0, 4, 7),]), + t!(basic600, &[""], "", &[(0, 0, 0)]), + t!(basic610, &[""], "a", &[(0, 0, 0), (0, 1, 1)]), + t!(basic620, &[""], "abc", &[(0, 0, 0), (0, 1, 1), (0, 2, 2), (0, 3, 3)]), + t!(basic700, &["yabcdef", "abcdezghi"], "yabcdefghi", &[(0, 0, 7),]), + t!(basic710, &["yabcdef", "abcdezghi"], "yabcdezghi", &[(1, 1, 10),]), + t!( + basic720, + &["yabcdef", "bcdeyabc", "abcdezghi"], + "yabcdezghi", + &[(2, 1, 10),] + ), +]; + +/// A collection of *anchored* tests for the Aho-Corasick algorithm that should +/// always be true regardless of match semantics. That is, all combinations of +/// leftmost-{shortest, first, longest} x {overlapping, non-overlapping} should +/// produce the same answer. +const ANCHORED_BASICS: &'static [SearchTest] = &[ + t!(abasic000, &[], "", &[]), + t!(abasic010, &[""], "", &[(0, 0, 0)]), + t!(abasic020, &[""], "a", &[(0, 0, 0)]), + t!(abasic030, &[""], "abc", &[(0, 0, 0)]), + t!(abasic100, &["a"], "a", &[(0, 0, 1)]), + t!(abasic110, &["a"], "aa", &[(0, 0, 1)]), + t!(abasic120, &["a", "b"], "ab", &[(0, 0, 1)]), + t!(abasic130, &["a", "b"], "ba", &[(1, 0, 1)]), + t!(abasic140, &["foo", "foofoo"], "foo", &[(0, 0, 3)]), + t!(abasic150, &["foofoo", "foo"], "foo", &[(1, 0, 3)]), +]; + +/// Tests for non-overlapping standard match semantics. +/// +/// These tests generally shouldn't pass for leftmost-{first,longest}, although +/// some do in order to write clearer tests. For example, standard000 will +/// pass with leftmost-first semantics, but standard010 will not. We write +/// both to emphasize how the match semantics work. +const STANDARD: &'static [SearchTest] = &[ + t!(standard000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), + t!(standard010, &["abcd", "ab"], "abcd", &[(1, 0, 2)]), + t!(standard020, &["abcd", "ab", "abc"], "abcd", &[(1, 0, 2)]), + t!(standard030, &["abcd", "abc", "ab"], "abcd", &[(2, 0, 2)]), + t!(standard040, &["a", ""], "a", &[(1, 0, 0), (1, 1, 1)]), + t!( + standard400, + &["abcd", "bcd", "cd", "b"], + "abcd", + &[(3, 1, 2), (2, 2, 4),] + ), + t!(standard410, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1),]), + t!(standard420, &["", "a"], "aa", &[(0, 0, 0), (0, 1, 1), (0, 2, 2),]), + t!(standard430, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1),]), + t!(standard440, &["a", "", ""], "a", &[(1, 0, 0), (1, 1, 1),]), + t!(standard450, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1),]), +]; + +/// Like STANDARD, but for anchored searches. +const STANDARD_ANCHORED: &'static [SearchTest] = &[ + t!(astandard000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), + t!(astandard010, &["abcd", "ab"], "abcd", &[(1, 0, 2)]), + t!(astandard020, &["abcd", "ab", "abc"], "abcd", &[(1, 0, 2)]), + t!(astandard030, &["abcd", "abc", "ab"], "abcd", &[(2, 0, 2)]), + t!(astandard040, &["a", ""], "a", &[(1, 0, 0)]), + t!(astandard050, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4)]), + t!(astandard410, &["", "a"], "a", &[(0, 0, 0)]), + t!(astandard420, &["", "a"], "aa", &[(0, 0, 0)]), + t!(astandard430, &["", "a", ""], "a", &[(0, 0, 0)]), + t!(astandard440, &["a", "", ""], "a", &[(1, 0, 0)]), + t!(astandard450, &["", "", "a"], "a", &[(0, 0, 0)]), +]; + +/// Tests for non-overlapping leftmost match semantics. These should pass for +/// both leftmost-first and leftmost-longest match kinds. Stated differently, +/// among ambiguous matches, the longest match and the match that appeared +/// first when constructing the automaton should always be the same. +const LEFTMOST: &'static [SearchTest] = &[ + t!(leftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]), + t!(leftmost010, &["a", ""], "a", &[(0, 0, 1), (1, 1, 1)]), + t!(leftmost020, &["", ""], "a", &[(0, 0, 0), (0, 1, 1)]), + t!(leftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]), + t!(leftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]), + t!(leftmost032, &["ab", "a"], "xayabbbz", &[(1, 1, 2), (0, 3, 5)]), + t!(leftmost300, &["abcd", "bce", "b"], "abce", &[(1, 1, 4)]), + t!(leftmost310, &["abcd", "ce", "bc"], "abce", &[(2, 1, 3)]), + t!(leftmost320, &["abcd", "bce", "ce", "b"], "abce", &[(1, 1, 4)]), + t!(leftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[(3, 1, 3)]), + t!(leftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]), + t!(leftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]), + t!( + leftmost360, + &["abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(2, 0, 8),] + ), + t!( + leftmost370, + &["abcdefghi", "cde", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + leftmost380, + &["abcdefghi", "hz", "abcdefgh", "a"], + "abcdefghz", + &[(2, 0, 8),] + ), + t!( + leftmost390, + &["b", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + leftmost400, + &["h", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + leftmost410, + &["z", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8), (0, 8, 9),] + ), +]; + +/// Like LEFTMOST, but for anchored searches. +const ANCHORED_LEFTMOST: &'static [SearchTest] = &[ + t!(aleftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]), + t!(aleftmost010, &["a", ""], "a", &[(0, 0, 1)]), + t!(aleftmost020, &["", ""], "a", &[(0, 0, 0)]), + t!(aleftmost030, &["a", "ab"], "aa", &[(0, 0, 1)]), + t!(aleftmost031, &["ab", "a"], "aa", &[(1, 0, 1)]), + t!(aleftmost032, &["ab", "a"], "xayabbbz", &[]), + t!(aleftmost300, &["abcd", "bce", "b"], "abce", &[]), + t!(aleftmost310, &["abcd", "ce", "bc"], "abce", &[]), + t!(aleftmost320, &["abcd", "bce", "ce", "b"], "abce", &[]), + t!(aleftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[]), + t!(aleftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]), + t!(aleftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]), + t!( + aleftmost360, + &["abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(2, 0, 8),] + ), + t!( + aleftmost370, + &["abcdefghi", "cde", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + aleftmost380, + &["abcdefghi", "hz", "abcdefgh", "a"], + "abcdefghz", + &[(2, 0, 8),] + ), + t!( + aleftmost390, + &["b", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + aleftmost400, + &["h", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + aleftmost410, + &["z", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8)] + ), +]; + +/// Tests for non-overlapping leftmost-first match semantics. These tests +/// should generally be specific to leftmost-first, which means they should +/// generally fail under leftmost-longest semantics. +const LEFTMOST_FIRST: &'static [SearchTest] = &[ + t!(leftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), + t!(leftfirst010, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1)]), + t!(leftfirst011, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1),]), + t!(leftfirst012, &["a", "", ""], "a", &[(0, 0, 1), (1, 1, 1),]), + t!(leftfirst013, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1),]), + t!(leftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]), + t!(leftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]), + t!(leftfirst040, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (0, 3, 4)]), + t!(leftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(1, 1, 5)]), + t!(leftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]), + t!(leftfirst300, &["abcd", "b", "bce"], "abce", &[(1, 1, 2)]), + t!( + leftfirst310, + &["abcd", "b", "bce", "ce"], + "abce", + &[(1, 1, 2), (3, 2, 4),] + ), + t!( + leftfirst320, + &["a", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(0, 0, 1), (2, 7, 9),] + ), + t!(leftfirst330, &["a", "abab"], "abab", &[(0, 0, 1), (0, 2, 3)]), + t!(leftfirst400, &["amwix", "samwise", "sam"], "Zsamwix", &[(2, 1, 4)]), +]; + +/// Like LEFTMOST_FIRST, but for anchored searches. +const ANCHORED_LEFTMOST_FIRST: &'static [SearchTest] = &[ + t!(aleftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), + t!(aleftfirst010, &["", "a"], "a", &[(0, 0, 0)]), + t!(aleftfirst011, &["", "a", ""], "a", &[(0, 0, 0)]), + t!(aleftfirst012, &["a", "", ""], "a", &[(0, 0, 1)]), + t!(aleftfirst013, &["", "", "a"], "a", &[(0, 0, 0)]), + t!(aleftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]), + t!(aleftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]), + t!(aleftfirst040, &["a", "ab"], "xayabbbz", &[]), + t!(aleftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[]), + t!(aleftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[]), + t!(aleftfirst300, &["abcd", "b", "bce"], "abce", &[]), + t!(aleftfirst310, &["abcd", "b", "bce", "ce"], "abce", &[]), + t!( + aleftfirst320, + &["a", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(0, 0, 1)] + ), + t!(aleftfirst330, &["a", "abab"], "abab", &[(0, 0, 1)]), + t!(aleftfirst400, &["wise", "samwise", "sam"], "samwix", &[(2, 0, 3)]), +]; + +/// Tests for non-overlapping leftmost-longest match semantics. These tests +/// should generally be specific to leftmost-longest, which means they should +/// generally fail under leftmost-first semantics. +const LEFTMOST_LONGEST: &'static [SearchTest] = &[ + t!(leftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]), + t!(leftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]), + t!(leftlong020, &["", "a"], "a", &[(1, 0, 1), (0, 1, 1),]), + t!(leftlong021, &["", "a", ""], "a", &[(1, 0, 1), (0, 1, 1),]), + t!(leftlong022, &["a", "", ""], "a", &[(0, 0, 1), (1, 1, 1),]), + t!(leftlong023, &["", "", "a"], "a", &[(2, 0, 1), (0, 1, 1),]), + t!(leftlong030, &["", "a"], "aa", &[(1, 0, 1), (1, 1, 2), (0, 2, 2),]), + t!(leftlong040, &["a", "ab"], "a", &[(0, 0, 1)]), + t!(leftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]), + t!(leftlong060, &["ab", "a"], "a", &[(1, 0, 1)]), + t!(leftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]), + t!(leftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(2, 1, 6)]), + t!(leftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]), + t!(leftlong300, &["abcd", "b", "bce"], "abce", &[(2, 1, 4)]), + t!( + leftlong310, + &["a", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!(leftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]), + t!(leftlong330, &["abcd", "b", "ce"], "abce", &[(1, 1, 2), (2, 2, 4),]), + t!(leftlong340, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (1, 3, 5)]), +]; + +/// Like LEFTMOST_LONGEST, but for anchored searches. +const ANCHORED_LEFTMOST_LONGEST: &'static [SearchTest] = &[ + t!(aleftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]), + t!(aleftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]), + t!(aleftlong020, &["", "a"], "a", &[(1, 0, 1)]), + t!(aleftlong021, &["", "a", ""], "a", &[(1, 0, 1)]), + t!(aleftlong022, &["a", "", ""], "a", &[(0, 0, 1)]), + t!(aleftlong023, &["", "", "a"], "a", &[(2, 0, 1)]), + t!(aleftlong030, &["", "a"], "aa", &[(1, 0, 1)]), + t!(aleftlong040, &["a", "ab"], "a", &[(0, 0, 1)]), + t!(aleftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]), + t!(aleftlong060, &["ab", "a"], "a", &[(1, 0, 1)]), + t!(aleftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]), + t!(aleftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[]), + t!(aleftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[]), + t!(aleftlong300, &["abcd", "b", "bce"], "abce", &[]), + t!( + aleftlong310, + &["a", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!(aleftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]), + t!(aleftlong330, &["abcd", "b", "ce"], "abce", &[]), + t!(aleftlong340, &["a", "ab"], "xayabbbz", &[]), +]; + +/// Tests for non-overlapping match semantics. +/// +/// Generally these tests shouldn't pass when using overlapping semantics. +/// These should pass for both standard and leftmost match semantics. +const NON_OVERLAPPING: &'static [SearchTest] = &[ + t!(nover010, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]), + t!(nover020, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]), + t!(nover030, &["abc", "bc"], "zazabcz", &[(0, 3, 6),]), + t!( + nover100, + &["ab", "ba"], + "abababa", + &[(0, 0, 2), (0, 2, 4), (0, 4, 6),] + ), + t!(nover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (0, 6, 9),]), + t!(nover300, &["", ""], "", &[(0, 0, 0),]), + t!(nover310, &["", ""], "a", &[(0, 0, 0), (0, 1, 1),]), +]; + +/// Like NON_OVERLAPPING, but for anchored searches. +const ANCHORED_NON_OVERLAPPING: &'static [SearchTest] = &[ + t!(anover010, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]), + t!(anover020, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]), + t!(anover030, &["abc", "bc"], "zazabcz", &[]), + t!(anover100, &["ab", "ba"], "abababa", &[(0, 0, 2)]), + t!(anover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3)]), + t!(anover300, &["", ""], "", &[(0, 0, 0),]), + t!(anover310, &["", ""], "a", &[(0, 0, 0)]), +]; + +/// Tests for overlapping match semantics. +/// +/// This only supports standard match semantics, since leftmost-{first,longest} +/// do not support overlapping matches. +const OVERLAPPING: &'static [SearchTest] = &[ + t!( + over000, + &["abcd", "bcd", "cd", "b"], + "abcd", + &[(3, 1, 2), (0, 0, 4), (1, 1, 4), (2, 2, 4),] + ), + t!( + over010, + &["bcd", "cd", "b", "abcd"], + "abcd", + &[(2, 1, 2), (3, 0, 4), (0, 1, 4), (1, 2, 4),] + ), + t!( + over020, + &["abcd", "bcd", "cd"], + "abcd", + &[(0, 0, 4), (1, 1, 4), (2, 2, 4),] + ), + t!( + over030, + &["bcd", "abcd", "cd"], + "abcd", + &[(1, 0, 4), (0, 1, 4), (2, 2, 4),] + ), + t!( + over040, + &["bcd", "cd", "abcd"], + "abcd", + &[(2, 0, 4), (0, 1, 4), (1, 2, 4),] + ), + t!(over050, &["abc", "bc"], "zazabcz", &[(0, 3, 6), (1, 4, 6),]), + t!( + over100, + &["ab", "ba"], + "abababa", + &[(0, 0, 2), (1, 1, 3), (0, 2, 4), (1, 3, 5), (0, 4, 6), (1, 5, 7),] + ), + t!( + over200, + &["foo", "foo"], + "foobarfoo", + &[(0, 0, 3), (1, 0, 3), (0, 6, 9), (1, 6, 9),] + ), + t!(over300, &["", ""], "", &[(0, 0, 0), (1, 0, 0),]), + t!( + over310, + &["", ""], + "a", + &[(0, 0, 0), (1, 0, 0), (0, 1, 1), (1, 1, 1),] + ), + t!(over320, &["", "a"], "a", &[(0, 0, 0), (1, 0, 1), (0, 1, 1),]), + t!( + over330, + &["", "a", ""], + "a", + &[(0, 0, 0), (2, 0, 0), (1, 0, 1), (0, 1, 1), (2, 1, 1),] + ), + t!( + over340, + &["a", "", ""], + "a", + &[(1, 0, 0), (2, 0, 0), (0, 0, 1), (1, 1, 1), (2, 1, 1),] + ), + t!( + over350, + &["", "", "a"], + "a", + &[(0, 0, 0), (1, 0, 0), (2, 0, 1), (0, 1, 1), (1, 1, 1),] + ), + t!( + over360, + &["foo", "foofoo"], + "foofoo", + &[(0, 0, 3), (1, 0, 6), (0, 3, 6)] + ), +]; + +/// Like OVERLAPPING, but for anchored searches. +const ANCHORED_OVERLAPPING: &'static [SearchTest] = &[ + t!(aover000, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4)]), + t!(aover010, &["bcd", "cd", "b", "abcd"], "abcd", &[(3, 0, 4)]), + t!(aover020, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4)]), + t!(aover030, &["bcd", "abcd", "cd"], "abcd", &[(1, 0, 4)]), + t!(aover040, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4)]), + t!(aover050, &["abc", "bc"], "zazabcz", &[]), + t!(aover100, &["ab", "ba"], "abababa", &[(0, 0, 2)]), + t!(aover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (1, 0, 3)]), + t!(aover300, &["", ""], "", &[(0, 0, 0), (1, 0, 0),]), + t!(aover310, &["", ""], "a", &[(0, 0, 0), (1, 0, 0)]), + t!(aover320, &["", "a"], "a", &[(0, 0, 0), (1, 0, 1)]), + t!(aover330, &["", "a", ""], "a", &[(0, 0, 0), (2, 0, 0), (1, 0, 1)]), + t!(aover340, &["a", "", ""], "a", &[(1, 0, 0), (2, 0, 0), (0, 0, 1)]), + t!(aover350, &["", "", "a"], "a", &[(0, 0, 0), (1, 0, 0), (2, 0, 1)]), + t!(aover360, &["foo", "foofoo"], "foofoo", &[(0, 0, 3), (1, 0, 6)]), +]; + +/// Tests for ASCII case insensitivity. +/// +/// These tests should all have the same behavior regardless of match semantics +/// or whether the search is overlapping. +const ASCII_CASE_INSENSITIVE: &'static [SearchTest] = &[ + t!(acasei000, &["a"], "A", &[(0, 0, 1)]), + t!(acasei010, &["Samwise"], "SAMWISE", &[(0, 0, 7)]), + t!(acasei011, &["Samwise"], "SAMWISE.abcd", &[(0, 0, 7)]), + t!(acasei020, &["fOoBaR"], "quux foobar baz", &[(0, 5, 11)]), +]; + +/// Like ASCII_CASE_INSENSITIVE, but specifically for non-overlapping tests. +const ASCII_CASE_INSENSITIVE_NON_OVERLAPPING: &'static [SearchTest] = &[ + t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3)]), + t!(acasei000, &["FOO", "foo"], "fOo", &[(0, 0, 3)]), + t!(acasei010, &["abc", "def"], "abcdef", &[(0, 0, 3), (1, 3, 6)]), +]; + +/// Like ASCII_CASE_INSENSITIVE, but specifically for overlapping tests. +const ASCII_CASE_INSENSITIVE_OVERLAPPING: &'static [SearchTest] = &[ + t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3), (1, 0, 3)]), + t!(acasei001, &["FOO", "foo"], "fOo", &[(0, 0, 3), (1, 0, 3)]), + // This is a regression test from: + // https://github.com/BurntSushi/aho-corasick/issues/68 + // Previously, it was reporting a duplicate (1, 3, 6) match. + t!( + acasei010, + &["abc", "def", "abcdef"], + "abcdef", + &[(0, 0, 3), (2, 0, 6), (1, 3, 6)] + ), +]; + +/// Regression tests that are applied to all Aho-Corasick combinations. +/// +/// If regression tests are needed for specific match semantics, then add them +/// to the appropriate group above. +const REGRESSION: &'static [SearchTest] = &[ + t!(regression010, &["inf", "ind"], "infind", &[(0, 0, 3), (1, 3, 6),]), + t!(regression020, &["ind", "inf"], "infind", &[(1, 0, 3), (0, 3, 6),]), + t!( + regression030, + &["libcore/", "libstd/"], + "libcore/char/methods.rs", + &[(0, 0, 8),] + ), + t!( + regression040, + &["libstd/", "libcore/"], + "libcore/char/methods.rs", + &[(1, 0, 8),] + ), + t!( + regression050, + &["\x00\x00\x01", "\x00\x00\x00"], + "\x00\x00\x00", + &[(1, 0, 3),] + ), + t!( + regression060, + &["\x00\x00\x00", "\x00\x00\x01"], + "\x00\x00\x00", + &[(0, 0, 3),] + ), +]; + +// Now define a test for each combination of things above that we want to run. +// Since there are a few different combinations for each collection of tests, +// we define a couple of macros to avoid repetition drudgery. The testconfig +// macro constructs the automaton from a given match kind, and runs the search +// tests one-by-one over the given collection. The `with` parameter allows one +// to configure the builder with additional parameters. The testcombo macro +// invokes testconfig in precisely this way: it sets up several tests where +// each one turns a different knob on AhoCorasickBuilder. + +macro_rules! testconfig { + (overlapping, $name:ident, $collection:expr, $kind:ident, $with:expr) => { + #[test] + fn $name() { + run_search_tests($collection, |test| { + let mut builder = AhoCorasickBuilder::new(); + $with(&mut builder); + builder + .match_kind(MatchKind::$kind) + .build(test.patterns) + .find_overlapping_iter(test.haystack) + .collect() + }); + } + }; + (stream, $name:ident, $collection:expr, $kind:ident, $with:expr) => { + #[test] + fn $name() { + run_search_tests($collection, |test| { + let buf = + io::BufReader::with_capacity(1, test.haystack.as_bytes()); + let mut builder = AhoCorasickBuilder::new(); + $with(&mut builder); + builder + .match_kind(MatchKind::$kind) + .build(test.patterns) + .stream_find_iter(buf) + .map(|result| result.unwrap()) + .collect() + }); + } + }; + ($name:ident, $collection:expr, $kind:ident, $with:expr) => { + #[test] + fn $name() { + run_search_tests($collection, |test| { + let mut builder = AhoCorasickBuilder::new(); + $with(&mut builder); + builder + .match_kind(MatchKind::$kind) + .build(test.patterns) + .find_iter(test.haystack) + .collect() + }); + } + }; +} + +macro_rules! testcombo { + ($name:ident, $collection:expr, $kind:ident) => { + mod $name { + use super::*; + + testconfig!(nfa_default, $collection, $kind, |_| ()); + testconfig!( + nfa_no_prefilter, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.prefilter(false); + } + ); + testconfig!( + nfa_all_sparse, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.dense_depth(0); + } + ); + testconfig!( + nfa_all_dense, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.dense_depth(usize::MAX); + } + ); + testconfig!( + dfa_default, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.dfa(true); + } + ); + testconfig!( + dfa_no_prefilter, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.dfa(true).prefilter(false); + } + ); + testconfig!( + dfa_all_sparse, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.dfa(true).dense_depth(0); + } + ); + testconfig!( + dfa_all_dense, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.dfa(true).dense_depth(usize::MAX); + } + ); + testconfig!( + dfa_no_byte_class, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + // TODO: remove tests when option is removed. + #[allow(deprecated)] + b.dfa(true).byte_classes(false); + } + ); + testconfig!( + dfa_no_premultiply, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + // TODO: remove tests when option is removed. + #[allow(deprecated)] + b.dfa(true).premultiply(false); + } + ); + testconfig!( + dfa_no_byte_class_no_premultiply, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + // TODO: remove tests when options are removed. + #[allow(deprecated)] + b.dfa(true).byte_classes(false).premultiply(false); + } + ); + } + }; +} + +// Write out the combinations. +testcombo!(search_leftmost_longest, AC_LEFTMOST_LONGEST, LeftmostLongest); +testcombo!(search_leftmost_first, AC_LEFTMOST_FIRST, LeftmostFirst); +testcombo!( + search_standard_nonoverlapping, + AC_STANDARD_NON_OVERLAPPING, + Standard +); + +// Write out the overlapping combo by hand since there is only one of them. +testconfig!( + overlapping, + search_standard_overlapping_nfa_default, + AC_STANDARD_OVERLAPPING, + Standard, + |_| () +); +testconfig!( + overlapping, + search_standard_overlapping_nfa_all_sparse, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.dense_depth(0); + } +); +testconfig!( + overlapping, + search_standard_overlapping_nfa_all_dense, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.dense_depth(usize::MAX); + } +); +testconfig!( + overlapping, + search_standard_overlapping_dfa_default, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.dfa(true); + } +); +testconfig!( + overlapping, + search_standard_overlapping_dfa_all_sparse, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.dfa(true).dense_depth(0); + } +); +testconfig!( + overlapping, + search_standard_overlapping_dfa_all_dense, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.dfa(true).dense_depth(usize::MAX); + } +); +testconfig!( + overlapping, + search_standard_overlapping_dfa_no_byte_class, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + // TODO: remove tests when option is removed. + #[allow(deprecated)] + b.dfa(true).byte_classes(false); + } +); +testconfig!( + overlapping, + search_standard_overlapping_dfa_no_premultiply, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + // TODO: remove tests when option is removed. + #[allow(deprecated)] + b.dfa(true).premultiply(false); + } +); +testconfig!( + overlapping, + search_standard_overlapping_dfa_no_byte_class_no_premultiply, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + // TODO: remove tests when options are removed. + #[allow(deprecated)] + b.dfa(true).byte_classes(false).premultiply(false); + } +); + +// Also write out tests manually for streams, since we only test the standard +// match semantics. We also don't bother testing different automaton +// configurations, since those are well covered by tests above. +testconfig!( + stream, + search_standard_stream_nfa_default, + AC_STANDARD_NON_OVERLAPPING, + Standard, + |_| () +); +testconfig!( + stream, + search_standard_stream_dfa_default, + AC_STANDARD_NON_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.dfa(true); + } +); + +// Same thing for anchored searches. Write them out manually. +testconfig!( + search_standard_anchored_nfa_default, + AC_STANDARD_ANCHORED_NON_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.anchored(true); + } +); +testconfig!( + search_standard_anchored_dfa_default, + AC_STANDARD_ANCHORED_NON_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.anchored(true).dfa(true); + } +); +testconfig!( + overlapping, + search_standard_anchored_overlapping_nfa_default, + AC_STANDARD_ANCHORED_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.anchored(true); + } +); +testconfig!( + overlapping, + search_standard_anchored_overlapping_dfa_default, + AC_STANDARD_ANCHORED_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.anchored(true).dfa(true); + } +); +testconfig!( + search_leftmost_first_anchored_nfa_default, + AC_LEFTMOST_FIRST_ANCHORED, + LeftmostFirst, + |b: &mut AhoCorasickBuilder| { + b.anchored(true); + } +); +testconfig!( + search_leftmost_first_anchored_dfa_default, + AC_LEFTMOST_FIRST_ANCHORED, + LeftmostFirst, + |b: &mut AhoCorasickBuilder| { + b.anchored(true).dfa(true); + } +); +testconfig!( + search_leftmost_longest_anchored_nfa_default, + AC_LEFTMOST_LONGEST_ANCHORED, + LeftmostLongest, + |b: &mut AhoCorasickBuilder| { + b.anchored(true); + } +); +testconfig!( + search_leftmost_longest_anchored_dfa_default, + AC_LEFTMOST_LONGEST_ANCHORED, + LeftmostLongest, + |b: &mut AhoCorasickBuilder| { + b.anchored(true).dfa(true); + } +); + +// And also write out the test combinations for ASCII case insensitivity. +testconfig!( + acasei_standard_nfa_default, + &[ASCII_CASE_INSENSITIVE], + Standard, + |b: &mut AhoCorasickBuilder| { + b.prefilter(false).ascii_case_insensitive(true); + } +); +testconfig!( + acasei_standard_dfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + Standard, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true).dfa(true); + } +); +testconfig!( + overlapping, + acasei_standard_overlapping_nfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING], + Standard, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true); + } +); +testconfig!( + overlapping, + acasei_standard_overlapping_dfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING], + Standard, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true).dfa(true); + } +); +testconfig!( + acasei_leftmost_first_nfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + LeftmostFirst, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true); + } +); +testconfig!( + acasei_leftmost_first_dfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + LeftmostFirst, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true).dfa(true); + } +); +testconfig!( + acasei_leftmost_longest_nfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + LeftmostLongest, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true); + } +); +testconfig!( + acasei_leftmost_longest_dfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + LeftmostLongest, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true).dfa(true); + } +); + +fn run_search_tests Vec>( + which: TestCollection, + mut f: F, +) { + let get_match_triples = + |matches: Vec| -> Vec<(usize, usize, usize)> { + matches + .into_iter() + .map(|m| (m.pattern(), m.start(), m.end())) + .collect() + }; + for &tests in which { + for test in tests { + assert_eq!( + test.matches, + get_match_triples(f(&test)).as_slice(), + "test: {}, patterns: {:?}, haystack: {:?}", + test.name, + test.patterns, + test.haystack + ); + } + } +} + +#[test] +fn search_tests_have_unique_names() { + let assert = |constname, tests: &[SearchTest]| { + let mut seen = HashMap::new(); // map from test name to position + for (i, test) in tests.iter().enumerate() { + if !seen.contains_key(test.name) { + seen.insert(test.name, i); + } else { + let last = seen[test.name]; + panic!( + "{} tests have duplicate names at positions {} and {}", + constname, last, i + ); + } + } + }; + assert("BASICS", BASICS); + assert("STANDARD", STANDARD); + assert("LEFTMOST", LEFTMOST); + assert("LEFTMOST_FIRST", LEFTMOST_FIRST); + assert("LEFTMOST_LONGEST", LEFTMOST_LONGEST); + assert("NON_OVERLAPPING", NON_OVERLAPPING); + assert("OVERLAPPING", OVERLAPPING); + assert("REGRESSION", REGRESSION); +} + +#[test] +#[should_panic] +fn stream_not_allowed_leftmost_first() { + let fsm = AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostFirst) + .build(None::); + assert_eq!(fsm.stream_find_iter(&b""[..]).count(), 0); +} + +#[test] +#[should_panic] +fn stream_not_allowed_leftmost_longest() { + let fsm = AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostLongest) + .build(None::); + assert_eq!(fsm.stream_find_iter(&b""[..]).count(), 0); +} + +#[test] +#[should_panic] +fn overlapping_not_allowed_leftmost_first() { + let fsm = AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostFirst) + .build(None::); + assert_eq!(fsm.find_overlapping_iter("").count(), 0); +} + +#[test] +#[should_panic] +fn overlapping_not_allowed_leftmost_longest() { + let fsm = AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostLongest) + .build(None::); + assert_eq!(fsm.find_overlapping_iter("").count(), 0); +} + +#[test] +fn state_id_too_small() { + let mut patterns = vec![]; + for c1 in (b'a'..b'z').map(|b| b as char) { + for c2 in (b'a'..b'z').map(|b| b as char) { + for c3 in (b'a'..b'z').map(|b| b as char) { + patterns.push(format!("{}{}{}", c1, c2, c3)); + } + } + } + let result = + AhoCorasickBuilder::new().build_with_size::(&patterns); + assert!(result.is_err()); +} + +// See: https://github.com/BurntSushi/aho-corasick/issues/44 +// +// In short, this test ensures that enabling ASCII case insensitivity does not +// visit an exponential number of states when filling in failure transitions. +#[test] +fn regression_ascii_case_insensitive_no_exponential() { + let ac = AhoCorasickBuilder::new() + .ascii_case_insensitive(true) + .build(&["Tsubaki House-Triple Shot Vol01校花三姐妹"]); + assert!(ac.find("").is_none()); +} + +// See: https://github.com/BurntSushi/aho-corasick/issues/53 +// +// This test ensures that the rare byte prefilter works in a particular corner +// case. In particular, the shift offset detected for '/' in the patterns below +// was incorrect, leading to a false negative. +#[test] +fn regression_rare_byte_prefilter() { + use crate::AhoCorasick; + + let ac = AhoCorasick::new_auto_configured(&["ab/j/", "x/"]); + assert!(ac.is_match("ab/j/")); +} + +#[test] +fn regression_case_insensitive_prefilter() { + use crate::AhoCorasickBuilder; + + for c in b'a'..b'z' { + for c2 in b'a'..b'z' { + let c = c as char; + let c2 = c2 as char; + let needle = format!("{}{}", c, c2).to_lowercase(); + let haystack = needle.to_uppercase(); + let ac = AhoCorasickBuilder::new() + .ascii_case_insensitive(true) + .prefilter(true) + .build(&[&needle]); + assert_eq!( + 1, + ac.find_iter(&haystack).count(), + "failed to find {:?} in {:?}\n\nautomaton:\n{:?}", + needle, + haystack, + ac, + ); + } + } +} + +// See: https://github.com/BurntSushi/aho-corasick/issues/64 +// +// This occurs when the rare byte prefilter is active. +#[test] +fn regression_stream_rare_byte_prefilter() { + use std::io::Read; + + // NOTE: The test only fails if this ends with j. + const MAGIC: [u8; 5] = *b"1234j"; + + // NOTE: The test fails for value in 8188..=8191 These value put the string + // to search accross two call to read because the buffer size is 8192 by + // default. + const BEGIN: usize = 8191; + + /// This is just a structure that implements Reader. The reader + /// implementation will simulate a file filled with 0, except for the MAGIC + /// string at offset BEGIN. + #[derive(Default)] + struct R { + read: usize, + } + + impl Read for R { + fn read(&mut self, buf: &mut [u8]) -> ::std::io::Result { + //dbg!(buf.len()); + if self.read > 100000 { + return Ok(0); + } + let mut from = 0; + if self.read < BEGIN { + from = buf.len().min(BEGIN - self.read); + for x in 0..from { + buf[x] = 0; + } + self.read += from; + } + if self.read >= BEGIN && self.read <= BEGIN + MAGIC.len() { + let to = buf.len().min(BEGIN + MAGIC.len() - self.read + from); + if to > from { + buf[from..to].copy_from_slice( + &MAGIC + [self.read - BEGIN..self.read - BEGIN + to - from], + ); + self.read += to - from; + from = to; + } + } + for x in from..buf.len() { + buf[x] = 0; + self.read += 1; + } + Ok(buf.len()) + } + } + + fn run() -> ::std::io::Result<()> { + let aut = AhoCorasickBuilder::new().build(&[&MAGIC]); + + // While reading from a vector, it works: + let mut buf = vec![]; + R::default().read_to_end(&mut buf)?; + let from_whole = aut.find_iter(&buf).next().unwrap().start(); + + //But using stream_find_iter fails! + let mut file = R::default(); + let begin = aut + .stream_find_iter(&mut file) + .next() + .expect("NOT FOUND!!!!")? // Panic here + .start(); + assert_eq!(from_whole, begin); + Ok(()) + } + + run().unwrap() +} diff --git a/vendor/regex-syntax/.cargo-checksum.json b/vendor/regex-syntax/.cargo-checksum.json new file mode 100644 index 0000000000..09ad1af6c0 --- /dev/null +++ b/vendor/regex-syntax/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"Cargo.toml":"64cd89fcae533a9869fd71526374a61adeb14a0e4fd29eb8b81d6744525a817c","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","README.md":"67a3e673a9da6826fd4db5be6902841c821b52b98dc22c300f6e327872392b0a","benches/bench.rs":"d2b6ae5b939abd6093064f144b981b7739d7f474ec0698a1268052fc92406635","src/ast/mod.rs":"a306c08610968d850a4666a9775fb7e8cf6646c02bab766440e6a93ab2e38f58","src/ast/parse.rs":"150b42e944f766fdca70d654dbe32f8a17498432729c78b9eb50b73ae7f91f86","src/ast/print.rs":"d12f2cc75cd62f35623e1eb7a77ab8ac804b971752082700d2c4f550f834b249","src/ast/visitor.rs":"1a7b473147e4f6b89623ef1744a9e87f665bcf160fe08a33ce8e35011811ba71","src/either.rs":"1758e3edd056884eccadd995708d1e374ba9aa65846bd0e13b1aae852607c560","src/error.rs":"b3c5903a8937d2aff229a3ec65d4571d01ec4d9874c9a242ed6562c32702bcbd","src/hir/interval.rs":"e767fed363bebe4bbda0d78b8f07e73f321eaf4f837e2d7bd14a1617387e9a89","src/hir/literal/mod.rs":"ffe9a0aff7827f97bffd29eb2f4ba96627b16953161dce6c50a2f760e76bbd98","src/hir/mod.rs":"3a6d4e0b9ec221a98fead13ba885c8a1efbbd5601e6445a2e928aa37e7716549","src/hir/print.rs":"651b5d9776532a78612a5f9081372a57bad693890639ac19e3128b4defa96662","src/hir/translate.rs":"c7cd9693f73760263fd49a968714d27e7985ebe840211b2d83bca6686b0602a8","src/hir/visitor.rs":"e5bf7f8c09f6155e59c9d676fe25437f7e3700f9bf5d91101d7e246a64c11d5a","src/lib.rs":"a004f65196dd5745b3112e4acc8c467b18495cecac64a58d6608b35de67371cb","src/parser.rs":"0dfb553a152e008b2755f115663e553ed99c4b8e6a4dcbcad1662737534de49d","src/unicode.rs":"2ad48193433fefbede0837bd645f4288f6b39b1facb59dbb7d541bce7bf19109","src/unicode_tables/LICENSE-UNICODE":"74db5baf44a41b1000312c673544b3374e4198af5605c7f9080a402cec42cfa3","src/unicode_tables/age.rs":"2a2599a4e406fbbd0efd16aa6ce385c3f97b87c34820d6686a9f9113a5231c67","src/unicode_tables/case_folding_simple.rs":"9583803d4a10486da372b76979dbd26349b40766229467238eff972c1d78e47b","src/unicode_tables/general_category.rs":"36a93ba1cdeed96a00ff29a5ab5afd2c578a89541bf4dd8b18478146cebda0aa","src/unicode_tables/grapheme_cluster_break.rs":"39c388e9805a8391d3d3e69d74d831ce4fb99aa7e13e52c64dd2bd16d4765301","src/unicode_tables/mod.rs":"26c837099cd934c8062e24bc9a0aaecf15fe1de03f9c6da3f3e1e5ac3ca24bee","src/unicode_tables/perl_decimal.rs":"a98ea4afe71c2947023ae12bd25c46bf4c7de48eeb40979eca5c96ba62cee02e","src/unicode_tables/perl_space.rs":"ea2b3b84b4a48334082dadc6c37d9fcc9c9ded84b40e8f5c9c9314898638967e","src/unicode_tables/perl_word.rs":"6f1156bd6af32151ecffea4abe07a38fa04b1fc1b227ec1a8dac5d5f08d9d74b","src/unicode_tables/property_bool.rs":"0bd64f6e3228eaecf47824e238bdf1f8a9eef113ace6e790a57f045a8106701c","src/unicode_tables/property_names.rs":"5ca25437927eb70c62adf7d038e99a601cfb8a718677fd6de832589664d3c481","src/unicode_tables/property_values.rs":"5b4cc02392d382cf7af60455fc87b9980e97409b62a4b8d6c5843190d2e2d21d","src/unicode_tables/script.rs":"ea1d771b6d0a4b12d143f9bad2ea9342a0887878cbbe3c11262b6eabedaf2dd4","src/unicode_tables/script_extension.rs":"beeb8349703d903ff861beb8401bfd2599e457dc25df872e69d6ad1615f8b5e9","src/unicode_tables/sentence_break.rs":"2befe2a27cc4e8aecb624e310ef9f371462470dd3b2f572cec1f5873a5e30aa9","src/unicode_tables/word_break.rs":"94679177731b515f0c360eff394286a1f99b59527bdbc826cbf51d32f9666187","src/utf8.rs":"de854b3bfb3f7dbefc422f6a25935aaeef55ead2c35386c712a1fe9bf81a7b6f","test":"8a9bd1bd9fb389e08288f951319a9bbb0d4c5284a2ba63cbdab7f6afa2c2f76e"},"package":"f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"} \ No newline at end of file diff --git a/vendor/regex-syntax/Cargo.toml b/vendor/regex-syntax/Cargo.toml new file mode 100644 index 0000000000..72e985621f --- /dev/null +++ b/vendor/regex-syntax/Cargo.toml @@ -0,0 +1,41 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2018" +name = "regex-syntax" +version = "0.6.29" +authors = ["The Rust Project Developers"] +description = "A regular expression parser." +homepage = "https://github.com/rust-lang/regex" +documentation = "https://docs.rs/regex-syntax" +readme = "README.md" +license = "MIT OR Apache-2.0" +repository = "https://github.com/rust-lang/regex" + +[features] +default = ["unicode"] +unicode = [ + "unicode-age", + "unicode-bool", + "unicode-case", + "unicode-gencat", + "unicode-perl", + "unicode-script", + "unicode-segment", +] +unicode-age = [] +unicode-bool = [] +unicode-case = [] +unicode-gencat = [] +unicode-perl = [] +unicode-script = [] +unicode-segment = [] diff --git a/vendor/regex-syntax/LICENSE-APACHE b/vendor/regex-syntax/LICENSE-APACHE new file mode 100644 index 0000000000..16fe87b06e --- /dev/null +++ b/vendor/regex-syntax/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/vendor/regex-syntax/LICENSE-MIT b/vendor/regex-syntax/LICENSE-MIT new file mode 100644 index 0000000000..39d4bdb5ac --- /dev/null +++ b/vendor/regex-syntax/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2014 The Rust Project Developers + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/vendor/regex-syntax/README.md b/vendor/regex-syntax/README.md new file mode 100644 index 0000000000..592f842686 --- /dev/null +++ b/vendor/regex-syntax/README.md @@ -0,0 +1,98 @@ +regex-syntax +============ +This crate provides a robust regular expression parser. + +[![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions) +[![Crates.io](https://img.shields.io/crates/v/regex-syntax.svg)](https://crates.io/crates/regex-syntax) +[![Rust](https://img.shields.io/badge/rust-1.28.0%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex) + + +### Documentation + +https://docs.rs/regex-syntax + + +### Overview + +There are two primary types exported by this crate: `Ast` and `Hir`. The former +is a faithful abstract syntax of a regular expression, and can convert regular +expressions back to their concrete syntax while mostly preserving its original +form. The latter type is a high level intermediate representation of a regular +expression that is amenable to analysis and compilation into byte codes or +automata. An `Hir` achieves this by drastically simplifying the syntactic +structure of the regular expression. While an `Hir` can be converted back to +its equivalent concrete syntax, the result is unlikely to resemble the original +concrete syntax that produced the `Hir`. + + +### Example + +This example shows how to parse a pattern string into its HIR: + +```rust +use regex_syntax::Parser; +use regex_syntax::hir::{self, Hir}; + +let hir = Parser::new().parse("a|b").unwrap(); +assert_eq!(hir, Hir::alternation(vec![ + Hir::literal(hir::Literal::Unicode('a')), + Hir::literal(hir::Literal::Unicode('b')), +])); +``` + + +### Safety + +This crate has no `unsafe` code and sets `forbid(unsafe_code)`. While it's +possible this crate could use `unsafe` code in the future, the standard +for doing so is extremely high. In general, most code in this crate is not +performance critical, since it tends to be dwarfed by the time it takes to +compile a regular expression into an automaton. Therefore, there is little need +for extreme optimization, and therefore, use of `unsafe`. + +The standard for using `unsafe` in this crate is extremely high because this +crate is intended to be reasonably safe to use with user supplied regular +expressions. Therefore, while there may be bugs in the regex parser itself, +they should _never_ result in memory unsafety unless there is either a bug +in the compiler or the standard library. (Since `regex-syntax` has zero +dependencies.) + + +### Crate features + +By default, this crate bundles a fairly large amount of Unicode data tables +(a source size of ~750KB). Because of their large size, one can disable some +or all of these data tables. If a regular expression attempts to use Unicode +data that is not available, then an error will occur when translating the `Ast` +to the `Hir`. + +The full set of features one can disable are +[in the "Crate features" section of the documentation](https://docs.rs/regex-syntax/*/#crate-features). + + +### Testing + +Simply running `cargo test` will give you very good coverage. However, because +of the large number of features exposed by this crate, a `test` script is +included in this directory which will test several feature combinations. This +is the same script that is run in CI. + + +### Motivation + +The primary purpose of this crate is to provide the parser used by `regex`. +Specifically, this crate is treated as an implementation detail of the `regex`, +and is primarily developed for the needs of `regex`. + +Since this crate is an implementation detail of `regex`, it may experience +breaking change releases at a different cadence from `regex`. This is only +possible because this crate is _not_ a public dependency of `regex`. + +Another consequence of this de-coupling is that there is no direct way to +compile a `regex::Regex` from a `regex_syntax::hir::Hir`. Instead, one must +first convert the `Hir` to a string (via its `std::fmt::Display`) and then +compile that via `Regex::new`. While this does repeat some work, compilation +typically takes much longer than parsing. + +Stated differently, the coupling between `regex` and `regex-syntax` exists only +at the level of the concrete syntax. diff --git a/vendor/regex-syntax/benches/bench.rs b/vendor/regex-syntax/benches/bench.rs new file mode 100644 index 0000000000..d4703d4fc1 --- /dev/null +++ b/vendor/regex-syntax/benches/bench.rs @@ -0,0 +1,63 @@ +#![feature(test)] + +extern crate test; + +use regex_syntax::Parser; +use test::Bencher; + +#[bench] +fn parse_simple1(b: &mut Bencher) { + b.iter(|| { + let re = r"^bc(d|e)*$"; + Parser::new().parse(re).unwrap() + }); +} + +#[bench] +fn parse_simple2(b: &mut Bencher) { + b.iter(|| { + let re = r"'[a-zA-Z_][a-zA-Z0-9_]*(')\b"; + Parser::new().parse(re).unwrap() + }); +} + +#[bench] +fn parse_small1(b: &mut Bencher) { + b.iter(|| { + let re = r"\p{L}|\p{N}|\s|.|\d"; + Parser::new().parse(re).unwrap() + }); +} + +#[bench] +fn parse_medium1(b: &mut Bencher) { + b.iter(|| { + let re = r"\pL\p{Greek}\p{Hiragana}\p{Alphabetic}\p{Hebrew}\p{Arabic}"; + Parser::new().parse(re).unwrap() + }); +} + +#[bench] +fn parse_medium2(b: &mut Bencher) { + b.iter(|| { + let re = r"\s\S\w\W\d\D"; + Parser::new().parse(re).unwrap() + }); +} + +#[bench] +fn parse_medium3(b: &mut Bencher) { + b.iter(|| { + let re = + r"\p{age:3.2}\p{hira}\p{scx:hira}\p{alphabetic}\p{sc:Greek}\pL"; + Parser::new().parse(re).unwrap() + }); +} + +#[bench] +fn parse_huge(b: &mut Bencher) { + b.iter(|| { + let re = r"\p{L}{100}"; + Parser::new().parse(re).unwrap() + }); +} diff --git a/vendor/regex-syntax/src/ast/mod.rs b/vendor/regex-syntax/src/ast/mod.rs new file mode 100644 index 0000000000..9db9afaf17 --- /dev/null +++ b/vendor/regex-syntax/src/ast/mod.rs @@ -0,0 +1,1513 @@ +/*! +Defines an abstract syntax for regular expressions. +*/ + +use std::cmp::Ordering; +use std::error; +use std::fmt; + +pub use crate::ast::visitor::{visit, Visitor}; + +pub mod parse; +pub mod print; +mod visitor; + +/// An error that occurred while parsing a regular expression into an abstract +/// syntax tree. +/// +/// Note that not all ASTs represents a valid regular expression. For example, +/// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a +/// valid Unicode property name. That particular error is reported when +/// translating an AST to the high-level intermediate representation (`HIR`). +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Error { + /// The kind of error. + kind: ErrorKind, + /// The original pattern that the parser generated the error from. Every + /// span in an error is a valid range into this string. + pattern: String, + /// The span of this error. + span: Span, +} + +impl Error { + /// Return the type of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } + + /// The original pattern string in which this error occurred. + /// + /// Every span reported by this error is reported in terms of this string. + pub fn pattern(&self) -> &str { + &self.pattern + } + + /// Return the span at which this error occurred. + pub fn span(&self) -> &Span { + &self.span + } + + /// Return an auxiliary span. This span exists only for some errors that + /// benefit from being able to point to two locations in the original + /// regular expression. For example, "duplicate" errors will have the + /// main error position set to the duplicate occurrence while its + /// auxiliary span will be set to the initial occurrence. + pub fn auxiliary_span(&self) -> Option<&Span> { + use self::ErrorKind::*; + match self.kind { + FlagDuplicate { ref original } => Some(original), + FlagRepeatedNegation { ref original, .. } => Some(original), + GroupNameDuplicate { ref original, .. } => Some(original), + _ => None, + } + } +} + +/// The type of an error that occurred while building an AST. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ErrorKind { + /// The capturing group limit was exceeded. + /// + /// Note that this represents a limit on the total number of capturing + /// groups in a regex and not necessarily the number of nested capturing + /// groups. That is, the nest limit can be low and it is still possible for + /// this error to occur. + CaptureLimitExceeded, + /// An invalid escape sequence was found in a character class set. + ClassEscapeInvalid, + /// An invalid character class range was found. An invalid range is any + /// range where the start is greater than the end. + ClassRangeInvalid, + /// An invalid range boundary was found in a character class. Range + /// boundaries must be a single literal codepoint, but this error indicates + /// that something else was found, such as a nested class. + ClassRangeLiteral, + /// An opening `[` was found with no corresponding closing `]`. + ClassUnclosed, + /// Note that this error variant is no longer used. Namely, a decimal + /// number can only appear as a repetition quantifier. When the number + /// in a repetition quantifier is empty, then it gets its own specialized + /// error, `RepetitionCountDecimalEmpty`. + DecimalEmpty, + /// An invalid decimal number was given where one was expected. + DecimalInvalid, + /// A bracketed hex literal was empty. + EscapeHexEmpty, + /// A bracketed hex literal did not correspond to a Unicode scalar value. + EscapeHexInvalid, + /// An invalid hexadecimal digit was found. + EscapeHexInvalidDigit, + /// EOF was found before an escape sequence was completed. + EscapeUnexpectedEof, + /// An unrecognized escape sequence. + EscapeUnrecognized, + /// A dangling negation was used when setting flags, e.g., `i-`. + FlagDanglingNegation, + /// A flag was used twice, e.g., `i-i`. + FlagDuplicate { + /// The position of the original flag. The error position + /// points to the duplicate flag. + original: Span, + }, + /// The negation operator was used twice, e.g., `-i-s`. + FlagRepeatedNegation { + /// The position of the original negation operator. The error position + /// points to the duplicate negation operator. + original: Span, + }, + /// Expected a flag but got EOF, e.g., `(?`. + FlagUnexpectedEof, + /// Unrecognized flag, e.g., `a`. + FlagUnrecognized, + /// A duplicate capture name was found. + GroupNameDuplicate { + /// The position of the initial occurrence of the capture name. The + /// error position itself points to the duplicate occurrence. + original: Span, + }, + /// A capture group name is empty, e.g., `(?P<>abc)`. + GroupNameEmpty, + /// An invalid character was seen for a capture group name. This includes + /// errors where the first character is a digit (even though subsequent + /// characters are allowed to be digits). + GroupNameInvalid, + /// A closing `>` could not be found for a capture group name. + GroupNameUnexpectedEof, + /// An unclosed group, e.g., `(ab`. + /// + /// The span of this error corresponds to the unclosed parenthesis. + GroupUnclosed, + /// An unopened group, e.g., `ab)`. + GroupUnopened, + /// The nest limit was exceeded. The limit stored here is the limit + /// configured in the parser. + NestLimitExceeded(u32), + /// The range provided in a counted repetition operator is invalid. The + /// range is invalid if the start is greater than the end. + RepetitionCountInvalid, + /// An opening `{` was not followed by a valid decimal value. + /// For example, `x{}` or `x{]}` would fail. + RepetitionCountDecimalEmpty, + /// An opening `{` was found with no corresponding closing `}`. + RepetitionCountUnclosed, + /// A repetition operator was applied to a missing sub-expression. This + /// occurs, for example, in the regex consisting of just a `*` or even + /// `(?i)*`. It is, however, possible to create a repetition operating on + /// an empty sub-expression. For example, `()*` is still considered valid. + RepetitionMissing, + /// The Unicode class is not valid. This typically occurs when a `\p` is + /// followed by something other than a `{`. + UnicodeClassInvalid, + /// When octal support is disabled, this error is produced when an octal + /// escape is used. The octal escape is assumed to be an invocation of + /// a backreference, which is the common case. + UnsupportedBackreference, + /// When syntax similar to PCRE's look-around is used, this error is + /// returned. Some example syntaxes that are rejected include, but are + /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and + /// `(? &str { + use self::ErrorKind::*; + match self.kind { + CaptureLimitExceeded => "capture group limit exceeded", + ClassEscapeInvalid => "invalid escape sequence in character class", + ClassRangeInvalid => "invalid character class range", + ClassRangeLiteral => "invalid range boundary, must be a literal", + ClassUnclosed => "unclosed character class", + DecimalEmpty => "empty decimal literal", + DecimalInvalid => "invalid decimal literal", + EscapeHexEmpty => "empty hexadecimal literal", + EscapeHexInvalid => "invalid hexadecimal literal", + EscapeHexInvalidDigit => "invalid hexadecimal digit", + EscapeUnexpectedEof => "unexpected eof (escape sequence)", + EscapeUnrecognized => "unrecognized escape sequence", + FlagDanglingNegation => "dangling flag negation operator", + FlagDuplicate { .. } => "duplicate flag", + FlagRepeatedNegation { .. } => "repeated negation", + FlagUnexpectedEof => "unexpected eof (flag)", + FlagUnrecognized => "unrecognized flag", + GroupNameDuplicate { .. } => "duplicate capture group name", + GroupNameEmpty => "empty capture group name", + GroupNameInvalid => "invalid capture group name", + GroupNameUnexpectedEof => "unclosed capture group name", + GroupUnclosed => "unclosed group", + GroupUnopened => "unopened group", + NestLimitExceeded(_) => "nest limit exceeded", + RepetitionCountInvalid => "invalid repetition count range", + RepetitionCountUnclosed => "unclosed counted repetition", + RepetitionMissing => "repetition operator missing expression", + UnicodeClassInvalid => "invalid Unicode character class", + UnsupportedBackreference => "backreferences are not supported", + UnsupportedLookAround => "look-around is not supported", + _ => unreachable!(), + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + crate::error::Formatter::from(self).fmt(f) + } +} + +impl fmt::Display for ErrorKind { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use self::ErrorKind::*; + match *self { + CaptureLimitExceeded => write!( + f, + "exceeded the maximum number of \ + capturing groups ({})", + ::std::u32::MAX + ), + ClassEscapeInvalid => { + write!(f, "invalid escape sequence found in character class") + } + ClassRangeInvalid => write!( + f, + "invalid character class range, \ + the start must be <= the end" + ), + ClassRangeLiteral => { + write!(f, "invalid range boundary, must be a literal") + } + ClassUnclosed => write!(f, "unclosed character class"), + DecimalEmpty => write!(f, "decimal literal empty"), + DecimalInvalid => write!(f, "decimal literal invalid"), + EscapeHexEmpty => write!(f, "hexadecimal literal empty"), + EscapeHexInvalid => { + write!(f, "hexadecimal literal is not a Unicode scalar value") + } + EscapeHexInvalidDigit => write!(f, "invalid hexadecimal digit"), + EscapeUnexpectedEof => write!( + f, + "incomplete escape sequence, \ + reached end of pattern prematurely" + ), + EscapeUnrecognized => write!(f, "unrecognized escape sequence"), + FlagDanglingNegation => { + write!(f, "dangling flag negation operator") + } + FlagDuplicate { .. } => write!(f, "duplicate flag"), + FlagRepeatedNegation { .. } => { + write!(f, "flag negation operator repeated") + } + FlagUnexpectedEof => { + write!(f, "expected flag but got end of regex") + } + FlagUnrecognized => write!(f, "unrecognized flag"), + GroupNameDuplicate { .. } => { + write!(f, "duplicate capture group name") + } + GroupNameEmpty => write!(f, "empty capture group name"), + GroupNameInvalid => write!(f, "invalid capture group character"), + GroupNameUnexpectedEof => write!(f, "unclosed capture group name"), + GroupUnclosed => write!(f, "unclosed group"), + GroupUnopened => write!(f, "unopened group"), + NestLimitExceeded(limit) => write!( + f, + "exceed the maximum number of \ + nested parentheses/brackets ({})", + limit + ), + RepetitionCountInvalid => write!( + f, + "invalid repetition count range, \ + the start must be <= the end" + ), + RepetitionCountDecimalEmpty => { + write!(f, "repetition quantifier expects a valid decimal") + } + RepetitionCountUnclosed => { + write!(f, "unclosed counted repetition") + } + RepetitionMissing => { + write!(f, "repetition operator missing expression") + } + UnicodeClassInvalid => { + write!(f, "invalid Unicode character class") + } + UnsupportedBackreference => { + write!(f, "backreferences are not supported") + } + UnsupportedLookAround => write!( + f, + "look-around, including look-ahead and look-behind, \ + is not supported" + ), + _ => unreachable!(), + } + } +} + +/// Span represents the position information of a single AST item. +/// +/// All span positions are absolute byte offsets that can be used on the +/// original regular expression that was parsed. +#[derive(Clone, Copy, Eq, PartialEq)] +pub struct Span { + /// The start byte offset. + pub start: Position, + /// The end byte offset. + pub end: Position, +} + +impl fmt::Debug for Span { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "Span({:?}, {:?})", self.start, self.end) + } +} + +impl Ord for Span { + fn cmp(&self, other: &Span) -> Ordering { + (&self.start, &self.end).cmp(&(&other.start, &other.end)) + } +} + +impl PartialOrd for Span { + fn partial_cmp(&self, other: &Span) -> Option { + Some(self.cmp(other)) + } +} + +/// A single position in a regular expression. +/// +/// A position encodes one half of a span, and include the byte offset, line +/// number and column number. +#[derive(Clone, Copy, Eq, PartialEq)] +pub struct Position { + /// The absolute offset of this position, starting at `0` from the + /// beginning of the regular expression pattern string. + pub offset: usize, + /// The line number, starting at `1`. + pub line: usize, + /// The approximate column number, starting at `1`. + pub column: usize, +} + +impl fmt::Debug for Position { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "Position(o: {:?}, l: {:?}, c: {:?})", + self.offset, self.line, self.column + ) + } +} + +impl Ord for Position { + fn cmp(&self, other: &Position) -> Ordering { + self.offset.cmp(&other.offset) + } +} + +impl PartialOrd for Position { + fn partial_cmp(&self, other: &Position) -> Option { + Some(self.cmp(other)) + } +} + +impl Span { + /// Create a new span with the given positions. + pub fn new(start: Position, end: Position) -> Span { + Span { start, end } + } + + /// Create a new span using the given position as the start and end. + pub fn splat(pos: Position) -> Span { + Span::new(pos, pos) + } + + /// Create a new span by replacing the starting the position with the one + /// given. + pub fn with_start(self, pos: Position) -> Span { + Span { start: pos, ..self } + } + + /// Create a new span by replacing the ending the position with the one + /// given. + pub fn with_end(self, pos: Position) -> Span { + Span { end: pos, ..self } + } + + /// Returns true if and only if this span occurs on a single line. + pub fn is_one_line(&self) -> bool { + self.start.line == self.end.line + } + + /// Returns true if and only if this span is empty. That is, it points to + /// a single position in the concrete syntax of a regular expression. + pub fn is_empty(&self) -> bool { + self.start.offset == self.end.offset + } +} + +impl Position { + /// Create a new position with the given information. + /// + /// `offset` is the absolute offset of the position, starting at `0` from + /// the beginning of the regular expression pattern string. + /// + /// `line` is the line number, starting at `1`. + /// + /// `column` is the approximate column number, starting at `1`. + pub fn new(offset: usize, line: usize, column: usize) -> Position { + Position { offset, line, column } + } +} + +/// An abstract syntax tree for a singular expression along with comments +/// found. +/// +/// Comments are not stored in the tree itself to avoid complexity. Each +/// comment contains a span of precisely where it occurred in the original +/// regular expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct WithComments { + /// The actual ast. + pub ast: Ast, + /// All comments found in the original regular expression. + pub comments: Vec, +} + +/// A comment from a regular expression with an associated span. +/// +/// A regular expression can only contain comments when the `x` flag is +/// enabled. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Comment { + /// The span of this comment, including the beginning `#` and ending `\n`. + pub span: Span, + /// The comment text, starting with the first character following the `#` + /// and ending with the last character preceding the `\n`. + pub comment: String, +} + +/// An abstract syntax tree for a single regular expression. +/// +/// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap +/// space proportional to the size of the `Ast`. +/// +/// This type defines its own destructor that uses constant stack space and +/// heap space proportional to the size of the `Ast`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Ast { + /// An empty regex that matches everything. + Empty(Span), + /// A set of flags, e.g., `(?is)`. + Flags(SetFlags), + /// A single character literal, which includes escape sequences. + Literal(Literal), + /// The "any character" class. + Dot(Span), + /// A single zero-width assertion. + Assertion(Assertion), + /// A single character class. This includes all forms of character classes + /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`. + Class(Class), + /// A repetition operator applied to an arbitrary regular expression. + Repetition(Repetition), + /// A grouped regular expression. + Group(Group), + /// An alternation of regular expressions. + Alternation(Alternation), + /// A concatenation of regular expressions. + Concat(Concat), +} + +impl Ast { + /// Return the span of this abstract syntax tree. + pub fn span(&self) -> &Span { + match *self { + Ast::Empty(ref span) => span, + Ast::Flags(ref x) => &x.span, + Ast::Literal(ref x) => &x.span, + Ast::Dot(ref span) => span, + Ast::Assertion(ref x) => &x.span, + Ast::Class(ref x) => x.span(), + Ast::Repetition(ref x) => &x.span, + Ast::Group(ref x) => &x.span, + Ast::Alternation(ref x) => &x.span, + Ast::Concat(ref x) => &x.span, + } + } + + /// Return true if and only if this Ast is empty. + pub fn is_empty(&self) -> bool { + match *self { + Ast::Empty(_) => true, + _ => false, + } + } + + /// Returns true if and only if this AST has any (including possibly empty) + /// subexpressions. + fn has_subexprs(&self) -> bool { + match *self { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) => false, + Ast::Class(_) + | Ast::Repetition(_) + | Ast::Group(_) + | Ast::Alternation(_) + | Ast::Concat(_) => true, + } + } +} + +/// Print a display representation of this Ast. +/// +/// This does not preserve any of the original whitespace formatting that may +/// have originally been present in the concrete syntax from which this Ast +/// was generated. +/// +/// This implementation uses constant stack space and heap space proportional +/// to the size of the `Ast`. +impl fmt::Display for Ast { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use crate::ast::print::Printer; + Printer::new().print(self, f) + } +} + +/// An alternation of regular expressions. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Alternation { + /// The span of this alternation. + pub span: Span, + /// The alternate regular expressions. + pub asts: Vec, +} + +impl Alternation { + /// Return this alternation as an AST. + /// + /// If this alternation contains zero ASTs, then Ast::Empty is + /// returned. If this alternation contains exactly 1 AST, then the + /// corresponding AST is returned. Otherwise, Ast::Alternation is returned. + pub fn into_ast(mut self) -> Ast { + match self.asts.len() { + 0 => Ast::Empty(self.span), + 1 => self.asts.pop().unwrap(), + _ => Ast::Alternation(self), + } + } +} + +/// A concatenation of regular expressions. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Concat { + /// The span of this concatenation. + pub span: Span, + /// The concatenation regular expressions. + pub asts: Vec, +} + +impl Concat { + /// Return this concatenation as an AST. + /// + /// If this concatenation contains zero ASTs, then Ast::Empty is + /// returned. If this concatenation contains exactly 1 AST, then the + /// corresponding AST is returned. Otherwise, Ast::Concat is returned. + pub fn into_ast(mut self) -> Ast { + match self.asts.len() { + 0 => Ast::Empty(self.span), + 1 => self.asts.pop().unwrap(), + _ => Ast::Concat(self), + } + } +} + +/// A single literal expression. +/// +/// A literal corresponds to a single Unicode scalar value. Literals may be +/// represented in their literal form, e.g., `a` or in their escaped form, +/// e.g., `\x61`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Literal { + /// The span of this literal. + pub span: Span, + /// The kind of this literal. + pub kind: LiteralKind, + /// The Unicode scalar value corresponding to this literal. + pub c: char, +} + +impl Literal { + /// If this literal was written as a `\x` hex escape, then this returns + /// the corresponding byte value. Otherwise, this returns `None`. + pub fn byte(&self) -> Option { + let short_hex = LiteralKind::HexFixed(HexLiteralKind::X); + if self.c as u32 <= 255 && self.kind == short_hex { + Some(self.c as u8) + } else { + None + } + } +} + +/// The kind of a single literal expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum LiteralKind { + /// The literal is written verbatim, e.g., `a` or `☃`. + Verbatim, + /// The literal is written as an escape because it is punctuation, e.g., + /// `\*` or `\[`. + Punctuation, + /// The literal is written as an octal escape, e.g., `\141`. + Octal, + /// The literal is written as a hex code with a fixed number of digits + /// depending on the type of the escape, e.g., `\x61` or or `\u0061` or + /// `\U00000061`. + HexFixed(HexLiteralKind), + /// The literal is written as a hex code with a bracketed number of + /// digits. The only restriction is that the bracketed hex code must refer + /// to a valid Unicode scalar value. + HexBrace(HexLiteralKind), + /// The literal is written as a specially recognized escape, e.g., `\f` + /// or `\n`. + Special(SpecialLiteralKind), +} + +/// The type of a special literal. +/// +/// A special literal is a special escape sequence recognized by the regex +/// parser, e.g., `\f` or `\n`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum SpecialLiteralKind { + /// Bell, spelled `\a` (`\x07`). + Bell, + /// Form feed, spelled `\f` (`\x0C`). + FormFeed, + /// Tab, spelled `\t` (`\x09`). + Tab, + /// Line feed, spelled `\n` (`\x0A`). + LineFeed, + /// Carriage return, spelled `\r` (`\x0D`). + CarriageReturn, + /// Vertical tab, spelled `\v` (`\x0B`). + VerticalTab, + /// Space, spelled `\ ` (`\x20`). Note that this can only appear when + /// parsing in verbose mode. + Space, +} + +/// The type of a Unicode hex literal. +/// +/// Note that all variants behave the same when used with brackets. They only +/// differ when used without brackets in the number of hex digits that must +/// follow. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum HexLiteralKind { + /// A `\x` prefix. When used without brackets, this form is limited to + /// two digits. + X, + /// A `\u` prefix. When used without brackets, this form is limited to + /// four digits. + UnicodeShort, + /// A `\U` prefix. When used without brackets, this form is limited to + /// eight digits. + UnicodeLong, +} + +impl HexLiteralKind { + /// The number of digits that must be used with this literal form when + /// used without brackets. When used with brackets, there is no + /// restriction on the number of digits. + pub fn digits(&self) -> u32 { + match *self { + HexLiteralKind::X => 2, + HexLiteralKind::UnicodeShort => 4, + HexLiteralKind::UnicodeLong => 8, + } + } +} + +/// A single character class expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Class { + /// A Unicode character class, e.g., `\pL` or `\p{Greek}`. + Unicode(ClassUnicode), + /// A perl character class, e.g., `\d` or `\W`. + Perl(ClassPerl), + /// A bracketed character class set, which may contain zero or more + /// character ranges and/or zero or more nested classes. e.g., + /// `[a-zA-Z\pL]`. + Bracketed(ClassBracketed), +} + +impl Class { + /// Return the span of this character class. + pub fn span(&self) -> &Span { + match *self { + Class::Perl(ref x) => &x.span, + Class::Unicode(ref x) => &x.span, + Class::Bracketed(ref x) => &x.span, + } + } +} + +/// A Perl character class. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassPerl { + /// The span of this class. + pub span: Span, + /// The kind of Perl class. + pub kind: ClassPerlKind, + /// Whether the class is negated or not. e.g., `\d` is not negated but + /// `\D` is. + pub negated: bool, +} + +/// The available Perl character classes. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassPerlKind { + /// Decimal numbers. + Digit, + /// Whitespace. + Space, + /// Word characters. + Word, +} + +/// An ASCII character class. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassAscii { + /// The span of this class. + pub span: Span, + /// The kind of ASCII class. + pub kind: ClassAsciiKind, + /// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated + /// but `[[:^alpha:]]` is. + pub negated: bool, +} + +/// The available ASCII character classes. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassAsciiKind { + /// `[0-9A-Za-z]` + Alnum, + /// `[A-Za-z]` + Alpha, + /// `[\x00-\x7F]` + Ascii, + /// `[ \t]` + Blank, + /// `[\x00-\x1F\x7F]` + Cntrl, + /// `[0-9]` + Digit, + /// `[!-~]` + Graph, + /// `[a-z]` + Lower, + /// `[ -~]` + Print, + /// `[!-/:-@\[-`{-~]` + Punct, + /// `[\t\n\v\f\r ]` + Space, + /// `[A-Z]` + Upper, + /// `[0-9A-Za-z_]` + Word, + /// `[0-9A-Fa-f]` + Xdigit, +} + +impl ClassAsciiKind { + /// Return the corresponding ClassAsciiKind variant for the given name. + /// + /// The name given should correspond to the lowercase version of the + /// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`. + /// + /// If no variant with the corresponding name exists, then `None` is + /// returned. + pub fn from_name(name: &str) -> Option { + use self::ClassAsciiKind::*; + match name { + "alnum" => Some(Alnum), + "alpha" => Some(Alpha), + "ascii" => Some(Ascii), + "blank" => Some(Blank), + "cntrl" => Some(Cntrl), + "digit" => Some(Digit), + "graph" => Some(Graph), + "lower" => Some(Lower), + "print" => Some(Print), + "punct" => Some(Punct), + "space" => Some(Space), + "upper" => Some(Upper), + "word" => Some(Word), + "xdigit" => Some(Xdigit), + _ => None, + } + } +} + +/// A Unicode character class. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassUnicode { + /// The span of this class. + pub span: Span, + /// Whether this class is negated or not. + /// + /// Note: be careful when using this attribute. This specifically refers + /// to whether the class is written as `\p` or `\P`, where the latter + /// is `negated = true`. However, it also possible to write something like + /// `\P{scx!=Katakana}` which is actually equivalent to + /// `\p{scx=Katakana}` and is therefore not actually negated even though + /// `negated = true` here. To test whether this class is truly negated + /// or not, use the `is_negated` method. + pub negated: bool, + /// The kind of Unicode class. + pub kind: ClassUnicodeKind, +} + +impl ClassUnicode { + /// Returns true if this class has been negated. + /// + /// Note that this takes the Unicode op into account, if it's present. + /// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`. + pub fn is_negated(&self) -> bool { + match self.kind { + ClassUnicodeKind::NamedValue { + op: ClassUnicodeOpKind::NotEqual, + .. + } => !self.negated, + _ => self.negated, + } + } +} + +/// The available forms of Unicode character classes. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassUnicodeKind { + /// A one letter abbreviated class, e.g., `\pN`. + OneLetter(char), + /// A binary property, general category or script. The string may be + /// empty. + Named(String), + /// A property name and an associated value. + NamedValue { + /// The type of Unicode op used to associate `name` with `value`. + op: ClassUnicodeOpKind, + /// The property name (which may be empty). + name: String, + /// The property value (which may be empty). + value: String, + }, +} + +/// The type of op used in a Unicode character class. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassUnicodeOpKind { + /// A property set to a specific value, e.g., `\p{scx=Katakana}`. + Equal, + /// A property set to a specific value using a colon, e.g., + /// `\p{scx:Katakana}`. + Colon, + /// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`. + NotEqual, +} + +impl ClassUnicodeOpKind { + /// Whether the op is an equality op or not. + pub fn is_equal(&self) -> bool { + match *self { + ClassUnicodeOpKind::Equal | ClassUnicodeOpKind::Colon => true, + _ => false, + } + } +} + +/// A bracketed character class, e.g., `[a-z0-9]`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassBracketed { + /// The span of this class. + pub span: Span, + /// Whether this class is negated or not. e.g., `[a]` is not negated but + /// `[^a]` is. + pub negated: bool, + /// The type of this set. A set is either a normal union of things, e.g., + /// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`. + pub kind: ClassSet, +} + +/// A character class set. +/// +/// This type corresponds to the internal structure of a bracketed character +/// class. That is, every bracketed character is one of two types: a union of +/// items (literals, ranges, other bracketed classes) or a tree of binary set +/// operations. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassSet { + /// An item, which can be a single literal, range, nested character class + /// or a union of items. + Item(ClassSetItem), + /// A single binary operation (i.e., &&, -- or ~~). + BinaryOp(ClassSetBinaryOp), +} + +impl ClassSet { + /// Build a set from a union. + pub fn union(ast: ClassSetUnion) -> ClassSet { + ClassSet::Item(ClassSetItem::Union(ast)) + } + + /// Return the span of this character class set. + pub fn span(&self) -> &Span { + match *self { + ClassSet::Item(ref x) => x.span(), + ClassSet::BinaryOp(ref x) => &x.span, + } + } + + /// Return true if and only if this class set is empty. + fn is_empty(&self) -> bool { + match *self { + ClassSet::Item(ClassSetItem::Empty(_)) => true, + _ => false, + } + } +} + +/// A single component of a character class set. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ClassSetItem { + /// An empty item. + /// + /// Note that a bracketed character class cannot contain a single empty + /// item. Empty items can appear when using one of the binary operators. + /// For example, `[&&]` is the intersection of two empty classes. + Empty(Span), + /// A single literal. + Literal(Literal), + /// A range between two literals. + Range(ClassSetRange), + /// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`. + Ascii(ClassAscii), + /// A Unicode character class, e.g., `\pL` or `\p{Greek}`. + Unicode(ClassUnicode), + /// A perl character class, e.g., `\d` or `\W`. + Perl(ClassPerl), + /// A bracketed character class set, which may contain zero or more + /// character ranges and/or zero or more nested classes. e.g., + /// `[a-zA-Z\pL]`. + Bracketed(Box), + /// A union of items. + Union(ClassSetUnion), +} + +impl ClassSetItem { + /// Return the span of this character class set item. + pub fn span(&self) -> &Span { + match *self { + ClassSetItem::Empty(ref span) => span, + ClassSetItem::Literal(ref x) => &x.span, + ClassSetItem::Range(ref x) => &x.span, + ClassSetItem::Ascii(ref x) => &x.span, + ClassSetItem::Perl(ref x) => &x.span, + ClassSetItem::Unicode(ref x) => &x.span, + ClassSetItem::Bracketed(ref x) => &x.span, + ClassSetItem::Union(ref x) => &x.span, + } + } +} + +/// A single character class range in a set. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassSetRange { + /// The span of this range. + pub span: Span, + /// The start of this range. + pub start: Literal, + /// The end of this range. + pub end: Literal, +} + +impl ClassSetRange { + /// Returns true if and only if this character class range is valid. + /// + /// The only case where a range is invalid is if its start is greater than + /// its end. + pub fn is_valid(&self) -> bool { + self.start.c <= self.end.c + } +} + +/// A union of items inside a character class set. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassSetUnion { + /// The span of the items in this operation. e.g., the `a-z0-9` in + /// `[^a-z0-9]` + pub span: Span, + /// The sequence of items that make up this union. + pub items: Vec, +} + +impl ClassSetUnion { + /// Push a new item in this union. + /// + /// The ending position of this union's span is updated to the ending + /// position of the span of the item given. If the union is empty, then + /// the starting position of this union is set to the starting position + /// of this item. + /// + /// In other words, if you only use this method to add items to a union + /// and you set the spans on each item correctly, then you should never + /// need to adjust the span of the union directly. + pub fn push(&mut self, item: ClassSetItem) { + if self.items.is_empty() { + self.span.start = item.span().start; + } + self.span.end = item.span().end; + self.items.push(item); + } + + /// Return this union as a character class set item. + /// + /// If this union contains zero items, then an empty union is + /// returned. If this concatenation contains exactly 1 item, then the + /// corresponding item is returned. Otherwise, ClassSetItem::Union is + /// returned. + pub fn into_item(mut self) -> ClassSetItem { + match self.items.len() { + 0 => ClassSetItem::Empty(self.span), + 1 => self.items.pop().unwrap(), + _ => ClassSetItem::Union(self), + } + } +} + +/// A Unicode character class set operation. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassSetBinaryOp { + /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`. + pub span: Span, + /// The type of this set operation. + pub kind: ClassSetBinaryOpKind, + /// The left hand side of the operation. + pub lhs: Box, + /// The right hand side of the operation. + pub rhs: Box, +} + +/// The type of a Unicode character class set operation. +/// +/// Note that this doesn't explicitly represent union since there is no +/// explicit union operator. Concatenation inside a character class corresponds +/// to the union operation. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum ClassSetBinaryOpKind { + /// The intersection of two sets, e.g., `\pN&&[a-z]`. + Intersection, + /// The difference of two sets, e.g., `\pN--[0-9]`. + Difference, + /// The symmetric difference of two sets. The symmetric difference is the + /// set of elements belonging to one but not both sets. + /// e.g., `[\pL~~[:ascii:]]`. + SymmetricDifference, +} + +/// A single zero-width assertion. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Assertion { + /// The span of this assertion. + pub span: Span, + /// The assertion kind, e.g., `\b` or `^`. + pub kind: AssertionKind, +} + +/// An assertion kind. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum AssertionKind { + /// `^` + StartLine, + /// `$` + EndLine, + /// `\A` + StartText, + /// `\z` + EndText, + /// `\b` + WordBoundary, + /// `\B` + NotWordBoundary, +} + +/// A repetition operation applied to a regular expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Repetition { + /// The span of this operation. + pub span: Span, + /// The actual operation. + pub op: RepetitionOp, + /// Whether this operation was applied greedily or not. + pub greedy: bool, + /// The regular expression under repetition. + pub ast: Box, +} + +/// The repetition operator itself. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct RepetitionOp { + /// The span of this operator. This includes things like `+`, `*?` and + /// `{m,n}`. + pub span: Span, + /// The type of operation. + pub kind: RepetitionKind, +} + +/// The kind of a repetition operator. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum RepetitionKind { + /// `?` + ZeroOrOne, + /// `*` + ZeroOrMore, + /// `+` + OneOrMore, + /// `{m,n}` + Range(RepetitionRange), +} + +/// A range repetition operator. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum RepetitionRange { + /// `{m}` + Exactly(u32), + /// `{m,}` + AtLeast(u32), + /// `{m,n}` + Bounded(u32, u32), +} + +impl RepetitionRange { + /// Returns true if and only if this repetition range is valid. + /// + /// The only case where a repetition range is invalid is if it is bounded + /// and its start is greater than its end. + pub fn is_valid(&self) -> bool { + match *self { + RepetitionRange::Bounded(s, e) if s > e => false, + _ => true, + } + } +} + +/// A grouped regular expression. +/// +/// This includes both capturing and non-capturing groups. This does **not** +/// include flag-only groups like `(?is)`, but does contain any group that +/// contains a sub-expression, e.g., `(a)`, `(?Pa)`, `(?:a)` and +/// `(?is:a)`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Group { + /// The span of this group. + pub span: Span, + /// The kind of this group. + pub kind: GroupKind, + /// The regular expression in this group. + pub ast: Box, +} + +impl Group { + /// If this group is non-capturing, then this returns the (possibly empty) + /// set of flags. Otherwise, `None` is returned. + pub fn flags(&self) -> Option<&Flags> { + match self.kind { + GroupKind::NonCapturing(ref flags) => Some(flags), + _ => None, + } + } + + /// Returns true if and only if this group is capturing. + pub fn is_capturing(&self) -> bool { + match self.kind { + GroupKind::CaptureIndex(_) | GroupKind::CaptureName(_) => true, + GroupKind::NonCapturing(_) => false, + } + } + + /// Returns the capture index of this group, if this is a capturing group. + /// + /// This returns a capture index precisely when `is_capturing` is `true`. + pub fn capture_index(&self) -> Option { + match self.kind { + GroupKind::CaptureIndex(i) => Some(i), + GroupKind::CaptureName(ref x) => Some(x.index), + GroupKind::NonCapturing(_) => None, + } + } +} + +/// The kind of a group. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum GroupKind { + /// `(a)` + CaptureIndex(u32), + /// `(?Pa)` + CaptureName(CaptureName), + /// `(?:a)` and `(?i:a)` + NonCapturing(Flags), +} + +/// A capture name. +/// +/// This corresponds to the name itself between the angle brackets in, e.g., +/// `(?Pexpr)`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct CaptureName { + /// The span of this capture name. + pub span: Span, + /// The capture name. + pub name: String, + /// The capture index. + pub index: u32, +} + +/// A group of flags that is not applied to a particular regular expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct SetFlags { + /// The span of these flags, including the grouping parentheses. + pub span: Span, + /// The actual sequence of flags. + pub flags: Flags, +} + +/// A group of flags. +/// +/// This corresponds only to the sequence of flags themselves, e.g., `is-u`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Flags { + /// The span of this group of flags. + pub span: Span, + /// A sequence of flag items. Each item is either a flag or a negation + /// operator. + pub items: Vec, +} + +impl Flags { + /// Add the given item to this sequence of flags. + /// + /// If the item was added successfully, then `None` is returned. If the + /// given item is a duplicate, then `Some(i)` is returned, where + /// `items[i].kind == item.kind`. + pub fn add_item(&mut self, item: FlagsItem) -> Option { + for (i, x) in self.items.iter().enumerate() { + if x.kind == item.kind { + return Some(i); + } + } + self.items.push(item); + None + } + + /// Returns the state of the given flag in this set. + /// + /// If the given flag is in the set but is negated, then `Some(false)` is + /// returned. + /// + /// If the given flag is in the set and is not negated, then `Some(true)` + /// is returned. + /// + /// Otherwise, `None` is returned. + pub fn flag_state(&self, flag: Flag) -> Option { + let mut negated = false; + for x in &self.items { + match x.kind { + FlagsItemKind::Negation => { + negated = true; + } + FlagsItemKind::Flag(ref xflag) if xflag == &flag => { + return Some(!negated); + } + _ => {} + } + } + None + } +} + +/// A single item in a group of flags. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct FlagsItem { + /// The span of this item. + pub span: Span, + /// The kind of this item. + pub kind: FlagsItemKind, +} + +/// The kind of an item in a group of flags. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum FlagsItemKind { + /// A negation operator applied to all subsequent flags in the enclosing + /// group. + Negation, + /// A single flag in a group. + Flag(Flag), +} + +impl FlagsItemKind { + /// Returns true if and only if this item is a negation operator. + pub fn is_negation(&self) -> bool { + match *self { + FlagsItemKind::Negation => true, + _ => false, + } + } +} + +/// A single flag. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Flag { + /// `i` + CaseInsensitive, + /// `m` + MultiLine, + /// `s` + DotMatchesNewLine, + /// `U` + SwapGreed, + /// `u` + Unicode, + /// `x` + IgnoreWhitespace, +} + +/// A custom `Drop` impl is used for `Ast` such that it uses constant stack +/// space but heap space proportional to the depth of the `Ast`. +impl Drop for Ast { + fn drop(&mut self) { + use std::mem; + + match *self { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + // Classes are recursive, so they get their own Drop impl. + | Ast::Class(_) => return, + Ast::Repetition(ref x) if !x.ast.has_subexprs() => return, + Ast::Group(ref x) if !x.ast.has_subexprs() => return, + Ast::Alternation(ref x) if x.asts.is_empty() => return, + Ast::Concat(ref x) if x.asts.is_empty() => return, + _ => {} + } + + let empty_span = || Span::splat(Position::new(0, 0, 0)); + let empty_ast = || Ast::Empty(empty_span()); + let mut stack = vec![mem::replace(self, empty_ast())]; + while let Some(mut ast) = stack.pop() { + match ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + // Classes are recursive, so they get their own Drop impl. + | Ast::Class(_) => {} + Ast::Repetition(ref mut x) => { + stack.push(mem::replace(&mut x.ast, empty_ast())); + } + Ast::Group(ref mut x) => { + stack.push(mem::replace(&mut x.ast, empty_ast())); + } + Ast::Alternation(ref mut x) => { + stack.extend(x.asts.drain(..)); + } + Ast::Concat(ref mut x) => { + stack.extend(x.asts.drain(..)); + } + } + } + } +} + +/// A custom `Drop` impl is used for `ClassSet` such that it uses constant +/// stack space but heap space proportional to the depth of the `ClassSet`. +impl Drop for ClassSet { + fn drop(&mut self) { + use std::mem; + + match *self { + ClassSet::Item(ref item) => match *item { + ClassSetItem::Empty(_) + | ClassSetItem::Literal(_) + | ClassSetItem::Range(_) + | ClassSetItem::Ascii(_) + | ClassSetItem::Unicode(_) + | ClassSetItem::Perl(_) => return, + ClassSetItem::Bracketed(ref x) => { + if x.kind.is_empty() { + return; + } + } + ClassSetItem::Union(ref x) => { + if x.items.is_empty() { + return; + } + } + }, + ClassSet::BinaryOp(ref op) => { + if op.lhs.is_empty() && op.rhs.is_empty() { + return; + } + } + } + + let empty_span = || Span::splat(Position::new(0, 0, 0)); + let empty_set = || ClassSet::Item(ClassSetItem::Empty(empty_span())); + let mut stack = vec![mem::replace(self, empty_set())]; + while let Some(mut set) = stack.pop() { + match set { + ClassSet::Item(ref mut item) => match *item { + ClassSetItem::Empty(_) + | ClassSetItem::Literal(_) + | ClassSetItem::Range(_) + | ClassSetItem::Ascii(_) + | ClassSetItem::Unicode(_) + | ClassSetItem::Perl(_) => {} + ClassSetItem::Bracketed(ref mut x) => { + stack.push(mem::replace(&mut x.kind, empty_set())); + } + ClassSetItem::Union(ref mut x) => { + stack.extend(x.items.drain(..).map(ClassSet::Item)); + } + }, + ClassSet::BinaryOp(ref mut op) => { + stack.push(mem::replace(&mut op.lhs, empty_set())); + stack.push(mem::replace(&mut op.rhs, empty_set())); + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // We use a thread with an explicit stack size to test that our destructor + // for Ast can handle arbitrarily sized expressions in constant stack + // space. In case we run on a platform without threads (WASM?), we limit + // this test to Windows/Unix. + #[test] + #[cfg(any(unix, windows))] + fn no_stack_overflow_on_drop() { + use std::thread; + + let run = || { + let span = || Span::splat(Position::new(0, 0, 0)); + let mut ast = Ast::Empty(span()); + for i in 0..200 { + ast = Ast::Group(Group { + span: span(), + kind: GroupKind::CaptureIndex(i), + ast: Box::new(ast), + }); + } + assert!(!ast.is_empty()); + }; + + // We run our test on a thread with a small stack size so we can + // force the issue more easily. + // + // NOTE(2023-03-21): It turns out that some platforms (like FreeBSD) + // will just barf with very small stack sizes. So we bump this up a bit + // to give more room to breath. When I did this, I confirmed that if + // I remove the custom `Drop` impl for `Ast`, then this test does + // indeed still fail with a stack overflow. (At the time of writing, I + // had to bump it all the way up to 32K before the test would pass even + // without the custom `Drop` impl. So 16K seems like a safe number + // here.) + // + // See: https://github.com/rust-lang/regex/issues/967 + thread::Builder::new() + .stack_size(16 << 10) + .spawn(run) + .unwrap() + .join() + .unwrap(); + } +} diff --git a/vendor/regex-syntax/src/ast/parse.rs b/vendor/regex-syntax/src/ast/parse.rs new file mode 100644 index 0000000000..6e9c9aca06 --- /dev/null +++ b/vendor/regex-syntax/src/ast/parse.rs @@ -0,0 +1,5930 @@ +/*! +This module provides a regular expression parser. +*/ + +use std::borrow::Borrow; +use std::cell::{Cell, RefCell}; +use std::mem; +use std::result; + +use crate::ast::{self, Ast, Position, Span}; +use crate::either::Either; + +use crate::is_meta_character; + +type Result = result::Result; + +/// A primitive is an expression with no sub-expressions. This includes +/// literals, assertions and non-set character classes. This representation +/// is used as intermediate state in the parser. +/// +/// This does not include ASCII character classes, since they can only appear +/// within a set character class. +#[derive(Clone, Debug, Eq, PartialEq)] +enum Primitive { + Literal(ast::Literal), + Assertion(ast::Assertion), + Dot(Span), + Perl(ast::ClassPerl), + Unicode(ast::ClassUnicode), +} + +impl Primitive { + /// Return the span of this primitive. + fn span(&self) -> &Span { + match *self { + Primitive::Literal(ref x) => &x.span, + Primitive::Assertion(ref x) => &x.span, + Primitive::Dot(ref span) => span, + Primitive::Perl(ref x) => &x.span, + Primitive::Unicode(ref x) => &x.span, + } + } + + /// Convert this primitive into a proper AST. + fn into_ast(self) -> Ast { + match self { + Primitive::Literal(lit) => Ast::Literal(lit), + Primitive::Assertion(assert) => Ast::Assertion(assert), + Primitive::Dot(span) => Ast::Dot(span), + Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)), + Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)), + } + } + + /// Convert this primitive into an item in a character class. + /// + /// If this primitive is not a legal item (i.e., an assertion or a dot), + /// then return an error. + fn into_class_set_item>( + self, + p: &ParserI<'_, P>, + ) -> Result { + use self::Primitive::*; + use crate::ast::ClassSetItem; + + match self { + Literal(lit) => Ok(ClassSetItem::Literal(lit)), + Perl(cls) => Ok(ClassSetItem::Perl(cls)), + Unicode(cls) => Ok(ClassSetItem::Unicode(cls)), + x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)), + } + } + + /// Convert this primitive into a literal in a character class. In + /// particular, literals are the only valid items that can appear in + /// ranges. + /// + /// If this primitive is not a legal item (i.e., a class, assertion or a + /// dot), then return an error. + fn into_class_literal>( + self, + p: &ParserI<'_, P>, + ) -> Result { + use self::Primitive::*; + + match self { + Literal(lit) => Ok(lit), + x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)), + } + } +} + +/// Returns true if the given character is a hexadecimal digit. +fn is_hex(c: char) -> bool { + ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') +} + +/// Returns true if the given character is a valid in a capture group name. +/// +/// If `first` is true, then `c` is treated as the first character in the +/// group name (which must be alphabetic or underscore). +fn is_capture_char(c: char, first: bool) -> bool { + c == '_' + || (!first + && (('0' <= c && c <= '9') || c == '.' || c == '[' || c == ']')) + || ('A' <= c && c <= 'Z') + || ('a' <= c && c <= 'z') +} + +/// A builder for a regular expression parser. +/// +/// This builder permits modifying configuration options for the parser. +#[derive(Clone, Debug)] +pub struct ParserBuilder { + ignore_whitespace: bool, + nest_limit: u32, + octal: bool, +} + +impl Default for ParserBuilder { + fn default() -> ParserBuilder { + ParserBuilder::new() + } +} + +impl ParserBuilder { + /// Create a new parser builder with a default configuration. + pub fn new() -> ParserBuilder { + ParserBuilder { + ignore_whitespace: false, + nest_limit: 250, + octal: false, + } + } + + /// Build a parser from this configuration with the given pattern. + pub fn build(&self) -> Parser { + Parser { + pos: Cell::new(Position { offset: 0, line: 1, column: 1 }), + capture_index: Cell::new(0), + nest_limit: self.nest_limit, + octal: self.octal, + initial_ignore_whitespace: self.ignore_whitespace, + ignore_whitespace: Cell::new(self.ignore_whitespace), + comments: RefCell::new(vec![]), + stack_group: RefCell::new(vec![]), + stack_class: RefCell::new(vec![]), + capture_names: RefCell::new(vec![]), + scratch: RefCell::new(String::new()), + } + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an `Ast` using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire Ast is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since this parser + /// implementation will limit itself to heap space proportional to the + /// length of the pattern string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation, which results in a nest + /// depth of `1`. In general, a nest limit is not something that manifests + /// in an obvious way in the concrete syntax, therefore, it should not be + /// used in a granular way. + pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { + self.nest_limit = limit; + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\0` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { + self.octal = yes; + self + } + + /// Enable verbose mode in the regular expression. + /// + /// When enabled, verbose mode permits insignificant whitespace in many + /// places in the regular expression, as well as comments. Comments are + /// started using `#` and continue until the end of the line. + /// + /// By default, this is disabled. It may be selectively enabled in the + /// regular expression by using the `x` flag regardless of this setting. + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { + self.ignore_whitespace = yes; + self + } +} + +/// A regular expression parser. +/// +/// This parses a string representation of a regular expression into an +/// abstract syntax tree. The size of the tree is proportional to the length +/// of the regular expression pattern. +/// +/// A `Parser` can be configured in more detail via a +/// [`ParserBuilder`](struct.ParserBuilder.html). +#[derive(Clone, Debug)] +pub struct Parser { + /// The current position of the parser. + pos: Cell, + /// The current capture index. + capture_index: Cell, + /// The maximum number of open parens/brackets allowed. If the parser + /// exceeds this number, then an error is returned. + nest_limit: u32, + /// Whether to support octal syntax or not. When `false`, the parser will + /// return an error helpfully pointing out that backreferences are not + /// supported. + octal: bool, + /// The initial setting for `ignore_whitespace` as provided by + /// `ParserBuilder`. It is used when resetting the parser's state. + initial_ignore_whitespace: bool, + /// Whether whitespace should be ignored. When enabled, comments are + /// also permitted. + ignore_whitespace: Cell, + /// A list of comments, in order of appearance. + comments: RefCell>, + /// A stack of grouped sub-expressions, including alternations. + stack_group: RefCell>, + /// A stack of nested character classes. This is only non-empty when + /// parsing a class. + stack_class: RefCell>, + /// A sorted sequence of capture names. This is used to detect duplicate + /// capture names and report an error if one is detected. + capture_names: RefCell>, + /// A scratch buffer used in various places. Mostly this is used to + /// accumulate relevant characters from parts of a pattern. + scratch: RefCell, +} + +/// ParserI is the internal parser implementation. +/// +/// We use this separate type so that we can carry the provided pattern string +/// along with us. In particular, a `Parser` internal state is not tied to any +/// one pattern, but `ParserI` is. +/// +/// This type also lets us use `ParserI<&Parser>` in production code while +/// retaining the convenience of `ParserI` for tests, which sometimes +/// work against the internal interface of the parser. +#[derive(Clone, Debug)] +struct ParserI<'s, P> { + /// The parser state/configuration. + parser: P, + /// The full regular expression provided by the user. + pattern: &'s str, +} + +/// GroupState represents a single stack frame while parsing nested groups +/// and alternations. Each frame records the state up to an opening parenthesis +/// or a alternating bracket `|`. +#[derive(Clone, Debug)] +enum GroupState { + /// This state is pushed whenever an opening group is found. + Group { + /// The concatenation immediately preceding the opening group. + concat: ast::Concat, + /// The group that has been opened. Its sub-AST is always empty. + group: ast::Group, + /// Whether this group has the `x` flag enabled or not. + ignore_whitespace: bool, + }, + /// This state is pushed whenever a new alternation branch is found. If + /// an alternation branch is found and this state is at the top of the + /// stack, then this state should be modified to include the new + /// alternation. + Alternation(ast::Alternation), +} + +/// ClassState represents a single stack frame while parsing character classes. +/// Each frame records the state up to an intersection, difference, symmetric +/// difference or nested class. +/// +/// Note that a parser's character class stack is only non-empty when parsing +/// a character class. In all other cases, it is empty. +#[derive(Clone, Debug)] +enum ClassState { + /// This state is pushed whenever an opening bracket is found. + Open { + /// The union of class items immediately preceding this class. + union: ast::ClassSetUnion, + /// The class that has been opened. Typically this just corresponds + /// to the `[`, but it can also include `[^` since `^` indicates + /// negation of the class. + set: ast::ClassBracketed, + }, + /// This state is pushed when a operator is seen. When popped, the stored + /// set becomes the left hand side of the operator. + Op { + /// The type of the operation, i.e., &&, -- or ~~. + kind: ast::ClassSetBinaryOpKind, + /// The left-hand side of the operator. + lhs: ast::ClassSet, + }, +} + +impl Parser { + /// Create a new parser with a default configuration. + /// + /// The parser can be run with either the `parse` or `parse_with_comments` + /// methods. The parse methods return an abstract syntax tree. + /// + /// To set configuration options on the parser, use + /// [`ParserBuilder`](struct.ParserBuilder.html). + pub fn new() -> Parser { + ParserBuilder::new().build() + } + + /// Parse the regular expression into an abstract syntax tree. + pub fn parse(&mut self, pattern: &str) -> Result { + ParserI::new(self, pattern).parse() + } + + /// Parse the regular expression and return an abstract syntax tree with + /// all of the comments found in the pattern. + pub fn parse_with_comments( + &mut self, + pattern: &str, + ) -> Result { + ParserI::new(self, pattern).parse_with_comments() + } + + /// Reset the internal state of a parser. + /// + /// This is called at the beginning of every parse. This prevents the + /// parser from running with inconsistent state (say, if a previous + /// invocation returned an error and the parser is reused). + fn reset(&self) { + // These settings should be in line with the construction + // in `ParserBuilder::build`. + self.pos.set(Position { offset: 0, line: 1, column: 1 }); + self.ignore_whitespace.set(self.initial_ignore_whitespace); + self.comments.borrow_mut().clear(); + self.stack_group.borrow_mut().clear(); + self.stack_class.borrow_mut().clear(); + } +} + +impl<'s, P: Borrow> ParserI<'s, P> { + /// Build an internal parser from a parser configuration and a pattern. + fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> { + ParserI { parser, pattern } + } + + /// Return a reference to the parser state. + fn parser(&self) -> &Parser { + self.parser.borrow() + } + + /// Return a reference to the pattern being parsed. + fn pattern(&self) -> &str { + self.pattern.borrow() + } + + /// Create a new error with the given span and error type. + fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error { + ast::Error { kind, pattern: self.pattern().to_string(), span } + } + + /// Return the current offset of the parser. + /// + /// The offset starts at `0` from the beginning of the regular expression + /// pattern string. + fn offset(&self) -> usize { + self.parser().pos.get().offset + } + + /// Return the current line number of the parser. + /// + /// The line number starts at `1`. + fn line(&self) -> usize { + self.parser().pos.get().line + } + + /// Return the current column of the parser. + /// + /// The column number starts at `1` and is reset whenever a `\n` is seen. + fn column(&self) -> usize { + self.parser().pos.get().column + } + + /// Return the next capturing index. Each subsequent call increments the + /// internal index. + /// + /// The span given should correspond to the location of the opening + /// parenthesis. + /// + /// If the capture limit is exceeded, then an error is returned. + fn next_capture_index(&self, span: Span) -> Result { + let current = self.parser().capture_index.get(); + let i = current.checked_add(1).ok_or_else(|| { + self.error(span, ast::ErrorKind::CaptureLimitExceeded) + })?; + self.parser().capture_index.set(i); + Ok(i) + } + + /// Adds the given capture name to this parser. If this capture name has + /// already been used, then an error is returned. + fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> { + let mut names = self.parser().capture_names.borrow_mut(); + match names + .binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) + { + Err(i) => { + names.insert(i, cap.clone()); + Ok(()) + } + Ok(i) => Err(self.error( + cap.span, + ast::ErrorKind::GroupNameDuplicate { original: names[i].span }, + )), + } + } + + /// Return whether the parser should ignore whitespace or not. + fn ignore_whitespace(&self) -> bool { + self.parser().ignore_whitespace.get() + } + + /// Return the character at the current position of the parser. + /// + /// This panics if the current position does not point to a valid char. + fn char(&self) -> char { + self.char_at(self.offset()) + } + + /// Return the character at the given position. + /// + /// This panics if the given position does not point to a valid char. + fn char_at(&self, i: usize) -> char { + self.pattern()[i..] + .chars() + .next() + .unwrap_or_else(|| panic!("expected char at offset {}", i)) + } + + /// Bump the parser to the next Unicode scalar value. + /// + /// If the end of the input has been reached, then `false` is returned. + fn bump(&self) -> bool { + if self.is_eof() { + return false; + } + let Position { mut offset, mut line, mut column } = self.pos(); + if self.char() == '\n' { + line = line.checked_add(1).unwrap(); + column = 1; + } else { + column = column.checked_add(1).unwrap(); + } + offset += self.char().len_utf8(); + self.parser().pos.set(Position { offset, line, column }); + self.pattern()[self.offset()..].chars().next().is_some() + } + + /// If the substring starting at the current position of the parser has + /// the given prefix, then bump the parser to the character immediately + /// following the prefix and return true. Otherwise, don't bump the parser + /// and return false. + fn bump_if(&self, prefix: &str) -> bool { + if self.pattern()[self.offset()..].starts_with(prefix) { + for _ in 0..prefix.chars().count() { + self.bump(); + } + true + } else { + false + } + } + + /// Returns true if and only if the parser is positioned at a look-around + /// prefix. The conditions under which this returns true must always + /// correspond to a regular expression that would otherwise be consider + /// invalid. + /// + /// This should only be called immediately after parsing the opening of + /// a group or a set of flags. + fn is_lookaround_prefix(&self) -> bool { + self.bump_if("?=") + || self.bump_if("?!") + || self.bump_if("?<=") + || self.bump_if("? bool { + if !self.bump() { + return false; + } + self.bump_space(); + !self.is_eof() + } + + /// If the `x` flag is enabled (i.e., whitespace insensitivity with + /// comments), then this will advance the parser through all whitespace + /// and comments to the next non-whitespace non-comment byte. + /// + /// If the `x` flag is disabled, then this is a no-op. + /// + /// This should be used selectively throughout the parser where + /// arbitrary whitespace is permitted when the `x` flag is enabled. For + /// example, `{ 5 , 6}` is equivalent to `{5,6}`. + fn bump_space(&self) { + if !self.ignore_whitespace() { + return; + } + while !self.is_eof() { + if self.char().is_whitespace() { + self.bump(); + } else if self.char() == '#' { + let start = self.pos(); + let mut comment_text = String::new(); + self.bump(); + while !self.is_eof() { + let c = self.char(); + self.bump(); + if c == '\n' { + break; + } + comment_text.push(c); + } + let comment = ast::Comment { + span: Span::new(start, self.pos()), + comment: comment_text, + }; + self.parser().comments.borrow_mut().push(comment); + } else { + break; + } + } + } + + /// Peek at the next character in the input without advancing the parser. + /// + /// If the input has been exhausted, then this returns `None`. + fn peek(&self) -> Option { + if self.is_eof() { + return None; + } + self.pattern()[self.offset() + self.char().len_utf8()..].chars().next() + } + + /// Like peek, but will ignore spaces when the parser is in whitespace + /// insensitive mode. + fn peek_space(&self) -> Option { + if !self.ignore_whitespace() { + return self.peek(); + } + if self.is_eof() { + return None; + } + let mut start = self.offset() + self.char().len_utf8(); + let mut in_comment = false; + for (i, c) in self.pattern()[start..].char_indices() { + if c.is_whitespace() { + continue; + } else if !in_comment && c == '#' { + in_comment = true; + } else if in_comment && c == '\n' { + in_comment = false; + } else { + start += i; + break; + } + } + self.pattern()[start..].chars().next() + } + + /// Returns true if the next call to `bump` would return false. + fn is_eof(&self) -> bool { + self.offset() == self.pattern().len() + } + + /// Return the current position of the parser, which includes the offset, + /// line and column. + fn pos(&self) -> Position { + self.parser().pos.get() + } + + /// Create a span at the current position of the parser. Both the start + /// and end of the span are set. + fn span(&self) -> Span { + Span::splat(self.pos()) + } + + /// Create a span that covers the current character. + fn span_char(&self) -> Span { + let mut next = Position { + offset: self.offset().checked_add(self.char().len_utf8()).unwrap(), + line: self.line(), + column: self.column().checked_add(1).unwrap(), + }; + if self.char() == '\n' { + next.line += 1; + next.column = 1; + } + Span::new(self.pos(), next) + } + + /// Parse and push a single alternation on to the parser's internal stack. + /// If the top of the stack already has an alternation, then add to that + /// instead of pushing a new one. + /// + /// The concatenation given corresponds to a single alternation branch. + /// The concatenation returned starts the next branch and is empty. + /// + /// This assumes the parser is currently positioned at `|` and will advance + /// the parser to the character following `|`. + #[inline(never)] + fn push_alternate(&self, mut concat: ast::Concat) -> Result { + assert_eq!(self.char(), '|'); + concat.span.end = self.pos(); + self.push_or_add_alternation(concat); + self.bump(); + Ok(ast::Concat { span: self.span(), asts: vec![] }) + } + + /// Pushes or adds the given branch of an alternation to the parser's + /// internal stack of state. + fn push_or_add_alternation(&self, concat: ast::Concat) { + use self::GroupState::*; + + let mut stack = self.parser().stack_group.borrow_mut(); + if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() { + alts.asts.push(concat.into_ast()); + return; + } + stack.push(Alternation(ast::Alternation { + span: Span::new(concat.span.start, self.pos()), + asts: vec![concat.into_ast()], + })); + } + + /// Parse and push a group AST (and its parent concatenation) on to the + /// parser's internal stack. Return a fresh concatenation corresponding + /// to the group's sub-AST. + /// + /// If a set of flags was found (with no group), then the concatenation + /// is returned with that set of flags added. + /// + /// This assumes that the parser is currently positioned on the opening + /// parenthesis. It advances the parser to the character at the start + /// of the sub-expression (or adjoining expression). + /// + /// If there was a problem parsing the start of the group, then an error + /// is returned. + #[inline(never)] + fn push_group(&self, mut concat: ast::Concat) -> Result { + assert_eq!(self.char(), '('); + match self.parse_group()? { + Either::Left(set) => { + let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace); + if let Some(v) = ignore { + self.parser().ignore_whitespace.set(v); + } + + concat.asts.push(Ast::Flags(set)); + Ok(concat) + } + Either::Right(group) => { + let old_ignore_whitespace = self.ignore_whitespace(); + let new_ignore_whitespace = group + .flags() + .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace)) + .unwrap_or(old_ignore_whitespace); + self.parser().stack_group.borrow_mut().push( + GroupState::Group { + concat, + group, + ignore_whitespace: old_ignore_whitespace, + }, + ); + self.parser().ignore_whitespace.set(new_ignore_whitespace); + Ok(ast::Concat { span: self.span(), asts: vec![] }) + } + } + } + + /// Pop a group AST from the parser's internal stack and set the group's + /// AST to the given concatenation. Return the concatenation containing + /// the group. + /// + /// This assumes that the parser is currently positioned on the closing + /// parenthesis and advances the parser to the character following the `)`. + /// + /// If no such group could be popped, then an unopened group error is + /// returned. + #[inline(never)] + fn pop_group(&self, mut group_concat: ast::Concat) -> Result { + use self::GroupState::*; + + assert_eq!(self.char(), ')'); + let mut stack = self.parser().stack_group.borrow_mut(); + let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack + .pop() + { + Some(Group { concat, group, ignore_whitespace }) => { + (concat, group, ignore_whitespace, None) + } + Some(Alternation(alt)) => match stack.pop() { + Some(Group { concat, group, ignore_whitespace }) => { + (concat, group, ignore_whitespace, Some(alt)) + } + None | Some(Alternation(_)) => { + return Err(self.error( + self.span_char(), + ast::ErrorKind::GroupUnopened, + )); + } + }, + None => { + return Err(self + .error(self.span_char(), ast::ErrorKind::GroupUnopened)); + } + }; + self.parser().ignore_whitespace.set(ignore_whitespace); + group_concat.span.end = self.pos(); + self.bump(); + group.span.end = self.pos(); + match alt { + Some(mut alt) => { + alt.span.end = group_concat.span.end; + alt.asts.push(group_concat.into_ast()); + group.ast = Box::new(alt.into_ast()); + } + None => { + group.ast = Box::new(group_concat.into_ast()); + } + } + prior_concat.asts.push(Ast::Group(group)); + Ok(prior_concat) + } + + /// Pop the last state from the parser's internal stack, if it exists, and + /// add the given concatenation to it. There either must be no state or a + /// single alternation item on the stack. Any other scenario produces an + /// error. + /// + /// This assumes that the parser has advanced to the end. + #[inline(never)] + fn pop_group_end(&self, mut concat: ast::Concat) -> Result { + concat.span.end = self.pos(); + let mut stack = self.parser().stack_group.borrow_mut(); + let ast = match stack.pop() { + None => Ok(concat.into_ast()), + Some(GroupState::Alternation(mut alt)) => { + alt.span.end = self.pos(); + alt.asts.push(concat.into_ast()); + Ok(Ast::Alternation(alt)) + } + Some(GroupState::Group { group, .. }) => { + return Err( + self.error(group.span, ast::ErrorKind::GroupUnclosed) + ); + } + }; + // If we try to pop again, there should be nothing. + match stack.pop() { + None => ast, + Some(GroupState::Alternation(_)) => { + // This unreachable is unfortunate. This case can't happen + // because the only way we can be here is if there were two + // `GroupState::Alternation`s adjacent in the parser's stack, + // which we guarantee to never happen because we never push a + // `GroupState::Alternation` if one is already at the top of + // the stack. + unreachable!() + } + Some(GroupState::Group { group, .. }) => { + Err(self.error(group.span, ast::ErrorKind::GroupUnclosed)) + } + } + } + + /// Parse the opening of a character class and push the current class + /// parsing context onto the parser's stack. This assumes that the parser + /// is positioned at an opening `[`. The given union should correspond to + /// the union of set items built up before seeing the `[`. + /// + /// If there was a problem parsing the opening of the class, then an error + /// is returned. Otherwise, a new union of set items for the class is + /// returned (which may be populated with either a `]` or a `-`). + #[inline(never)] + fn push_class_open( + &self, + parent_union: ast::ClassSetUnion, + ) -> Result { + assert_eq!(self.char(), '['); + + let (nested_set, nested_union) = self.parse_set_class_open()?; + self.parser() + .stack_class + .borrow_mut() + .push(ClassState::Open { union: parent_union, set: nested_set }); + Ok(nested_union) + } + + /// Parse the end of a character class set and pop the character class + /// parser stack. The union given corresponds to the last union built + /// before seeing the closing `]`. The union returned corresponds to the + /// parent character class set with the nested class added to it. + /// + /// This assumes that the parser is positioned at a `]` and will advance + /// the parser to the byte immediately following the `]`. + /// + /// If the stack is empty after popping, then this returns the final + /// "top-level" character class AST (where a "top-level" character class + /// is one that is not nested inside any other character class). + /// + /// If there is no corresponding opening bracket on the parser's stack, + /// then an error is returned. + #[inline(never)] + fn pop_class( + &self, + nested_union: ast::ClassSetUnion, + ) -> Result> { + assert_eq!(self.char(), ']'); + + let item = ast::ClassSet::Item(nested_union.into_item()); + let prevset = self.pop_class_op(item); + let mut stack = self.parser().stack_class.borrow_mut(); + match stack.pop() { + None => { + // We can never observe an empty stack: + // + // 1) We are guaranteed to start with a non-empty stack since + // the character class parser is only initiated when it sees + // a `[`. + // 2) If we ever observe an empty stack while popping after + // seeing a `]`, then we signal the character class parser + // to terminate. + panic!("unexpected empty character class stack") + } + Some(ClassState::Op { .. }) => { + // This panic is unfortunate, but this case is impossible + // since we already popped the Op state if one exists above. + // Namely, every push to the class parser stack is guarded by + // whether an existing Op is already on the top of the stack. + // If it is, the existing Op is modified. That is, the stack + // can never have consecutive Op states. + panic!("unexpected ClassState::Op") + } + Some(ClassState::Open { mut union, mut set }) => { + self.bump(); + set.span.end = self.pos(); + set.kind = prevset; + if stack.is_empty() { + Ok(Either::Right(ast::Class::Bracketed(set))) + } else { + union.push(ast::ClassSetItem::Bracketed(Box::new(set))); + Ok(Either::Left(union)) + } + } + } + } + + /// Return an "unclosed class" error whose span points to the most + /// recently opened class. + /// + /// This should only be called while parsing a character class. + #[inline(never)] + fn unclosed_class_error(&self) -> ast::Error { + for state in self.parser().stack_class.borrow().iter().rev() { + if let ClassState::Open { ref set, .. } = *state { + return self.error(set.span, ast::ErrorKind::ClassUnclosed); + } + } + // We are guaranteed to have a non-empty stack with at least + // one open bracket, so we should never get here. + panic!("no open character class found") + } + + /// Push the current set of class items on to the class parser's stack as + /// the left hand side of the given operator. + /// + /// A fresh set union is returned, which should be used to build the right + /// hand side of this operator. + #[inline(never)] + fn push_class_op( + &self, + next_kind: ast::ClassSetBinaryOpKind, + next_union: ast::ClassSetUnion, + ) -> ast::ClassSetUnion { + let item = ast::ClassSet::Item(next_union.into_item()); + let new_lhs = self.pop_class_op(item); + self.parser() + .stack_class + .borrow_mut() + .push(ClassState::Op { kind: next_kind, lhs: new_lhs }); + ast::ClassSetUnion { span: self.span(), items: vec![] } + } + + /// Pop a character class set from the character class parser stack. If the + /// top of the stack is just an item (not an operation), then return the + /// given set unchanged. If the top of the stack is an operation, then the + /// given set will be used as the rhs of the operation on the top of the + /// stack. In that case, the binary operation is returned as a set. + #[inline(never)] + fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet { + let mut stack = self.parser().stack_class.borrow_mut(); + let (kind, lhs) = match stack.pop() { + Some(ClassState::Op { kind, lhs }) => (kind, lhs), + Some(state @ ClassState::Open { .. }) => { + stack.push(state); + return rhs; + } + None => unreachable!(), + }; + let span = Span::new(lhs.span().start, rhs.span().end); + ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { + span, + kind, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }) + } +} + +impl<'s, P: Borrow> ParserI<'s, P> { + /// Parse the regular expression into an abstract syntax tree. + fn parse(&self) -> Result { + self.parse_with_comments().map(|astc| astc.ast) + } + + /// Parse the regular expression and return an abstract syntax tree with + /// all of the comments found in the pattern. + fn parse_with_comments(&self) -> Result { + assert_eq!(self.offset(), 0, "parser can only be used once"); + self.parser().reset(); + let mut concat = ast::Concat { span: self.span(), asts: vec![] }; + loop { + self.bump_space(); + if self.is_eof() { + break; + } + match self.char() { + '(' => concat = self.push_group(concat)?, + ')' => concat = self.pop_group(concat)?, + '|' => concat = self.push_alternate(concat)?, + '[' => { + let class = self.parse_set_class()?; + concat.asts.push(Ast::Class(class)); + } + '?' => { + concat = self.parse_uncounted_repetition( + concat, + ast::RepetitionKind::ZeroOrOne, + )?; + } + '*' => { + concat = self.parse_uncounted_repetition( + concat, + ast::RepetitionKind::ZeroOrMore, + )?; + } + '+' => { + concat = self.parse_uncounted_repetition( + concat, + ast::RepetitionKind::OneOrMore, + )?; + } + '{' => { + concat = self.parse_counted_repetition(concat)?; + } + _ => concat.asts.push(self.parse_primitive()?.into_ast()), + } + } + let ast = self.pop_group_end(concat)?; + NestLimiter::new(self).check(&ast)?; + Ok(ast::WithComments { + ast, + comments: mem::replace( + &mut *self.parser().comments.borrow_mut(), + vec![], + ), + }) + } + + /// Parses an uncounted repetition operation. An uncounted repetition + /// operator includes ?, * and +, but does not include the {m,n} syntax. + /// The given `kind` should correspond to the operator observed by the + /// caller. + /// + /// This assumes that the parser is currently positioned at the repetition + /// operator and advances the parser to the first character after the + /// operator. (Note that the operator may include a single additional `?`, + /// which makes the operator ungreedy.) + /// + /// The caller should include the concatenation that is being built. The + /// concatenation returned includes the repetition operator applied to the + /// last expression in the given concatenation. + #[inline(never)] + fn parse_uncounted_repetition( + &self, + mut concat: ast::Concat, + kind: ast::RepetitionKind, + ) -> Result { + assert!( + self.char() == '?' || self.char() == '*' || self.char() == '+' + ); + let op_start = self.pos(); + let ast = match concat.asts.pop() { + Some(ast) => ast, + None => { + return Err( + self.error(self.span(), ast::ErrorKind::RepetitionMissing) + ) + } + }; + match ast { + Ast::Empty(_) | Ast::Flags(_) => { + return Err( + self.error(self.span(), ast::ErrorKind::RepetitionMissing) + ) + } + _ => {} + } + let mut greedy = true; + if self.bump() && self.char() == '?' { + greedy = false; + self.bump(); + } + concat.asts.push(Ast::Repetition(ast::Repetition { + span: ast.span().with_end(self.pos()), + op: ast::RepetitionOp { + span: Span::new(op_start, self.pos()), + kind, + }, + greedy, + ast: Box::new(ast), + })); + Ok(concat) + } + + /// Parses a counted repetition operation. A counted repetition operator + /// corresponds to the {m,n} syntax, and does not include the ?, * or + + /// operators. + /// + /// This assumes that the parser is currently positioned at the opening `{` + /// and advances the parser to the first character after the operator. + /// (Note that the operator may include a single additional `?`, which + /// makes the operator ungreedy.) + /// + /// The caller should include the concatenation that is being built. The + /// concatenation returned includes the repetition operator applied to the + /// last expression in the given concatenation. + #[inline(never)] + fn parse_counted_repetition( + &self, + mut concat: ast::Concat, + ) -> Result { + assert!(self.char() == '{'); + let start = self.pos(); + let ast = match concat.asts.pop() { + Some(ast) => ast, + None => { + return Err( + self.error(self.span(), ast::ErrorKind::RepetitionMissing) + ) + } + }; + match ast { + Ast::Empty(_) | Ast::Flags(_) => { + return Err( + self.error(self.span(), ast::ErrorKind::RepetitionMissing) + ) + } + _ => {} + } + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::RepetitionCountUnclosed, + )); + } + let count_start = specialize_err( + self.parse_decimal(), + ast::ErrorKind::DecimalEmpty, + ast::ErrorKind::RepetitionCountDecimalEmpty, + )?; + let mut range = ast::RepetitionRange::Exactly(count_start); + if self.is_eof() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::RepetitionCountUnclosed, + )); + } + if self.char() == ',' { + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::RepetitionCountUnclosed, + )); + } + if self.char() != '}' { + let count_end = specialize_err( + self.parse_decimal(), + ast::ErrorKind::DecimalEmpty, + ast::ErrorKind::RepetitionCountDecimalEmpty, + )?; + range = ast::RepetitionRange::Bounded(count_start, count_end); + } else { + range = ast::RepetitionRange::AtLeast(count_start); + } + } + if self.is_eof() || self.char() != '}' { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::RepetitionCountUnclosed, + )); + } + + let mut greedy = true; + if self.bump_and_bump_space() && self.char() == '?' { + greedy = false; + self.bump(); + } + + let op_span = Span::new(start, self.pos()); + if !range.is_valid() { + return Err( + self.error(op_span, ast::ErrorKind::RepetitionCountInvalid) + ); + } + concat.asts.push(Ast::Repetition(ast::Repetition { + span: ast.span().with_end(self.pos()), + op: ast::RepetitionOp { + span: op_span, + kind: ast::RepetitionKind::Range(range), + }, + greedy, + ast: Box::new(ast), + })); + Ok(concat) + } + + /// Parse a group (which contains a sub-expression) or a set of flags. + /// + /// If a group was found, then it is returned with an empty AST. If a set + /// of flags is found, then that set is returned. + /// + /// The parser should be positioned at the opening parenthesis. + /// + /// This advances the parser to the character before the start of the + /// sub-expression (in the case of a group) or to the closing parenthesis + /// immediately following the set of flags. + /// + /// # Errors + /// + /// If flags are given and incorrectly specified, then a corresponding + /// error is returned. + /// + /// If a capture name is given and it is incorrectly specified, then a + /// corresponding error is returned. + #[inline(never)] + fn parse_group(&self) -> Result> { + assert_eq!(self.char(), '('); + let open_span = self.span_char(); + self.bump(); + self.bump_space(); + if self.is_lookaround_prefix() { + return Err(self.error( + Span::new(open_span.start, self.span().end), + ast::ErrorKind::UnsupportedLookAround, + )); + } + let inner_span = self.span(); + if self.bump_if("?P<") { + let capture_index = self.next_capture_index(open_span)?; + let cap = self.parse_capture_name(capture_index)?; + Ok(Either::Right(ast::Group { + span: open_span, + kind: ast::GroupKind::CaptureName(cap), + ast: Box::new(Ast::Empty(self.span())), + })) + } else if self.bump_if("?") { + if self.is_eof() { + return Err( + self.error(open_span, ast::ErrorKind::GroupUnclosed) + ); + } + let flags = self.parse_flags()?; + let char_end = self.char(); + self.bump(); + if char_end == ')' { + // We don't allow empty flags, e.g., `(?)`. We instead + // interpret it as a repetition operator missing its argument. + if flags.items.is_empty() { + return Err(self.error( + inner_span, + ast::ErrorKind::RepetitionMissing, + )); + } + Ok(Either::Left(ast::SetFlags { + span: Span { end: self.pos(), ..open_span }, + flags, + })) + } else { + assert_eq!(char_end, ':'); + Ok(Either::Right(ast::Group { + span: open_span, + kind: ast::GroupKind::NonCapturing(flags), + ast: Box::new(Ast::Empty(self.span())), + })) + } + } else { + let capture_index = self.next_capture_index(open_span)?; + Ok(Either::Right(ast::Group { + span: open_span, + kind: ast::GroupKind::CaptureIndex(capture_index), + ast: Box::new(Ast::Empty(self.span())), + })) + } + } + + /// Parses a capture group name. Assumes that the parser is positioned at + /// the first character in the name following the opening `<` (and may + /// possibly be EOF). This advances the parser to the first character + /// following the closing `>`. + /// + /// The caller must provide the capture index of the group for this name. + #[inline(never)] + fn parse_capture_name( + &self, + capture_index: u32, + ) -> Result { + if self.is_eof() { + return Err(self + .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof)); + } + let start = self.pos(); + loop { + if self.char() == '>' { + break; + } + if !is_capture_char(self.char(), self.pos() == start) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::GroupNameInvalid, + )); + } + if !self.bump() { + break; + } + } + let end = self.pos(); + if self.is_eof() { + return Err(self + .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof)); + } + assert_eq!(self.char(), '>'); + self.bump(); + let name = &self.pattern()[start.offset..end.offset]; + if name.is_empty() { + return Err(self.error( + Span::new(start, start), + ast::ErrorKind::GroupNameEmpty, + )); + } + let capname = ast::CaptureName { + span: Span::new(start, end), + name: name.to_string(), + index: capture_index, + }; + self.add_capture_name(&capname)?; + Ok(capname) + } + + /// Parse a sequence of flags starting at the current character. + /// + /// This advances the parser to the character immediately following the + /// flags, which is guaranteed to be either `:` or `)`. + /// + /// # Errors + /// + /// If any flags are duplicated, then an error is returned. + /// + /// If the negation operator is used more than once, then an error is + /// returned. + /// + /// If no flags could be found or if the negation operation is not followed + /// by any flags, then an error is returned. + #[inline(never)] + fn parse_flags(&self) -> Result { + let mut flags = ast::Flags { span: self.span(), items: vec![] }; + let mut last_was_negation = None; + while self.char() != ':' && self.char() != ')' { + if self.char() == '-' { + last_was_negation = Some(self.span_char()); + let item = ast::FlagsItem { + span: self.span_char(), + kind: ast::FlagsItemKind::Negation, + }; + if let Some(i) = flags.add_item(item) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::FlagRepeatedNegation { + original: flags.items[i].span, + }, + )); + } + } else { + last_was_negation = None; + let item = ast::FlagsItem { + span: self.span_char(), + kind: ast::FlagsItemKind::Flag(self.parse_flag()?), + }; + if let Some(i) = flags.add_item(item) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::FlagDuplicate { + original: flags.items[i].span, + }, + )); + } + } + if !self.bump() { + return Err( + self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof) + ); + } + } + if let Some(span) = last_was_negation { + return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation)); + } + flags.span.end = self.pos(); + Ok(flags) + } + + /// Parse the current character as a flag. Do not advance the parser. + /// + /// # Errors + /// + /// If the flag is not recognized, then an error is returned. + #[inline(never)] + fn parse_flag(&self) -> Result { + match self.char() { + 'i' => Ok(ast::Flag::CaseInsensitive), + 'm' => Ok(ast::Flag::MultiLine), + 's' => Ok(ast::Flag::DotMatchesNewLine), + 'U' => Ok(ast::Flag::SwapGreed), + 'u' => Ok(ast::Flag::Unicode), + 'x' => Ok(ast::Flag::IgnoreWhitespace), + _ => { + Err(self + .error(self.span_char(), ast::ErrorKind::FlagUnrecognized)) + } + } + } + + /// Parse a primitive AST. e.g., A literal, non-set character class or + /// assertion. + /// + /// This assumes that the parser expects a primitive at the current + /// location. i.e., All other non-primitive cases have been handled. + /// For example, if the parser's position is at `|`, then `|` will be + /// treated as a literal (e.g., inside a character class). + /// + /// This advances the parser to the first character immediately following + /// the primitive. + fn parse_primitive(&self) -> Result { + match self.char() { + '\\' => self.parse_escape(), + '.' => { + let ast = Primitive::Dot(self.span_char()); + self.bump(); + Ok(ast) + } + '^' => { + let ast = Primitive::Assertion(ast::Assertion { + span: self.span_char(), + kind: ast::AssertionKind::StartLine, + }); + self.bump(); + Ok(ast) + } + '$' => { + let ast = Primitive::Assertion(ast::Assertion { + span: self.span_char(), + kind: ast::AssertionKind::EndLine, + }); + self.bump(); + Ok(ast) + } + c => { + let ast = Primitive::Literal(ast::Literal { + span: self.span_char(), + kind: ast::LiteralKind::Verbatim, + c, + }); + self.bump(); + Ok(ast) + } + } + } + + /// Parse an escape sequence as a primitive AST. + /// + /// This assumes the parser is positioned at the start of the escape + /// sequence, i.e., `\`. It advances the parser to the first position + /// immediately following the escape sequence. + #[inline(never)] + fn parse_escape(&self) -> Result { + assert_eq!(self.char(), '\\'); + let start = self.pos(); + if !self.bump() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::EscapeUnexpectedEof, + )); + } + let c = self.char(); + // Put some of the more complicated routines into helpers. + match c { + '0'..='7' => { + if !self.parser().octal { + return Err(self.error( + Span::new(start, self.span_char().end), + ast::ErrorKind::UnsupportedBackreference, + )); + } + let mut lit = self.parse_octal(); + lit.span.start = start; + return Ok(Primitive::Literal(lit)); + } + '8'..='9' if !self.parser().octal => { + return Err(self.error( + Span::new(start, self.span_char().end), + ast::ErrorKind::UnsupportedBackreference, + )); + } + 'x' | 'u' | 'U' => { + let mut lit = self.parse_hex()?; + lit.span.start = start; + return Ok(Primitive::Literal(lit)); + } + 'p' | 'P' => { + let mut cls = self.parse_unicode_class()?; + cls.span.start = start; + return Ok(Primitive::Unicode(cls)); + } + 'd' | 's' | 'w' | 'D' | 'S' | 'W' => { + let mut cls = self.parse_perl_class(); + cls.span.start = start; + return Ok(Primitive::Perl(cls)); + } + _ => {} + } + + // Handle all of the one letter sequences inline. + self.bump(); + let span = Span::new(start, self.pos()); + if is_meta_character(c) { + return Ok(Primitive::Literal(ast::Literal { + span, + kind: ast::LiteralKind::Punctuation, + c, + })); + } + let special = |kind, c| { + Ok(Primitive::Literal(ast::Literal { + span, + kind: ast::LiteralKind::Special(kind), + c, + })) + }; + match c { + 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'), + 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'), + 't' => special(ast::SpecialLiteralKind::Tab, '\t'), + 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'), + 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'), + 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'), + ' ' if self.ignore_whitespace() => { + special(ast::SpecialLiteralKind::Space, ' ') + } + 'A' => Ok(Primitive::Assertion(ast::Assertion { + span, + kind: ast::AssertionKind::StartText, + })), + 'z' => Ok(Primitive::Assertion(ast::Assertion { + span, + kind: ast::AssertionKind::EndText, + })), + 'b' => Ok(Primitive::Assertion(ast::Assertion { + span, + kind: ast::AssertionKind::WordBoundary, + })), + 'B' => Ok(Primitive::Assertion(ast::Assertion { + span, + kind: ast::AssertionKind::NotWordBoundary, + })), + _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)), + } + } + + /// Parse an octal representation of a Unicode codepoint up to 3 digits + /// long. This expects the parser to be positioned at the first octal + /// digit and advances the parser to the first character immediately + /// following the octal number. This also assumes that parsing octal + /// escapes is enabled. + /// + /// Assuming the preconditions are met, this routine can never fail. + #[inline(never)] + fn parse_octal(&self) -> ast::Literal { + use std::char; + use std::u32; + + assert!(self.parser().octal); + assert!('0' <= self.char() && self.char() <= '7'); + let start = self.pos(); + // Parse up to two more digits. + while self.bump() + && '0' <= self.char() + && self.char() <= '7' + && self.pos().offset - start.offset <= 2 + {} + let end = self.pos(); + let octal = &self.pattern()[start.offset..end.offset]; + // Parsing the octal should never fail since the above guarantees a + // valid number. + let codepoint = + u32::from_str_radix(octal, 8).expect("valid octal number"); + // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no + // invalid Unicode scalar values. + let c = char::from_u32(codepoint).expect("Unicode scalar value"); + ast::Literal { + span: Span::new(start, end), + kind: ast::LiteralKind::Octal, + c, + } + } + + /// Parse a hex representation of a Unicode codepoint. This handles both + /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to + /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to + /// the first character immediately following the hexadecimal literal. + #[inline(never)] + fn parse_hex(&self) -> Result { + assert!( + self.char() == 'x' || self.char() == 'u' || self.char() == 'U' + ); + + let hex_kind = match self.char() { + 'x' => ast::HexLiteralKind::X, + 'u' => ast::HexLiteralKind::UnicodeShort, + _ => ast::HexLiteralKind::UnicodeLong, + }; + if !self.bump_and_bump_space() { + return Err( + self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof) + ); + } + if self.char() == '{' { + self.parse_hex_brace(hex_kind) + } else { + self.parse_hex_digits(hex_kind) + } + } + + /// Parse an N-digit hex representation of a Unicode codepoint. This + /// expects the parser to be positioned at the first digit and will advance + /// the parser to the first character immediately following the escape + /// sequence. + /// + /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`) + /// or 8 (for `\UNNNNNNNN`). + #[inline(never)] + fn parse_hex_digits( + &self, + kind: ast::HexLiteralKind, + ) -> Result { + use std::char; + use std::u32; + + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + + let start = self.pos(); + for i in 0..kind.digits() { + if i > 0 && !self.bump_and_bump_space() { + return Err(self + .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)); + } + if !is_hex(self.char()) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::EscapeHexInvalidDigit, + )); + } + scratch.push(self.char()); + } + // The final bump just moves the parser past the literal, which may + // be EOF. + self.bump_and_bump_space(); + let end = self.pos(); + let hex = scratch.as_str(); + match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { + None => Err(self.error( + Span::new(start, end), + ast::ErrorKind::EscapeHexInvalid, + )), + Some(c) => Ok(ast::Literal { + span: Span::new(start, end), + kind: ast::LiteralKind::HexFixed(kind), + c, + }), + } + } + + /// Parse a hex representation of any Unicode scalar value. This expects + /// the parser to be positioned at the opening brace `{` and will advance + /// the parser to the first character following the closing brace `}`. + #[inline(never)] + fn parse_hex_brace( + &self, + kind: ast::HexLiteralKind, + ) -> Result { + use std::char; + use std::u32; + + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + + let brace_pos = self.pos(); + let start = self.span_char().end; + while self.bump_and_bump_space() && self.char() != '}' { + if !is_hex(self.char()) { + return Err(self.error( + self.span_char(), + ast::ErrorKind::EscapeHexInvalidDigit, + )); + } + scratch.push(self.char()); + } + if self.is_eof() { + return Err(self.error( + Span::new(brace_pos, self.pos()), + ast::ErrorKind::EscapeUnexpectedEof, + )); + } + let end = self.pos(); + let hex = scratch.as_str(); + assert_eq!(self.char(), '}'); + self.bump_and_bump_space(); + + if hex.is_empty() { + return Err(self.error( + Span::new(brace_pos, self.pos()), + ast::ErrorKind::EscapeHexEmpty, + )); + } + match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { + None => Err(self.error( + Span::new(start, end), + ast::ErrorKind::EscapeHexInvalid, + )), + Some(c) => Ok(ast::Literal { + span: Span::new(start, self.pos()), + kind: ast::LiteralKind::HexBrace(kind), + c, + }), + } + } + + /// Parse a decimal number into a u32 while trimming leading and trailing + /// whitespace. + /// + /// This expects the parser to be positioned at the first position where + /// a decimal digit could occur. This will advance the parser to the byte + /// immediately following the last contiguous decimal digit. + /// + /// If no decimal digit could be found or if there was a problem parsing + /// the complete set of digits into a u32, then an error is returned. + fn parse_decimal(&self) -> Result { + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + + while !self.is_eof() && self.char().is_whitespace() { + self.bump(); + } + let start = self.pos(); + while !self.is_eof() && '0' <= self.char() && self.char() <= '9' { + scratch.push(self.char()); + self.bump_and_bump_space(); + } + let span = Span::new(start, self.pos()); + while !self.is_eof() && self.char().is_whitespace() { + self.bump_and_bump_space(); + } + let digits = scratch.as_str(); + if digits.is_empty() { + return Err(self.error(span, ast::ErrorKind::DecimalEmpty)); + } + match u32::from_str_radix(digits, 10).ok() { + Some(n) => Ok(n), + None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)), + } + } + + /// Parse a standard character class consisting primarily of characters or + /// character ranges, but can also contain nested character classes of + /// any type (sans `.`). + /// + /// This assumes the parser is positioned at the opening `[`. If parsing + /// is successful, then the parser is advanced to the position immediately + /// following the closing `]`. + #[inline(never)] + fn parse_set_class(&self) -> Result { + assert_eq!(self.char(), '['); + + let mut union = + ast::ClassSetUnion { span: self.span(), items: vec![] }; + loop { + self.bump_space(); + if self.is_eof() { + return Err(self.unclosed_class_error()); + } + match self.char() { + '[' => { + // If we've already parsed the opening bracket, then + // attempt to treat this as the beginning of an ASCII + // class. If ASCII class parsing fails, then the parser + // backs up to `[`. + if !self.parser().stack_class.borrow().is_empty() { + if let Some(cls) = self.maybe_parse_ascii_class() { + union.push(ast::ClassSetItem::Ascii(cls)); + continue; + } + } + union = self.push_class_open(union)?; + } + ']' => match self.pop_class(union)? { + Either::Left(nested_union) => { + union = nested_union; + } + Either::Right(class) => return Ok(class), + }, + '&' if self.peek() == Some('&') => { + assert!(self.bump_if("&&")); + union = self.push_class_op( + ast::ClassSetBinaryOpKind::Intersection, + union, + ); + } + '-' if self.peek() == Some('-') => { + assert!(self.bump_if("--")); + union = self.push_class_op( + ast::ClassSetBinaryOpKind::Difference, + union, + ); + } + '~' if self.peek() == Some('~') => { + assert!(self.bump_if("~~")); + union = self.push_class_op( + ast::ClassSetBinaryOpKind::SymmetricDifference, + union, + ); + } + _ => { + union.push(self.parse_set_class_range()?); + } + } + } + } + + /// Parse a single primitive item in a character class set. The item to + /// be parsed can either be one of a simple literal character, a range + /// between two simple literal characters or a "primitive" character + /// class like \w or \p{Greek}. + /// + /// If an invalid escape is found, or if a character class is found where + /// a simple literal is expected (e.g., in a range), then an error is + /// returned. + #[inline(never)] + fn parse_set_class_range(&self) -> Result { + let prim1 = self.parse_set_class_item()?; + self.bump_space(); + if self.is_eof() { + return Err(self.unclosed_class_error()); + } + // If the next char isn't a `-`, then we don't have a range. + // There are two exceptions. If the char after a `-` is a `]`, then + // `-` is interpreted as a literal `-`. Alternatively, if the char + // after a `-` is a `-`, then `--` corresponds to a "difference" + // operation. + if self.char() != '-' + || self.peek_space() == Some(']') + || self.peek_space() == Some('-') + { + return prim1.into_class_set_item(self); + } + // OK, now we're parsing a range, so bump past the `-` and parse the + // second half of the range. + if !self.bump_and_bump_space() { + return Err(self.unclosed_class_error()); + } + let prim2 = self.parse_set_class_item()?; + let range = ast::ClassSetRange { + span: Span::new(prim1.span().start, prim2.span().end), + start: prim1.into_class_literal(self)?, + end: prim2.into_class_literal(self)?, + }; + if !range.is_valid() { + return Err( + self.error(range.span, ast::ErrorKind::ClassRangeInvalid) + ); + } + Ok(ast::ClassSetItem::Range(range)) + } + + /// Parse a single item in a character class as a primitive, where the + /// primitive either consists of a verbatim literal or a single escape + /// sequence. + /// + /// This assumes the parser is positioned at the beginning of a primitive, + /// and advances the parser to the first position after the primitive if + /// successful. + /// + /// Note that it is the caller's responsibility to report an error if an + /// illegal primitive was parsed. + #[inline(never)] + fn parse_set_class_item(&self) -> Result { + if self.char() == '\\' { + self.parse_escape() + } else { + let x = Primitive::Literal(ast::Literal { + span: self.span_char(), + kind: ast::LiteralKind::Verbatim, + c: self.char(), + }); + self.bump(); + Ok(x) + } + } + + /// Parses the opening of a character class set. This includes the opening + /// bracket along with `^` if present to indicate negation. This also + /// starts parsing the opening set of unioned items if applicable, since + /// there are special rules applied to certain characters in the opening + /// of a character class. For example, `[^]]` is the class of all + /// characters not equal to `]`. (`]` would need to be escaped in any other + /// position.) Similarly for `-`. + /// + /// In all cases, the op inside the returned `ast::ClassBracketed` is an + /// empty union. This empty union should be replaced with the actual item + /// when it is popped from the parser's stack. + /// + /// This assumes the parser is positioned at the opening `[` and advances + /// the parser to the first non-special byte of the character class. + /// + /// An error is returned if EOF is found. + #[inline(never)] + fn parse_set_class_open( + &self, + ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> { + assert_eq!(self.char(), '['); + let start = self.pos(); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::ClassUnclosed, + )); + } + + let negated = if self.char() != '^' { + false + } else { + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::ClassUnclosed, + )); + } + true + }; + // Accept any number of `-` as literal `-`. + let mut union = + ast::ClassSetUnion { span: self.span(), items: vec![] }; + while self.char() == '-' { + union.push(ast::ClassSetItem::Literal(ast::Literal { + span: self.span_char(), + kind: ast::LiteralKind::Verbatim, + c: '-', + })); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, start), + ast::ErrorKind::ClassUnclosed, + )); + } + } + // If `]` is the *first* char in a set, then interpret it as a literal + // `]`. That is, an empty class is impossible to write. + if union.items.is_empty() && self.char() == ']' { + union.push(ast::ClassSetItem::Literal(ast::Literal { + span: self.span_char(), + kind: ast::LiteralKind::Verbatim, + c: ']', + })); + if !self.bump_and_bump_space() { + return Err(self.error( + Span::new(start, self.pos()), + ast::ErrorKind::ClassUnclosed, + )); + } + } + let set = ast::ClassBracketed { + span: Span::new(start, self.pos()), + negated, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: Span::new(union.span.start, union.span.start), + items: vec![], + }), + }; + Ok((set, union)) + } + + /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`. + /// + /// This assumes the parser is positioned at the opening `[`. + /// + /// If no valid ASCII character class could be found, then this does not + /// advance the parser and `None` is returned. Otherwise, the parser is + /// advanced to the first byte following the closing `]` and the + /// corresponding ASCII class is returned. + #[inline(never)] + fn maybe_parse_ascii_class(&self) -> Option { + // ASCII character classes are interesting from a parsing perspective + // because parsing cannot fail with any interesting error. For example, + // in order to use an ASCII character class, it must be enclosed in + // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think + // of it as "ASCII character characters have the syntax `[:NAME:]` + // which can only appear within character brackets." This means that + // things like `[[:lower:]A]` are legal constructs. + // + // However, if one types an incorrect ASCII character class, e.g., + // `[[:loower:]]`, then we treat that as a normal nested character + // class containing the characters `:elorw`. One might argue that we + // should return an error instead since the repeated colons give away + // the intent to write an ASCII class. But what if the user typed + // `[[:lower]]` instead? How can we tell that was intended to be an + // ASCII class and not just a normal nested class? + // + // Reasonable people can probably disagree over this, but for better + // or worse, we implement semantics that never fails at the expense + // of better failure modes. + assert_eq!(self.char(), '['); + // If parsing fails, then we back up the parser to this starting point. + let start = self.pos(); + let mut negated = false; + if !self.bump() || self.char() != ':' { + self.parser().pos.set(start); + return None; + } + if !self.bump() { + self.parser().pos.set(start); + return None; + } + if self.char() == '^' { + negated = true; + if !self.bump() { + self.parser().pos.set(start); + return None; + } + } + let name_start = self.offset(); + while self.char() != ':' && self.bump() {} + if self.is_eof() { + self.parser().pos.set(start); + return None; + } + let name = &self.pattern()[name_start..self.offset()]; + if !self.bump_if(":]") { + self.parser().pos.set(start); + return None; + } + let kind = match ast::ClassAsciiKind::from_name(name) { + Some(kind) => kind, + None => { + self.parser().pos.set(start); + return None; + } + }; + Some(ast::ClassAscii { + span: Span::new(start, self.pos()), + kind, + negated, + }) + } + + /// Parse a Unicode class in either the single character notation, `\pN` + /// or the multi-character bracketed notation, `\p{Greek}`. This assumes + /// the parser is positioned at the `p` (or `P` for negation) and will + /// advance the parser to the character immediately following the class. + /// + /// Note that this does not check whether the class name is valid or not. + #[inline(never)] + fn parse_unicode_class(&self) -> Result { + assert!(self.char() == 'p' || self.char() == 'P'); + + let mut scratch = self.parser().scratch.borrow_mut(); + scratch.clear(); + + let negated = self.char() == 'P'; + if !self.bump_and_bump_space() { + return Err( + self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof) + ); + } + let (start, kind) = if self.char() == '{' { + let start = self.span_char().end; + while self.bump_and_bump_space() && self.char() != '}' { + scratch.push(self.char()); + } + if self.is_eof() { + return Err(self + .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)); + } + assert_eq!(self.char(), '}'); + self.bump(); + + let name = scratch.as_str(); + if let Some(i) = name.find("!=") { + ( + start, + ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::NotEqual, + name: name[..i].to_string(), + value: name[i + 2..].to_string(), + }, + ) + } else if let Some(i) = name.find(':') { + ( + start, + ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Colon, + name: name[..i].to_string(), + value: name[i + 1..].to_string(), + }, + ) + } else if let Some(i) = name.find('=') { + ( + start, + ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Equal, + name: name[..i].to_string(), + value: name[i + 1..].to_string(), + }, + ) + } else { + (start, ast::ClassUnicodeKind::Named(name.to_string())) + } + } else { + let start = self.pos(); + let c = self.char(); + if c == '\\' { + return Err(self.error( + self.span_char(), + ast::ErrorKind::UnicodeClassInvalid, + )); + } + self.bump_and_bump_space(); + let kind = ast::ClassUnicodeKind::OneLetter(c); + (start, kind) + }; + Ok(ast::ClassUnicode { + span: Span::new(start, self.pos()), + negated, + kind, + }) + } + + /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the + /// parser is currently at a valid character class name and will be + /// advanced to the character immediately following the class. + #[inline(never)] + fn parse_perl_class(&self) -> ast::ClassPerl { + let c = self.char(); + let span = self.span_char(); + self.bump(); + let (negated, kind) = match c { + 'd' => (false, ast::ClassPerlKind::Digit), + 'D' => (true, ast::ClassPerlKind::Digit), + 's' => (false, ast::ClassPerlKind::Space), + 'S' => (true, ast::ClassPerlKind::Space), + 'w' => (false, ast::ClassPerlKind::Word), + 'W' => (true, ast::ClassPerlKind::Word), + c => panic!("expected valid Perl class but got '{}'", c), + }; + ast::ClassPerl { span, kind, negated } + } +} + +/// A type that traverses a fully parsed Ast and checks whether its depth +/// exceeds the specified nesting limit. If it does, then an error is returned. +#[derive(Debug)] +struct NestLimiter<'p, 's, P> { + /// The parser that is checking the nest limit. + p: &'p ParserI<'s, P>, + /// The current depth while walking an Ast. + depth: u32, +} + +impl<'p, 's, P: Borrow> NestLimiter<'p, 's, P> { + fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> { + NestLimiter { p, depth: 0 } + } + + #[inline(never)] + fn check(self, ast: &Ast) -> Result<()> { + ast::visit(ast, self) + } + + fn increment_depth(&mut self, span: &Span) -> Result<()> { + let new = self.depth.checked_add(1).ok_or_else(|| { + self.p.error( + span.clone(), + ast::ErrorKind::NestLimitExceeded(::std::u32::MAX), + ) + })?; + let limit = self.p.parser().nest_limit; + if new > limit { + return Err(self.p.error( + span.clone(), + ast::ErrorKind::NestLimitExceeded(limit), + )); + } + self.depth = new; + Ok(()) + } + + fn decrement_depth(&mut self) { + // Assuming the correctness of the visitor, this should never drop + // below 0. + self.depth = self.depth.checked_sub(1).unwrap(); + } +} + +impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { + type Output = (); + type Err = ast::Error; + + fn finish(self) -> Result<()> { + Ok(()) + } + + fn visit_pre(&mut self, ast: &Ast) -> Result<()> { + let span = match *ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::Class(ast::Class::Unicode(_)) + | Ast::Class(ast::Class::Perl(_)) => { + // These are all base cases, so we don't increment depth. + return Ok(()); + } + Ast::Class(ast::Class::Bracketed(ref x)) => &x.span, + Ast::Repetition(ref x) => &x.span, + Ast::Group(ref x) => &x.span, + Ast::Alternation(ref x) => &x.span, + Ast::Concat(ref x) => &x.span, + }; + self.increment_depth(span) + } + + fn visit_post(&mut self, ast: &Ast) -> Result<()> { + match *ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Literal(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::Class(ast::Class::Unicode(_)) + | Ast::Class(ast::Class::Perl(_)) => { + // These are all base cases, so we don't decrement depth. + Ok(()) + } + Ast::Class(ast::Class::Bracketed(_)) + | Ast::Repetition(_) + | Ast::Group(_) + | Ast::Alternation(_) + | Ast::Concat(_) => { + self.decrement_depth(); + Ok(()) + } + } + } + + fn visit_class_set_item_pre( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + let span = match *ast { + ast::ClassSetItem::Empty(_) + | ast::ClassSetItem::Literal(_) + | ast::ClassSetItem::Range(_) + | ast::ClassSetItem::Ascii(_) + | ast::ClassSetItem::Unicode(_) + | ast::ClassSetItem::Perl(_) => { + // These are all base cases, so we don't increment depth. + return Ok(()); + } + ast::ClassSetItem::Bracketed(ref x) => &x.span, + ast::ClassSetItem::Union(ref x) => &x.span, + }; + self.increment_depth(span) + } + + fn visit_class_set_item_post( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + match *ast { + ast::ClassSetItem::Empty(_) + | ast::ClassSetItem::Literal(_) + | ast::ClassSetItem::Range(_) + | ast::ClassSetItem::Ascii(_) + | ast::ClassSetItem::Unicode(_) + | ast::ClassSetItem::Perl(_) => { + // These are all base cases, so we don't decrement depth. + Ok(()) + } + ast::ClassSetItem::Bracketed(_) | ast::ClassSetItem::Union(_) => { + self.decrement_depth(); + Ok(()) + } + } + } + + fn visit_class_set_binary_op_pre( + &mut self, + ast: &ast::ClassSetBinaryOp, + ) -> Result<()> { + self.increment_depth(&ast.span) + } + + fn visit_class_set_binary_op_post( + &mut self, + _ast: &ast::ClassSetBinaryOp, + ) -> Result<()> { + self.decrement_depth(); + Ok(()) + } +} + +/// When the result is an error, transforms the ast::ErrorKind from the source +/// Result into another one. This function is used to return clearer error +/// messages when possible. +fn specialize_err( + result: Result, + from: ast::ErrorKind, + to: ast::ErrorKind, +) -> Result { + if let Err(e) = result { + if e.kind == from { + Err(ast::Error { kind: to, pattern: e.pattern, span: e.span }) + } else { + Err(e) + } + } else { + result + } +} + +#[cfg(test)] +mod tests { + use std::ops::Range; + + use super::{Parser, ParserBuilder, ParserI, Primitive}; + use crate::ast::{self, Ast, Position, Span}; + + // Our own assert_eq, which has slightly better formatting (but honestly + // still kind of crappy). + macro_rules! assert_eq { + ($left:expr, $right:expr) => {{ + match (&$left, &$right) { + (left_val, right_val) => { + if !(*left_val == *right_val) { + panic!( + "assertion failed: `(left == right)`\n\n\ + left: `{:?}`\nright: `{:?}`\n\n", + left_val, right_val + ) + } + } + } + }}; + } + + // We create these errors to compare with real ast::Errors in the tests. + // We define equality between TestError and ast::Error to disregard the + // pattern string in ast::Error, which is annoying to provide in tests. + #[derive(Clone, Debug)] + struct TestError { + span: Span, + kind: ast::ErrorKind, + } + + impl PartialEq for TestError { + fn eq(&self, other: &ast::Error) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + impl PartialEq for ast::Error { + fn eq(&self, other: &TestError) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + fn s(str: &str) -> String { + str.to_string() + } + + fn parser(pattern: &str) -> ParserI<'_, Parser> { + ParserI::new(Parser::new(), pattern) + } + + fn parser_octal(pattern: &str) -> ParserI<'_, Parser> { + let parser = ParserBuilder::new().octal(true).build(); + ParserI::new(parser, pattern) + } + + fn parser_nest_limit( + pattern: &str, + nest_limit: u32, + ) -> ParserI<'_, Parser> { + let p = ParserBuilder::new().nest_limit(nest_limit).build(); + ParserI::new(p, pattern) + } + + fn parser_ignore_whitespace(pattern: &str) -> ParserI<'_, Parser> { + let p = ParserBuilder::new().ignore_whitespace(true).build(); + ParserI::new(p, pattern) + } + + /// Short alias for creating a new span. + fn nspan(start: Position, end: Position) -> Span { + Span::new(start, end) + } + + /// Short alias for creating a new position. + fn npos(offset: usize, line: usize, column: usize) -> Position { + Position::new(offset, line, column) + } + + /// Create a new span from the given offset range. This assumes a single + /// line and sets the columns based on the offsets. i.e., This only works + /// out of the box for ASCII, which is fine for most tests. + fn span(range: Range) -> Span { + let start = Position::new(range.start, 1, range.start + 1); + let end = Position::new(range.end, 1, range.end + 1); + Span::new(start, end) + } + + /// Create a new span for the corresponding byte range in the given string. + fn span_range(subject: &str, range: Range) -> Span { + let start = Position { + offset: range.start, + line: 1 + subject[..range.start].matches('\n').count(), + column: 1 + subject[..range.start] + .chars() + .rev() + .position(|c| c == '\n') + .unwrap_or(subject[..range.start].chars().count()), + }; + let end = Position { + offset: range.end, + line: 1 + subject[..range.end].matches('\n').count(), + column: 1 + subject[..range.end] + .chars() + .rev() + .position(|c| c == '\n') + .unwrap_or(subject[..range.end].chars().count()), + }; + Span::new(start, end) + } + + /// Create a verbatim literal starting at the given position. + fn lit(c: char, start: usize) -> Ast { + lit_with(c, span(start..start + c.len_utf8())) + } + + /// Create a punctuation literal starting at the given position. + fn punct_lit(c: char, span: Span) -> Ast { + Ast::Literal(ast::Literal { + span, + kind: ast::LiteralKind::Punctuation, + c, + }) + } + + /// Create a verbatim literal with the given span. + fn lit_with(c: char, span: Span) -> Ast { + Ast::Literal(ast::Literal { + span, + kind: ast::LiteralKind::Verbatim, + c, + }) + } + + /// Create a concatenation with the given range. + fn concat(range: Range, asts: Vec) -> Ast { + concat_with(span(range), asts) + } + + /// Create a concatenation with the given span. + fn concat_with(span: Span, asts: Vec) -> Ast { + Ast::Concat(ast::Concat { span, asts }) + } + + /// Create an alternation with the given span. + fn alt(range: Range, asts: Vec) -> Ast { + Ast::Alternation(ast::Alternation { span: span(range), asts }) + } + + /// Create a capturing group with the given span. + fn group(range: Range, index: u32, ast: Ast) -> Ast { + Ast::Group(ast::Group { + span: span(range), + kind: ast::GroupKind::CaptureIndex(index), + ast: Box::new(ast), + }) + } + + /// Create an ast::SetFlags. + /// + /// The given pattern should be the full pattern string. The range given + /// should correspond to the byte offsets where the flag set occurs. + /// + /// If negated is true, then the set is interpreted as beginning with a + /// negation. + fn flag_set( + pat: &str, + range: Range, + flag: ast::Flag, + negated: bool, + ) -> Ast { + let mut items = vec![ast::FlagsItem { + span: span_range(pat, (range.end - 2)..(range.end - 1)), + kind: ast::FlagsItemKind::Flag(flag), + }]; + if negated { + items.insert( + 0, + ast::FlagsItem { + span: span_range(pat, (range.start + 2)..(range.end - 2)), + kind: ast::FlagsItemKind::Negation, + }, + ); + } + Ast::Flags(ast::SetFlags { + span: span_range(pat, range.clone()), + flags: ast::Flags { + span: span_range(pat, (range.start + 2)..(range.end - 1)), + items, + }, + }) + } + + #[test] + fn parse_nest_limit() { + // A nest limit of 0 still allows some types of regexes. + assert_eq!( + parser_nest_limit("", 0).parse(), + Ok(Ast::Empty(span(0..0))) + ); + assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0))); + + // Test repetition operations, which require one level of nesting. + assert_eq!( + parser_nest_limit("a+", 0).parse().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::NestLimitExceeded(0), + } + ); + assert_eq!( + parser_nest_limit("a+", 1).parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::OneOrMore, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser_nest_limit("(a)+", 1).parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::NestLimitExceeded(1), + } + ); + assert_eq!( + parser_nest_limit("a+*", 1).parse().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::NestLimitExceeded(1), + } + ); + assert_eq!( + parser_nest_limit("a+*", 2).parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..3), + op: ast::RepetitionOp { + span: span(2..3), + kind: ast::RepetitionKind::ZeroOrMore, + }, + greedy: true, + ast: Box::new(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::OneOrMore, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })), + })) + ); + + // Test concatenations. A concatenation requires one level of nesting. + assert_eq!( + parser_nest_limit("ab", 0).parse().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::NestLimitExceeded(0), + } + ); + assert_eq!( + parser_nest_limit("ab", 1).parse(), + Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)])) + ); + assert_eq!( + parser_nest_limit("abc", 1).parse(), + Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)])) + ); + + // Test alternations. An alternation requires one level of nesting. + assert_eq!( + parser_nest_limit("a|b", 0).parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::NestLimitExceeded(0), + } + ); + assert_eq!( + parser_nest_limit("a|b", 1).parse(), + Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)])) + ); + assert_eq!( + parser_nest_limit("a|b|c", 1).parse(), + Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)])) + ); + + // Test character classes. Classes form their own mini-recursive + // syntax! + assert_eq!( + parser_nest_limit("[a]", 0).parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::NestLimitExceeded(0), + } + ); + assert_eq!( + parser_nest_limit("[a]", 1).parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..3), + negated: false, + kind: ast::ClassSet::Item(ast::ClassSetItem::Literal( + ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: 'a', + } + )), + }))) + ); + assert_eq!( + parser_nest_limit("[ab]", 1).parse().unwrap_err(), + TestError { + span: span(1..3), + kind: ast::ErrorKind::NestLimitExceeded(1), + } + ); + assert_eq!( + parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(), + TestError { + span: span(3..7), + kind: ast::ErrorKind::NestLimitExceeded(2), + } + ); + assert_eq!( + parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(), + TestError { + span: span(4..6), + kind: ast::ErrorKind::NestLimitExceeded(3), + } + ); + assert_eq!( + parser_nest_limit("[a--b]", 1).parse().unwrap_err(), + TestError { + span: span(1..5), + kind: ast::ErrorKind::NestLimitExceeded(1), + } + ); + assert_eq!( + parser_nest_limit("[a--bc]", 2).parse().unwrap_err(), + TestError { + span: span(4..6), + kind: ast::ErrorKind::NestLimitExceeded(2), + } + ); + } + + #[test] + fn parse_comments() { + let pat = "(?x) +# This is comment 1. +foo # This is comment 2. + # This is comment 3. +bar +# This is comment 4."; + let astc = parser(pat).parse_with_comments().unwrap(); + assert_eq!( + astc.ast, + concat_with( + span_range(pat, 0..pat.len()), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + lit_with('f', span_range(pat, 26..27)), + lit_with('o', span_range(pat, 27..28)), + lit_with('o', span_range(pat, 28..29)), + lit_with('b', span_range(pat, 74..75)), + lit_with('a', span_range(pat, 75..76)), + lit_with('r', span_range(pat, 76..77)), + ] + ) + ); + assert_eq!( + astc.comments, + vec![ + ast::Comment { + span: span_range(pat, 5..26), + comment: s(" This is comment 1."), + }, + ast::Comment { + span: span_range(pat, 30..51), + comment: s(" This is comment 2."), + }, + ast::Comment { + span: span_range(pat, 53..74), + comment: s(" This is comment 3."), + }, + ast::Comment { + span: span_range(pat, 78..98), + comment: s(" This is comment 4."), + }, + ] + ); + } + + #[test] + fn parse_holistic() { + assert_eq!(parser("]").parse(), Ok(lit(']', 0))); + assert_eq!( + parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(), + Ok(concat( + 0..36, + vec![ + punct_lit('\\', span(0..2)), + punct_lit('.', span(2..4)), + punct_lit('+', span(4..6)), + punct_lit('*', span(6..8)), + punct_lit('?', span(8..10)), + punct_lit('(', span(10..12)), + punct_lit(')', span(12..14)), + punct_lit('|', span(14..16)), + punct_lit('[', span(16..18)), + punct_lit(']', span(18..20)), + punct_lit('{', span(20..22)), + punct_lit('}', span(22..24)), + punct_lit('^', span(24..26)), + punct_lit('$', span(26..28)), + punct_lit('#', span(28..30)), + punct_lit('&', span(30..32)), + punct_lit('-', span(32..34)), + punct_lit('~', span(34..36)), + ] + )) + ); + } + + #[test] + fn parse_ignore_whitespace() { + // Test that basic whitespace insensitivity works. + let pat = "(?x)a b"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + nspan(npos(0, 1, 1), npos(7, 1, 8)), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), + lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))), + ] + )) + ); + + // Test that we can toggle whitespace insensitivity. + let pat = "(?x)a b(?-x)a b"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + nspan(npos(0, 1, 1), npos(15, 1, 16)), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), + lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))), + flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true), + lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))), + lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))), + lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))), + ] + )) + ); + + // Test that nesting whitespace insensitive flags works. + let pat = "a (?x:a )a "; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..11), + vec![ + lit_with('a', span_range(pat, 0..1)), + lit_with(' ', span_range(pat, 1..2)), + Ast::Group(ast::Group { + span: span_range(pat, 2..9), + kind: ast::GroupKind::NonCapturing(ast::Flags { + span: span_range(pat, 4..5), + items: vec![ast::FlagsItem { + span: span_range(pat, 4..5), + kind: ast::FlagsItemKind::Flag( + ast::Flag::IgnoreWhitespace + ), + },], + }), + ast: Box::new(lit_with('a', span_range(pat, 6..7))), + }), + lit_with('a', span_range(pat, 9..10)), + lit_with(' ', span_range(pat, 10..11)), + ] + )) + ); + + // Test that whitespace after an opening paren is insignificant. + let pat = "(?x)( ?P a )"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..pat.len()), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::Group(ast::Group { + span: span_range(pat, 4..pat.len()), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span_range(pat, 9..12), + name: s("foo"), + index: 1, + }), + ast: Box::new(lit_with('a', span_range(pat, 14..15))), + }), + ] + )) + ); + let pat = "(?x)( a )"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..pat.len()), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::Group(ast::Group { + span: span_range(pat, 4..pat.len()), + kind: ast::GroupKind::CaptureIndex(1), + ast: Box::new(lit_with('a', span_range(pat, 7..8))), + }), + ] + )) + ); + let pat = "(?x)( ?: a )"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..pat.len()), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::Group(ast::Group { + span: span_range(pat, 4..pat.len()), + kind: ast::GroupKind::NonCapturing(ast::Flags { + span: span_range(pat, 8..8), + items: vec![], + }), + ast: Box::new(lit_with('a', span_range(pat, 11..12))), + }), + ] + )) + ); + let pat = r"(?x)\x { 53 }"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..pat.len()), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::Literal(ast::Literal { + span: span(4..13), + kind: ast::LiteralKind::HexBrace( + ast::HexLiteralKind::X + ), + c: 'S', + }), + ] + )) + ); + + // Test that whitespace after an escape is OK. + let pat = r"(?x)\ "; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..pat.len()), + vec![ + flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), + Ast::Literal(ast::Literal { + span: span_range(pat, 4..6), + kind: ast::LiteralKind::Special( + ast::SpecialLiteralKind::Space + ), + c: ' ', + }), + ] + )) + ); + // ... but only when `x` mode is enabled. + let pat = r"\ "; + assert_eq!( + parser(pat).parse().unwrap_err(), + TestError { + span: span_range(pat, 0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + } + ); + } + + #[test] + fn parse_newlines() { + let pat = ".\n."; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..3), + vec![ + Ast::Dot(span_range(pat, 0..1)), + lit_with('\n', span_range(pat, 1..2)), + Ast::Dot(span_range(pat, 2..3)), + ] + )) + ); + + let pat = "foobar\nbaz\nquux\n"; + assert_eq!( + parser(pat).parse(), + Ok(concat_with( + span_range(pat, 0..pat.len()), + vec![ + lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))), + lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))), + lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))), + lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))), + lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), + lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))), + lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))), + lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))), + lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))), + lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))), + lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))), + lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))), + lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))), + lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))), + lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))), + lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))), + ] + )) + ); + } + + #[test] + fn parse_uncounted_repetition() { + assert_eq!( + parser(r"a*").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::ZeroOrMore, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a+").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::OneOrMore, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + + assert_eq!( + parser(r"a?").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a??").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..3), + op: ast::RepetitionOp { + span: span(1..3), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: false, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a?").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a?b").parse(), + Ok(concat( + 0..3, + vec![ + Ast::Repetition(ast::Repetition { + span: span(0..2), + op: ast::RepetitionOp { + span: span(1..2), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('a', 0)), + }), + lit('b', 2), + ] + )) + ); + assert_eq!( + parser(r"a??b").parse(), + Ok(concat( + 0..4, + vec![ + Ast::Repetition(ast::Repetition { + span: span(0..3), + op: ast::RepetitionOp { + span: span(1..3), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: false, + ast: Box::new(lit('a', 0)), + }), + lit('b', 3), + ] + )) + ); + assert_eq!( + parser(r"ab?").parse(), + Ok(concat( + 0..3, + vec![ + lit('a', 0), + Ast::Repetition(ast::Repetition { + span: span(1..3), + op: ast::RepetitionOp { + span: span(2..3), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('b', 1)), + }), + ] + )) + ); + assert_eq!( + parser(r"(ab)?").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..5), + op: ast::RepetitionOp { + span: span(4..5), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(group( + 0..4, + 1, + concat(1..3, vec![lit('a', 1), lit('b', 2),]) + )), + })) + ); + assert_eq!( + parser(r"|a?").parse(), + Ok(alt( + 0..3, + vec![ + Ast::Empty(span(0..0)), + Ast::Repetition(ast::Repetition { + span: span(1..3), + op: ast::RepetitionOp { + span: span(2..3), + kind: ast::RepetitionKind::ZeroOrOne, + }, + greedy: true, + ast: Box::new(lit('a', 1)), + }), + ] + )) + ); + + assert_eq!( + parser(r"*").parse().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"(?i)*").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"(*)").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"(?:?)").parse().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"+").parse().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"?").parse().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"(?)").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"|*").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"|+").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"|?").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + } + + #[test] + fn parse_counted_repetition() { + assert_eq!( + parser(r"a{5}").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..4), + op: ast::RepetitionOp { + span: span(1..4), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5) + ), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a{5,}").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..5), + op: ast::RepetitionOp { + span: span(1..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::AtLeast(5) + ), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a{5,9}").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..6), + op: ast::RepetitionOp { + span: span(1..6), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9) + ), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a{5}?").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..5), + op: ast::RepetitionOp { + span: span(1..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5) + ), + }, + greedy: false, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"ab{5}").parse(), + Ok(concat( + 0..5, + vec![ + lit('a', 0), + Ast::Repetition(ast::Repetition { + span: span(1..5), + op: ast::RepetitionOp { + span: span(2..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5) + ), + }, + greedy: true, + ast: Box::new(lit('b', 1)), + }), + ] + )) + ); + assert_eq!( + parser(r"ab{5}c").parse(), + Ok(concat( + 0..6, + vec![ + lit('a', 0), + Ast::Repetition(ast::Repetition { + span: span(1..5), + op: ast::RepetitionOp { + span: span(2..5), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5) + ), + }, + greedy: true, + ast: Box::new(lit('b', 1)), + }), + lit('c', 5), + ] + )) + ); + + assert_eq!( + parser(r"a{ 5 }").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..6), + op: ast::RepetitionOp { + span: span(1..6), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Exactly(5) + ), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser(r"a{ 5 , 9 }").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..10), + op: ast::RepetitionOp { + span: span(1..10), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9) + ), + }, + greedy: true, + ast: Box::new(lit('a', 0)), + })) + ); + assert_eq!( + parser_ignore_whitespace(r"a{5,9} ?").parse(), + Ok(Ast::Repetition(ast::Repetition { + span: span(0..8), + op: ast::RepetitionOp { + span: span(1..8), + kind: ast::RepetitionKind::Range( + ast::RepetitionRange::Bounded(5, 9) + ), + }, + greedy: false, + ast: Box::new(lit('a', 0)), + })) + ); + + assert_eq!( + parser(r"(?i){0}").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"(?m){1,1}").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"a{]}").parse().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + assert_eq!( + parser(r"a{1,]}").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + assert_eq!( + parser(r"a{").parse().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::RepetitionCountUnclosed, + } + ); + assert_eq!( + parser(r"a{}").parse().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + assert_eq!( + parser(r"a{a").parse().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + assert_eq!( + parser(r"a{9999999999}").parse().unwrap_err(), + TestError { + span: span(2..12), + kind: ast::ErrorKind::DecimalInvalid, + } + ); + assert_eq!( + parser(r"a{9").parse().unwrap_err(), + TestError { + span: span(1..3), + kind: ast::ErrorKind::RepetitionCountUnclosed, + } + ); + assert_eq!( + parser(r"a{9,a").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::RepetitionCountDecimalEmpty, + } + ); + assert_eq!( + parser(r"a{9,9999999999}").parse().unwrap_err(), + TestError { + span: span(4..14), + kind: ast::ErrorKind::DecimalInvalid, + } + ); + assert_eq!( + parser(r"a{9,").parse().unwrap_err(), + TestError { + span: span(1..4), + kind: ast::ErrorKind::RepetitionCountUnclosed, + } + ); + assert_eq!( + parser(r"a{9,11").parse().unwrap_err(), + TestError { + span: span(1..6), + kind: ast::ErrorKind::RepetitionCountUnclosed, + } + ); + assert_eq!( + parser(r"a{2,1}").parse().unwrap_err(), + TestError { + span: span(1..6), + kind: ast::ErrorKind::RepetitionCountInvalid, + } + ); + assert_eq!( + parser(r"{5}").parse().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + assert_eq!( + parser(r"|{5}").parse().unwrap_err(), + TestError { + span: span(1..1), + kind: ast::ErrorKind::RepetitionMissing, + } + ); + } + + #[test] + fn parse_alternate() { + assert_eq!( + parser(r"a|b").parse(), + Ok(Ast::Alternation(ast::Alternation { + span: span(0..3), + asts: vec![lit('a', 0), lit('b', 2)], + })) + ); + assert_eq!( + parser(r"(a|b)").parse(), + Ok(group( + 0..5, + 1, + Ast::Alternation(ast::Alternation { + span: span(1..4), + asts: vec![lit('a', 1), lit('b', 3)], + }) + )) + ); + + assert_eq!( + parser(r"a|b|c").parse(), + Ok(Ast::Alternation(ast::Alternation { + span: span(0..5), + asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)], + })) + ); + assert_eq!( + parser(r"ax|by|cz").parse(), + Ok(Ast::Alternation(ast::Alternation { + span: span(0..8), + asts: vec![ + concat(0..2, vec![lit('a', 0), lit('x', 1)]), + concat(3..5, vec![lit('b', 3), lit('y', 4)]), + concat(6..8, vec![lit('c', 6), lit('z', 7)]), + ], + })) + ); + assert_eq!( + parser(r"(ax|by|cz)").parse(), + Ok(group( + 0..10, + 1, + Ast::Alternation(ast::Alternation { + span: span(1..9), + asts: vec![ + concat(1..3, vec![lit('a', 1), lit('x', 2)]), + concat(4..6, vec![lit('b', 4), lit('y', 5)]), + concat(7..9, vec![lit('c', 7), lit('z', 8)]), + ], + }) + )) + ); + assert_eq!( + parser(r"(ax|(by|(cz)))").parse(), + Ok(group( + 0..14, + 1, + alt( + 1..13, + vec![ + concat(1..3, vec![lit('a', 1), lit('x', 2)]), + group( + 4..13, + 2, + alt( + 5..12, + vec![ + concat( + 5..7, + vec![lit('b', 5), lit('y', 6)] + ), + group( + 8..12, + 3, + concat( + 9..11, + vec![lit('c', 9), lit('z', 10),] + ) + ), + ] + ) + ), + ] + ) + )) + ); + + assert_eq!( + parser(r"|").parse(), + Ok(alt( + 0..1, + vec![Ast::Empty(span(0..0)), Ast::Empty(span(1..1)),] + )) + ); + assert_eq!( + parser(r"||").parse(), + Ok(alt( + 0..2, + vec![ + Ast::Empty(span(0..0)), + Ast::Empty(span(1..1)), + Ast::Empty(span(2..2)), + ] + )) + ); + assert_eq!( + parser(r"a|").parse(), + Ok(alt(0..2, vec![lit('a', 0), Ast::Empty(span(2..2)),])) + ); + assert_eq!( + parser(r"|a").parse(), + Ok(alt(0..2, vec![Ast::Empty(span(0..0)), lit('a', 1),])) + ); + + assert_eq!( + parser(r"(|)").parse(), + Ok(group( + 0..3, + 1, + alt( + 1..2, + vec![Ast::Empty(span(1..1)), Ast::Empty(span(2..2)),] + ) + )) + ); + assert_eq!( + parser(r"(a|)").parse(), + Ok(group( + 0..4, + 1, + alt(1..3, vec![lit('a', 1), Ast::Empty(span(3..3)),]) + )) + ); + assert_eq!( + parser(r"(|a)").parse(), + Ok(group( + 0..4, + 1, + alt(1..3, vec![Ast::Empty(span(1..1)), lit('a', 2),]) + )) + ); + + assert_eq!( + parser(r"a|b)").parse().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::GroupUnopened, + } + ); + assert_eq!( + parser(r"(a|b").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::GroupUnclosed, + } + ); + } + + #[test] + fn parse_unsupported_lookaround() { + assert_eq!( + parser(r"(?=a)").parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::UnsupportedLookAround, + } + ); + assert_eq!( + parser(r"(?!a)").parse().unwrap_err(), + TestError { + span: span(0..3), + kind: ast::ErrorKind::UnsupportedLookAround, + } + ); + assert_eq!( + parser(r"(?<=a)").parse().unwrap_err(), + TestError { + span: span(0..4), + kind: ast::ErrorKind::UnsupportedLookAround, + } + ); + assert_eq!( + parser(r"(?z)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..8), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span(4..5), + name: s("a"), + index: 1, + }), + ast: Box::new(lit('z', 6)), + })) + ); + assert_eq!( + parser("(?Pz)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..10), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span(4..7), + name: s("abc"), + index: 1, + }), + ast: Box::new(lit('z', 8)), + })) + ); + + assert_eq!( + parser("(?Pz)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..10), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span(4..7), + name: s("a_1"), + index: 1, + }), + ast: Box::new(lit('z', 8)), + })) + ); + + assert_eq!( + parser("(?Pz)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..10), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span(4..7), + name: s("a.1"), + index: 1, + }), + ast: Box::new(lit('z', 8)), + })) + ); + + assert_eq!( + parser("(?Pz)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..11), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span(4..8), + name: s("a[1]"), + index: 1, + }), + ast: Box::new(lit('z', 9)), + })) + ); + + assert_eq!( + parser("(?P<").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::GroupNameUnexpectedEof, + } + ); + assert_eq!( + parser("(?P<>z)").parse().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::GroupNameEmpty, + } + ); + assert_eq!( + parser("(?Py)(?Pz)").parse().unwrap_err(), + TestError { + span: span(12..13), + kind: ast::ErrorKind::GroupNameDuplicate { + original: span(4..5), + }, + } + ); + } + + #[test] + fn parse_flags() { + assert_eq!( + parser("i:").parse_flags(), + Ok(ast::Flags { + span: span(0..1), + items: vec![ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), + }], + }) + ); + assert_eq!( + parser("i)").parse_flags(), + Ok(ast::Flags { + span: span(0..1), + items: vec![ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), + }], + }) + ); + + assert_eq!( + parser("isU:").parse_flags(), + Ok(ast::Flags { + span: span(0..3), + items: vec![ + ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag( + ast::Flag::CaseInsensitive + ), + }, + ast::FlagsItem { + span: span(1..2), + kind: ast::FlagsItemKind::Flag( + ast::Flag::DotMatchesNewLine + ), + }, + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), + }, + ], + }) + ); + + assert_eq!( + parser("-isU:").parse_flags(), + Ok(ast::Flags { + span: span(0..4), + items: vec![ + ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Negation, + }, + ast::FlagsItem { + span: span(1..2), + kind: ast::FlagsItemKind::Flag( + ast::Flag::CaseInsensitive + ), + }, + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag( + ast::Flag::DotMatchesNewLine + ), + }, + ast::FlagsItem { + span: span(3..4), + kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), + }, + ], + }) + ); + assert_eq!( + parser("i-sU:").parse_flags(), + Ok(ast::Flags { + span: span(0..4), + items: vec![ + ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag( + ast::Flag::CaseInsensitive + ), + }, + ast::FlagsItem { + span: span(1..2), + kind: ast::FlagsItemKind::Negation, + }, + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag( + ast::Flag::DotMatchesNewLine + ), + }, + ast::FlagsItem { + span: span(3..4), + kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), + }, + ], + }) + ); + + assert_eq!( + parser("isU").parse_flags().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::FlagUnexpectedEof, + } + ); + assert_eq!( + parser("isUa:").parse_flags().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::FlagUnrecognized, + } + ); + assert_eq!( + parser("isUi:").parse_flags().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::FlagDuplicate { original: span(0..1) }, + } + ); + assert_eq!( + parser("i-sU-i:").parse_flags().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::FlagRepeatedNegation { + original: span(1..2), + }, + } + ); + assert_eq!( + parser("-)").parse_flags().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::FlagDanglingNegation, + } + ); + assert_eq!( + parser("i-)").parse_flags().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::FlagDanglingNegation, + } + ); + assert_eq!( + parser("iU-)").parse_flags().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::FlagDanglingNegation, + } + ); + } + + #[test] + fn parse_flag() { + assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive)); + assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine)); + assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine)); + assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed)); + assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode)); + assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace)); + + assert_eq!( + parser("a").parse_flag().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::FlagUnrecognized, + } + ); + assert_eq!( + parser("☃").parse_flag().unwrap_err(), + TestError { + span: span_range("☃", 0..3), + kind: ast::ErrorKind::FlagUnrecognized, + } + ); + } + + #[test] + fn parse_primitive_non_escape() { + assert_eq!( + parser(r".").parse_primitive(), + Ok(Primitive::Dot(span(0..1))) + ); + assert_eq!( + parser(r"^").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..1), + kind: ast::AssertionKind::StartLine, + })) + ); + assert_eq!( + parser(r"$").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..1), + kind: ast::AssertionKind::EndLine, + })) + ); + + assert_eq!( + parser(r"a").parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..1), + kind: ast::LiteralKind::Verbatim, + c: 'a', + })) + ); + assert_eq!( + parser(r"|").parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..1), + kind: ast::LiteralKind::Verbatim, + c: '|', + })) + ); + assert_eq!( + parser(r"☃").parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span_range("☃", 0..3), + kind: ast::LiteralKind::Verbatim, + c: '☃', + })) + ); + } + + #[test] + fn parse_escape() { + assert_eq!( + parser(r"\|").parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..2), + kind: ast::LiteralKind::Punctuation, + c: '|', + })) + ); + let specials = &[ + (r"\a", '\x07', ast::SpecialLiteralKind::Bell), + (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed), + (r"\t", '\t', ast::SpecialLiteralKind::Tab), + (r"\n", '\n', ast::SpecialLiteralKind::LineFeed), + (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn), + (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab), + ]; + for &(pat, c, ref kind) in specials { + assert_eq!( + parser(pat).parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..2), + kind: ast::LiteralKind::Special(kind.clone()), + c, + })) + ); + } + assert_eq!( + parser(r"\A").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::StartText, + })) + ); + assert_eq!( + parser(r"\z").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::EndText, + })) + ); + assert_eq!( + parser(r"\b").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::WordBoundary, + })) + ); + assert_eq!( + parser(r"\B").parse_primitive(), + Ok(Primitive::Assertion(ast::Assertion { + span: span(0..2), + kind: ast::AssertionKind::NotWordBoundary, + })) + ); + + assert_eq!( + parser(r"\").parse_escape().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\y").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + } + ); + } + + #[test] + fn parse_unsupported_backreference() { + assert_eq!( + parser(r"\0").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::UnsupportedBackreference, + } + ); + assert_eq!( + parser(r"\9").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::UnsupportedBackreference, + } + ); + } + + #[test] + fn parse_octal() { + for i in 0..511 { + let pat = format!(r"\{:o}", i); + assert_eq!( + parser_octal(&pat).parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..pat.len()), + kind: ast::LiteralKind::Octal, + c: ::std::char::from_u32(i).unwrap(), + })) + ); + } + assert_eq!( + parser_octal(r"\778").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..3), + kind: ast::LiteralKind::Octal, + c: '?', + })) + ); + assert_eq!( + parser_octal(r"\7777").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..4), + kind: ast::LiteralKind::Octal, + c: '\u{01FF}', + })) + ); + assert_eq!( + parser_octal(r"\778").parse(), + Ok(Ast::Concat(ast::Concat { + span: span(0..4), + asts: vec![ + Ast::Literal(ast::Literal { + span: span(0..3), + kind: ast::LiteralKind::Octal, + c: '?', + }), + Ast::Literal(ast::Literal { + span: span(3..4), + kind: ast::LiteralKind::Verbatim, + c: '8', + }), + ], + })) + ); + assert_eq!( + parser_octal(r"\7777").parse(), + Ok(Ast::Concat(ast::Concat { + span: span(0..5), + asts: vec![ + Ast::Literal(ast::Literal { + span: span(0..4), + kind: ast::LiteralKind::Octal, + c: '\u{01FF}', + }), + Ast::Literal(ast::Literal { + span: span(4..5), + kind: ast::LiteralKind::Verbatim, + c: '7', + }), + ], + })) + ); + + assert_eq!( + parser_octal(r"\8").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + } + ); + } + + #[test] + fn parse_hex_two() { + for i in 0..256 { + let pat = format!(r"\x{:02x}", i); + assert_eq!( + parser(&pat).parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..pat.len()), + kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X), + c: ::std::char::from_u32(i).unwrap(), + })) + ); + } + + assert_eq!( + parser(r"\xF").parse_escape().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\xG").parse_escape().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\xFG").parse_escape().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + } + + #[test] + fn parse_hex_four() { + for i in 0..65536 { + let c = match ::std::char::from_u32(i) { + None => continue, + Some(c) => c, + }; + let pat = format!(r"\u{:04x}", i); + assert_eq!( + parser(&pat).parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..pat.len()), + kind: ast::LiteralKind::HexFixed( + ast::HexLiteralKind::UnicodeShort + ), + c, + })) + ); + } + + assert_eq!( + parser(r"\uF").parse_escape().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\uG").parse_escape().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\uFG").parse_escape().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\uFFG").parse_escape().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\uFFFG").parse_escape().unwrap_err(), + TestError { + span: span(5..6), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\uD800").parse_escape().unwrap_err(), + TestError { + span: span(2..6), + kind: ast::ErrorKind::EscapeHexInvalid, + } + ); + } + + #[test] + fn parse_hex_eight() { + for i in 0..65536 { + let c = match ::std::char::from_u32(i) { + None => continue, + Some(c) => c, + }; + let pat = format!(r"\U{:08x}", i); + assert_eq!( + parser(&pat).parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..pat.len()), + kind: ast::LiteralKind::HexFixed( + ast::HexLiteralKind::UnicodeLong + ), + c, + })) + ); + } + + assert_eq!( + parser(r"\UF").parse_escape().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\UG").parse_escape().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFG").parse_escape().unwrap_err(), + TestError { + span: span(3..4), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFFG").parse_escape().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFFFG").parse_escape().unwrap_err(), + TestError { + span: span(5..6), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFFFFG").parse_escape().unwrap_err(), + TestError { + span: span(6..7), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFFFFFG").parse_escape().unwrap_err(), + TestError { + span: span(7..8), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFFFFFFG").parse_escape().unwrap_err(), + TestError { + span: span(8..9), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\UFFFFFFFG").parse_escape().unwrap_err(), + TestError { + span: span(9..10), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + } + + #[test] + fn parse_hex_brace() { + assert_eq!( + parser(r"\u{26c4}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..8), + kind: ast::LiteralKind::HexBrace( + ast::HexLiteralKind::UnicodeShort + ), + c: '⛄', + })) + ); + assert_eq!( + parser(r"\U{26c4}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..8), + kind: ast::LiteralKind::HexBrace( + ast::HexLiteralKind::UnicodeLong + ), + c: '⛄', + })) + ); + assert_eq!( + parser(r"\x{26c4}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..8), + kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), + c: '⛄', + })) + ); + assert_eq!( + parser(r"\x{26C4}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..8), + kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), + c: '⛄', + })) + ); + assert_eq!( + parser(r"\x{10fFfF}").parse_escape(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..10), + kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), + c: '\u{10FFFF}', + })) + ); + + assert_eq!( + parser(r"\x").parse_escape().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\x{").parse_escape().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\x{FF").parse_escape().unwrap_err(), + TestError { + span: span(2..5), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\x{}").parse_escape().unwrap_err(), + TestError { + span: span(2..4), + kind: ast::ErrorKind::EscapeHexEmpty, + } + ); + assert_eq!( + parser(r"\x{FGF}").parse_escape().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::EscapeHexInvalidDigit, + } + ); + assert_eq!( + parser(r"\x{FFFFFF}").parse_escape().unwrap_err(), + TestError { + span: span(3..9), + kind: ast::ErrorKind::EscapeHexInvalid, + } + ); + assert_eq!( + parser(r"\x{D800}").parse_escape().unwrap_err(), + TestError { + span: span(3..7), + kind: ast::ErrorKind::EscapeHexInvalid, + } + ); + assert_eq!( + parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(), + TestError { + span: span(3..12), + kind: ast::ErrorKind::EscapeHexInvalid, + } + ); + } + + #[test] + fn parse_decimal() { + assert_eq!(parser("123").parse_decimal(), Ok(123)); + assert_eq!(parser("0").parse_decimal(), Ok(0)); + assert_eq!(parser("01").parse_decimal(), Ok(1)); + + assert_eq!( + parser("-1").parse_decimal().unwrap_err(), + TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty } + ); + assert_eq!( + parser("").parse_decimal().unwrap_err(), + TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty } + ); + assert_eq!( + parser("9999999999").parse_decimal().unwrap_err(), + TestError { + span: span(0..10), + kind: ast::ErrorKind::DecimalInvalid, + } + ); + } + + #[test] + fn parse_set_class() { + fn union(span: Span, items: Vec) -> ast::ClassSet { + ast::ClassSet::union(ast::ClassSetUnion { span, items }) + } + + fn intersection( + span: Span, + lhs: ast::ClassSet, + rhs: ast::ClassSet, + ) -> ast::ClassSet { + ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { + span, + kind: ast::ClassSetBinaryOpKind::Intersection, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }) + } + + fn difference( + span: Span, + lhs: ast::ClassSet, + rhs: ast::ClassSet, + ) -> ast::ClassSet { + ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { + span, + kind: ast::ClassSetBinaryOpKind::Difference, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }) + } + + fn symdifference( + span: Span, + lhs: ast::ClassSet, + rhs: ast::ClassSet, + ) -> ast::ClassSet { + ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { + span, + kind: ast::ClassSetBinaryOpKind::SymmetricDifference, + lhs: Box::new(lhs), + rhs: Box::new(rhs), + }) + } + + fn itemset(item: ast::ClassSetItem) -> ast::ClassSet { + ast::ClassSet::Item(item) + } + + fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem { + ast::ClassSetItem::Ascii(cls) + } + + fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem { + ast::ClassSetItem::Unicode(cls) + } + + fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem { + ast::ClassSetItem::Perl(cls) + } + + fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem { + ast::ClassSetItem::Bracketed(Box::new(cls)) + } + + fn lit(span: Span, c: char) -> ast::ClassSetItem { + ast::ClassSetItem::Literal(ast::Literal { + span, + kind: ast::LiteralKind::Verbatim, + c, + }) + } + + fn empty(span: Span) -> ast::ClassSetItem { + ast::ClassSetItem::Empty(span) + } + + fn range(span: Span, start: char, end: char) -> ast::ClassSetItem { + let pos1 = Position { + offset: span.start.offset + start.len_utf8(), + column: span.start.column + 1, + ..span.start + }; + let pos2 = Position { + offset: span.end.offset - end.len_utf8(), + column: span.end.column - 1, + ..span.end + }; + ast::ClassSetItem::Range(ast::ClassSetRange { + span, + start: ast::Literal { + span: Span { end: pos1, ..span }, + kind: ast::LiteralKind::Verbatim, + c: start, + }, + end: ast::Literal { + span: Span { start: pos2, ..span }, + kind: ast::LiteralKind::Verbatim, + c: end, + }, + }) + } + + fn alnum(span: Span, negated: bool) -> ast::ClassAscii { + ast::ClassAscii { span, kind: ast::ClassAsciiKind::Alnum, negated } + } + + fn lower(span: Span, negated: bool) -> ast::ClassAscii { + ast::ClassAscii { span, kind: ast::ClassAsciiKind::Lower, negated } + } + + assert_eq!( + parser("[[:alnum:]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..11), + negated: false, + kind: itemset(item_ascii(alnum(span(1..10), false))), + }))) + ); + assert_eq!( + parser("[[[:alnum:]]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..13), + negated: false, + kind: itemset(item_bracket(ast::ClassBracketed { + span: span(1..12), + negated: false, + kind: itemset(item_ascii(alnum(span(2..11), false))), + })), + }))) + ); + assert_eq!( + parser("[[:alnum:]&&[:lower:]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..22), + negated: false, + kind: intersection( + span(1..21), + itemset(item_ascii(alnum(span(1..10), false))), + itemset(item_ascii(lower(span(12..21), false))), + ), + }))) + ); + assert_eq!( + parser("[[:alnum:]--[:lower:]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..22), + negated: false, + kind: difference( + span(1..21), + itemset(item_ascii(alnum(span(1..10), false))), + itemset(item_ascii(lower(span(12..21), false))), + ), + }))) + ); + assert_eq!( + parser("[[:alnum:]~~[:lower:]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..22), + negated: false, + kind: symdifference( + span(1..21), + itemset(item_ascii(alnum(span(1..10), false))), + itemset(item_ascii(lower(span(12..21), false))), + ), + }))) + ); + + assert_eq!( + parser("[a]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..3), + negated: false, + kind: itemset(lit(span(1..2), 'a')), + }))) + ); + assert_eq!( + parser(r"[a\]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..5), + negated: false, + kind: union( + span(1..4), + vec![ + lit(span(1..2), 'a'), + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..4), + kind: ast::LiteralKind::Punctuation, + c: ']', + }), + ] + ), + }))) + ); + assert_eq!( + parser(r"[a\-z]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..6), + negated: false, + kind: union( + span(1..5), + vec![ + lit(span(1..2), 'a'), + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..4), + kind: ast::LiteralKind::Punctuation, + c: '-', + }), + lit(span(4..5), 'z'), + ] + ), + }))) + ); + assert_eq!( + parser("[ab]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: union( + span(1..3), + vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),] + ), + }))) + ); + assert_eq!( + parser("[a-]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: union( + span(1..3), + vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),] + ), + }))) + ); + assert_eq!( + parser("[-a]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: union( + span(1..3), + vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),] + ), + }))) + ); + assert_eq!( + parser(r"[\pL]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..5), + negated: false, + kind: itemset(item_unicode(ast::ClassUnicode { + span: span(1..4), + negated: false, + kind: ast::ClassUnicodeKind::OneLetter('L'), + })), + }))) + ); + assert_eq!( + parser(r"[\w]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: itemset(item_perl(ast::ClassPerl { + span: span(1..3), + kind: ast::ClassPerlKind::Word, + negated: false, + })), + }))) + ); + assert_eq!( + parser(r"[a\wz]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..6), + negated: false, + kind: union( + span(1..5), + vec![ + lit(span(1..2), 'a'), + item_perl(ast::ClassPerl { + span: span(2..4), + kind: ast::ClassPerlKind::Word, + negated: false, + }), + lit(span(4..5), 'z'), + ] + ), + }))) + ); + + assert_eq!( + parser("[a-z]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..5), + negated: false, + kind: itemset(range(span(1..4), 'a', 'z')), + }))) + ); + assert_eq!( + parser("[a-cx-z]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..8), + negated: false, + kind: union( + span(1..7), + vec![ + range(span(1..4), 'a', 'c'), + range(span(4..7), 'x', 'z'), + ] + ), + }))) + ); + assert_eq!( + parser(r"[\w&&a-cx-z]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..12), + negated: false, + kind: intersection( + span(1..11), + itemset(item_perl(ast::ClassPerl { + span: span(1..3), + kind: ast::ClassPerlKind::Word, + negated: false, + })), + union( + span(5..11), + vec![ + range(span(5..8), 'a', 'c'), + range(span(8..11), 'x', 'z'), + ] + ), + ), + }))) + ); + assert_eq!( + parser(r"[a-cx-z&&\w]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..12), + negated: false, + kind: intersection( + span(1..11), + union( + span(1..7), + vec![ + range(span(1..4), 'a', 'c'), + range(span(4..7), 'x', 'z'), + ] + ), + itemset(item_perl(ast::ClassPerl { + span: span(9..11), + kind: ast::ClassPerlKind::Word, + negated: false, + })), + ), + }))) + ); + assert_eq!( + parser(r"[a--b--c]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..9), + negated: false, + kind: difference( + span(1..8), + difference( + span(1..5), + itemset(lit(span(1..2), 'a')), + itemset(lit(span(4..5), 'b')), + ), + itemset(lit(span(7..8), 'c')), + ), + }))) + ); + assert_eq!( + parser(r"[a~~b~~c]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..9), + negated: false, + kind: symdifference( + span(1..8), + symdifference( + span(1..5), + itemset(lit(span(1..2), 'a')), + itemset(lit(span(4..5), 'b')), + ), + itemset(lit(span(7..8), 'c')), + ), + }))) + ); + assert_eq!( + parser(r"[\^&&^]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..7), + negated: false, + kind: intersection( + span(1..6), + itemset(ast::ClassSetItem::Literal(ast::Literal { + span: span(1..3), + kind: ast::LiteralKind::Punctuation, + c: '^', + })), + itemset(lit(span(5..6), '^')), + ), + }))) + ); + assert_eq!( + parser(r"[\&&&&]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..7), + negated: false, + kind: intersection( + span(1..6), + itemset(ast::ClassSetItem::Literal(ast::Literal { + span: span(1..3), + kind: ast::LiteralKind::Punctuation, + c: '&', + })), + itemset(lit(span(5..6), '&')), + ), + }))) + ); + assert_eq!( + parser(r"[&&&&]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..6), + negated: false, + kind: intersection( + span(1..5), + intersection( + span(1..3), + itemset(empty(span(1..1))), + itemset(empty(span(3..3))), + ), + itemset(empty(span(5..5))), + ), + }))) + ); + + let pat = "[☃-⛄]"; + assert_eq!( + parser(pat).parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span_range(pat, 0..9), + negated: false, + kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange { + span: span_range(pat, 1..8), + start: ast::Literal { + span: span_range(pat, 1..4), + kind: ast::LiteralKind::Verbatim, + c: '☃', + }, + end: ast::Literal { + span: span_range(pat, 5..8), + kind: ast::LiteralKind::Verbatim, + c: '⛄', + }, + })), + }))) + ); + + assert_eq!( + parser(r"[]]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..3), + negated: false, + kind: itemset(lit(span(1..2), ']')), + }))) + ); + assert_eq!( + parser(r"[]\[]").parse(), + Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..5), + negated: false, + kind: union( + span(1..4), + vec![ + lit(span(1..2), ']'), + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..4), + kind: ast::LiteralKind::Punctuation, + c: '[', + }), + ] + ), + }))) + ); + assert_eq!( + parser(r"[\[]]").parse(), + Ok(concat( + 0..5, + vec![ + Ast::Class(ast::Class::Bracketed(ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: itemset(ast::ClassSetItem::Literal( + ast::Literal { + span: span(1..3), + kind: ast::LiteralKind::Punctuation, + c: '[', + } + )), + })), + Ast::Literal(ast::Literal { + span: span(4..5), + kind: ast::LiteralKind::Verbatim, + c: ']', + }), + ] + )) + ); + + assert_eq!( + parser("[").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[[").parse().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[[-]").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[[[:alnum:]").parse().unwrap_err(), + TestError { + span: span(1..2), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser(r"[\b]").parse().unwrap_err(), + TestError { + span: span(1..3), + kind: ast::ErrorKind::ClassEscapeInvalid, + } + ); + assert_eq!( + parser(r"[\w-a]").parse().unwrap_err(), + TestError { + span: span(1..3), + kind: ast::ErrorKind::ClassRangeLiteral, + } + ); + assert_eq!( + parser(r"[a-\w]").parse().unwrap_err(), + TestError { + span: span(3..5), + kind: ast::ErrorKind::ClassRangeLiteral, + } + ); + assert_eq!( + parser(r"[z-a]").parse().unwrap_err(), + TestError { + span: span(1..4), + kind: ast::ErrorKind::ClassRangeInvalid, + } + ); + + assert_eq!( + parser_ignore_whitespace("[a ").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser_ignore_whitespace("[a- ").parse().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + } + + #[test] + fn parse_set_class_open() { + assert_eq!(parser("[a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..1), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { span: span(1..1), items: vec![] }; + Ok((set, union)) + }); + assert_eq!( + parser_ignore_whitespace("[ a]").parse_set_class_open(), + { + let set = ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(4..4), + items: vec![], + }), + }; + let union = + ast::ClassSetUnion { span: span(4..4), items: vec![] }; + Ok((set, union)) + } + ); + assert_eq!(parser("[^a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..2), + negated: true, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { span: span(2..2), items: vec![] }; + Ok((set, union)) + }); + assert_eq!( + parser_ignore_whitespace("[ ^ a]").parse_set_class_open(), + { + let set = ast::ClassBracketed { + span: span(0..4), + negated: true, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(4..4), + items: vec![], + }), + }; + let union = + ast::ClassSetUnion { span: span(4..4), items: vec![] }; + Ok((set, union)) + } + ); + assert_eq!(parser("[-a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..2), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..2), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: '-', + })], + }; + Ok((set, union)) + }); + assert_eq!( + parser_ignore_whitespace("[ - a]").parse_set_class_open(), + { + let set = ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..3), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: '-', + })], + }; + Ok((set, union)) + } + ); + assert_eq!(parser("[^-a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..3), + negated: true, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..3), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: '-', + })], + }; + Ok((set, union)) + }); + assert_eq!(parser("[--a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..3), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..3), + items: vec![ + ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: '-', + }), + ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: '-', + }), + ], + }; + Ok((set, union)) + }); + assert_eq!(parser("[]a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..2), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..2), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: ']', + })], + }; + Ok((set, union)) + }); + assert_eq!( + parser_ignore_whitespace("[ ] a]").parse_set_class_open(), + { + let set = ast::ClassBracketed { + span: span(0..4), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..3), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: ']', + })], + }; + Ok((set, union)) + } + ); + assert_eq!(parser("[^]a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..3), + negated: true, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(2..2), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(2..3), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: ']', + })], + }; + Ok((set, union)) + }); + assert_eq!(parser("[-]a]").parse_set_class_open(), { + let set = ast::ClassBracketed { + span: span(0..2), + negated: false, + kind: ast::ClassSet::union(ast::ClassSetUnion { + span: span(1..1), + items: vec![], + }), + }; + let union = ast::ClassSetUnion { + span: span(1..2), + items: vec![ast::ClassSetItem::Literal(ast::Literal { + span: span(1..2), + kind: ast::LiteralKind::Verbatim, + c: '-', + })], + }; + Ok((set, union)) + }); + + assert_eq!( + parser("[").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser_ignore_whitespace("[ ") + .parse_set_class_open() + .unwrap_err(), + TestError { + span: span(0..5), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[^").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[]").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[-").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + assert_eq!( + parser("[--").parse_set_class_open().unwrap_err(), + TestError { + span: span(0..0), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + + // See: https://github.com/rust-lang/regex/issues/792 + assert_eq!( + parser("(?x)[-#]").parse_with_comments().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::ClassUnclosed, + } + ); + } + + #[test] + fn maybe_parse_ascii_class() { + assert_eq!( + parser(r"[:alnum:]").maybe_parse_ascii_class(), + Some(ast::ClassAscii { + span: span(0..9), + kind: ast::ClassAsciiKind::Alnum, + negated: false, + }) + ); + assert_eq!( + parser(r"[:alnum:]A").maybe_parse_ascii_class(), + Some(ast::ClassAscii { + span: span(0..9), + kind: ast::ClassAsciiKind::Alnum, + negated: false, + }) + ); + assert_eq!( + parser(r"[:^alnum:]").maybe_parse_ascii_class(), + Some(ast::ClassAscii { + span: span(0..10), + kind: ast::ClassAsciiKind::Alnum, + negated: true, + }) + ); + + let p = parser(r"[:"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[:^"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[^:alnum:]"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[:alnnum:]"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[:alnum]"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + + let p = parser(r"[:alnum:"); + assert_eq!(p.maybe_parse_ascii_class(), None); + assert_eq!(p.offset(), 0); + } + + #[test] + fn parse_unicode_class() { + assert_eq!( + parser(r"\pN").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..3), + negated: false, + kind: ast::ClassUnicodeKind::OneLetter('N'), + })) + ); + assert_eq!( + parser(r"\PN").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..3), + negated: true, + kind: ast::ClassUnicodeKind::OneLetter('N'), + })) + ); + assert_eq!( + parser(r"\p{N}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..5), + negated: false, + kind: ast::ClassUnicodeKind::Named(s("N")), + })) + ); + assert_eq!( + parser(r"\P{N}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..5), + negated: true, + kind: ast::ClassUnicodeKind::Named(s("N")), + })) + ); + assert_eq!( + parser(r"\p{Greek}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..9), + negated: false, + kind: ast::ClassUnicodeKind::Named(s("Greek")), + })) + ); + + assert_eq!( + parser(r"\p{scx:Katakana}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..16), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Colon, + name: s("scx"), + value: s("Katakana"), + }, + })) + ); + assert_eq!( + parser(r"\p{scx=Katakana}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..16), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Equal, + name: s("scx"), + value: s("Katakana"), + }, + })) + ); + assert_eq!( + parser(r"\p{scx!=Katakana}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..17), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::NotEqual, + name: s("scx"), + value: s("Katakana"), + }, + })) + ); + + assert_eq!( + parser(r"\p{:}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..5), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Colon, + name: s(""), + value: s(""), + }, + })) + ); + assert_eq!( + parser(r"\p{=}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..5), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::Equal, + name: s(""), + value: s(""), + }, + })) + ); + assert_eq!( + parser(r"\p{!=}").parse_escape(), + Ok(Primitive::Unicode(ast::ClassUnicode { + span: span(0..6), + negated: false, + kind: ast::ClassUnicodeKind::NamedValue { + op: ast::ClassUnicodeOpKind::NotEqual, + name: s(""), + value: s(""), + }, + })) + ); + + assert_eq!( + parser(r"\p").parse_escape().unwrap_err(), + TestError { + span: span(2..2), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\p{").parse_escape().unwrap_err(), + TestError { + span: span(3..3), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\p{N").parse_escape().unwrap_err(), + TestError { + span: span(4..4), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + assert_eq!( + parser(r"\p{Greek").parse_escape().unwrap_err(), + TestError { + span: span(8..8), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); + + assert_eq!( + parser(r"\pNz").parse(), + Ok(Ast::Concat(ast::Concat { + span: span(0..4), + asts: vec![ + Ast::Class(ast::Class::Unicode(ast::ClassUnicode { + span: span(0..3), + negated: false, + kind: ast::ClassUnicodeKind::OneLetter('N'), + })), + Ast::Literal(ast::Literal { + span: span(3..4), + kind: ast::LiteralKind::Verbatim, + c: 'z', + }), + ], + })) + ); + assert_eq!( + parser(r"\p{Greek}z").parse(), + Ok(Ast::Concat(ast::Concat { + span: span(0..10), + asts: vec![ + Ast::Class(ast::Class::Unicode(ast::ClassUnicode { + span: span(0..9), + negated: false, + kind: ast::ClassUnicodeKind::Named(s("Greek")), + })), + Ast::Literal(ast::Literal { + span: span(9..10), + kind: ast::LiteralKind::Verbatim, + c: 'z', + }), + ], + })) + ); + assert_eq!( + parser(r"\p\{").parse().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::UnicodeClassInvalid, + } + ); + assert_eq!( + parser(r"\P\{").parse().unwrap_err(), + TestError { + span: span(2..3), + kind: ast::ErrorKind::UnicodeClassInvalid, + } + ); + } + + #[test] + fn parse_perl_class() { + assert_eq!( + parser(r"\d").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Digit, + negated: false, + })) + ); + assert_eq!( + parser(r"\D").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Digit, + negated: true, + })) + ); + assert_eq!( + parser(r"\s").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Space, + negated: false, + })) + ); + assert_eq!( + parser(r"\S").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Space, + negated: true, + })) + ); + assert_eq!( + parser(r"\w").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Word, + negated: false, + })) + ); + assert_eq!( + parser(r"\W").parse_escape(), + Ok(Primitive::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Word, + negated: true, + })) + ); + + assert_eq!( + parser(r"\d").parse(), + Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Digit, + negated: false, + }))) + ); + assert_eq!( + parser(r"\dz").parse(), + Ok(Ast::Concat(ast::Concat { + span: span(0..3), + asts: vec![ + Ast::Class(ast::Class::Perl(ast::ClassPerl { + span: span(0..2), + kind: ast::ClassPerlKind::Digit, + negated: false, + })), + Ast::Literal(ast::Literal { + span: span(2..3), + kind: ast::LiteralKind::Verbatim, + c: 'z', + }), + ], + })) + ); + } + + // This tests a bug fix where the nest limit checker wasn't decrementing + // its depth during post-traversal, which causes long regexes to trip + // the default limit too aggressively. + #[test] + fn regression_454_nest_too_big() { + let pattern = r#" + 2(?: + [45]\d{3}| + 7(?: + 1[0-267]| + 2[0-289]| + 3[0-29]| + 4[01]| + 5[1-3]| + 6[013]| + 7[0178]| + 91 + )| + 8(?: + 0[125]| + [139][1-6]| + 2[0157-9]| + 41| + 6[1-35]| + 7[1-5]| + 8[1-8]| + 90 + )| + 9(?: + 0[0-2]| + 1[0-4]| + 2[568]| + 3[3-6]| + 5[5-7]| + 6[0167]| + 7[15]| + 8[0146-9] + ) + )\d{4} + "#; + assert!(parser_nest_limit(pattern, 50).parse().is_ok()); + } + + // This tests that we treat a trailing `-` in a character class as a + // literal `-` even when whitespace mode is enabled and there is whitespace + // after the trailing `-`. + #[test] + fn regression_455_trailing_dash_ignore_whitespace() { + assert!(parser("(?x)[ / - ]").parse().is_ok()); + assert!(parser("(?x)[ a - ]").parse().is_ok()); + assert!(parser( + "(?x)[ + a + - ] + " + ) + .parse() + .is_ok()); + assert!(parser( + "(?x)[ + a # wat + - ] + " + ) + .parse() + .is_ok()); + + assert!(parser("(?x)[ / -").parse().is_err()); + assert!(parser("(?x)[ / - ").parse().is_err()); + assert!(parser( + "(?x)[ + / - + " + ) + .parse() + .is_err()); + assert!(parser( + "(?x)[ + / - # wat + " + ) + .parse() + .is_err()); + } +} diff --git a/vendor/regex-syntax/src/ast/print.rs b/vendor/regex-syntax/src/ast/print.rs new file mode 100644 index 0000000000..045de2eaf2 --- /dev/null +++ b/vendor/regex-syntax/src/ast/print.rs @@ -0,0 +1,568 @@ +/*! +This module provides a regular expression printer for `Ast`. +*/ + +use std::fmt; + +use crate::ast::visitor::{self, Visitor}; +use crate::ast::{self, Ast}; + +/// A builder for constructing a printer. +/// +/// Note that since a printer doesn't have any configuration knobs, this type +/// remains unexported. +#[derive(Clone, Debug)] +struct PrinterBuilder { + _priv: (), +} + +impl Default for PrinterBuilder { + fn default() -> PrinterBuilder { + PrinterBuilder::new() + } +} + +impl PrinterBuilder { + fn new() -> PrinterBuilder { + PrinterBuilder { _priv: () } + } + + fn build(&self) -> Printer { + Printer { _priv: () } + } +} + +/// A printer for a regular expression abstract syntax tree. +/// +/// A printer converts an abstract syntax tree (AST) to a regular expression +/// pattern string. This particular printer uses constant stack space and heap +/// space proportional to the size of the AST. +/// +/// This printer will not necessarily preserve the original formatting of the +/// regular expression pattern string. For example, all whitespace and comments +/// are ignored. +#[derive(Debug)] +pub struct Printer { + _priv: (), +} + +impl Printer { + /// Create a new printer. + pub fn new() -> Printer { + PrinterBuilder::new().build() + } + + /// Print the given `Ast` to the given writer. The writer must implement + /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used + /// here are a `fmt::Formatter` (which is available in `fmt::Display` + /// implementations) or a `&mut String`. + pub fn print(&mut self, ast: &Ast, wtr: W) -> fmt::Result { + visitor::visit(ast, Writer { wtr }) + } +} + +#[derive(Debug)] +struct Writer { + wtr: W, +} + +impl Visitor for Writer { + type Output = (); + type Err = fmt::Error; + + fn finish(self) -> fmt::Result { + Ok(()) + } + + fn visit_pre(&mut self, ast: &Ast) -> fmt::Result { + match *ast { + Ast::Group(ref x) => self.fmt_group_pre(x), + Ast::Class(ast::Class::Bracketed(ref x)) => { + self.fmt_class_bracketed_pre(x) + } + _ => Ok(()), + } + } + + fn visit_post(&mut self, ast: &Ast) -> fmt::Result { + use crate::ast::Class; + + match *ast { + Ast::Empty(_) => Ok(()), + Ast::Flags(ref x) => self.fmt_set_flags(x), + Ast::Literal(ref x) => self.fmt_literal(x), + Ast::Dot(_) => self.wtr.write_str("."), + Ast::Assertion(ref x) => self.fmt_assertion(x), + Ast::Class(Class::Perl(ref x)) => self.fmt_class_perl(x), + Ast::Class(Class::Unicode(ref x)) => self.fmt_class_unicode(x), + Ast::Class(Class::Bracketed(ref x)) => { + self.fmt_class_bracketed_post(x) + } + Ast::Repetition(ref x) => self.fmt_repetition(x), + Ast::Group(ref x) => self.fmt_group_post(x), + Ast::Alternation(_) => Ok(()), + Ast::Concat(_) => Ok(()), + } + } + + fn visit_alternation_in(&mut self) -> fmt::Result { + self.wtr.write_str("|") + } + + fn visit_class_set_item_pre( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<(), Self::Err> { + match *ast { + ast::ClassSetItem::Bracketed(ref x) => { + self.fmt_class_bracketed_pre(x) + } + _ => Ok(()), + } + } + + fn visit_class_set_item_post( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<(), Self::Err> { + use crate::ast::ClassSetItem::*; + + match *ast { + Empty(_) => Ok(()), + Literal(ref x) => self.fmt_literal(x), + Range(ref x) => { + self.fmt_literal(&x.start)?; + self.wtr.write_str("-")?; + self.fmt_literal(&x.end)?; + Ok(()) + } + Ascii(ref x) => self.fmt_class_ascii(x), + Unicode(ref x) => self.fmt_class_unicode(x), + Perl(ref x) => self.fmt_class_perl(x), + Bracketed(ref x) => self.fmt_class_bracketed_post(x), + Union(_) => Ok(()), + } + } + + fn visit_class_set_binary_op_in( + &mut self, + ast: &ast::ClassSetBinaryOp, + ) -> Result<(), Self::Err> { + self.fmt_class_set_binary_op_kind(&ast.kind) + } +} + +impl Writer { + fn fmt_group_pre(&mut self, ast: &ast::Group) -> fmt::Result { + use crate::ast::GroupKind::*; + match ast.kind { + CaptureIndex(_) => self.wtr.write_str("("), + CaptureName(ref x) => { + self.wtr.write_str("(?P<")?; + self.wtr.write_str(&x.name)?; + self.wtr.write_str(">")?; + Ok(()) + } + NonCapturing(ref flags) => { + self.wtr.write_str("(?")?; + self.fmt_flags(flags)?; + self.wtr.write_str(":")?; + Ok(()) + } + } + } + + fn fmt_group_post(&mut self, _ast: &ast::Group) -> fmt::Result { + self.wtr.write_str(")") + } + + fn fmt_repetition(&mut self, ast: &ast::Repetition) -> fmt::Result { + use crate::ast::RepetitionKind::*; + match ast.op.kind { + ZeroOrOne if ast.greedy => self.wtr.write_str("?"), + ZeroOrOne => self.wtr.write_str("??"), + ZeroOrMore if ast.greedy => self.wtr.write_str("*"), + ZeroOrMore => self.wtr.write_str("*?"), + OneOrMore if ast.greedy => self.wtr.write_str("+"), + OneOrMore => self.wtr.write_str("+?"), + Range(ref x) => { + self.fmt_repetition_range(x)?; + if !ast.greedy { + self.wtr.write_str("?")?; + } + Ok(()) + } + } + } + + fn fmt_repetition_range( + &mut self, + ast: &ast::RepetitionRange, + ) -> fmt::Result { + use crate::ast::RepetitionRange::*; + match *ast { + Exactly(x) => write!(self.wtr, "{{{}}}", x), + AtLeast(x) => write!(self.wtr, "{{{},}}", x), + Bounded(x, y) => write!(self.wtr, "{{{},{}}}", x, y), + } + } + + fn fmt_literal(&mut self, ast: &ast::Literal) -> fmt::Result { + use crate::ast::LiteralKind::*; + + match ast.kind { + Verbatim => self.wtr.write_char(ast.c), + Punctuation => write!(self.wtr, r"\{}", ast.c), + Octal => write!(self.wtr, r"\{:o}", ast.c as u32), + HexFixed(ast::HexLiteralKind::X) => { + write!(self.wtr, r"\x{:02X}", ast.c as u32) + } + HexFixed(ast::HexLiteralKind::UnicodeShort) => { + write!(self.wtr, r"\u{:04X}", ast.c as u32) + } + HexFixed(ast::HexLiteralKind::UnicodeLong) => { + write!(self.wtr, r"\U{:08X}", ast.c as u32) + } + HexBrace(ast::HexLiteralKind::X) => { + write!(self.wtr, r"\x{{{:X}}}", ast.c as u32) + } + HexBrace(ast::HexLiteralKind::UnicodeShort) => { + write!(self.wtr, r"\u{{{:X}}}", ast.c as u32) + } + HexBrace(ast::HexLiteralKind::UnicodeLong) => { + write!(self.wtr, r"\U{{{:X}}}", ast.c as u32) + } + Special(ast::SpecialLiteralKind::Bell) => { + self.wtr.write_str(r"\a") + } + Special(ast::SpecialLiteralKind::FormFeed) => { + self.wtr.write_str(r"\f") + } + Special(ast::SpecialLiteralKind::Tab) => self.wtr.write_str(r"\t"), + Special(ast::SpecialLiteralKind::LineFeed) => { + self.wtr.write_str(r"\n") + } + Special(ast::SpecialLiteralKind::CarriageReturn) => { + self.wtr.write_str(r"\r") + } + Special(ast::SpecialLiteralKind::VerticalTab) => { + self.wtr.write_str(r"\v") + } + Special(ast::SpecialLiteralKind::Space) => { + self.wtr.write_str(r"\ ") + } + } + } + + fn fmt_assertion(&mut self, ast: &ast::Assertion) -> fmt::Result { + use crate::ast::AssertionKind::*; + match ast.kind { + StartLine => self.wtr.write_str("^"), + EndLine => self.wtr.write_str("$"), + StartText => self.wtr.write_str(r"\A"), + EndText => self.wtr.write_str(r"\z"), + WordBoundary => self.wtr.write_str(r"\b"), + NotWordBoundary => self.wtr.write_str(r"\B"), + } + } + + fn fmt_set_flags(&mut self, ast: &ast::SetFlags) -> fmt::Result { + self.wtr.write_str("(?")?; + self.fmt_flags(&ast.flags)?; + self.wtr.write_str(")")?; + Ok(()) + } + + fn fmt_flags(&mut self, ast: &ast::Flags) -> fmt::Result { + use crate::ast::{Flag, FlagsItemKind}; + + for item in &ast.items { + match item.kind { + FlagsItemKind::Negation => self.wtr.write_str("-"), + FlagsItemKind::Flag(ref flag) => match *flag { + Flag::CaseInsensitive => self.wtr.write_str("i"), + Flag::MultiLine => self.wtr.write_str("m"), + Flag::DotMatchesNewLine => self.wtr.write_str("s"), + Flag::SwapGreed => self.wtr.write_str("U"), + Flag::Unicode => self.wtr.write_str("u"), + Flag::IgnoreWhitespace => self.wtr.write_str("x"), + }, + }?; + } + Ok(()) + } + + fn fmt_class_bracketed_pre( + &mut self, + ast: &ast::ClassBracketed, + ) -> fmt::Result { + if ast.negated { + self.wtr.write_str("[^") + } else { + self.wtr.write_str("[") + } + } + + fn fmt_class_bracketed_post( + &mut self, + _ast: &ast::ClassBracketed, + ) -> fmt::Result { + self.wtr.write_str("]") + } + + fn fmt_class_set_binary_op_kind( + &mut self, + ast: &ast::ClassSetBinaryOpKind, + ) -> fmt::Result { + use crate::ast::ClassSetBinaryOpKind::*; + match *ast { + Intersection => self.wtr.write_str("&&"), + Difference => self.wtr.write_str("--"), + SymmetricDifference => self.wtr.write_str("~~"), + } + } + + fn fmt_class_perl(&mut self, ast: &ast::ClassPerl) -> fmt::Result { + use crate::ast::ClassPerlKind::*; + match ast.kind { + Digit if ast.negated => self.wtr.write_str(r"\D"), + Digit => self.wtr.write_str(r"\d"), + Space if ast.negated => self.wtr.write_str(r"\S"), + Space => self.wtr.write_str(r"\s"), + Word if ast.negated => self.wtr.write_str(r"\W"), + Word => self.wtr.write_str(r"\w"), + } + } + + fn fmt_class_ascii(&mut self, ast: &ast::ClassAscii) -> fmt::Result { + use crate::ast::ClassAsciiKind::*; + match ast.kind { + Alnum if ast.negated => self.wtr.write_str("[:^alnum:]"), + Alnum => self.wtr.write_str("[:alnum:]"), + Alpha if ast.negated => self.wtr.write_str("[:^alpha:]"), + Alpha => self.wtr.write_str("[:alpha:]"), + Ascii if ast.negated => self.wtr.write_str("[:^ascii:]"), + Ascii => self.wtr.write_str("[:ascii:]"), + Blank if ast.negated => self.wtr.write_str("[:^blank:]"), + Blank => self.wtr.write_str("[:blank:]"), + Cntrl if ast.negated => self.wtr.write_str("[:^cntrl:]"), + Cntrl => self.wtr.write_str("[:cntrl:]"), + Digit if ast.negated => self.wtr.write_str("[:^digit:]"), + Digit => self.wtr.write_str("[:digit:]"), + Graph if ast.negated => self.wtr.write_str("[:^graph:]"), + Graph => self.wtr.write_str("[:graph:]"), + Lower if ast.negated => self.wtr.write_str("[:^lower:]"), + Lower => self.wtr.write_str("[:lower:]"), + Print if ast.negated => self.wtr.write_str("[:^print:]"), + Print => self.wtr.write_str("[:print:]"), + Punct if ast.negated => self.wtr.write_str("[:^punct:]"), + Punct => self.wtr.write_str("[:punct:]"), + Space if ast.negated => self.wtr.write_str("[:^space:]"), + Space => self.wtr.write_str("[:space:]"), + Upper if ast.negated => self.wtr.write_str("[:^upper:]"), + Upper => self.wtr.write_str("[:upper:]"), + Word if ast.negated => self.wtr.write_str("[:^word:]"), + Word => self.wtr.write_str("[:word:]"), + Xdigit if ast.negated => self.wtr.write_str("[:^xdigit:]"), + Xdigit => self.wtr.write_str("[:xdigit:]"), + } + } + + fn fmt_class_unicode(&mut self, ast: &ast::ClassUnicode) -> fmt::Result { + use crate::ast::ClassUnicodeKind::*; + use crate::ast::ClassUnicodeOpKind::*; + + if ast.negated { + self.wtr.write_str(r"\P")?; + } else { + self.wtr.write_str(r"\p")?; + } + match ast.kind { + OneLetter(c) => self.wtr.write_char(c), + Named(ref x) => write!(self.wtr, "{{{}}}", x), + NamedValue { op: Equal, ref name, ref value } => { + write!(self.wtr, "{{{}={}}}", name, value) + } + NamedValue { op: Colon, ref name, ref value } => { + write!(self.wtr, "{{{}:{}}}", name, value) + } + NamedValue { op: NotEqual, ref name, ref value } => { + write!(self.wtr, "{{{}!={}}}", name, value) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::Printer; + use crate::ast::parse::ParserBuilder; + + fn roundtrip(given: &str) { + roundtrip_with(|b| b, given); + } + + fn roundtrip_with(mut f: F, given: &str) + where + F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder, + { + let mut builder = ParserBuilder::new(); + f(&mut builder); + let ast = builder.build().parse(given).unwrap(); + + let mut printer = Printer::new(); + let mut dst = String::new(); + printer.print(&ast, &mut dst).unwrap(); + assert_eq!(given, dst); + } + + #[test] + fn print_literal() { + roundtrip("a"); + roundtrip(r"\["); + roundtrip_with(|b| b.octal(true), r"\141"); + roundtrip(r"\x61"); + roundtrip(r"\x7F"); + roundtrip(r"\u0061"); + roundtrip(r"\U00000061"); + roundtrip(r"\x{61}"); + roundtrip(r"\x{7F}"); + roundtrip(r"\u{61}"); + roundtrip(r"\U{61}"); + + roundtrip(r"\a"); + roundtrip(r"\f"); + roundtrip(r"\t"); + roundtrip(r"\n"); + roundtrip(r"\r"); + roundtrip(r"\v"); + roundtrip(r"(?x)\ "); + } + + #[test] + fn print_dot() { + roundtrip("."); + } + + #[test] + fn print_concat() { + roundtrip("ab"); + roundtrip("abcde"); + roundtrip("a(bcd)ef"); + } + + #[test] + fn print_alternation() { + roundtrip("a|b"); + roundtrip("a|b|c|d|e"); + roundtrip("|a|b|c|d|e"); + roundtrip("|a|b|c|d|e|"); + roundtrip("a(b|c|d)|e|f"); + } + + #[test] + fn print_assertion() { + roundtrip(r"^"); + roundtrip(r"$"); + roundtrip(r"\A"); + roundtrip(r"\z"); + roundtrip(r"\b"); + roundtrip(r"\B"); + } + + #[test] + fn print_repetition() { + roundtrip("a?"); + roundtrip("a??"); + roundtrip("a*"); + roundtrip("a*?"); + roundtrip("a+"); + roundtrip("a+?"); + roundtrip("a{5}"); + roundtrip("a{5}?"); + roundtrip("a{5,}"); + roundtrip("a{5,}?"); + roundtrip("a{5,10}"); + roundtrip("a{5,10}?"); + } + + #[test] + fn print_flags() { + roundtrip("(?i)"); + roundtrip("(?-i)"); + roundtrip("(?s-i)"); + roundtrip("(?-si)"); + roundtrip("(?siUmux)"); + } + + #[test] + fn print_group() { + roundtrip("(?i:a)"); + roundtrip("(?Pa)"); + roundtrip("(a)"); + } + + #[test] + fn print_class() { + roundtrip(r"[abc]"); + roundtrip(r"[a-z]"); + roundtrip(r"[^a-z]"); + roundtrip(r"[a-z0-9]"); + roundtrip(r"[-a-z0-9]"); + roundtrip(r"[-a-z0-9]"); + roundtrip(r"[a-z0-9---]"); + roundtrip(r"[a-z&&m-n]"); + roundtrip(r"[[a-z&&m-n]]"); + roundtrip(r"[a-z--m-n]"); + roundtrip(r"[a-z~~m-n]"); + roundtrip(r"[a-z[0-9]]"); + roundtrip(r"[a-z[^0-9]]"); + + roundtrip(r"\d"); + roundtrip(r"\D"); + roundtrip(r"\s"); + roundtrip(r"\S"); + roundtrip(r"\w"); + roundtrip(r"\W"); + + roundtrip(r"[[:alnum:]]"); + roundtrip(r"[[:^alnum:]]"); + roundtrip(r"[[:alpha:]]"); + roundtrip(r"[[:^alpha:]]"); + roundtrip(r"[[:ascii:]]"); + roundtrip(r"[[:^ascii:]]"); + roundtrip(r"[[:blank:]]"); + roundtrip(r"[[:^blank:]]"); + roundtrip(r"[[:cntrl:]]"); + roundtrip(r"[[:^cntrl:]]"); + roundtrip(r"[[:digit:]]"); + roundtrip(r"[[:^digit:]]"); + roundtrip(r"[[:graph:]]"); + roundtrip(r"[[:^graph:]]"); + roundtrip(r"[[:lower:]]"); + roundtrip(r"[[:^lower:]]"); + roundtrip(r"[[:print:]]"); + roundtrip(r"[[:^print:]]"); + roundtrip(r"[[:punct:]]"); + roundtrip(r"[[:^punct:]]"); + roundtrip(r"[[:space:]]"); + roundtrip(r"[[:^space:]]"); + roundtrip(r"[[:upper:]]"); + roundtrip(r"[[:^upper:]]"); + roundtrip(r"[[:word:]]"); + roundtrip(r"[[:^word:]]"); + roundtrip(r"[[:xdigit:]]"); + roundtrip(r"[[:^xdigit:]]"); + + roundtrip(r"\pL"); + roundtrip(r"\PL"); + roundtrip(r"\p{L}"); + roundtrip(r"\P{L}"); + roundtrip(r"\p{X=Y}"); + roundtrip(r"\P{X=Y}"); + roundtrip(r"\p{X:Y}"); + roundtrip(r"\P{X:Y}"); + roundtrip(r"\p{X!=Y}"); + roundtrip(r"\P{X!=Y}"); + } +} diff --git a/vendor/regex-syntax/src/ast/visitor.rs b/vendor/regex-syntax/src/ast/visitor.rs new file mode 100644 index 0000000000..78ee487cff --- /dev/null +++ b/vendor/regex-syntax/src/ast/visitor.rs @@ -0,0 +1,517 @@ +use std::fmt; + +use crate::ast::{self, Ast}; + +/// A trait for visiting an abstract syntax tree (AST) in depth first order. +/// +/// The principle aim of this trait is to enable callers to perform case +/// analysis on an abstract syntax tree without necessarily using recursion. +/// In particular, this permits callers to do case analysis with constant stack +/// usage, which can be important since the size of an abstract syntax tree +/// may be proportional to end user input. +/// +/// Typical usage of this trait involves providing an implementation and then +/// running it using the [`visit`](fn.visit.html) function. +/// +/// Note that the abstract syntax tree for a regular expression is quite +/// complex. Unless you specifically need it, you might be able to use the +/// much simpler +/// [high-level intermediate representation](../hir/struct.Hir.html) +/// and its +/// [corresponding `Visitor` trait](../hir/trait.Visitor.html) +/// instead. +pub trait Visitor { + /// The result of visiting an AST. + type Output; + /// An error that visiting an AST might return. + type Err; + + /// All implementors of `Visitor` must provide a `finish` method, which + /// yields the result of visiting the AST or an error. + fn finish(self) -> Result; + + /// This method is called before beginning traversal of the AST. + fn start(&mut self) {} + + /// This method is called on an `Ast` before descending into child `Ast` + /// nodes. + fn visit_pre(&mut self, _ast: &Ast) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on an `Ast` after descending all of its child + /// `Ast` nodes. + fn visit_post(&mut self, _ast: &Ast) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called between child nodes of an + /// [`Alternation`](struct.Alternation.html). + fn visit_alternation_in(&mut self) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on every + /// [`ClassSetItem`](enum.ClassSetItem.html) + /// before descending into child nodes. + fn visit_class_set_item_pre( + &mut self, + _ast: &ast::ClassSetItem, + ) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on every + /// [`ClassSetItem`](enum.ClassSetItem.html) + /// after descending into child nodes. + fn visit_class_set_item_post( + &mut self, + _ast: &ast::ClassSetItem, + ) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on every + /// [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html) + /// before descending into child nodes. + fn visit_class_set_binary_op_pre( + &mut self, + _ast: &ast::ClassSetBinaryOp, + ) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on every + /// [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html) + /// after descending into child nodes. + fn visit_class_set_binary_op_post( + &mut self, + _ast: &ast::ClassSetBinaryOp, + ) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called between the left hand and right hand child nodes + /// of a [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html). + fn visit_class_set_binary_op_in( + &mut self, + _ast: &ast::ClassSetBinaryOp, + ) -> Result<(), Self::Err> { + Ok(()) + } +} + +/// Executes an implementation of `Visitor` in constant stack space. +/// +/// This function will visit every node in the given `Ast` while calling the +/// appropriate methods provided by the +/// [`Visitor`](trait.Visitor.html) trait. +/// +/// The primary use case for this method is when one wants to perform case +/// analysis over an `Ast` without using a stack size proportional to the depth +/// of the `Ast`. Namely, this method will instead use constant stack size, but +/// will use heap space proportional to the size of the `Ast`. This may be +/// desirable in cases where the size of `Ast` is proportional to end user +/// input. +/// +/// If the visitor returns an error at any point, then visiting is stopped and +/// the error is returned. +pub fn visit(ast: &Ast, visitor: V) -> Result { + HeapVisitor::new().visit(ast, visitor) +} + +/// HeapVisitor visits every item in an `Ast` recursively using constant stack +/// size and a heap size proportional to the size of the `Ast`. +struct HeapVisitor<'a> { + /// A stack of `Ast` nodes. This is roughly analogous to the call stack + /// used in a typical recursive visitor. + stack: Vec<(&'a Ast, Frame<'a>)>, + /// Similar to the `Ast` stack above, but is used only for character + /// classes. In particular, character classes embed their own mini + /// recursive syntax. + stack_class: Vec<(ClassInduct<'a>, ClassFrame<'a>)>, +} + +/// Represents a single stack frame while performing structural induction over +/// an `Ast`. +enum Frame<'a> { + /// A stack frame allocated just before descending into a repetition + /// operator's child node. + Repetition(&'a ast::Repetition), + /// A stack frame allocated just before descending into a group's child + /// node. + Group(&'a ast::Group), + /// The stack frame used while visiting every child node of a concatenation + /// of expressions. + Concat { + /// The child node we are currently visiting. + head: &'a Ast, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Ast], + }, + /// The stack frame used while visiting every child node of an alternation + /// of expressions. + Alternation { + /// The child node we are currently visiting. + head: &'a Ast, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Ast], + }, +} + +/// Represents a single stack frame while performing structural induction over +/// a character class. +enum ClassFrame<'a> { + /// The stack frame used while visiting every child node of a union of + /// character class items. + Union { + /// The child node we are currently visiting. + head: &'a ast::ClassSetItem, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [ast::ClassSetItem], + }, + /// The stack frame used while a binary class operation. + Binary { op: &'a ast::ClassSetBinaryOp }, + /// A stack frame allocated just before descending into a binary operator's + /// left hand child node. + BinaryLHS { + op: &'a ast::ClassSetBinaryOp, + lhs: &'a ast::ClassSet, + rhs: &'a ast::ClassSet, + }, + /// A stack frame allocated just before descending into a binary operator's + /// right hand child node. + BinaryRHS { op: &'a ast::ClassSetBinaryOp, rhs: &'a ast::ClassSet }, +} + +/// A representation of the inductive step when performing structural induction +/// over a character class. +/// +/// Note that there is no analogous explicit type for the inductive step for +/// `Ast` nodes because the inductive step is just an `Ast`. For character +/// classes, the inductive step can produce one of two possible child nodes: +/// an item or a binary operation. (An item cannot be a binary operation +/// because that would imply binary operations can be unioned in the concrete +/// syntax, which is not possible.) +enum ClassInduct<'a> { + Item(&'a ast::ClassSetItem), + BinaryOp(&'a ast::ClassSetBinaryOp), +} + +impl<'a> HeapVisitor<'a> { + fn new() -> HeapVisitor<'a> { + HeapVisitor { stack: vec![], stack_class: vec![] } + } + + fn visit( + &mut self, + mut ast: &'a Ast, + mut visitor: V, + ) -> Result { + self.stack.clear(); + self.stack_class.clear(); + + visitor.start(); + loop { + visitor.visit_pre(ast)?; + if let Some(x) = self.induct(ast, &mut visitor)? { + let child = x.child(); + self.stack.push((ast, x)); + ast = child; + continue; + } + // No induction means we have a base case, so we can post visit + // it now. + visitor.visit_post(ast)?; + + // At this point, we now try to pop our call stack until it is + // either empty or we hit another inductive case. + loop { + let (post_ast, frame) = match self.stack.pop() { + None => return visitor.finish(), + Some((post_ast, frame)) => (post_ast, frame), + }; + // If this is a concat/alternate, then we might have additional + // inductive steps to process. + if let Some(x) = self.pop(frame) { + if let Frame::Alternation { .. } = x { + visitor.visit_alternation_in()?; + } + ast = x.child(); + self.stack.push((post_ast, x)); + break; + } + // Otherwise, we've finished visiting all the child nodes for + // this AST, so we can post visit it now. + visitor.visit_post(post_ast)?; + } + } + } + + /// Build a stack frame for the given AST if one is needed (which occurs if + /// and only if there are child nodes in the AST). Otherwise, return None. + /// + /// If this visits a class, then the underlying visitor implementation may + /// return an error which will be passed on here. + fn induct( + &mut self, + ast: &'a Ast, + visitor: &mut V, + ) -> Result>, V::Err> { + Ok(match *ast { + Ast::Class(ast::Class::Bracketed(ref x)) => { + self.visit_class(x, visitor)?; + None + } + Ast::Repetition(ref x) => Some(Frame::Repetition(x)), + Ast::Group(ref x) => Some(Frame::Group(x)), + Ast::Concat(ref x) if x.asts.is_empty() => None, + Ast::Concat(ref x) => { + Some(Frame::Concat { head: &x.asts[0], tail: &x.asts[1..] }) + } + Ast::Alternation(ref x) if x.asts.is_empty() => None, + Ast::Alternation(ref x) => Some(Frame::Alternation { + head: &x.asts[0], + tail: &x.asts[1..], + }), + _ => None, + }) + } + + /// Pops the given frame. If the frame has an additional inductive step, + /// then return it, otherwise return `None`. + fn pop(&self, induct: Frame<'a>) -> Option> { + match induct { + Frame::Repetition(_) => None, + Frame::Group(_) => None, + Frame::Concat { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Concat { head: &tail[0], tail: &tail[1..] }) + } + } + Frame::Alternation { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Alternation { + head: &tail[0], + tail: &tail[1..], + }) + } + } + } + } + + fn visit_class( + &mut self, + ast: &'a ast::ClassBracketed, + visitor: &mut V, + ) -> Result<(), V::Err> { + let mut ast = ClassInduct::from_bracketed(ast); + loop { + self.visit_class_pre(&ast, visitor)?; + if let Some(x) = self.induct_class(&ast) { + let child = x.child(); + self.stack_class.push((ast, x)); + ast = child; + continue; + } + self.visit_class_post(&ast, visitor)?; + + // At this point, we now try to pop our call stack until it is + // either empty or we hit another inductive case. + loop { + let (post_ast, frame) = match self.stack_class.pop() { + None => return Ok(()), + Some((post_ast, frame)) => (post_ast, frame), + }; + // If this is a union or a binary op, then we might have + // additional inductive steps to process. + if let Some(x) = self.pop_class(frame) { + if let ClassFrame::BinaryRHS { ref op, .. } = x { + visitor.visit_class_set_binary_op_in(op)?; + } + ast = x.child(); + self.stack_class.push((post_ast, x)); + break; + } + // Otherwise, we've finished visiting all the child nodes for + // this class node, so we can post visit it now. + self.visit_class_post(&post_ast, visitor)?; + } + } + } + + /// Call the appropriate `Visitor` methods given an inductive step. + fn visit_class_pre( + &self, + ast: &ClassInduct<'a>, + visitor: &mut V, + ) -> Result<(), V::Err> { + match *ast { + ClassInduct::Item(item) => { + visitor.visit_class_set_item_pre(item)?; + } + ClassInduct::BinaryOp(op) => { + visitor.visit_class_set_binary_op_pre(op)?; + } + } + Ok(()) + } + + /// Call the appropriate `Visitor` methods given an inductive step. + fn visit_class_post( + &self, + ast: &ClassInduct<'a>, + visitor: &mut V, + ) -> Result<(), V::Err> { + match *ast { + ClassInduct::Item(item) => { + visitor.visit_class_set_item_post(item)?; + } + ClassInduct::BinaryOp(op) => { + visitor.visit_class_set_binary_op_post(op)?; + } + } + Ok(()) + } + + /// Build a stack frame for the given class node if one is needed (which + /// occurs if and only if there are child nodes). Otherwise, return None. + fn induct_class(&self, ast: &ClassInduct<'a>) -> Option> { + match *ast { + ClassInduct::Item(&ast::ClassSetItem::Bracketed(ref x)) => { + match x.kind { + ast::ClassSet::Item(ref item) => { + Some(ClassFrame::Union { head: item, tail: &[] }) + } + ast::ClassSet::BinaryOp(ref op) => { + Some(ClassFrame::Binary { op }) + } + } + } + ClassInduct::Item(&ast::ClassSetItem::Union(ref x)) => { + if x.items.is_empty() { + None + } else { + Some(ClassFrame::Union { + head: &x.items[0], + tail: &x.items[1..], + }) + } + } + ClassInduct::BinaryOp(op) => { + Some(ClassFrame::BinaryLHS { op, lhs: &op.lhs, rhs: &op.rhs }) + } + _ => None, + } + } + + /// Pops the given frame. If the frame has an additional inductive step, + /// then return it, otherwise return `None`. + fn pop_class(&self, induct: ClassFrame<'a>) -> Option> { + match induct { + ClassFrame::Union { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(ClassFrame::Union { + head: &tail[0], + tail: &tail[1..], + }) + } + } + ClassFrame::Binary { .. } => None, + ClassFrame::BinaryLHS { op, rhs, .. } => { + Some(ClassFrame::BinaryRHS { op, rhs }) + } + ClassFrame::BinaryRHS { .. } => None, + } + } +} + +impl<'a> Frame<'a> { + /// Perform the next inductive step on this frame and return the next + /// child AST node to visit. + fn child(&self) -> &'a Ast { + match *self { + Frame::Repetition(rep) => &rep.ast, + Frame::Group(group) => &group.ast, + Frame::Concat { head, .. } => head, + Frame::Alternation { head, .. } => head, + } + } +} + +impl<'a> ClassFrame<'a> { + /// Perform the next inductive step on this frame and return the next + /// child class node to visit. + fn child(&self) -> ClassInduct<'a> { + match *self { + ClassFrame::Union { head, .. } => ClassInduct::Item(head), + ClassFrame::Binary { op, .. } => ClassInduct::BinaryOp(op), + ClassFrame::BinaryLHS { ref lhs, .. } => { + ClassInduct::from_set(lhs) + } + ClassFrame::BinaryRHS { ref rhs, .. } => { + ClassInduct::from_set(rhs) + } + } + } +} + +impl<'a> ClassInduct<'a> { + fn from_bracketed(ast: &'a ast::ClassBracketed) -> ClassInduct<'a> { + ClassInduct::from_set(&ast.kind) + } + + fn from_set(ast: &'a ast::ClassSet) -> ClassInduct<'a> { + match *ast { + ast::ClassSet::Item(ref item) => ClassInduct::Item(item), + ast::ClassSet::BinaryOp(ref op) => ClassInduct::BinaryOp(op), + } + } +} + +impl<'a> fmt::Debug for ClassFrame<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let x = match *self { + ClassFrame::Union { .. } => "Union", + ClassFrame::Binary { .. } => "Binary", + ClassFrame::BinaryLHS { .. } => "BinaryLHS", + ClassFrame::BinaryRHS { .. } => "BinaryRHS", + }; + write!(f, "{}", x) + } +} + +impl<'a> fmt::Debug for ClassInduct<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let x = match *self { + ClassInduct::Item(it) => match *it { + ast::ClassSetItem::Empty(_) => "Item(Empty)", + ast::ClassSetItem::Literal(_) => "Item(Literal)", + ast::ClassSetItem::Range(_) => "Item(Range)", + ast::ClassSetItem::Ascii(_) => "Item(Ascii)", + ast::ClassSetItem::Perl(_) => "Item(Perl)", + ast::ClassSetItem::Unicode(_) => "Item(Unicode)", + ast::ClassSetItem::Bracketed(_) => "Item(Bracketed)", + ast::ClassSetItem::Union(_) => "Item(Union)", + }, + ClassInduct::BinaryOp(it) => match it.kind { + ast::ClassSetBinaryOpKind::Intersection => { + "BinaryOp(Intersection)" + } + ast::ClassSetBinaryOpKind::Difference => { + "BinaryOp(Difference)" + } + ast::ClassSetBinaryOpKind::SymmetricDifference => { + "BinaryOp(SymmetricDifference)" + } + }, + }; + write!(f, "{}", x) + } +} diff --git a/vendor/regex-syntax/src/either.rs b/vendor/regex-syntax/src/either.rs new file mode 100644 index 0000000000..7ae41e4ced --- /dev/null +++ b/vendor/regex-syntax/src/either.rs @@ -0,0 +1,8 @@ +/// A simple binary sum type. +/// +/// This is occasionally useful in an ad hoc fashion. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Either { + Left(Left), + Right(Right), +} diff --git a/vendor/regex-syntax/src/error.rs b/vendor/regex-syntax/src/error.rs new file mode 100644 index 0000000000..1230d2fc5d --- /dev/null +++ b/vendor/regex-syntax/src/error.rs @@ -0,0 +1,324 @@ +use std::cmp; +use std::error; +use std::fmt; +use std::result; + +use crate::ast; +use crate::hir; + +/// A type alias for dealing with errors returned by this crate. +pub type Result = result::Result; + +/// This error type encompasses any error that can be returned by this crate. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Error { + /// An error that occurred while translating concrete syntax into abstract + /// syntax (AST). + Parse(ast::Error), + /// An error that occurred while translating abstract syntax into a high + /// level intermediate representation (HIR). + Translate(hir::Error), + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl From for Error { + fn from(err: ast::Error) -> Error { + Error::Parse(err) + } +} + +impl From for Error { + fn from(err: hir::Error) -> Error { + Error::Translate(err) + } +} + +impl error::Error for Error { + // TODO: Remove this method entirely on the next breaking semver release. + #[allow(deprecated)] + fn description(&self) -> &str { + match *self { + Error::Parse(ref x) => x.description(), + Error::Translate(ref x) => x.description(), + _ => unreachable!(), + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + Error::Parse(ref x) => x.fmt(f), + Error::Translate(ref x) => x.fmt(f), + _ => unreachable!(), + } + } +} + +/// A helper type for formatting nice error messages. +/// +/// This type is responsible for reporting regex parse errors in a nice human +/// readable format. Most of its complexity is from interspersing notational +/// markers pointing out the position where an error occurred. +#[derive(Debug)] +pub struct Formatter<'e, E> { + /// The original regex pattern in which the error occurred. + pattern: &'e str, + /// The error kind. It must impl fmt::Display. + err: &'e E, + /// The primary span of the error. + span: &'e ast::Span, + /// An auxiliary and optional span, in case the error needs to point to + /// two locations (e.g., when reporting a duplicate capture group name). + aux_span: Option<&'e ast::Span>, +} + +impl<'e> From<&'e ast::Error> for Formatter<'e, ast::ErrorKind> { + fn from(err: &'e ast::Error) -> Self { + Formatter { + pattern: err.pattern(), + err: err.kind(), + span: err.span(), + aux_span: err.auxiliary_span(), + } + } +} + +impl<'e> From<&'e hir::Error> for Formatter<'e, hir::ErrorKind> { + fn from(err: &'e hir::Error) -> Self { + Formatter { + pattern: err.pattern(), + err: err.kind(), + span: err.span(), + aux_span: None, + } + } +} + +impl<'e, E: fmt::Display> fmt::Display for Formatter<'e, E> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let spans = Spans::from_formatter(self); + if self.pattern.contains('\n') { + let divider = repeat_char('~', 79); + + writeln!(f, "regex parse error:")?; + writeln!(f, "{}", divider)?; + let notated = spans.notate(); + write!(f, "{}", notated)?; + writeln!(f, "{}", divider)?; + // If we have error spans that cover multiple lines, then we just + // note the line numbers. + if !spans.multi_line.is_empty() { + let mut notes = vec![]; + for span in &spans.multi_line { + notes.push(format!( + "on line {} (column {}) through line {} (column {})", + span.start.line, + span.start.column, + span.end.line, + span.end.column - 1 + )); + } + writeln!(f, "{}", notes.join("\n"))?; + } + write!(f, "error: {}", self.err)?; + } else { + writeln!(f, "regex parse error:")?; + let notated = Spans::from_formatter(self).notate(); + write!(f, "{}", notated)?; + write!(f, "error: {}", self.err)?; + } + Ok(()) + } +} + +/// This type represents an arbitrary number of error spans in a way that makes +/// it convenient to notate the regex pattern. ("Notate" means "point out +/// exactly where the error occurred in the regex pattern.") +/// +/// Technically, we can only ever have two spans given our current error +/// structure. However, after toiling with a specific algorithm for handling +/// two spans, it became obvious that an algorithm to handle an arbitrary +/// number of spans was actually much simpler. +struct Spans<'p> { + /// The original regex pattern string. + pattern: &'p str, + /// The total width that should be used for line numbers. The width is + /// used for left padding the line numbers for alignment. + /// + /// A value of `0` means line numbers should not be displayed. That is, + /// the pattern is itself only one line. + line_number_width: usize, + /// All error spans that occur on a single line. This sequence always has + /// length equivalent to the number of lines in `pattern`, where the index + /// of the sequence represents a line number, starting at `0`. The spans + /// in each line are sorted in ascending order. + by_line: Vec>, + /// All error spans that occur over one or more lines. That is, the start + /// and end position of the span have different line numbers. The spans are + /// sorted in ascending order. + multi_line: Vec, +} + +impl<'p> Spans<'p> { + /// Build a sequence of spans from a formatter. + fn from_formatter<'e, E: fmt::Display>( + fmter: &'p Formatter<'e, E>, + ) -> Spans<'p> { + let mut line_count = fmter.pattern.lines().count(); + // If the pattern ends with a `\n` literal, then our line count is + // off by one, since a span can occur immediately after the last `\n`, + // which is consider to be an additional line. + if fmter.pattern.ends_with('\n') { + line_count += 1; + } + let line_number_width = + if line_count <= 1 { 0 } else { line_count.to_string().len() }; + let mut spans = Spans { + pattern: &fmter.pattern, + line_number_width, + by_line: vec![vec![]; line_count], + multi_line: vec![], + }; + spans.add(fmter.span.clone()); + if let Some(span) = fmter.aux_span { + spans.add(span.clone()); + } + spans + } + + /// Add the given span to this sequence, putting it in the right place. + fn add(&mut self, span: ast::Span) { + // This is grossly inefficient since we sort after each add, but right + // now, we only ever add two spans at most. + if span.is_one_line() { + let i = span.start.line - 1; // because lines are 1-indexed + self.by_line[i].push(span); + self.by_line[i].sort(); + } else { + self.multi_line.push(span); + self.multi_line.sort(); + } + } + + /// Notate the pattern string with carents (`^`) pointing at each span + /// location. This only applies to spans that occur within a single line. + fn notate(&self) -> String { + let mut notated = String::new(); + for (i, line) in self.pattern.lines().enumerate() { + if self.line_number_width > 0 { + notated.push_str(&self.left_pad_line_number(i + 1)); + notated.push_str(": "); + } else { + notated.push_str(" "); + } + notated.push_str(line); + notated.push('\n'); + if let Some(notes) = self.notate_line(i) { + notated.push_str(¬es); + notated.push('\n'); + } + } + notated + } + + /// Return notes for the line indexed at `i` (zero-based). If there are no + /// spans for the given line, then `None` is returned. Otherwise, an + /// appropriately space padded string with correctly positioned `^` is + /// returned, accounting for line numbers. + fn notate_line(&self, i: usize) -> Option { + let spans = &self.by_line[i]; + if spans.is_empty() { + return None; + } + let mut notes = String::new(); + for _ in 0..self.line_number_padding() { + notes.push(' '); + } + let mut pos = 0; + for span in spans { + for _ in pos..(span.start.column - 1) { + notes.push(' '); + pos += 1; + } + let note_len = span.end.column.saturating_sub(span.start.column); + for _ in 0..cmp::max(1, note_len) { + notes.push('^'); + pos += 1; + } + } + Some(notes) + } + + /// Left pad the given line number with spaces such that it is aligned with + /// other line numbers. + fn left_pad_line_number(&self, n: usize) -> String { + let n = n.to_string(); + let pad = self.line_number_width.checked_sub(n.len()).unwrap(); + let mut result = repeat_char(' ', pad); + result.push_str(&n); + result + } + + /// Return the line number padding beginning at the start of each line of + /// the pattern. + /// + /// If the pattern is only one line, then this returns a fixed padding + /// for visual indentation. + fn line_number_padding(&self) -> usize { + if self.line_number_width == 0 { + 4 + } else { + 2 + self.line_number_width + } + } +} + +fn repeat_char(c: char, count: usize) -> String { + ::std::iter::repeat(c).take(count).collect() +} + +#[cfg(test)] +mod tests { + use crate::ast::parse::Parser; + + fn assert_panic_message(pattern: &str, expected_msg: &str) { + let result = Parser::new().parse(pattern); + match result { + Ok(_) => { + panic!("regex should not have parsed"); + } + Err(err) => { + assert_eq!(err.to_string(), expected_msg.trim()); + } + } + } + + // See: https://github.com/rust-lang/regex/issues/464 + #[test] + fn regression_464() { + let err = Parser::new().parse("a{\n").unwrap_err(); + // This test checks that the error formatter doesn't panic. + assert!(!err.to_string().is_empty()); + } + + // See: https://github.com/rust-lang/regex/issues/545 + #[test] + fn repetition_quantifier_expects_a_valid_decimal() { + assert_panic_message( + r"\\u{[^}]*}", + r#" +regex parse error: + \\u{[^}]*} + ^ +error: repetition quantifier expects a valid decimal +"#, + ); + } +} diff --git a/vendor/regex-syntax/src/hir/interval.rs b/vendor/regex-syntax/src/hir/interval.rs new file mode 100644 index 0000000000..56698c53af --- /dev/null +++ b/vendor/regex-syntax/src/hir/interval.rs @@ -0,0 +1,520 @@ +use std::char; +use std::cmp; +use std::fmt::Debug; +use std::slice; +use std::u8; + +use crate::unicode; + +// This module contains an *internal* implementation of interval sets. +// +// The primary invariant that interval sets guards is canonical ordering. That +// is, every interval set contains an ordered sequence of intervals where +// no two intervals are overlapping or adjacent. While this invariant is +// occasionally broken within the implementation, it should be impossible for +// callers to observe it. +// +// Since case folding (as implemented below) breaks that invariant, we roll +// that into this API even though it is a little out of place in an otherwise +// generic interval set. (Hence the reason why the `unicode` module is imported +// here.) +// +// Some of the implementation complexity here is a result of me wanting to +// preserve the sequential representation without using additional memory. +// In many cases, we do use linear extra memory, but it is at most 2x and it +// is amortized. If we relaxed the memory requirements, this implementation +// could become much simpler. The extra memory is honestly probably OK, but +// character classes (especially of the Unicode variety) can become quite +// large, and it would be nice to keep regex compilation snappy even in debug +// builds. (In the past, I have been careless with this area of code and it has +// caused slow regex compilations in debug mode, so this isn't entirely +// unwarranted.) +// +// Tests on this are relegated to the public API of HIR in src/hir.rs. + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct IntervalSet { + ranges: Vec, +} + +impl IntervalSet { + /// Create a new set from a sequence of intervals. Each interval is + /// specified as a pair of bounds, where both bounds are inclusive. + /// + /// The given ranges do not need to be in any specific order, and ranges + /// may overlap. + pub fn new>(intervals: T) -> IntervalSet { + let mut set = IntervalSet { ranges: intervals.into_iter().collect() }; + set.canonicalize(); + set + } + + /// Add a new interval to this set. + pub fn push(&mut self, interval: I) { + // TODO: This could be faster. e.g., Push the interval such that + // it preserves canonicalization. + self.ranges.push(interval); + self.canonicalize(); + } + + /// Return an iterator over all intervals in this set. + /// + /// The iterator yields intervals in ascending order. + pub fn iter(&self) -> IntervalSetIter<'_, I> { + IntervalSetIter(self.ranges.iter()) + } + + /// Return an immutable slice of intervals in this set. + /// + /// The sequence returned is in canonical ordering. + pub fn intervals(&self) -> &[I] { + &self.ranges + } + + /// Expand this interval set such that it contains all case folded + /// characters. For example, if this class consists of the range `a-z`, + /// then applying case folding will result in the class containing both the + /// ranges `a-z` and `A-Z`. + /// + /// This returns an error if the necessary case mapping data is not + /// available. + pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> { + let len = self.ranges.len(); + for i in 0..len { + let range = self.ranges[i]; + if let Err(err) = range.case_fold_simple(&mut self.ranges) { + self.canonicalize(); + return Err(err); + } + } + self.canonicalize(); + Ok(()) + } + + /// Union this set with the given set, in place. + pub fn union(&mut self, other: &IntervalSet) { + // This could almost certainly be done more efficiently. + self.ranges.extend(&other.ranges); + self.canonicalize(); + } + + /// Intersect this set with the given set, in place. + pub fn intersect(&mut self, other: &IntervalSet) { + if self.ranges.is_empty() { + return; + } + if other.ranges.is_empty() { + self.ranges.clear(); + return; + } + + // There should be a way to do this in-place with constant memory, + // but I couldn't figure out a simple way to do it. So just append + // the intersection to the end of this range, and then drain it before + // we're done. + let drain_end = self.ranges.len(); + + let mut ita = 0..drain_end; + let mut itb = 0..other.ranges.len(); + let mut a = ita.next().unwrap(); + let mut b = itb.next().unwrap(); + loop { + if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) { + self.ranges.push(ab); + } + let (it, aorb) = + if self.ranges[a].upper() < other.ranges[b].upper() { + (&mut ita, &mut a) + } else { + (&mut itb, &mut b) + }; + match it.next() { + Some(v) => *aorb = v, + None => break, + } + } + self.ranges.drain(..drain_end); + } + + /// Subtract the given set from this set, in place. + pub fn difference(&mut self, other: &IntervalSet) { + if self.ranges.is_empty() || other.ranges.is_empty() { + return; + } + + // This algorithm is (to me) surprisingly complex. A search of the + // interwebs indicate that this is a potentially interesting problem. + // Folks seem to suggest interval or segment trees, but I'd like to + // avoid the overhead (both runtime and conceptual) of that. + // + // The following is basically my Shitty First Draft. Therefore, in + // order to grok it, you probably need to read each line carefully. + // Simplifications are most welcome! + // + // Remember, we can assume the canonical format invariant here, which + // says that all ranges are sorted, not overlapping and not adjacent in + // each class. + let drain_end = self.ranges.len(); + let (mut a, mut b) = (0, 0); + 'LOOP: while a < drain_end && b < other.ranges.len() { + // Basically, the easy cases are when neither range overlaps with + // each other. If the `b` range is less than our current `a` + // range, then we can skip it and move on. + if other.ranges[b].upper() < self.ranges[a].lower() { + b += 1; + continue; + } + // ... similarly for the `a` range. If it's less than the smallest + // `b` range, then we can add it as-is. + if self.ranges[a].upper() < other.ranges[b].lower() { + let range = self.ranges[a]; + self.ranges.push(range); + a += 1; + continue; + } + // Otherwise, we have overlapping ranges. + assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b])); + + // This part is tricky and was non-obvious to me without looking + // at explicit examples (see the tests). The trickiness stems from + // two things: 1) subtracting a range from another range could + // yield two ranges and 2) after subtracting a range, it's possible + // that future ranges can have an impact. The loop below advances + // the `b` ranges until they can't possible impact the current + // range. + // + // For example, if our `a` range is `a-t` and our next three `b` + // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply + // subtraction three times before moving on to the next `a` range. + let mut range = self.ranges[a]; + while b < other.ranges.len() + && !range.is_intersection_empty(&other.ranges[b]) + { + let old_range = range; + range = match range.difference(&other.ranges[b]) { + (None, None) => { + // We lost the entire range, so move on to the next + // without adding this one. + a += 1; + continue 'LOOP; + } + (Some(range1), None) | (None, Some(range1)) => range1, + (Some(range1), Some(range2)) => { + self.ranges.push(range1); + range2 + } + }; + // It's possible that the `b` range has more to contribute + // here. In particular, if it is greater than the original + // range, then it might impact the next `a` range *and* it + // has impacted the current `a` range as much as possible, + // so we can quit. We don't bump `b` so that the next `a` + // range can apply it. + if other.ranges[b].upper() > old_range.upper() { + break; + } + // Otherwise, the next `b` range might apply to the current + // `a` range. + b += 1; + } + self.ranges.push(range); + a += 1; + } + while a < drain_end { + let range = self.ranges[a]; + self.ranges.push(range); + a += 1; + } + self.ranges.drain(..drain_end); + } + + /// Compute the symmetric difference of the two sets, in place. + /// + /// This computes the symmetric difference of two interval sets. This + /// removes all elements in this set that are also in the given set, + /// but also adds all elements from the given set that aren't in this + /// set. That is, the set will contain all elements in either set, + /// but will not contain any elements that are in both sets. + pub fn symmetric_difference(&mut self, other: &IntervalSet) { + // TODO(burntsushi): Fix this so that it amortizes allocation. + let mut intersection = self.clone(); + intersection.intersect(other); + self.union(other); + self.difference(&intersection); + } + + /// Negate this interval set. + /// + /// For all `x` where `x` is any element, if `x` was in this set, then it + /// will not be in this set after negation. + pub fn negate(&mut self) { + if self.ranges.is_empty() { + let (min, max) = (I::Bound::min_value(), I::Bound::max_value()); + self.ranges.push(I::create(min, max)); + return; + } + + // There should be a way to do this in-place with constant memory, + // but I couldn't figure out a simple way to do it. So just append + // the negation to the end of this range, and then drain it before + // we're done. + let drain_end = self.ranges.len(); + + // We do checked arithmetic below because of the canonical ordering + // invariant. + if self.ranges[0].lower() > I::Bound::min_value() { + let upper = self.ranges[0].lower().decrement(); + self.ranges.push(I::create(I::Bound::min_value(), upper)); + } + for i in 1..drain_end { + let lower = self.ranges[i - 1].upper().increment(); + let upper = self.ranges[i].lower().decrement(); + self.ranges.push(I::create(lower, upper)); + } + if self.ranges[drain_end - 1].upper() < I::Bound::max_value() { + let lower = self.ranges[drain_end - 1].upper().increment(); + self.ranges.push(I::create(lower, I::Bound::max_value())); + } + self.ranges.drain(..drain_end); + } + + /// Converts this set into a canonical ordering. + fn canonicalize(&mut self) { + if self.is_canonical() { + return; + } + self.ranges.sort(); + assert!(!self.ranges.is_empty()); + + // Is there a way to do this in-place with constant memory? I couldn't + // figure out a way to do it. So just append the canonicalization to + // the end of this range, and then drain it before we're done. + let drain_end = self.ranges.len(); + for oldi in 0..drain_end { + // If we've added at least one new range, then check if we can + // merge this range in the previously added range. + if self.ranges.len() > drain_end { + let (last, rest) = self.ranges.split_last_mut().unwrap(); + if let Some(union) = last.union(&rest[oldi]) { + *last = union; + continue; + } + } + let range = self.ranges[oldi]; + self.ranges.push(range); + } + self.ranges.drain(..drain_end); + } + + /// Returns true if and only if this class is in a canonical ordering. + fn is_canonical(&self) -> bool { + for pair in self.ranges.windows(2) { + if pair[0] >= pair[1] { + return false; + } + if pair[0].is_contiguous(&pair[1]) { + return false; + } + } + true + } +} + +/// An iterator over intervals. +#[derive(Debug)] +pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>); + +impl<'a, I> Iterator for IntervalSetIter<'a, I> { + type Item = &'a I; + + fn next(&mut self) -> Option<&'a I> { + self.0.next() + } +} + +pub trait Interval: + Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord +{ + type Bound: Bound; + + fn lower(&self) -> Self::Bound; + fn upper(&self) -> Self::Bound; + fn set_lower(&mut self, bound: Self::Bound); + fn set_upper(&mut self, bound: Self::Bound); + fn case_fold_simple( + &self, + intervals: &mut Vec, + ) -> Result<(), unicode::CaseFoldError>; + + /// Create a new interval. + fn create(lower: Self::Bound, upper: Self::Bound) -> Self { + let mut int = Self::default(); + if lower <= upper { + int.set_lower(lower); + int.set_upper(upper); + } else { + int.set_lower(upper); + int.set_upper(lower); + } + int + } + + /// Union the given overlapping range into this range. + /// + /// If the two ranges aren't contiguous, then this returns `None`. + fn union(&self, other: &Self) -> Option { + if !self.is_contiguous(other) { + return None; + } + let lower = cmp::min(self.lower(), other.lower()); + let upper = cmp::max(self.upper(), other.upper()); + Some(Self::create(lower, upper)) + } + + /// Intersect this range with the given range and return the result. + /// + /// If the intersection is empty, then this returns `None`. + fn intersect(&self, other: &Self) -> Option { + let lower = cmp::max(self.lower(), other.lower()); + let upper = cmp::min(self.upper(), other.upper()); + if lower <= upper { + Some(Self::create(lower, upper)) + } else { + None + } + } + + /// Subtract the given range from this range and return the resulting + /// ranges. + /// + /// If subtraction would result in an empty range, then no ranges are + /// returned. + fn difference(&self, other: &Self) -> (Option, Option) { + if self.is_subset(other) { + return (None, None); + } + if self.is_intersection_empty(other) { + return (Some(self.clone()), None); + } + let add_lower = other.lower() > self.lower(); + let add_upper = other.upper() < self.upper(); + // We know this because !self.is_subset(other) and the ranges have + // a non-empty intersection. + assert!(add_lower || add_upper); + let mut ret = (None, None); + if add_lower { + let upper = other.lower().decrement(); + ret.0 = Some(Self::create(self.lower(), upper)); + } + if add_upper { + let lower = other.upper().increment(); + let range = Self::create(lower, self.upper()); + if ret.0.is_none() { + ret.0 = Some(range); + } else { + ret.1 = Some(range); + } + } + ret + } + + /// Compute the symmetric difference the given range from this range. This + /// returns the union of the two ranges minus its intersection. + fn symmetric_difference( + &self, + other: &Self, + ) -> (Option, Option) { + let union = match self.union(other) { + None => return (Some(self.clone()), Some(other.clone())), + Some(union) => union, + }; + let intersection = match self.intersect(other) { + None => return (Some(self.clone()), Some(other.clone())), + Some(intersection) => intersection, + }; + union.difference(&intersection) + } + + /// Returns true if and only if the two ranges are contiguous. Two ranges + /// are contiguous if and only if the ranges are either overlapping or + /// adjacent. + fn is_contiguous(&self, other: &Self) -> bool { + let lower1 = self.lower().as_u32(); + let upper1 = self.upper().as_u32(); + let lower2 = other.lower().as_u32(); + let upper2 = other.upper().as_u32(); + cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1) + } + + /// Returns true if and only if the intersection of this range and the + /// other range is empty. + fn is_intersection_empty(&self, other: &Self) -> bool { + let (lower1, upper1) = (self.lower(), self.upper()); + let (lower2, upper2) = (other.lower(), other.upper()); + cmp::max(lower1, lower2) > cmp::min(upper1, upper2) + } + + /// Returns true if and only if this range is a subset of the other range. + fn is_subset(&self, other: &Self) -> bool { + let (lower1, upper1) = (self.lower(), self.upper()); + let (lower2, upper2) = (other.lower(), other.upper()); + (lower2 <= lower1 && lower1 <= upper2) + && (lower2 <= upper1 && upper1 <= upper2) + } +} + +pub trait Bound: + Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord +{ + fn min_value() -> Self; + fn max_value() -> Self; + fn as_u32(self) -> u32; + fn increment(self) -> Self; + fn decrement(self) -> Self; +} + +impl Bound for u8 { + fn min_value() -> Self { + u8::MIN + } + fn max_value() -> Self { + u8::MAX + } + fn as_u32(self) -> u32 { + self as u32 + } + fn increment(self) -> Self { + self.checked_add(1).unwrap() + } + fn decrement(self) -> Self { + self.checked_sub(1).unwrap() + } +} + +impl Bound for char { + fn min_value() -> Self { + '\x00' + } + fn max_value() -> Self { + '\u{10FFFF}' + } + fn as_u32(self) -> u32 { + self as u32 + } + + fn increment(self) -> Self { + match self { + '\u{D7FF}' => '\u{E000}', + c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(), + } + } + + fn decrement(self) -> Self { + match self { + '\u{E000}' => '\u{D7FF}', + c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(), + } + } +} + +// Tests for interval sets are written in src/hir.rs against the public API. diff --git a/vendor/regex-syntax/src/hir/literal/mod.rs b/vendor/regex-syntax/src/hir/literal/mod.rs new file mode 100644 index 0000000000..fbc5d3c975 --- /dev/null +++ b/vendor/regex-syntax/src/hir/literal/mod.rs @@ -0,0 +1,1686 @@ +/*! +Provides routines for extracting literal prefixes and suffixes from an `Hir`. +*/ + +use std::cmp; +use std::fmt; +use std::iter; +use std::mem; +use std::ops; + +use crate::hir::{self, Hir, HirKind}; + +/// A set of literal byte strings extracted from a regular expression. +/// +/// Every member of the set is a `Literal`, which is represented by a +/// `Vec`. (Notably, it may contain invalid UTF-8.) Every member is +/// said to be either *complete* or *cut*. A complete literal means that +/// it extends until the beginning (or end) of the regular expression. In +/// some circumstances, this can be used to indicate a match in the regular +/// expression. +/// +/// A key aspect of literal extraction is knowing when to stop. It is not +/// feasible to blindly extract all literals from a regular expression, even if +/// there are finitely many. For example, the regular expression `[0-9]{10}` +/// has `10^10` distinct literals. For this reason, literal extraction is +/// bounded to some low number by default using heuristics, but the limits can +/// be tweaked. +/// +/// **WARNING**: Literal extraction uses stack space proportional to the size +/// of the `Hir` expression. At some point, this drawback will be eliminated. +/// To protect yourself, set a reasonable +/// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit). +/// This is done for you by default. +#[derive(Clone, Eq, PartialEq)] +pub struct Literals { + lits: Vec, + limit_size: usize, + limit_class: usize, +} + +/// A single member of a set of literals extracted from a regular expression. +/// +/// This type has `Deref` and `DerefMut` impls to `Vec` so that all slice +/// and `Vec` operations are available. +#[derive(Clone, Eq, Ord)] +pub struct Literal { + v: Vec, + cut: bool, +} + +impl Literals { + /// Returns a new empty set of literals using default limits. + pub fn empty() -> Literals { + Literals { lits: vec![], limit_size: 250, limit_class: 10 } + } + + /// Returns a set of literal prefixes extracted from the given `Hir`. + pub fn prefixes(expr: &Hir) -> Literals { + let mut lits = Literals::empty(); + lits.union_prefixes(expr); + lits + } + + /// Returns a set of literal suffixes extracted from the given `Hir`. + pub fn suffixes(expr: &Hir) -> Literals { + let mut lits = Literals::empty(); + lits.union_suffixes(expr); + lits + } + + /// Get the approximate size limit (in bytes) of this set. + pub fn limit_size(&self) -> usize { + self.limit_size + } + + /// Set the approximate size limit (in bytes) of this set. + /// + /// If extracting a literal would put the set over this limit, then + /// extraction stops. + /// + /// The new limits will only apply to additions to this set. Existing + /// members remain unchanged, even if the set exceeds the new limit. + pub fn set_limit_size(&mut self, size: usize) -> &mut Literals { + self.limit_size = size; + self + } + + /// Get the character class size limit for this set. + pub fn limit_class(&self) -> usize { + self.limit_class + } + + /// Limits the size of character(or byte) classes considered. + /// + /// A value of `0` prevents all character classes from being considered. + /// + /// This limit also applies to case insensitive literals, since each + /// character in the case insensitive literal is converted to a class, and + /// then case folded. + /// + /// The new limits will only apply to additions to this set. Existing + /// members remain unchanged, even if the set exceeds the new limit. + pub fn set_limit_class(&mut self, size: usize) -> &mut Literals { + self.limit_class = size; + self + } + + /// Returns the set of literals as a slice. Its order is unspecified. + pub fn literals(&self) -> &[Literal] { + &self.lits + } + + /// Returns the length of the smallest literal. + /// + /// Returns None is there are no literals in the set. + pub fn min_len(&self) -> Option { + let mut min = None; + for lit in &self.lits { + match min { + None => min = Some(lit.len()), + Some(m) if lit.len() < m => min = Some(lit.len()), + _ => {} + } + } + min + } + + /// Returns true if all members in this set are complete. + pub fn all_complete(&self) -> bool { + !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut()) + } + + /// Returns true if any member in this set is complete. + pub fn any_complete(&self) -> bool { + self.lits.iter().any(|lit| !lit.is_cut()) + } + + /// Returns true if this set contains an empty literal. + pub fn contains_empty(&self) -> bool { + self.lits.iter().any(|lit| lit.is_empty()) + } + + /// Returns true if this set is empty or if all of its members is empty. + pub fn is_empty(&self) -> bool { + self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty()) + } + + /// Returns a new empty set of literals using this set's limits. + pub fn to_empty(&self) -> Literals { + let mut lits = Literals::empty(); + lits.set_limit_size(self.limit_size).set_limit_class(self.limit_class); + lits + } + + /// Returns the longest common prefix of all members in this set. + pub fn longest_common_prefix(&self) -> &[u8] { + if self.is_empty() { + return &[]; + } + let lit0 = &*self.lits[0]; + let mut len = lit0.len(); + for lit in &self.lits[1..] { + len = cmp::min( + len, + lit.iter().zip(lit0).take_while(|&(a, b)| a == b).count(), + ); + } + &self.lits[0][..len] + } + + /// Returns the longest common suffix of all members in this set. + pub fn longest_common_suffix(&self) -> &[u8] { + if self.is_empty() { + return &[]; + } + let lit0 = &*self.lits[0]; + let mut len = lit0.len(); + for lit in &self.lits[1..] { + len = cmp::min( + len, + lit.iter() + .rev() + .zip(lit0.iter().rev()) + .take_while(|&(a, b)| a == b) + .count(), + ); + } + &self.lits[0][self.lits[0].len() - len..] + } + + /// Returns a new set of literals with the given number of bytes trimmed + /// from the suffix of each literal. + /// + /// If any literal would be cut out completely by trimming, then None is + /// returned. + /// + /// Any duplicates that are created as a result of this transformation are + /// removed. + pub fn trim_suffix(&self, num_bytes: usize) -> Option { + if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) { + return None; + } + let mut new = self.to_empty(); + for mut lit in self.lits.iter().cloned() { + let new_len = lit.len() - num_bytes; + lit.truncate(new_len); + lit.cut(); + new.lits.push(lit); + } + new.lits.sort(); + new.lits.dedup(); + Some(new) + } + + /// Returns a new set of prefixes of this set of literals that are + /// guaranteed to be unambiguous. + /// + /// Any substring match with a member of the set is returned is guaranteed + /// to never overlap with a substring match of another member of the set + /// at the same starting position. + /// + /// Given any two members of the returned set, neither is a substring of + /// the other. + pub fn unambiguous_prefixes(&self) -> Literals { + if self.lits.is_empty() { + return self.to_empty(); + } + let mut old = self.lits.to_vec(); + let mut new = self.to_empty(); + 'OUTER: while let Some(mut candidate) = old.pop() { + if candidate.is_empty() { + continue; + } + if new.lits.is_empty() { + new.lits.push(candidate); + continue; + } + for lit2 in &mut new.lits { + if lit2.is_empty() { + continue; + } + if &candidate == lit2 { + // If the literal is already in the set, then we can + // just drop it. But make sure that cut literals are + // infectious! + candidate.cut = candidate.cut || lit2.cut; + lit2.cut = candidate.cut; + continue 'OUTER; + } + if candidate.len() < lit2.len() { + if let Some(i) = position(&candidate, &lit2) { + candidate.cut(); + let mut lit3 = lit2.clone(); + lit3.truncate(i); + lit3.cut(); + old.push(lit3); + lit2.clear(); + } + } else if let Some(i) = position(&lit2, &candidate) { + lit2.cut(); + let mut new_candidate = candidate.clone(); + new_candidate.truncate(i); + new_candidate.cut(); + old.push(new_candidate); + candidate.clear(); + } + // Oops, the candidate is already represented in the set. + if candidate.is_empty() { + continue 'OUTER; + } + } + new.lits.push(candidate); + } + new.lits.retain(|lit| !lit.is_empty()); + new.lits.sort(); + new.lits.dedup(); + new + } + + /// Returns a new set of suffixes of this set of literals that are + /// guaranteed to be unambiguous. + /// + /// Any substring match with a member of the set is returned is guaranteed + /// to never overlap with a substring match of another member of the set + /// at the same ending position. + /// + /// Given any two members of the returned set, neither is a substring of + /// the other. + pub fn unambiguous_suffixes(&self) -> Literals { + // This is a touch wasteful... + let mut lits = self.clone(); + lits.reverse(); + let mut unamb = lits.unambiguous_prefixes(); + unamb.reverse(); + unamb + } + + /// Unions the prefixes from the given expression to this set. + /// + /// If prefixes could not be added (for example, this set would exceed its + /// size limits or the set of prefixes from `expr` includes the empty + /// string), then false is returned. + /// + /// Note that prefix literals extracted from `expr` are said to be complete + /// if and only if the literal extends from the beginning of `expr` to the + /// end of `expr`. + pub fn union_prefixes(&mut self, expr: &Hir) -> bool { + let mut lits = self.to_empty(); + prefixes(expr, &mut lits); + !lits.is_empty() && !lits.contains_empty() && self.union(lits) + } + + /// Unions the suffixes from the given expression to this set. + /// + /// If suffixes could not be added (for example, this set would exceed its + /// size limits or the set of suffixes from `expr` includes the empty + /// string), then false is returned. + /// + /// Note that prefix literals extracted from `expr` are said to be complete + /// if and only if the literal extends from the end of `expr` to the + /// beginning of `expr`. + pub fn union_suffixes(&mut self, expr: &Hir) -> bool { + let mut lits = self.to_empty(); + suffixes(expr, &mut lits); + lits.reverse(); + !lits.is_empty() && !lits.contains_empty() && self.union(lits) + } + + /// Unions this set with another set. + /// + /// If the union would cause the set to exceed its limits, then the union + /// is skipped and it returns false. Otherwise, if the union succeeds, it + /// returns true. + pub fn union(&mut self, lits: Literals) -> bool { + if self.num_bytes() + lits.num_bytes() > self.limit_size { + return false; + } + if lits.is_empty() { + self.lits.push(Literal::empty()); + } else { + self.lits.extend(lits.lits); + } + true + } + + /// Extends this set with another set. + /// + /// The set of literals is extended via a cross product. + /// + /// If a cross product would cause this set to exceed its limits, then the + /// cross product is skipped and it returns false. Otherwise, if the cross + /// product succeeds, it returns true. + pub fn cross_product(&mut self, lits: &Literals) -> bool { + if lits.is_empty() { + return true; + } + // Check that we make sure we stay in our limits. + let mut size_after; + if self.is_empty() || !self.any_complete() { + size_after = self.num_bytes(); + for lits_lit in lits.literals() { + size_after += lits_lit.len(); + } + } else { + size_after = self.lits.iter().fold(0, |accum, lit| { + accum + if lit.is_cut() { lit.len() } else { 0 } + }); + for lits_lit in lits.literals() { + for self_lit in self.literals() { + if !self_lit.is_cut() { + size_after += self_lit.len() + lits_lit.len(); + } + } + } + } + if size_after > self.limit_size { + return false; + } + + let mut base = self.remove_complete(); + if base.is_empty() { + base = vec![Literal::empty()]; + } + for lits_lit in lits.literals() { + for mut self_lit in base.clone() { + self_lit.extend(&**lits_lit); + self_lit.cut = lits_lit.cut; + self.lits.push(self_lit); + } + } + true + } + + /// Extends each literal in this set with the bytes given. + /// + /// If the set is empty, then the given literal is added to the set. + /// + /// If adding any number of bytes to all members of this set causes a limit + /// to be exceeded, then no bytes are added and false is returned. If a + /// prefix of `bytes` can be fit into this set, then it is used and all + /// resulting literals are cut. + pub fn cross_add(&mut self, bytes: &[u8]) -> bool { + // N.B. This could be implemented by simply calling cross_product with + // a literal set containing just `bytes`, but we can be smarter about + // taking shorter prefixes of `bytes` if they'll fit. + if bytes.is_empty() { + return true; + } + if self.lits.is_empty() { + let i = cmp::min(self.limit_size, bytes.len()); + self.lits.push(Literal::new(bytes[..i].to_owned())); + self.lits[0].cut = i < bytes.len(); + return !self.lits[0].is_cut(); + } + let size = self.num_bytes(); + if size + self.lits.len() >= self.limit_size { + return false; + } + let mut i = 1; + while size + (i * self.lits.len()) <= self.limit_size + && i < bytes.len() + { + i += 1; + } + for lit in &mut self.lits { + if !lit.is_cut() { + lit.extend(&bytes[..i]); + if i < bytes.len() { + lit.cut(); + } + } + } + true + } + + /// Adds the given literal to this set. + /// + /// Returns false if adding this literal would cause the class to be too + /// big. + pub fn add(&mut self, lit: Literal) -> bool { + if self.num_bytes() + lit.len() > self.limit_size { + return false; + } + self.lits.push(lit); + true + } + + /// Extends each literal in this set with the character class given. + /// + /// Returns false if the character class was too big to add. + pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool { + self._add_char_class(cls, false) + } + + /// Extends each literal in this set with the character class given, + /// writing the bytes of each character in reverse. + /// + /// Returns false if the character class was too big to add. + fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool { + self._add_char_class(cls, true) + } + + fn _add_char_class( + &mut self, + cls: &hir::ClassUnicode, + reverse: bool, + ) -> bool { + use std::char; + + if self.class_exceeds_limits(cls_char_count(cls)) { + return false; + } + let mut base = self.remove_complete(); + if base.is_empty() { + base = vec![Literal::empty()]; + } + for r in cls.iter() { + let (s, e) = (r.start as u32, r.end as u32 + 1); + for c in (s..e).filter_map(char::from_u32) { + for mut lit in base.clone() { + let mut bytes = c.to_string().into_bytes(); + if reverse { + bytes.reverse(); + } + lit.extend(&bytes); + self.lits.push(lit); + } + } + } + true + } + + /// Extends each literal in this set with the byte class given. + /// + /// Returns false if the byte class was too big to add. + pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool { + if self.class_exceeds_limits(cls_byte_count(cls)) { + return false; + } + let mut base = self.remove_complete(); + if base.is_empty() { + base = vec![Literal::empty()]; + } + for r in cls.iter() { + let (s, e) = (r.start as u32, r.end as u32 + 1); + for b in (s..e).map(|b| b as u8) { + for mut lit in base.clone() { + lit.push(b); + self.lits.push(lit); + } + } + } + true + } + + /// Cuts every member of this set. When a member is cut, it can never + /// be extended. + pub fn cut(&mut self) { + for lit in &mut self.lits { + lit.cut(); + } + } + + /// Reverses all members in place. + pub fn reverse(&mut self) { + for lit in &mut self.lits { + lit.reverse(); + } + } + + /// Clears this set of all members. + pub fn clear(&mut self) { + self.lits.clear(); + } + + /// Pops all complete literals out of this set. + fn remove_complete(&mut self) -> Vec { + let mut base = vec![]; + for lit in mem::replace(&mut self.lits, vec![]) { + if lit.is_cut() { + self.lits.push(lit); + } else { + base.push(lit); + } + } + base + } + + /// Returns the total number of bytes in this set. + fn num_bytes(&self) -> usize { + self.lits.iter().fold(0, |accum, lit| accum + lit.len()) + } + + /// Returns true if a character class with the given size would cause this + /// set to exceed its limits. + /// + /// The size given should correspond to the number of items in the class. + fn class_exceeds_limits(&self, size: usize) -> bool { + if size > self.limit_class { + return true; + } + // This is an approximation since codepoints in a char class can encode + // to 1-4 bytes. + let new_byte_count = if self.lits.is_empty() { + size + } else { + self.lits.iter().fold(0, |accum, lit| { + accum + + if lit.is_cut() { + // If the literal is cut, then we'll never add + // anything to it, so don't count it. + 0 + } else { + (lit.len() + 1) * size + } + }) + }; + new_byte_count > self.limit_size + } +} + +fn prefixes(expr: &Hir, lits: &mut Literals) { + match *expr.kind() { + HirKind::Literal(hir::Literal::Unicode(c)) => { + let mut buf = [0; 4]; + lits.cross_add(c.encode_utf8(&mut buf).as_bytes()); + } + HirKind::Literal(hir::Literal::Byte(b)) => { + lits.cross_add(&[b]); + } + HirKind::Class(hir::Class::Unicode(ref cls)) => { + if !lits.add_char_class(cls) { + lits.cut(); + } + } + HirKind::Class(hir::Class::Bytes(ref cls)) => { + if !lits.add_byte_class(cls) { + lits.cut(); + } + } + HirKind::Group(hir::Group { ref hir, .. }) => { + prefixes(&**hir, lits); + } + HirKind::Repetition(ref x) => match x.kind { + hir::RepetitionKind::ZeroOrOne => { + repeat_zero_or_one_literals(&x.hir, lits, prefixes); + } + hir::RepetitionKind::ZeroOrMore => { + repeat_zero_or_more_literals(&x.hir, lits, prefixes); + } + hir::RepetitionKind::OneOrMore => { + repeat_one_or_more_literals(&x.hir, lits, prefixes); + } + hir::RepetitionKind::Range(ref rng) => { + let (min, max) = match *rng { + hir::RepetitionRange::Exactly(m) => (m, Some(m)), + hir::RepetitionRange::AtLeast(m) => (m, None), + hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), + }; + repeat_range_literals( + &x.hir, min, max, x.greedy, lits, prefixes, + ) + } + }, + HirKind::Concat(ref es) if es.is_empty() => {} + HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits), + HirKind::Concat(ref es) => { + for e in es { + if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() { + if !lits.is_empty() { + lits.cut(); + break; + } + lits.add(Literal::empty()); + continue; + } + let mut lits2 = lits.to_empty(); + prefixes(e, &mut lits2); + if !lits.cross_product(&lits2) || !lits2.any_complete() { + // If this expression couldn't yield any literal that + // could be extended, then we need to quit. Since we're + // short-circuiting, we also need to freeze every member. + lits.cut(); + break; + } + } + } + HirKind::Alternation(ref es) => { + alternate_literals(es, lits, prefixes); + } + _ => lits.cut(), + } +} + +fn suffixes(expr: &Hir, lits: &mut Literals) { + match *expr.kind() { + HirKind::Literal(hir::Literal::Unicode(c)) => { + let mut buf = [0u8; 4]; + let i = c.encode_utf8(&mut buf).len(); + let buf = &mut buf[..i]; + buf.reverse(); + lits.cross_add(buf); + } + HirKind::Literal(hir::Literal::Byte(b)) => { + lits.cross_add(&[b]); + } + HirKind::Class(hir::Class::Unicode(ref cls)) => { + if !lits.add_char_class_reverse(cls) { + lits.cut(); + } + } + HirKind::Class(hir::Class::Bytes(ref cls)) => { + if !lits.add_byte_class(cls) { + lits.cut(); + } + } + HirKind::Group(hir::Group { ref hir, .. }) => { + suffixes(&**hir, lits); + } + HirKind::Repetition(ref x) => match x.kind { + hir::RepetitionKind::ZeroOrOne => { + repeat_zero_or_one_literals(&x.hir, lits, suffixes); + } + hir::RepetitionKind::ZeroOrMore => { + repeat_zero_or_more_literals(&x.hir, lits, suffixes); + } + hir::RepetitionKind::OneOrMore => { + repeat_one_or_more_literals(&x.hir, lits, suffixes); + } + hir::RepetitionKind::Range(ref rng) => { + let (min, max) = match *rng { + hir::RepetitionRange::Exactly(m) => (m, Some(m)), + hir::RepetitionRange::AtLeast(m) => (m, None), + hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), + }; + repeat_range_literals( + &x.hir, min, max, x.greedy, lits, suffixes, + ) + } + }, + HirKind::Concat(ref es) if es.is_empty() => {} + HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits), + HirKind::Concat(ref es) => { + for e in es.iter().rev() { + if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() { + if !lits.is_empty() { + lits.cut(); + break; + } + lits.add(Literal::empty()); + continue; + } + let mut lits2 = lits.to_empty(); + suffixes(e, &mut lits2); + if !lits.cross_product(&lits2) || !lits2.any_complete() { + // If this expression couldn't yield any literal that + // could be extended, then we need to quit. Since we're + // short-circuiting, we also need to freeze every member. + lits.cut(); + break; + } + } + } + HirKind::Alternation(ref es) => { + alternate_literals(es, lits, suffixes); + } + _ => lits.cut(), + } +} + +fn repeat_zero_or_one_literals( + e: &Hir, + lits: &mut Literals, + mut f: F, +) { + f( + &Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + // FIXME: Our literal extraction doesn't care about greediness. + // Which is partially why we're treating 'e?' as 'e*'. Namely, + // 'ab??' yields [Complete(ab), Complete(a)], but it should yield + // [Complete(a), Complete(ab)] because of the non-greediness. + greedy: true, + hir: Box::new(e.clone()), + }), + lits, + ); +} + +fn repeat_zero_or_more_literals( + e: &Hir, + lits: &mut Literals, + mut f: F, +) { + let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); + lits3.set_limit_size(lits.limit_size() / 2); + f(e, &mut lits3); + + if lits3.is_empty() || !lits2.cross_product(&lits3) { + lits.cut(); + return; + } + lits2.cut(); + lits2.add(Literal::empty()); + if !lits.union(lits2) { + lits.cut(); + } +} + +fn repeat_one_or_more_literals( + e: &Hir, + lits: &mut Literals, + mut f: F, +) { + f(e, lits); + lits.cut(); +} + +fn repeat_range_literals( + e: &Hir, + min: u32, + max: Option, + greedy: bool, + lits: &mut Literals, + mut f: F, +) { + if min == 0 { + // This is a bit conservative. If `max` is set, then we could + // treat this as a finite set of alternations. For now, we + // just treat it as `e*`. + f( + &Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + greedy, + hir: Box::new(e.clone()), + }), + lits, + ); + } else { + if min > 0 { + let n = cmp::min(lits.limit_size, min as usize); + let es = iter::repeat(e.clone()).take(n).collect(); + f(&Hir::concat(es), lits); + if n < min as usize || lits.contains_empty() { + lits.cut(); + } + } + if max.map_or(true, |max| min < max) { + lits.cut(); + } + } +} + +fn alternate_literals( + es: &[Hir], + lits: &mut Literals, + mut f: F, +) { + let mut lits2 = lits.to_empty(); + for e in es { + let mut lits3 = lits.to_empty(); + lits3.set_limit_size(lits.limit_size() / 5); + f(e, &mut lits3); + if lits3.is_empty() || !lits2.union(lits3) { + // If we couldn't find suffixes for *any* of the + // alternates, then the entire alternation has to be thrown + // away and any existing members must be frozen. Similarly, + // if the union couldn't complete, stop and freeze. + lits.cut(); + return; + } + } + if !lits.cross_product(&lits2) { + lits.cut(); + } +} + +impl fmt::Debug for Literals { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Literals") + .field("lits", &self.lits) + .field("limit_size", &self.limit_size) + .field("limit_class", &self.limit_class) + .finish() + } +} + +impl Literal { + /// Returns a new complete literal with the bytes given. + pub fn new(bytes: Vec) -> Literal { + Literal { v: bytes, cut: false } + } + + /// Returns a new complete empty literal. + pub fn empty() -> Literal { + Literal { v: vec![], cut: false } + } + + /// Returns true if this literal was "cut." + pub fn is_cut(&self) -> bool { + self.cut + } + + /// Cuts this literal. + pub fn cut(&mut self) { + self.cut = true; + } +} + +impl PartialEq for Literal { + fn eq(&self, other: &Literal) -> bool { + self.v == other.v + } +} + +impl PartialOrd for Literal { + fn partial_cmp(&self, other: &Literal) -> Option { + self.v.partial_cmp(&other.v) + } +} + +impl fmt::Debug for Literal { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.is_cut() { + write!(f, "Cut({})", escape_unicode(&self.v)) + } else { + write!(f, "Complete({})", escape_unicode(&self.v)) + } + } +} + +impl AsRef<[u8]> for Literal { + fn as_ref(&self) -> &[u8] { + &self.v + } +} + +impl ops::Deref for Literal { + type Target = Vec; + fn deref(&self) -> &Vec { + &self.v + } +} + +impl ops::DerefMut for Literal { + fn deref_mut(&mut self) -> &mut Vec { + &mut self.v + } +} + +fn position(needle: &[u8], mut haystack: &[u8]) -> Option { + let mut i = 0; + while haystack.len() >= needle.len() { + if needle == &haystack[..needle.len()] { + return Some(i); + } + i += 1; + haystack = &haystack[1..]; + } + None +} + +fn escape_unicode(bytes: &[u8]) -> String { + let show = match ::std::str::from_utf8(bytes) { + Ok(v) => v.to_string(), + Err(_) => escape_bytes(bytes), + }; + let mut space_escaped = String::new(); + for c in show.chars() { + if c.is_whitespace() { + let escaped = if c as u32 <= 0x7F { + escape_byte(c as u8) + } else if c as u32 <= 0xFFFF { + format!(r"\u{{{:04x}}}", c as u32) + } else { + format!(r"\U{{{:08x}}}", c as u32) + }; + space_escaped.push_str(&escaped); + } else { + space_escaped.push(c); + } + } + space_escaped +} + +fn escape_bytes(bytes: &[u8]) -> String { + let mut s = String::new(); + for &b in bytes { + s.push_str(&escape_byte(b)); + } + s +} + +fn escape_byte(byte: u8) -> String { + use std::ascii::escape_default; + + let escaped: Vec = escape_default(byte).collect(); + String::from_utf8_lossy(&escaped).into_owned() +} + +fn cls_char_count(cls: &hir::ClassUnicode) -> usize { + cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::() + as usize +} + +fn cls_byte_count(cls: &hir::ClassBytes) -> usize { + cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::() + as usize +} + +#[cfg(test)] +mod tests { + use std::fmt; + + use super::{escape_bytes, Literal, Literals}; + use crate::hir::Hir; + use crate::ParserBuilder; + + // To make test failures easier to read. + #[derive(Debug, Eq, PartialEq)] + struct Bytes(Vec); + #[derive(Debug, Eq, PartialEq)] + struct Unicode(Vec); + + fn escape_lits(blits: &[Literal]) -> Vec { + let mut ulits = vec![]; + for blit in blits { + ulits + .push(ULiteral { v: escape_bytes(&blit), cut: blit.is_cut() }); + } + ulits + } + + fn create_lits>(it: I) -> Literals { + Literals { + lits: it.into_iter().collect(), + limit_size: 0, + limit_class: 0, + } + } + + // Needs to be pub for 1.3? + #[derive(Clone, Eq, PartialEq)] + pub struct ULiteral { + v: String, + cut: bool, + } + + impl ULiteral { + fn is_cut(&self) -> bool { + self.cut + } + } + + impl fmt::Debug for ULiteral { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.is_cut() { + write!(f, "Cut({})", self.v) + } else { + write!(f, "Complete({})", self.v) + } + } + } + + impl PartialEq for ULiteral { + fn eq(&self, other: &Literal) -> bool { + self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut() + } + } + + impl PartialEq for Literal { + fn eq(&self, other: &ULiteral) -> bool { + &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut() + } + } + + #[allow(non_snake_case)] + fn C(s: &'static str) -> ULiteral { + ULiteral { v: s.to_owned(), cut: true } + } + #[allow(non_snake_case)] + fn M(s: &'static str) -> ULiteral { + ULiteral { v: s.to_owned(), cut: false } + } + + fn prefixes(lits: &mut Literals, expr: &Hir) { + lits.union_prefixes(expr); + } + + fn suffixes(lits: &mut Literals, expr: &Hir) { + lits.union_suffixes(expr); + } + + macro_rules! assert_lit_eq { + ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{ + let expected: Vec = vec![$($expected_lit),*]; + let lits = $got_lits; + assert_eq!( + $which(expected.clone()), + $which(escape_lits(lits.literals()))); + assert_eq!( + !expected.is_empty() && expected.iter().all(|l| !l.is_cut()), + lits.all_complete()); + assert_eq!( + expected.iter().any(|l| !l.is_cut()), + lits.any_complete()); + }}; + } + + macro_rules! test_lit { + ($name:ident, $which:ident, $re:expr) => { + test_lit!($name, $which, $re,); + }; + ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { + #[test] + fn $name() { + let expr = ParserBuilder::new() + .build() + .parse($re) + .unwrap(); + let lits = Literals::$which(&expr); + assert_lit_eq!(Unicode, lits, $($lit),*); + + let expr = ParserBuilder::new() + .allow_invalid_utf8(true) + .unicode(false) + .build() + .parse($re) + .unwrap(); + let lits = Literals::$which(&expr); + assert_lit_eq!(Bytes, lits, $($lit),*); + } + }; + } + + // ************************************************************************ + // Tests for prefix literal extraction. + // ************************************************************************ + + // Elementary tests. + test_lit!(pfx_one_lit1, prefixes, "a", M("a")); + test_lit!(pfx_one_lit2, prefixes, "abc", M("abc")); + test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83")); + #[cfg(feature = "unicode-case")] + test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83")); + test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4")); + test_lit!( + pfx_class2, + prefixes, + "(?u)[☃Ⅰ]", + M("\\xe2\\x85\\xa0"), + M("\\xe2\\x98\\x83") + ); + #[cfg(feature = "unicode-case")] + test_lit!( + pfx_class3, + prefixes, + "(?ui)[☃Ⅰ]", + M("\\xe2\\x85\\xa0"), + M("\\xe2\\x85\\xb0"), + M("\\xe2\\x98\\x83") + ); + test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a")); + test_lit!( + pfx_one_lit_casei2, + prefixes, + "(?i-u)abc", + M("ABC"), + M("aBC"), + M("AbC"), + M("abC"), + M("ABc"), + M("aBc"), + M("Abc"), + M("abc") + ); + test_lit!(pfx_group1, prefixes, "(a)", M("a")); + test_lit!(pfx_rep_zero_or_one1, prefixes, "a?"); + test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?"); + test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a")); + // FIXME: This should return [M("a"), M("ab")] because of the non-greedy + // repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get + // a cut literal. + test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a")); + test_lit!(pfx_rep_zero_or_more1, prefixes, "a*"); + test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*"); + test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a")); + test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc")); + test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a")); + test_lit!(pfx_rep_range1, prefixes, "a{0}"); + test_lit!(pfx_rep_range2, prefixes, "a{0,}"); + test_lit!(pfx_rep_range3, prefixes, "a{0,1}"); + test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a")); + test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa")); + test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a")); + test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa")); + + // Test regexes with concatenations. + test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab")); + test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz")); + test_lit!( + pfx_cat3, + prefixes, + "(?i-u)[ab]z", + M("AZ"), + M("BZ"), + M("aZ"), + M("bZ"), + M("Az"), + M("Bz"), + M("az"), + M("bz") + ); + test_lit!( + pfx_cat4, + prefixes, + "[ab][yz]", + M("ay"), + M("by"), + M("az"), + M("bz") + ); + test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b")); + test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c")); + test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c")); + test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b")); + test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b")); + test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a")); + test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac")); + test_lit!(pfx_cat12, prefixes, "ab+", C("ab")); + test_lit!(pfx_cat13, prefixes, "ab+c", C("ab")); + test_lit!(pfx_cat14, prefixes, "a^", C("a")); + test_lit!(pfx_cat15, prefixes, "$a"); + test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac")); + test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab")); + test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb")); + test_lit!(pfx_cat19, prefixes, "a.z", C("a")); + + // Test regexes with alternations. + test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b")); + test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); + test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz")); + test_lit!(pfx_alt4, prefixes, "a|b*"); + test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b")); + test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)"); + test_lit!( + pfx_alt7, + prefixes, + "(a|b)*c|(a|ab)*c", + C("a"), + C("b"), + M("c"), + C("a"), + C("ab"), + M("c") + ); + test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c")); + + // Test regexes with empty assertions. + test_lit!(pfx_empty1, prefixes, "^a", M("a")); + test_lit!(pfx_empty2, prefixes, "a${2}", C("a")); + test_lit!(pfx_empty3, prefixes, "^abc", M("abc")); + test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z")); + + // Make sure some curious regexes have no prefixes. + test_lit!(pfx_nothing1, prefixes, "."); + test_lit!(pfx_nothing2, prefixes, "(?s)."); + test_lit!(pfx_nothing3, prefixes, "^"); + test_lit!(pfx_nothing4, prefixes, "$"); + test_lit!(pfx_nothing6, prefixes, "(?m)$"); + test_lit!(pfx_nothing7, prefixes, r"\b"); + test_lit!(pfx_nothing8, prefixes, r"\B"); + + // Test a few regexes that defeat any prefix literal detection. + test_lit!(pfx_defeated1, prefixes, ".a"); + test_lit!(pfx_defeated2, prefixes, "(?s).a"); + test_lit!(pfx_defeated3, prefixes, "a*b*c*"); + test_lit!(pfx_defeated4, prefixes, "a|."); + test_lit!(pfx_defeated5, prefixes, ".|a"); + test_lit!(pfx_defeated6, prefixes, "a|^"); + test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))"); + test_lit!(pfx_defeated8, prefixes, "$a"); + test_lit!(pfx_defeated9, prefixes, "(?m)$a"); + test_lit!(pfx_defeated10, prefixes, r"\ba"); + test_lit!(pfx_defeated11, prefixes, r"\Ba"); + test_lit!(pfx_defeated12, prefixes, "^*a"); + test_lit!(pfx_defeated13, prefixes, "^+a"); + + test_lit!( + pfx_crazy1, + prefixes, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + C("Mo\\'"), + C("Mu\\'"), + C("Moam"), + C("Muam") + ); + + // ************************************************************************ + // Tests for quiting prefix literal search. + // ************************************************************************ + + macro_rules! test_exhausted { + ($name:ident, $which:ident, $re:expr) => { + test_exhausted!($name, $which, $re,); + }; + ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { + #[test] + fn $name() { + let expr = ParserBuilder::new() + .build() + .parse($re) + .unwrap(); + let mut lits = Literals::empty(); + lits.set_limit_size(20).set_limit_class(10); + $which(&mut lits, &expr); + assert_lit_eq!(Unicode, lits, $($lit),*); + + let expr = ParserBuilder::new() + .allow_invalid_utf8(true) + .unicode(false) + .build() + .parse($re) + .unwrap(); + let mut lits = Literals::empty(); + lits.set_limit_size(20).set_limit_class(10); + $which(&mut lits, &expr); + assert_lit_eq!(Bytes, lits, $($lit),*); + } + }; + } + + // These test use a much lower limit than the default so that we can + // write test cases of reasonable size. + test_exhausted!(pfx_exhausted1, prefixes, "[a-z]"); + test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A"); + test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A")); + test_exhausted!( + pfx_exhausted4, + prefixes, + "(?i-u)foobar", + C("FO"), + C("fO"), + C("Fo"), + C("fo") + ); + test_exhausted!( + pfx_exhausted5, + prefixes, + "(?:ab){100}", + C("abababababababababab") + ); + test_exhausted!( + pfx_exhausted6, + prefixes, + "(?:(?:ab){100})*cd", + C("ababababab"), + M("cd") + ); + test_exhausted!( + pfx_exhausted7, + prefixes, + "z(?:(?:ab){100})*cd", + C("zababababab"), + M("zcd") + ); + test_exhausted!( + pfx_exhausted8, + prefixes, + "aaaaaaaaaaaaaaaaaaaaz", + C("aaaaaaaaaaaaaaaaaaaa") + ); + + // ************************************************************************ + // Tests for suffix literal extraction. + // ************************************************************************ + + // Elementary tests. + test_lit!(sfx_one_lit1, suffixes, "a", M("a")); + test_lit!(sfx_one_lit2, suffixes, "abc", M("abc")); + test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83")); + #[cfg(feature = "unicode-case")] + test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83")); + test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4")); + test_lit!( + sfx_class2, + suffixes, + "(?u)[☃Ⅰ]", + M("\\xe2\\x85\\xa0"), + M("\\xe2\\x98\\x83") + ); + #[cfg(feature = "unicode-case")] + test_lit!( + sfx_class3, + suffixes, + "(?ui)[☃Ⅰ]", + M("\\xe2\\x85\\xa0"), + M("\\xe2\\x85\\xb0"), + M("\\xe2\\x98\\x83") + ); + test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a")); + test_lit!( + sfx_one_lit_casei2, + suffixes, + "(?i-u)abc", + M("ABC"), + M("ABc"), + M("AbC"), + M("Abc"), + M("aBC"), + M("aBc"), + M("abC"), + M("abc") + ); + test_lit!(sfx_group1, suffixes, "(a)", M("a")); + test_lit!(sfx_rep_zero_or_one1, suffixes, "a?"); + test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?"); + test_lit!(sfx_rep_zero_or_more1, suffixes, "a*"); + test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*"); + test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a")); + test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc")); + test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a")); + test_lit!(sfx_rep_range1, suffixes, "a{0}"); + test_lit!(sfx_rep_range2, suffixes, "a{0,}"); + test_lit!(sfx_rep_range3, suffixes, "a{0,1}"); + test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a")); + test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa")); + test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a")); + test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa")); + + // Test regexes with concatenations. + test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab")); + test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz")); + test_lit!( + sfx_cat3, + suffixes, + "(?i-u)[ab]z", + M("AZ"), + M("Az"), + M("BZ"), + M("Bz"), + M("aZ"), + M("az"), + M("bZ"), + M("bz") + ); + test_lit!( + sfx_cat4, + suffixes, + "[ab][yz]", + M("ay"), + M("az"), + M("by"), + M("bz") + ); + test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b")); + test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c")); + test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c")); + test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc")); + test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b")); + test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a")); + test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac")); + test_lit!(sfx_cat12, suffixes, "ab+", C("b")); + test_lit!(sfx_cat13, suffixes, "ab+c", C("bc")); + test_lit!(sfx_cat14, suffixes, "a^"); + test_lit!(sfx_cat15, suffixes, "$a", C("a")); + test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac")); + test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc")); + test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb")); + test_lit!(sfx_cat19, suffixes, "a.z", C("z")); + + // Test regexes with alternations. + test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b")); + test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); + test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz")); + test_lit!(sfx_alt4, suffixes, "a|b*"); + test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b")); + test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)"); + test_lit!( + sfx_alt7, + suffixes, + "(a|b)*c|(a|ab)*c", + C("ac"), + C("bc"), + M("c"), + C("ac"), + C("abc"), + M("c") + ); + test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c")); + + // Test regexes with empty assertions. + test_lit!(sfx_empty1, suffixes, "a$", M("a")); + test_lit!(sfx_empty2, suffixes, "${2}a", C("a")); + + // Make sure some curious regexes have no suffixes. + test_lit!(sfx_nothing1, suffixes, "."); + test_lit!(sfx_nothing2, suffixes, "(?s)."); + test_lit!(sfx_nothing3, suffixes, "^"); + test_lit!(sfx_nothing4, suffixes, "$"); + test_lit!(sfx_nothing6, suffixes, "(?m)$"); + test_lit!(sfx_nothing7, suffixes, r"\b"); + test_lit!(sfx_nothing8, suffixes, r"\B"); + + // Test a few regexes that defeat any suffix literal detection. + test_lit!(sfx_defeated1, suffixes, "a."); + test_lit!(sfx_defeated2, suffixes, "(?s)a."); + test_lit!(sfx_defeated3, suffixes, "a*b*c*"); + test_lit!(sfx_defeated4, suffixes, "a|."); + test_lit!(sfx_defeated5, suffixes, ".|a"); + test_lit!(sfx_defeated6, suffixes, "a|^"); + test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c))."); + test_lit!(sfx_defeated8, suffixes, "a^"); + test_lit!(sfx_defeated9, suffixes, "(?m)a$"); + test_lit!(sfx_defeated10, suffixes, r"a\b"); + test_lit!(sfx_defeated11, suffixes, r"a\B"); + test_lit!(sfx_defeated12, suffixes, "a^*"); + test_lit!(sfx_defeated13, suffixes, "a^+"); + + // These test use a much lower limit than the default so that we can + // write test cases of reasonable size. + test_exhausted!(sfx_exhausted1, suffixes, "[a-z]"); + test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*"); + test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z")); + test_exhausted!( + sfx_exhausted4, + suffixes, + "(?i-u)foobar", + C("AR"), + C("Ar"), + C("aR"), + C("ar") + ); + test_exhausted!( + sfx_exhausted5, + suffixes, + "(?:ab){100}", + C("abababababababababab") + ); + test_exhausted!( + sfx_exhausted6, + suffixes, + "cd(?:(?:ab){100})*", + C("ababababab"), + M("cd") + ); + test_exhausted!( + sfx_exhausted7, + suffixes, + "cd(?:(?:ab){100})*z", + C("abababababz"), + M("cdz") + ); + test_exhausted!( + sfx_exhausted8, + suffixes, + "zaaaaaaaaaaaaaaaaaaaa", + C("aaaaaaaaaaaaaaaaaaaa") + ); + + // ************************************************************************ + // Tests for generating unambiguous literal sets. + // ************************************************************************ + + macro_rules! test_unamb { + ($name:ident, $given:expr, $expected:expr) => { + #[test] + fn $name() { + let given: Vec = $given + .into_iter() + .map(|ul| { + let cut = ul.is_cut(); + Literal { v: ul.v.into_bytes(), cut: cut } + }) + .collect(); + let lits = create_lits(given); + let got = lits.unambiguous_prefixes(); + assert_eq!($expected, escape_lits(got.literals())); + } + }; + } + + test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]); + test_unamb!( + unambiguous2, + vec![M("zaaaaaa"), M("aa")], + vec![C("aa"), C("z")] + ); + test_unamb!( + unambiguous3, + vec![M("Sherlock"), M("Watson")], + vec![M("Sherlock"), M("Watson")] + ); + test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]); + test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]); + test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]); + test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]); + test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]); + test_unamb!( + unambiguous9, + vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")], + vec![C("a"), C("b"), C("c")] + ); + test_unamb!( + unambiguous10, + vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")], + vec![C("Mo"), C("Mu")] + ); + test_unamb!( + unambiguous11, + vec![M("zazb"), M("azb")], + vec![C("a"), C("z")] + ); + test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]); + test_unamb!( + unambiguous13, + vec![M("ABCX"), M("CDAX"), M("BCX")], + vec![C("A"), C("BCX"), C("CD")] + ); + test_unamb!( + unambiguous14, + vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")], + vec![M("DSX"), C("I"), C("MGX"), C("MV")] + ); + test_unamb!( + unambiguous15, + vec![M("IMG_"), M("MG_"), M("CIMG")], + vec![C("C"), C("I"), C("MG_")] + ); + + // ************************************************************************ + // Tests for suffix trimming. + // ************************************************************************ + macro_rules! test_trim { + ($name:ident, $trim:expr, $given:expr, $expected:expr) => { + #[test] + fn $name() { + let given: Vec = $given + .into_iter() + .map(|ul| { + let cut = ul.is_cut(); + Literal { v: ul.v.into_bytes(), cut: cut } + }) + .collect(); + let lits = create_lits(given); + let got = lits.trim_suffix($trim).unwrap(); + assert_eq!($expected, escape_lits(got.literals())); + } + }; + } + + test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]); + test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]); + test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]); + test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]); + + // ************************************************************************ + // Tests for longest common prefix. + // ************************************************************************ + + macro_rules! test_lcp { + ($name:ident, $given:expr, $expected:expr) => { + #[test] + fn $name() { + let given: Vec = $given + .into_iter() + .map(|s: &str| Literal { + v: s.to_owned().into_bytes(), + cut: false, + }) + .collect(); + let lits = create_lits(given); + let got = lits.longest_common_prefix(); + assert_eq!($expected, escape_bytes(got)); + } + }; + } + + test_lcp!(lcp1, vec!["a"], "a"); + test_lcp!(lcp2, vec![], ""); + test_lcp!(lcp3, vec!["a", "b"], ""); + test_lcp!(lcp4, vec!["ab", "ab"], "ab"); + test_lcp!(lcp5, vec!["ab", "a"], "a"); + test_lcp!(lcp6, vec!["a", "ab"], "a"); + test_lcp!(lcp7, vec!["ab", "b"], ""); + test_lcp!(lcp8, vec!["b", "ab"], ""); + test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba"); + test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], ""); + test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], ""); + test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f"); + + // ************************************************************************ + // Tests for longest common suffix. + // ************************************************************************ + + macro_rules! test_lcs { + ($name:ident, $given:expr, $expected:expr) => { + #[test] + fn $name() { + let given: Vec = $given + .into_iter() + .map(|s: &str| Literal { + v: s.to_owned().into_bytes(), + cut: false, + }) + .collect(); + let lits = create_lits(given); + let got = lits.longest_common_suffix(); + assert_eq!($expected, escape_bytes(got)); + } + }; + } + + test_lcs!(lcs1, vec!["a"], "a"); + test_lcs!(lcs2, vec![], ""); + test_lcs!(lcs3, vec!["a", "b"], ""); + test_lcs!(lcs4, vec!["ab", "ab"], "ab"); + test_lcs!(lcs5, vec!["ab", "a"], ""); + test_lcs!(lcs6, vec!["a", "ab"], ""); + test_lcs!(lcs7, vec!["ab", "b"], "b"); + test_lcs!(lcs8, vec!["b", "ab"], "b"); + test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo"); + test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], ""); + test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], ""); + test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b"); +} diff --git a/vendor/regex-syntax/src/hir/mod.rs b/vendor/regex-syntax/src/hir/mod.rs new file mode 100644 index 0000000000..156bcc2844 --- /dev/null +++ b/vendor/regex-syntax/src/hir/mod.rs @@ -0,0 +1,2299 @@ +/*! +Defines a high-level intermediate representation for regular expressions. +*/ +use std::char; +use std::cmp; +use std::error; +use std::fmt; +use std::result; +use std::u8; + +use crate::ast::Span; +use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter}; +use crate::unicode; + +pub use crate::hir::visitor::{visit, Visitor}; +pub use crate::unicode::CaseFoldError; + +mod interval; +pub mod literal; +pub mod print; +pub mod translate; +mod visitor; + +/// An error that can occur while translating an `Ast` to a `Hir`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Error { + /// The kind of error. + kind: ErrorKind, + /// The original pattern that the translator's Ast was parsed from. Every + /// span in an error is a valid range into this string. + pattern: String, + /// The span of this error, derived from the Ast given to the translator. + span: Span, +} + +impl Error { + /// Return the type of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } + + /// The original pattern string in which this error occurred. + /// + /// Every span reported by this error is reported in terms of this string. + pub fn pattern(&self) -> &str { + &self.pattern + } + + /// Return the span at which this error occurred. + pub fn span(&self) -> &Span { + &self.span + } +} + +/// The type of an error that occurred while building an `Hir`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ErrorKind { + /// This error occurs when a Unicode feature is used when Unicode + /// support is disabled. For example `(?-u:\pL)` would trigger this error. + UnicodeNotAllowed, + /// This error occurs when translating a pattern that could match a byte + /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled. + InvalidUtf8, + /// This occurs when an unrecognized Unicode property name could not + /// be found. + UnicodePropertyNotFound, + /// This occurs when an unrecognized Unicode property value could not + /// be found. + UnicodePropertyValueNotFound, + /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or + /// `\d`) could not be found. This can occur when the `unicode-perl` + /// crate feature is not enabled. + UnicodePerlClassNotFound, + /// This occurs when the Unicode simple case mapping tables are not + /// available, and the regular expression required Unicode aware case + /// insensitivity. + UnicodeCaseUnavailable, + /// This occurs when the translator attempts to construct a character class + /// that is empty. + /// + /// Note that this restriction in the translator may be removed in the + /// future. + EmptyClassNotAllowed, + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl ErrorKind { + // TODO: Remove this method entirely on the next breaking semver release. + #[allow(deprecated)] + fn description(&self) -> &str { + use self::ErrorKind::*; + match *self { + UnicodeNotAllowed => "Unicode not allowed here", + InvalidUtf8 => "pattern can match invalid UTF-8", + UnicodePropertyNotFound => "Unicode property not found", + UnicodePropertyValueNotFound => "Unicode property value not found", + UnicodePerlClassNotFound => { + "Unicode-aware Perl class not found \ + (make sure the unicode-perl feature is enabled)" + } + UnicodeCaseUnavailable => { + "Unicode-aware case insensitivity matching is not available \ + (make sure the unicode-case feature is enabled)" + } + EmptyClassNotAllowed => "empty character classes are not allowed", + __Nonexhaustive => unreachable!(), + } + } +} + +impl error::Error for Error { + // TODO: Remove this method entirely on the next breaking semver release. + #[allow(deprecated)] + fn description(&self) -> &str { + self.kind.description() + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + crate::error::Formatter::from(self).fmt(f) + } +} + +impl fmt::Display for ErrorKind { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // TODO: Remove this on the next breaking semver release. + #[allow(deprecated)] + f.write_str(self.description()) + } +} + +/// A high-level intermediate representation (HIR) for a regular expression. +/// +/// The HIR of a regular expression represents an intermediate step between its +/// abstract syntax (a structured description of the concrete syntax) and +/// compiled byte codes. The purpose of HIR is to make regular expressions +/// easier to analyze. In particular, the AST is much more complex than the +/// HIR. For example, while an AST supports arbitrarily nested character +/// classes, the HIR will flatten all nested classes into a single set. The HIR +/// will also "compile away" every flag present in the concrete syntax. For +/// example, users of HIR expressions never need to worry about case folding; +/// it is handled automatically by the translator (e.g., by translating `(?i)A` +/// to `[aA]`). +/// +/// If the HIR was produced by a translator that disallows invalid UTF-8, then +/// the HIR is guaranteed to match UTF-8 exclusively. +/// +/// This type defines its own destructor that uses constant stack space and +/// heap space proportional to the size of the HIR. +/// +/// The specific type of an HIR expression can be accessed via its `kind` +/// or `into_kind` methods. This extra level of indirection exists for two +/// reasons: +/// +/// 1. Construction of an HIR expression *must* use the constructor methods +/// on this `Hir` type instead of building the `HirKind` values directly. +/// This permits construction to enforce invariants like "concatenations +/// always consist of two or more sub-expressions." +/// 2. Every HIR expression contains attributes that are defined inductively, +/// and can be computed cheaply during the construction process. For +/// example, one such attribute is whether the expression must match at the +/// beginning of the text. +/// +/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular +/// expression pattern string, and uses constant stack space and heap space +/// proportional to the size of the `Hir`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Hir { + /// The underlying HIR kind. + kind: HirKind, + /// Analysis info about this HIR, computed during construction. + info: HirInfo, +} + +/// The kind of an arbitrary `Hir` expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum HirKind { + /// The empty regular expression, which matches everything, including the + /// empty string. + Empty, + /// A single literal character that matches exactly this character. + Literal(Literal), + /// A single character class that matches any of the characters in the + /// class. A class can either consist of Unicode scalar values as + /// characters, or it can use bytes. + Class(Class), + /// An anchor assertion. An anchor assertion match always has zero length. + Anchor(Anchor), + /// A word boundary assertion, which may or may not be Unicode aware. A + /// word boundary assertion match always has zero length. + WordBoundary(WordBoundary), + /// A repetition operation applied to a child expression. + Repetition(Repetition), + /// A possibly capturing group, which contains a child expression. + Group(Group), + /// A concatenation of expressions. A concatenation always has at least two + /// child expressions. + /// + /// A concatenation matches only if each of its child expression matches + /// one after the other. + Concat(Vec), + /// An alternation of expressions. An alternation always has at least two + /// child expressions. + /// + /// An alternation matches only if at least one of its child expression + /// matches. If multiple expressions match, then the leftmost is preferred. + Alternation(Vec), +} + +impl Hir { + /// Returns a reference to the underlying HIR kind. + pub fn kind(&self) -> &HirKind { + &self.kind + } + + /// Consumes ownership of this HIR expression and returns its underlying + /// `HirKind`. + pub fn into_kind(mut self) -> HirKind { + use std::mem; + mem::replace(&mut self.kind, HirKind::Empty) + } + + /// Returns an empty HIR expression. + /// + /// An empty HIR expression always matches, including the empty string. + pub fn empty() -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(true); + info.set_all_assertions(true); + info.set_anchored_start(false); + info.set_anchored_end(false); + info.set_line_anchored_start(false); + info.set_line_anchored_end(false); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(true); + info.set_literal(false); + info.set_alternation_literal(false); + Hir { kind: HirKind::Empty, info } + } + + /// Creates a literal HIR expression. + /// + /// If the given literal has a `Byte` variant with an ASCII byte, then this + /// method panics. This enforces the invariant that `Byte` variants are + /// only used to express matching of invalid UTF-8. + pub fn literal(lit: Literal) -> Hir { + if let Literal::Byte(b) = lit { + assert!(b > 0x7F); + } + + let mut info = HirInfo::new(); + info.set_always_utf8(lit.is_unicode()); + info.set_all_assertions(false); + info.set_anchored_start(false); + info.set_anchored_end(false); + info.set_line_anchored_start(false); + info.set_line_anchored_end(false); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(false); + info.set_literal(true); + info.set_alternation_literal(true); + Hir { kind: HirKind::Literal(lit), info } + } + + /// Creates a class HIR expression. + pub fn class(class: Class) -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(class.is_always_utf8()); + info.set_all_assertions(false); + info.set_anchored_start(false); + info.set_anchored_end(false); + info.set_line_anchored_start(false); + info.set_line_anchored_end(false); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(false); + info.set_literal(false); + info.set_alternation_literal(false); + Hir { kind: HirKind::Class(class), info } + } + + /// Creates an anchor assertion HIR expression. + pub fn anchor(anchor: Anchor) -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(true); + info.set_all_assertions(true); + info.set_anchored_start(false); + info.set_anchored_end(false); + info.set_line_anchored_start(false); + info.set_line_anchored_end(false); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(true); + info.set_literal(false); + info.set_alternation_literal(false); + if let Anchor::StartText = anchor { + info.set_anchored_start(true); + info.set_line_anchored_start(true); + info.set_any_anchored_start(true); + } + if let Anchor::EndText = anchor { + info.set_anchored_end(true); + info.set_line_anchored_end(true); + info.set_any_anchored_end(true); + } + if let Anchor::StartLine = anchor { + info.set_line_anchored_start(true); + } + if let Anchor::EndLine = anchor { + info.set_line_anchored_end(true); + } + Hir { kind: HirKind::Anchor(anchor), info } + } + + /// Creates a word boundary assertion HIR expression. + pub fn word_boundary(word_boundary: WordBoundary) -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(true); + info.set_all_assertions(true); + info.set_anchored_start(false); + info.set_anchored_end(false); + info.set_line_anchored_start(false); + info.set_line_anchored_end(false); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_literal(false); + info.set_alternation_literal(false); + // A negated word boundary matches '', so that's fine. But \b does not + // match \b, so why do we say it can match the empty string? Well, + // because, if you search for \b against 'a', it will report [0, 0) and + // [1, 1) as matches, and both of those matches correspond to the empty + // string. Thus, only *certain* empty strings match \b, which similarly + // applies to \B. + info.set_match_empty(true); + // Negated ASCII word boundaries can match invalid UTF-8. + if let WordBoundary::AsciiNegate = word_boundary { + info.set_always_utf8(false); + } + Hir { kind: HirKind::WordBoundary(word_boundary), info } + } + + /// Creates a repetition HIR expression. + pub fn repetition(rep: Repetition) -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(rep.hir.is_always_utf8()); + info.set_all_assertions(rep.hir.is_all_assertions()); + // If this operator can match the empty string, then it can never + // be anchored. + info.set_anchored_start( + !rep.is_match_empty() && rep.hir.is_anchored_start(), + ); + info.set_anchored_end( + !rep.is_match_empty() && rep.hir.is_anchored_end(), + ); + info.set_line_anchored_start( + !rep.is_match_empty() && rep.hir.is_anchored_start(), + ); + info.set_line_anchored_end( + !rep.is_match_empty() && rep.hir.is_anchored_end(), + ); + info.set_any_anchored_start(rep.hir.is_any_anchored_start()); + info.set_any_anchored_end(rep.hir.is_any_anchored_end()); + info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty()); + info.set_literal(false); + info.set_alternation_literal(false); + Hir { kind: HirKind::Repetition(rep), info } + } + + /// Creates a group HIR expression. + pub fn group(group: Group) -> Hir { + let mut info = HirInfo::new(); + info.set_always_utf8(group.hir.is_always_utf8()); + info.set_all_assertions(group.hir.is_all_assertions()); + info.set_anchored_start(group.hir.is_anchored_start()); + info.set_anchored_end(group.hir.is_anchored_end()); + info.set_line_anchored_start(group.hir.is_line_anchored_start()); + info.set_line_anchored_end(group.hir.is_line_anchored_end()); + info.set_any_anchored_start(group.hir.is_any_anchored_start()); + info.set_any_anchored_end(group.hir.is_any_anchored_end()); + info.set_match_empty(group.hir.is_match_empty()); + info.set_literal(false); + info.set_alternation_literal(false); + Hir { kind: HirKind::Group(group), info } + } + + /// Returns the concatenation of the given expressions. + /// + /// This flattens the concatenation as appropriate. + pub fn concat(mut exprs: Vec) -> Hir { + match exprs.len() { + 0 => Hir::empty(), + 1 => exprs.pop().unwrap(), + _ => { + let mut info = HirInfo::new(); + info.set_always_utf8(true); + info.set_all_assertions(true); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(true); + info.set_literal(true); + info.set_alternation_literal(true); + + // Some attributes require analyzing all sub-expressions. + for e in &exprs { + let x = info.is_always_utf8() && e.is_always_utf8(); + info.set_always_utf8(x); + + let x = info.is_all_assertions() && e.is_all_assertions(); + info.set_all_assertions(x); + + let x = info.is_any_anchored_start() + || e.is_any_anchored_start(); + info.set_any_anchored_start(x); + + let x = + info.is_any_anchored_end() || e.is_any_anchored_end(); + info.set_any_anchored_end(x); + + let x = info.is_match_empty() && e.is_match_empty(); + info.set_match_empty(x); + + let x = info.is_literal() && e.is_literal(); + info.set_literal(x); + + let x = info.is_alternation_literal() + && e.is_alternation_literal(); + info.set_alternation_literal(x); + } + // Anchored attributes require something slightly more + // sophisticated. Normally, WLOG, to determine whether an + // expression is anchored to the start, we'd only need to check + // the first expression of a concatenation. However, + // expressions like `$\b^` are still anchored to the start, + // but the first expression in the concatenation *isn't* + // anchored to the start. So the "first" expression to look at + // is actually one that is either not an assertion or is + // specifically the StartText assertion. + info.set_anchored_start( + exprs + .iter() + .take_while(|e| { + e.is_anchored_start() || e.is_all_assertions() + }) + .any(|e| e.is_anchored_start()), + ); + // Similarly for the end anchor, but in reverse. + info.set_anchored_end( + exprs + .iter() + .rev() + .take_while(|e| { + e.is_anchored_end() || e.is_all_assertions() + }) + .any(|e| e.is_anchored_end()), + ); + // Repeat the process for line anchors. + info.set_line_anchored_start( + exprs + .iter() + .take_while(|e| { + e.is_line_anchored_start() || e.is_all_assertions() + }) + .any(|e| e.is_line_anchored_start()), + ); + info.set_line_anchored_end( + exprs + .iter() + .rev() + .take_while(|e| { + e.is_line_anchored_end() || e.is_all_assertions() + }) + .any(|e| e.is_line_anchored_end()), + ); + Hir { kind: HirKind::Concat(exprs), info } + } + } + } + + /// Returns the alternation of the given expressions. + /// + /// This flattens the alternation as appropriate. + pub fn alternation(mut exprs: Vec) -> Hir { + match exprs.len() { + 0 => Hir::empty(), + 1 => exprs.pop().unwrap(), + _ => { + let mut info = HirInfo::new(); + info.set_always_utf8(true); + info.set_all_assertions(true); + info.set_anchored_start(true); + info.set_anchored_end(true); + info.set_line_anchored_start(true); + info.set_line_anchored_end(true); + info.set_any_anchored_start(false); + info.set_any_anchored_end(false); + info.set_match_empty(false); + info.set_literal(false); + info.set_alternation_literal(true); + + // Some attributes require analyzing all sub-expressions. + for e in &exprs { + let x = info.is_always_utf8() && e.is_always_utf8(); + info.set_always_utf8(x); + + let x = info.is_all_assertions() && e.is_all_assertions(); + info.set_all_assertions(x); + + let x = info.is_anchored_start() && e.is_anchored_start(); + info.set_anchored_start(x); + + let x = info.is_anchored_end() && e.is_anchored_end(); + info.set_anchored_end(x); + + let x = info.is_line_anchored_start() + && e.is_line_anchored_start(); + info.set_line_anchored_start(x); + + let x = info.is_line_anchored_end() + && e.is_line_anchored_end(); + info.set_line_anchored_end(x); + + let x = info.is_any_anchored_start() + || e.is_any_anchored_start(); + info.set_any_anchored_start(x); + + let x = + info.is_any_anchored_end() || e.is_any_anchored_end(); + info.set_any_anchored_end(x); + + let x = info.is_match_empty() || e.is_match_empty(); + info.set_match_empty(x); + + let x = info.is_alternation_literal() && e.is_literal(); + info.set_alternation_literal(x); + } + Hir { kind: HirKind::Alternation(exprs), info } + } + } + } + + /// Build an HIR expression for `.`. + /// + /// A `.` expression matches any character except for `\n`. To build an + /// expression that matches any character, including `\n`, use the `any` + /// method. + /// + /// If `bytes` is `true`, then this assumes characters are limited to a + /// single byte. + pub fn dot(bytes: bool) -> Hir { + if bytes { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\x09')); + cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } else { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\x09')); + cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + } + + /// Build an HIR expression for `(?s).`. + /// + /// A `(?s).` expression matches any character, including `\n`. To build an + /// expression that matches any character except for `\n`, then use the + /// `dot` method. + /// + /// If `bytes` is `true`, then this assumes characters are limited to a + /// single byte. + pub fn any(bytes: bool) -> Hir { + if bytes { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } else { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + } + + /// Return true if and only if this HIR will always match valid UTF-8. + /// + /// When this returns false, then it is possible for this HIR expression + /// to match invalid UTF-8. + pub fn is_always_utf8(&self) -> bool { + self.info.is_always_utf8() + } + + /// Returns true if and only if this entire HIR expression is made up of + /// zero-width assertions. + /// + /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but + /// not `^a`. + pub fn is_all_assertions(&self) -> bool { + self.info.is_all_assertions() + } + + /// Return true if and only if this HIR is required to match from the + /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`, + /// `^foo|^bar` but not `^foo|bar`. + pub fn is_anchored_start(&self) -> bool { + self.info.is_anchored_start() + } + + /// Return true if and only if this HIR is required to match at the end + /// of text. This includes expressions like `foo$`, `(foo|bar)$`, + /// `foo$|bar$` but not `foo$|bar`. + pub fn is_anchored_end(&self) -> bool { + self.info.is_anchored_end() + } + + /// Return true if and only if this HIR is required to match from the + /// beginning of text or the beginning of a line. This includes expressions + /// like `^foo`, `(?m)^foo`, `^(foo|bar)`, `^(foo|bar)`, `(?m)^foo|^bar` + /// but not `^foo|bar` or `(?m)^foo|bar`. + /// + /// Note that if `is_anchored_start` is `true`, then + /// `is_line_anchored_start` will also be `true`. The reverse implication + /// is not true. For example, `(?m)^foo` is line anchored, but not + /// `is_anchored_start`. + pub fn is_line_anchored_start(&self) -> bool { + self.info.is_line_anchored_start() + } + + /// Return true if and only if this HIR is required to match at the + /// end of text or the end of a line. This includes expressions like + /// `foo$`, `(?m)foo$`, `(foo|bar)$`, `(?m)(foo|bar)$`, `foo$|bar$`, + /// `(?m)(foo|bar)$`, but not `foo$|bar` or `(?m)foo$|bar`. + /// + /// Note that if `is_anchored_end` is `true`, then + /// `is_line_anchored_end` will also be `true`. The reverse implication + /// is not true. For example, `(?m)foo$` is line anchored, but not + /// `is_anchored_end`. + pub fn is_line_anchored_end(&self) -> bool { + self.info.is_line_anchored_end() + } + + /// Return true if and only if this HIR contains any sub-expression that + /// is required to match at the beginning of text. Specifically, this + /// returns true if the `^` symbol (when multiline mode is disabled) or the + /// `\A` escape appear anywhere in the regex. + pub fn is_any_anchored_start(&self) -> bool { + self.info.is_any_anchored_start() + } + + /// Return true if and only if this HIR contains any sub-expression that is + /// required to match at the end of text. Specifically, this returns true + /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape + /// appear anywhere in the regex. + pub fn is_any_anchored_end(&self) -> bool { + self.info.is_any_anchored_end() + } + + /// Return true if and only if the empty string is part of the language + /// matched by this regular expression. + /// + /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\b` + /// and `\B`, but not `a` or `a+`. + pub fn is_match_empty(&self) -> bool { + self.info.is_match_empty() + } + + /// Return true if and only if this HIR is a simple literal. This is only + /// true when this HIR expression is either itself a `Literal` or a + /// concatenation of only `Literal`s. + /// + /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`, + /// `` are not (even though that contain sub-expressions that are literals). + pub fn is_literal(&self) -> bool { + self.info.is_literal() + } + + /// Return true if and only if this HIR is either a simple literal or an + /// alternation of simple literals. This is only + /// true when this HIR expression is either itself a `Literal` or a + /// concatenation of only `Literal`s or an alternation of only `Literal`s. + /// + /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation + /// literals, but `f+`, `(foo)`, `foo()`, `` + /// are not (even though that contain sub-expressions that are literals). + pub fn is_alternation_literal(&self) -> bool { + self.info.is_alternation_literal() + } +} + +impl HirKind { + /// Return true if and only if this HIR is the empty regular expression. + /// + /// Note that this is not defined inductively. That is, it only tests if + /// this kind is the `Empty` variant. To get the inductive definition, + /// use the `is_match_empty` method on [`Hir`](struct.Hir.html). + pub fn is_empty(&self) -> bool { + match *self { + HirKind::Empty => true, + _ => false, + } + } + + /// Returns true if and only if this kind has any (including possibly + /// empty) subexpressions. + pub fn has_subexprs(&self) -> bool { + match *self { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Anchor(_) + | HirKind::WordBoundary(_) => false, + HirKind::Group(_) + | HirKind::Repetition(_) + | HirKind::Concat(_) + | HirKind::Alternation(_) => true, + } + } +} + +/// Print a display representation of this Hir. +/// +/// The result of this is a valid regular expression pattern string. +/// +/// This implementation uses constant stack space and heap space proportional +/// to the size of the `Hir`. +impl fmt::Display for Hir { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use crate::hir::print::Printer; + Printer::new().print(self, f) + } +} + +/// The high-level intermediate representation of a literal. +/// +/// A literal corresponds to a single character, where a character is either +/// defined by a Unicode scalar value or an arbitrary byte. Unicode characters +/// are preferred whenever possible. In particular, a `Byte` variant is only +/// ever produced when it could match invalid UTF-8. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Literal { + /// A single character represented by a Unicode scalar value. + Unicode(char), + /// A single character represented by an arbitrary byte. + Byte(u8), +} + +impl Literal { + /// Returns true if and only if this literal corresponds to a Unicode + /// scalar value. + pub fn is_unicode(&self) -> bool { + match *self { + Literal::Unicode(_) => true, + Literal::Byte(b) if b <= 0x7F => true, + Literal::Byte(_) => false, + } + } +} + +/// The high-level intermediate representation of a character class. +/// +/// A character class corresponds to a set of characters. A character is either +/// defined by a Unicode scalar value or a byte. Unicode characters are used +/// by default, while bytes are used when Unicode mode (via the `u` flag) is +/// disabled. +/// +/// A character class, regardless of its character type, is represented by a +/// sequence of non-overlapping non-adjacent ranges of characters. +/// +/// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may +/// be produced even when it exclusively matches valid UTF-8. This is because +/// a `Bytes` variant represents an intention by the author of the regular +/// expression to disable Unicode mode, which in turn impacts the semantics of +/// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not +/// match the same set of strings. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Class { + /// A set of characters represented by Unicode scalar values. + Unicode(ClassUnicode), + /// A set of characters represented by arbitrary bytes (one byte per + /// character). + Bytes(ClassBytes), +} + +impl Class { + /// Apply Unicode simple case folding to this character class, in place. + /// The character class will be expanded to include all simple case folded + /// character variants. + /// + /// If this is a byte oriented character class, then this will be limited + /// to the ASCII ranges `A-Z` and `a-z`. + pub fn case_fold_simple(&mut self) { + match *self { + Class::Unicode(ref mut x) => x.case_fold_simple(), + Class::Bytes(ref mut x) => x.case_fold_simple(), + } + } + + /// Negate this character class in place. + /// + /// After completion, this character class will contain precisely the + /// characters that weren't previously in the class. + pub fn negate(&mut self) { + match *self { + Class::Unicode(ref mut x) => x.negate(), + Class::Bytes(ref mut x) => x.negate(), + } + } + + /// Returns true if and only if this character class will only ever match + /// valid UTF-8. + /// + /// A character class can match invalid UTF-8 only when the following + /// conditions are met: + /// + /// 1. The translator was configured to permit generating an expression + /// that can match invalid UTF-8. (By default, this is disabled.) + /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete + /// syntax or in the parser builder. By default, Unicode mode is + /// enabled. + pub fn is_always_utf8(&self) -> bool { + match *self { + Class::Unicode(_) => true, + Class::Bytes(ref x) => x.is_all_ascii(), + } + } +} + +/// A set of characters represented by Unicode scalar values. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassUnicode { + set: IntervalSet, +} + +impl ClassUnicode { + /// Create a new class from a sequence of ranges. + /// + /// The given ranges do not need to be in any specific order, and ranges + /// may overlap. + pub fn new(ranges: I) -> ClassUnicode + where + I: IntoIterator, + { + ClassUnicode { set: IntervalSet::new(ranges) } + } + + /// Create a new class with no ranges. + pub fn empty() -> ClassUnicode { + ClassUnicode::new(vec![]) + } + + /// Add a new range to this set. + pub fn push(&mut self, range: ClassUnicodeRange) { + self.set.push(range); + } + + /// Return an iterator over all ranges in this class. + /// + /// The iterator yields ranges in ascending order. + pub fn iter(&self) -> ClassUnicodeIter<'_> { + ClassUnicodeIter(self.set.iter()) + } + + /// Return the underlying ranges as a slice. + pub fn ranges(&self) -> &[ClassUnicodeRange] { + self.set.intervals() + } + + /// Expand this character class such that it contains all case folded + /// characters, according to Unicode's "simple" mapping. For example, if + /// this class consists of the range `a-z`, then applying case folding will + /// result in the class containing both the ranges `a-z` and `A-Z`. + /// + /// # Panics + /// + /// This routine panics when the case mapping data necessary for this + /// routine to complete is unavailable. This occurs when the `unicode-case` + /// feature is not enabled. + /// + /// Callers should prefer using `try_case_fold_simple` instead, which will + /// return an error instead of panicking. + pub fn case_fold_simple(&mut self) { + self.set + .case_fold_simple() + .expect("unicode-case feature must be enabled"); + } + + /// Expand this character class such that it contains all case folded + /// characters, according to Unicode's "simple" mapping. For example, if + /// this class consists of the range `a-z`, then applying case folding will + /// result in the class containing both the ranges `a-z` and `A-Z`. + /// + /// # Error + /// + /// This routine returns an error when the case mapping data necessary + /// for this routine to complete is unavailable. This occurs when the + /// `unicode-case` feature is not enabled. + pub fn try_case_fold_simple( + &mut self, + ) -> result::Result<(), CaseFoldError> { + self.set.case_fold_simple() + } + + /// Negate this character class. + /// + /// For all `c` where `c` is a Unicode scalar value, if `c` was in this + /// set, then it will not be in this set after negation. + pub fn negate(&mut self) { + self.set.negate(); + } + + /// Union this character class with the given character class, in place. + pub fn union(&mut self, other: &ClassUnicode) { + self.set.union(&other.set); + } + + /// Intersect this character class with the given character class, in + /// place. + pub fn intersect(&mut self, other: &ClassUnicode) { + self.set.intersect(&other.set); + } + + /// Subtract the given character class from this character class, in place. + pub fn difference(&mut self, other: &ClassUnicode) { + self.set.difference(&other.set); + } + + /// Compute the symmetric difference of the given character classes, in + /// place. + /// + /// This computes the symmetric difference of two character classes. This + /// removes all elements in this class that are also in the given class, + /// but all adds all elements from the given class that aren't in this + /// class. That is, the class will contain all elements in either class, + /// but will not contain any elements that are in both classes. + pub fn symmetric_difference(&mut self, other: &ClassUnicode) { + self.set.symmetric_difference(&other.set); + } + + /// Returns true if and only if this character class will either match + /// nothing or only ASCII bytes. Stated differently, this returns false + /// if and only if this class contains a non-ASCII codepoint. + pub fn is_all_ascii(&self) -> bool { + self.set.intervals().last().map_or(true, |r| r.end <= '\x7F') + } +} + +/// An iterator over all ranges in a Unicode character class. +/// +/// The lifetime `'a` refers to the lifetime of the underlying class. +#[derive(Debug)] +pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>); + +impl<'a> Iterator for ClassUnicodeIter<'a> { + type Item = &'a ClassUnicodeRange; + + fn next(&mut self) -> Option<&'a ClassUnicodeRange> { + self.0.next() + } +} + +/// A single range of characters represented by Unicode scalar values. +/// +/// The range is closed. That is, the start and end of the range are included +/// in the range. +#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] +pub struct ClassUnicodeRange { + start: char, + end: char, +} + +impl fmt::Debug for ClassUnicodeRange { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let start = if !self.start.is_whitespace() && !self.start.is_control() + { + self.start.to_string() + } else { + format!("0x{:X}", self.start as u32) + }; + let end = if !self.end.is_whitespace() && !self.end.is_control() { + self.end.to_string() + } else { + format!("0x{:X}", self.end as u32) + }; + f.debug_struct("ClassUnicodeRange") + .field("start", &start) + .field("end", &end) + .finish() + } +} + +impl Interval for ClassUnicodeRange { + type Bound = char; + + #[inline] + fn lower(&self) -> char { + self.start + } + #[inline] + fn upper(&self) -> char { + self.end + } + #[inline] + fn set_lower(&mut self, bound: char) { + self.start = bound; + } + #[inline] + fn set_upper(&mut self, bound: char) { + self.end = bound; + } + + /// Apply simple case folding to this Unicode scalar value range. + /// + /// Additional ranges are appended to the given vector. Canonical ordering + /// is *not* maintained in the given vector. + fn case_fold_simple( + &self, + ranges: &mut Vec, + ) -> Result<(), unicode::CaseFoldError> { + if !unicode::contains_simple_case_mapping(self.start, self.end)? { + return Ok(()); + } + let start = self.start as u32; + let end = (self.end as u32).saturating_add(1); + let mut next_simple_cp = None; + for cp in (start..end).filter_map(char::from_u32) { + if next_simple_cp.map_or(false, |next| cp < next) { + continue; + } + let it = match unicode::simple_fold(cp)? { + Ok(it) => it, + Err(next) => { + next_simple_cp = next; + continue; + } + }; + for cp_folded in it { + ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded)); + } + } + Ok(()) + } +} + +impl ClassUnicodeRange { + /// Create a new Unicode scalar value range for a character class. + /// + /// The returned range is always in a canonical form. That is, the range + /// returned always satisfies the invariant that `start <= end`. + pub fn new(start: char, end: char) -> ClassUnicodeRange { + ClassUnicodeRange::create(start, end) + } + + /// Return the start of this range. + /// + /// The start of a range is always less than or equal to the end of the + /// range. + pub fn start(&self) -> char { + self.start + } + + /// Return the end of this range. + /// + /// The end of a range is always greater than or equal to the start of the + /// range. + pub fn end(&self) -> char { + self.end + } +} + +/// A set of characters represented by arbitrary bytes (where one byte +/// corresponds to one character). +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassBytes { + set: IntervalSet, +} + +impl ClassBytes { + /// Create a new class from a sequence of ranges. + /// + /// The given ranges do not need to be in any specific order, and ranges + /// may overlap. + pub fn new(ranges: I) -> ClassBytes + where + I: IntoIterator, + { + ClassBytes { set: IntervalSet::new(ranges) } + } + + /// Create a new class with no ranges. + pub fn empty() -> ClassBytes { + ClassBytes::new(vec![]) + } + + /// Add a new range to this set. + pub fn push(&mut self, range: ClassBytesRange) { + self.set.push(range); + } + + /// Return an iterator over all ranges in this class. + /// + /// The iterator yields ranges in ascending order. + pub fn iter(&self) -> ClassBytesIter<'_> { + ClassBytesIter(self.set.iter()) + } + + /// Return the underlying ranges as a slice. + pub fn ranges(&self) -> &[ClassBytesRange] { + self.set.intervals() + } + + /// Expand this character class such that it contains all case folded + /// characters. For example, if this class consists of the range `a-z`, + /// then applying case folding will result in the class containing both the + /// ranges `a-z` and `A-Z`. + /// + /// Note that this only applies ASCII case folding, which is limited to the + /// characters `a-z` and `A-Z`. + pub fn case_fold_simple(&mut self) { + self.set.case_fold_simple().expect("ASCII case folding never fails"); + } + + /// Negate this byte class. + /// + /// For all `b` where `b` is a any byte, if `b` was in this set, then it + /// will not be in this set after negation. + pub fn negate(&mut self) { + self.set.negate(); + } + + /// Union this byte class with the given byte class, in place. + pub fn union(&mut self, other: &ClassBytes) { + self.set.union(&other.set); + } + + /// Intersect this byte class with the given byte class, in place. + pub fn intersect(&mut self, other: &ClassBytes) { + self.set.intersect(&other.set); + } + + /// Subtract the given byte class from this byte class, in place. + pub fn difference(&mut self, other: &ClassBytes) { + self.set.difference(&other.set); + } + + /// Compute the symmetric difference of the given byte classes, in place. + /// + /// This computes the symmetric difference of two byte classes. This + /// removes all elements in this class that are also in the given class, + /// but all adds all elements from the given class that aren't in this + /// class. That is, the class will contain all elements in either class, + /// but will not contain any elements that are in both classes. + pub fn symmetric_difference(&mut self, other: &ClassBytes) { + self.set.symmetric_difference(&other.set); + } + + /// Returns true if and only if this character class will either match + /// nothing or only ASCII bytes. Stated differently, this returns false + /// if and only if this class contains a non-ASCII byte. + pub fn is_all_ascii(&self) -> bool { + self.set.intervals().last().map_or(true, |r| r.end <= 0x7F) + } +} + +/// An iterator over all ranges in a byte character class. +/// +/// The lifetime `'a` refers to the lifetime of the underlying class. +#[derive(Debug)] +pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>); + +impl<'a> Iterator for ClassBytesIter<'a> { + type Item = &'a ClassBytesRange; + + fn next(&mut self) -> Option<&'a ClassBytesRange> { + self.0.next() + } +} + +/// A single range of characters represented by arbitrary bytes. +/// +/// The range is closed. That is, the start and end of the range are included +/// in the range. +#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] +pub struct ClassBytesRange { + start: u8, + end: u8, +} + +impl Interval for ClassBytesRange { + type Bound = u8; + + #[inline] + fn lower(&self) -> u8 { + self.start + } + #[inline] + fn upper(&self) -> u8 { + self.end + } + #[inline] + fn set_lower(&mut self, bound: u8) { + self.start = bound; + } + #[inline] + fn set_upper(&mut self, bound: u8) { + self.end = bound; + } + + /// Apply simple case folding to this byte range. Only ASCII case mappings + /// (for a-z) are applied. + /// + /// Additional ranges are appended to the given vector. Canonical ordering + /// is *not* maintained in the given vector. + fn case_fold_simple( + &self, + ranges: &mut Vec, + ) -> Result<(), unicode::CaseFoldError> { + if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) { + let lower = cmp::max(self.start, b'a'); + let upper = cmp::min(self.end, b'z'); + ranges.push(ClassBytesRange::new(lower - 32, upper - 32)); + } + if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) { + let lower = cmp::max(self.start, b'A'); + let upper = cmp::min(self.end, b'Z'); + ranges.push(ClassBytesRange::new(lower + 32, upper + 32)); + } + Ok(()) + } +} + +impl ClassBytesRange { + /// Create a new byte range for a character class. + /// + /// The returned range is always in a canonical form. That is, the range + /// returned always satisfies the invariant that `start <= end`. + pub fn new(start: u8, end: u8) -> ClassBytesRange { + ClassBytesRange::create(start, end) + } + + /// Return the start of this range. + /// + /// The start of a range is always less than or equal to the end of the + /// range. + pub fn start(&self) -> u8 { + self.start + } + + /// Return the end of this range. + /// + /// The end of a range is always greater than or equal to the start of the + /// range. + pub fn end(&self) -> u8 { + self.end + } +} + +impl fmt::Debug for ClassBytesRange { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut debug = f.debug_struct("ClassBytesRange"); + if self.start <= 0x7F { + debug.field("start", &(self.start as char)); + } else { + debug.field("start", &self.start); + } + if self.end <= 0x7F { + debug.field("end", &(self.end as char)); + } else { + debug.field("end", &self.end); + } + debug.finish() + } +} + +/// The high-level intermediate representation for an anchor assertion. +/// +/// A matching anchor assertion is always zero-length. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Anchor { + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following a `\n` character. + StartLine, + /// Match the end of a line or the end of text. Specifically, + /// this matches at the end position of the input, or at the position + /// immediately preceding a `\n` character. + EndLine, + /// Match the beginning of text. Specifically, this matches at the starting + /// position of the input. + StartText, + /// Match the end of text. Specifically, this matches at the ending + /// position of the input. + EndText, +} + +/// The high-level intermediate representation for a word-boundary assertion. +/// +/// A matching word boundary assertion is always zero-length. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum WordBoundary { + /// Match a Unicode-aware word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + Unicode, + /// Match a Unicode-aware negation of a word boundary. + UnicodeNegate, + /// Match an ASCII-only word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + Ascii, + /// Match an ASCII-only negation of a word boundary. + AsciiNegate, +} + +impl WordBoundary { + /// Returns true if and only if this word boundary assertion is negated. + pub fn is_negated(&self) -> bool { + match *self { + WordBoundary::Unicode | WordBoundary::Ascii => false, + WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true, + } + } +} + +/// The high-level intermediate representation for a group. +/// +/// This represents one of three possible group types: +/// +/// 1. A non-capturing group (e.g., `(?:expr)`). +/// 2. A capturing group (e.g., `(expr)`). +/// 3. A named capturing group (e.g., `(?Pexpr)`). +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Group { + /// The kind of this group. If it is a capturing group, then the kind + /// contains the capture group index (and the name, if it is a named + /// group). + pub kind: GroupKind, + /// The expression inside the capturing group, which may be empty. + pub hir: Box, +} + +/// The kind of group. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum GroupKind { + /// A normal unnamed capturing group. + /// + /// The value is the capture index of the group. + CaptureIndex(u32), + /// A named capturing group. + CaptureName { + /// The name of the group. + name: String, + /// The capture index of the group. + index: u32, + }, + /// A non-capturing group. + NonCapturing, +} + +/// The high-level intermediate representation of a repetition operator. +/// +/// A repetition operator permits the repetition of an arbitrary +/// sub-expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Repetition { + /// The kind of this repetition operator. + pub kind: RepetitionKind, + /// Whether this repetition operator is greedy or not. A greedy operator + /// will match as much as it can. A non-greedy operator will match as + /// little as it can. + /// + /// Typically, operators are greedy by default and are only non-greedy when + /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is + /// not. However, this can be inverted via the `U` "ungreedy" flag. + pub greedy: bool, + /// The expression being repeated. + pub hir: Box, +} + +impl Repetition { + /// Returns true if and only if this repetition operator makes it possible + /// to match the empty string. + /// + /// Note that this is not defined inductively. For example, while `a*` + /// will report `true`, `()+` will not, even though `()` matches the empty + /// string and one or more occurrences of something that matches the empty + /// string will always match the empty string. In order to get the + /// inductive definition, see the corresponding method on + /// [`Hir`](struct.Hir.html). + pub fn is_match_empty(&self) -> bool { + match self.kind { + RepetitionKind::ZeroOrOne => true, + RepetitionKind::ZeroOrMore => true, + RepetitionKind::OneOrMore => false, + RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0, + RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0, + RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0, + } + } +} + +/// The kind of a repetition operator. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum RepetitionKind { + /// Matches a sub-expression zero or one times. + ZeroOrOne, + /// Matches a sub-expression zero or more times. + ZeroOrMore, + /// Matches a sub-expression one or more times. + OneOrMore, + /// Matches a sub-expression within a bounded range of times. + Range(RepetitionRange), +} + +/// The kind of a counted repetition operator. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum RepetitionRange { + /// Matches a sub-expression exactly this many times. + Exactly(u32), + /// Matches a sub-expression at least this many times. + AtLeast(u32), + /// Matches a sub-expression at least `m` times and at most `n` times. + Bounded(u32, u32), +} + +/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack +/// space but heap space proportional to the depth of the total `Hir`. +impl Drop for Hir { + fn drop(&mut self) { + use std::mem; + + match *self.kind() { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Anchor(_) + | HirKind::WordBoundary(_) => return, + HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return, + HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return, + HirKind::Concat(ref x) if x.is_empty() => return, + HirKind::Alternation(ref x) if x.is_empty() => return, + _ => {} + } + + let mut stack = vec![mem::replace(self, Hir::empty())]; + while let Some(mut expr) = stack.pop() { + match expr.kind { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Anchor(_) + | HirKind::WordBoundary(_) => {} + HirKind::Group(ref mut x) => { + stack.push(mem::replace(&mut x.hir, Hir::empty())); + } + HirKind::Repetition(ref mut x) => { + stack.push(mem::replace(&mut x.hir, Hir::empty())); + } + HirKind::Concat(ref mut x) => { + stack.extend(x.drain(..)); + } + HirKind::Alternation(ref mut x) => { + stack.extend(x.drain(..)); + } + } + } + } +} + +/// A type that documents various attributes of an HIR expression. +/// +/// These attributes are typically defined inductively on the HIR. +#[derive(Clone, Debug, Eq, PartialEq)] +struct HirInfo { + /// Represent yes/no questions by a bitfield to conserve space, since + /// this is included in every HIR expression. + /// + /// If more attributes need to be added, it is OK to increase the size of + /// this as appropriate. + bools: u16, +} + +// A simple macro for defining bitfield accessors/mutators. +macro_rules! define_bool { + ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => { + fn $is_fn_name(&self) -> bool { + self.bools & (0b1 << $bit) > 0 + } + + fn $set_fn_name(&mut self, yes: bool) { + if yes { + self.bools |= 1 << $bit; + } else { + self.bools &= !(1 << $bit); + } + } + }; +} + +impl HirInfo { + fn new() -> HirInfo { + HirInfo { bools: 0 } + } + + define_bool!(0, is_always_utf8, set_always_utf8); + define_bool!(1, is_all_assertions, set_all_assertions); + define_bool!(2, is_anchored_start, set_anchored_start); + define_bool!(3, is_anchored_end, set_anchored_end); + define_bool!(4, is_line_anchored_start, set_line_anchored_start); + define_bool!(5, is_line_anchored_end, set_line_anchored_end); + define_bool!(6, is_any_anchored_start, set_any_anchored_start); + define_bool!(7, is_any_anchored_end, set_any_anchored_end); + define_bool!(8, is_match_empty, set_match_empty); + define_bool!(9, is_literal, set_literal); + define_bool!(10, is_alternation_literal, set_alternation_literal); +} + +#[cfg(test)] +mod tests { + use super::*; + + fn uclass(ranges: &[(char, char)]) -> ClassUnicode { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| ClassUnicodeRange::new(s, e)) + .collect(); + ClassUnicode::new(ranges) + } + + fn bclass(ranges: &[(u8, u8)]) -> ClassBytes { + let ranges: Vec = + ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect(); + ClassBytes::new(ranges) + } + + fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> { + cls.iter().map(|x| (x.start(), x.end())).collect() + } + + #[cfg(feature = "unicode-case")] + fn ucasefold(cls: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls.clone(); + cls_.case_fold_simple(); + cls_ + } + + fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.union(cls2); + cls_ + } + + fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.intersect(cls2); + cls_ + } + + fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.difference(cls2); + cls_ + } + + fn usymdifference( + cls1: &ClassUnicode, + cls2: &ClassUnicode, + ) -> ClassUnicode { + let mut cls_ = cls1.clone(); + cls_.symmetric_difference(cls2); + cls_ + } + + fn unegate(cls: &ClassUnicode) -> ClassUnicode { + let mut cls_ = cls.clone(); + cls_.negate(); + cls_ + } + + fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> { + cls.iter().map(|x| (x.start(), x.end())).collect() + } + + fn bcasefold(cls: &ClassBytes) -> ClassBytes { + let mut cls_ = cls.clone(); + cls_.case_fold_simple(); + cls_ + } + + fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.union(cls2); + cls_ + } + + fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.intersect(cls2); + cls_ + } + + fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.difference(cls2); + cls_ + } + + fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { + let mut cls_ = cls1.clone(); + cls_.symmetric_difference(cls2); + cls_ + } + + fn bnegate(cls: &ClassBytes) -> ClassBytes { + let mut cls_ = cls.clone(); + cls_.negate(); + cls_ + } + + #[test] + fn class_range_canonical_unicode() { + let range = ClassUnicodeRange::new('\u{00FF}', '\0'); + assert_eq!('\0', range.start()); + assert_eq!('\u{00FF}', range.end()); + } + + #[test] + fn class_range_canonical_bytes() { + let range = ClassBytesRange::new(b'\xFF', b'\0'); + assert_eq!(b'\0', range.start()); + assert_eq!(b'\xFF', range.end()); + } + + #[test] + fn class_canonicalize_unicode() { + let cls = uclass(&[('a', 'c'), ('x', 'z')]); + let expected = vec![('a', 'c'), ('x', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('x', 'z'), ('a', 'c')]); + let expected = vec![('a', 'c'), ('x', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('x', 'z'), ('w', 'y')]); + let expected = vec![('w', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[ + ('c', 'f'), + ('a', 'g'), + ('d', 'j'), + ('a', 'c'), + ('m', 'p'), + ('l', 's'), + ]); + let expected = vec![('a', 'j'), ('l', 's')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('x', 'z'), ('u', 'w')]); + let expected = vec![('u', 'z')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]); + let expected = vec![('\x00', '\u{10FFFF}')]; + assert_eq!(expected, uranges(&cls)); + + let cls = uclass(&[('a', 'a'), ('b', 'b')]); + let expected = vec![('a', 'b')]; + assert_eq!(expected, uranges(&cls)); + } + + #[test] + fn class_canonicalize_bytes() { + let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); + let expected = vec![(b'a', b'c'), (b'x', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]); + let expected = vec![(b'a', b'c'), (b'x', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]); + let expected = vec![(b'w', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[ + (b'c', b'f'), + (b'a', b'g'), + (b'd', b'j'), + (b'a', b'c'), + (b'm', b'p'), + (b'l', b's'), + ]); + let expected = vec![(b'a', b'j'), (b'l', b's')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]); + let expected = vec![(b'u', b'z')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]); + let expected = vec![(b'\x00', b'\xFF')]; + assert_eq!(expected, branges(&cls)); + + let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); + let expected = vec![(b'a', b'b')]; + assert_eq!(expected, branges(&cls)); + } + + #[test] + #[cfg(feature = "unicode-case")] + fn class_case_fold_unicode() { + let cls = uclass(&[ + ('C', 'F'), + ('A', 'G'), + ('D', 'J'), + ('A', 'C'), + ('M', 'P'), + ('L', 'S'), + ('c', 'f'), + ]); + let expected = uclass(&[ + ('A', 'J'), + ('L', 'S'), + ('a', 'j'), + ('l', 's'), + ('\u{17F}', '\u{17F}'), + ]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('A', 'Z')]); + let expected = uclass(&[ + ('A', 'Z'), + ('a', 'z'), + ('\u{17F}', '\u{17F}'), + ('\u{212A}', '\u{212A}'), + ]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('a', 'z')]); + let expected = uclass(&[ + ('A', 'Z'), + ('a', 'z'), + ('\u{17F}', '\u{17F}'), + ('\u{212A}', '\u{212A}'), + ]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('A', 'A'), ('_', '_')]); + let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('A', 'A'), ('=', '=')]); + let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('\x00', '\x10')]); + assert_eq!(cls, ucasefold(&cls)); + + let cls = uclass(&[('k', 'k')]); + let expected = + uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]); + assert_eq!(expected, ucasefold(&cls)); + + let cls = uclass(&[('@', '@')]); + assert_eq!(cls, ucasefold(&cls)); + } + + #[test] + #[cfg(not(feature = "unicode-case"))] + fn class_case_fold_unicode_disabled() { + let mut cls = uclass(&[ + ('C', 'F'), + ('A', 'G'), + ('D', 'J'), + ('A', 'C'), + ('M', 'P'), + ('L', 'S'), + ('c', 'f'), + ]); + assert!(cls.try_case_fold_simple().is_err()); + } + + #[test] + #[should_panic] + #[cfg(not(feature = "unicode-case"))] + fn class_case_fold_unicode_disabled_panics() { + let mut cls = uclass(&[ + ('C', 'F'), + ('A', 'G'), + ('D', 'J'), + ('A', 'C'), + ('M', 'P'), + ('L', 'S'), + ('c', 'f'), + ]); + cls.case_fold_simple(); + } + + #[test] + fn class_case_fold_bytes() { + let cls = bclass(&[ + (b'C', b'F'), + (b'A', b'G'), + (b'D', b'J'), + (b'A', b'C'), + (b'M', b'P'), + (b'L', b'S'), + (b'c', b'f'), + ]); + let expected = + bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'A', b'Z')]); + let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'a', b'z')]); + let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]); + let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]); + let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'\x00', b'\x10')]); + assert_eq!(cls, bcasefold(&cls)); + + let cls = bclass(&[(b'k', b'k')]); + let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]); + assert_eq!(expected, bcasefold(&cls)); + + let cls = bclass(&[(b'@', b'@')]); + assert_eq!(cls, bcasefold(&cls)); + } + + #[test] + fn class_negate_unicode() { + let cls = uclass(&[('a', 'a')]); + let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('a', 'a'), ('b', 'b')]); + let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('a', 'c'), ('x', 'z')]); + let expected = uclass(&[ + ('\x00', '\x60'), + ('\x64', '\x77'), + ('\x7B', '\u{10FFFF}'), + ]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', 'a')]); + let expected = uclass(&[('\x62', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('a', '\u{10FFFF}')]); + let expected = uclass(&[('\x00', '\x60')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', '\u{10FFFF}')]); + let expected = uclass(&[]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[]); + let expected = uclass(&[('\x00', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = + uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]); + let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', '\u{D7FF}')]); + let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\x00', '\u{D7FE}')]); + let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]); + let expected = uclass(&[('\x00', '\u{D7FF}')]); + assert_eq!(expected, unegate(&cls)); + + let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]); + let expected = uclass(&[('\x00', '\u{E000}')]); + assert_eq!(expected, unegate(&cls)); + } + + #[test] + fn class_negate_bytes() { + let cls = bclass(&[(b'a', b'a')]); + let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); + let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); + let expected = bclass(&[ + (b'\x00', b'\x60'), + (b'\x64', b'\x77'), + (b'\x7B', b'\xFF'), + ]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'\x00', b'a')]); + let expected = bclass(&[(b'\x62', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'a', b'\xFF')]); + let expected = bclass(&[(b'\x00', b'\x60')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'\x00', b'\xFF')]); + let expected = bclass(&[]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[]); + let expected = bclass(&[(b'\x00', b'\xFF')]); + assert_eq!(expected, bnegate(&cls)); + + let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]); + let expected = bclass(&[(b'\xFE', b'\xFE')]); + assert_eq!(expected, bnegate(&cls)); + } + + #[test] + fn class_union_unicode() { + let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]); + let cls2 = uclass(&[('a', 'z')]); + let expected = uclass(&[('a', 'z'), ('A', 'C')]); + assert_eq!(expected, uunion(&cls1, &cls2)); + } + + #[test] + fn class_union_bytes() { + let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]); + let cls2 = bclass(&[(b'a', b'z')]); + let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]); + assert_eq!(expected, bunion(&cls1, &cls2)); + } + + #[test] + fn class_intersect_unicode() { + let cls1 = uclass(&[]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[('a', 'a')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('b', 'b')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('a', 'c')]); + let expected = uclass(&[('a', 'a')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b')]); + let cls2 = uclass(&[('a', 'c')]); + let expected = uclass(&[('a', 'b')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b')]); + let cls2 = uclass(&[('b', 'c')]); + let expected = uclass(&[('b', 'b')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b')]); + let cls2 = uclass(&[('c', 'd')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('b', 'c')]); + let cls2 = uclass(&[('a', 'd')]); + let expected = uclass(&[('b', 'c')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = uclass(&[('a', 'h')]); + let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('g', 'h')]); + let cls2 = uclass(&[('d', 'e'), ('k', 'l')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); + let cls2 = uclass(&[('h', 'h')]); + let expected = uclass(&[('h', 'h')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]); + let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]); + let expected = uclass(&[]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]); + let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]); + let expected = uclass(&[('b', 'f')]); + assert_eq!(expected, uintersect(&cls1, &cls2)); + } + + #[test] + fn class_intersect_bytes() { + let cls1 = bclass(&[]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[(b'a', b'a')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'b', b'b')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'a', b'c')]); + let expected = bclass(&[(b'a', b'a')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b')]); + let cls2 = bclass(&[(b'a', b'c')]); + let expected = bclass(&[(b'a', b'b')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b')]); + let cls2 = bclass(&[(b'b', b'c')]); + let expected = bclass(&[(b'b', b'b')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b')]); + let cls2 = bclass(&[(b'c', b'd')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'b', b'c')]); + let cls2 = bclass(&[(b'a', b'd')]); + let expected = bclass(&[(b'b', b'c')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let cls2 = bclass(&[(b'a', b'h')]); + let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]); + let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); + let cls2 = bclass(&[(b'h', b'h')]); + let expected = bclass(&[(b'h', b'h')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]); + let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]); + let expected = bclass(&[]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]); + let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]); + let expected = bclass(&[(b'b', b'f')]); + assert_eq!(expected, bintersect(&cls1, &cls2)); + } + + #[test] + fn class_difference_unicode() { + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'a')]); + let cls2 = uclass(&[]); + let expected = uclass(&[('a', 'a')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('a', 'a')]); + let expected = uclass(&[('b', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('z', 'z')]); + let expected = uclass(&[('a', 'y')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('m', 'm')]); + let expected = uclass(&[('a', 'l'), ('n', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('a', 'z')]); + let expected = uclass(&[]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('d', 'v')]); + let expected = uclass(&[('a', 'c')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('b', 'g'), ('s', 'u')]); + let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); + let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]); + let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('x', 'z')]); + let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); + let expected = uclass(&[('x', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + + let cls1 = uclass(&[('a', 'z')]); + let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); + let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]); + assert_eq!(expected, udifference(&cls1, &cls2)); + } + + #[test] + fn class_difference_bytes() { + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'a')]); + let cls2 = bclass(&[]); + let expected = bclass(&[(b'a', b'a')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'a', b'a')]); + let expected = bclass(&[(b'b', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'z', b'z')]); + let expected = bclass(&[(b'a', b'y')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'm', b'm')]); + let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'a', b'z')]); + let expected = bclass(&[]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'd', b'v')]); + let expected = bclass(&[(b'a', b'c')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); + let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'x', b'z')]); + let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'x', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + + let cls1 = bclass(&[(b'a', b'z')]); + let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); + let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]); + assert_eq!(expected, bdifference(&cls1, &cls2)); + } + + #[test] + fn class_symmetric_difference_unicode() { + let cls1 = uclass(&[('a', 'm')]); + let cls2 = uclass(&[('g', 't')]); + let expected = uclass(&[('a', 'f'), ('n', 't')]); + assert_eq!(expected, usymdifference(&cls1, &cls2)); + } + + #[test] + fn class_symmetric_difference_bytes() { + let cls1 = bclass(&[(b'a', b'm')]); + let cls2 = bclass(&[(b'g', b't')]); + let expected = bclass(&[(b'a', b'f'), (b'n', b't')]); + assert_eq!(expected, bsymdifference(&cls1, &cls2)); + } + + #[test] + #[should_panic] + fn hir_byte_literal_non_ascii() { + Hir::literal(Literal::Byte(b'a')); + } + + // We use a thread with an explicit stack size to test that our destructor + // for Hir can handle arbitrarily sized expressions in constant stack + // space. In case we run on a platform without threads (WASM?), we limit + // this test to Windows/Unix. + #[test] + #[cfg(any(unix, windows))] + fn no_stack_overflow_on_drop() { + use std::thread; + + let run = || { + let mut expr = Hir::empty(); + for _ in 0..100 { + expr = Hir::group(Group { + kind: GroupKind::NonCapturing, + hir: Box::new(expr), + }); + expr = Hir::repetition(Repetition { + kind: RepetitionKind::ZeroOrOne, + greedy: true, + hir: Box::new(expr), + }); + + expr = Hir { + kind: HirKind::Concat(vec![expr]), + info: HirInfo::new(), + }; + expr = Hir { + kind: HirKind::Alternation(vec![expr]), + info: HirInfo::new(), + }; + } + assert!(!expr.kind.is_empty()); + }; + + // We run our test on a thread with a small stack size so we can + // force the issue more easily. + // + // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests' + // for context on the specific stack size chosen here. + thread::Builder::new() + .stack_size(16 << 10) + .spawn(run) + .unwrap() + .join() + .unwrap(); + } +} diff --git a/vendor/regex-syntax/src/hir/print.rs b/vendor/regex-syntax/src/hir/print.rs new file mode 100644 index 0000000000..b71f3897cf --- /dev/null +++ b/vendor/regex-syntax/src/hir/print.rs @@ -0,0 +1,367 @@ +/*! +This module provides a regular expression printer for `Hir`. +*/ + +use std::fmt; + +use crate::hir::visitor::{self, Visitor}; +use crate::hir::{self, Hir, HirKind}; +use crate::is_meta_character; + +/// A builder for constructing a printer. +/// +/// Note that since a printer doesn't have any configuration knobs, this type +/// remains unexported. +#[derive(Clone, Debug)] +struct PrinterBuilder { + _priv: (), +} + +impl Default for PrinterBuilder { + fn default() -> PrinterBuilder { + PrinterBuilder::new() + } +} + +impl PrinterBuilder { + fn new() -> PrinterBuilder { + PrinterBuilder { _priv: () } + } + + fn build(&self) -> Printer { + Printer { _priv: () } + } +} + +/// A printer for a regular expression's high-level intermediate +/// representation. +/// +/// A printer converts a high-level intermediate representation (HIR) to a +/// regular expression pattern string. This particular printer uses constant +/// stack space and heap space proportional to the size of the HIR. +/// +/// Since this printer is only using the HIR, the pattern it prints will likely +/// not resemble the original pattern at all. For example, a pattern like +/// `\pL` will have its entire class written out. +/// +/// The purpose of this printer is to provide a means to mutate an HIR and then +/// build a regular expression from the result of that mutation. (A regex +/// library could provide a constructor from this HIR explicitly, but that +/// creates an unnecessary public coupling between the regex library and this +/// specific HIR representation.) +#[derive(Debug)] +pub struct Printer { + _priv: (), +} + +impl Printer { + /// Create a new printer. + pub fn new() -> Printer { + PrinterBuilder::new().build() + } + + /// Print the given `Ast` to the given writer. The writer must implement + /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used + /// here are a `fmt::Formatter` (which is available in `fmt::Display` + /// implementations) or a `&mut String`. + pub fn print(&mut self, hir: &Hir, wtr: W) -> fmt::Result { + visitor::visit(hir, Writer { wtr }) + } +} + +#[derive(Debug)] +struct Writer { + wtr: W, +} + +impl Visitor for Writer { + type Output = (); + type Err = fmt::Error; + + fn finish(self) -> fmt::Result { + Ok(()) + } + + fn visit_pre(&mut self, hir: &Hir) -> fmt::Result { + match *hir.kind() { + HirKind::Empty + | HirKind::Repetition(_) + | HirKind::Concat(_) + | HirKind::Alternation(_) => {} + HirKind::Literal(hir::Literal::Unicode(c)) => { + self.write_literal_char(c)?; + } + HirKind::Literal(hir::Literal::Byte(b)) => { + self.write_literal_byte(b)?; + } + HirKind::Class(hir::Class::Unicode(ref cls)) => { + self.wtr.write_str("[")?; + for range in cls.iter() { + if range.start() == range.end() { + self.write_literal_char(range.start())?; + } else { + self.write_literal_char(range.start())?; + self.wtr.write_str("-")?; + self.write_literal_char(range.end())?; + } + } + self.wtr.write_str("]")?; + } + HirKind::Class(hir::Class::Bytes(ref cls)) => { + self.wtr.write_str("(?-u:[")?; + for range in cls.iter() { + if range.start() == range.end() { + self.write_literal_class_byte(range.start())?; + } else { + self.write_literal_class_byte(range.start())?; + self.wtr.write_str("-")?; + self.write_literal_class_byte(range.end())?; + } + } + self.wtr.write_str("])")?; + } + HirKind::Anchor(hir::Anchor::StartLine) => { + self.wtr.write_str("(?m:^)")?; + } + HirKind::Anchor(hir::Anchor::EndLine) => { + self.wtr.write_str("(?m:$)")?; + } + HirKind::Anchor(hir::Anchor::StartText) => { + self.wtr.write_str(r"\A")?; + } + HirKind::Anchor(hir::Anchor::EndText) => { + self.wtr.write_str(r"\z")?; + } + HirKind::WordBoundary(hir::WordBoundary::Unicode) => { + self.wtr.write_str(r"\b")?; + } + HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => { + self.wtr.write_str(r"\B")?; + } + HirKind::WordBoundary(hir::WordBoundary::Ascii) => { + self.wtr.write_str(r"(?-u:\b)")?; + } + HirKind::WordBoundary(hir::WordBoundary::AsciiNegate) => { + self.wtr.write_str(r"(?-u:\B)")?; + } + HirKind::Group(ref x) => match x.kind { + hir::GroupKind::CaptureIndex(_) => { + self.wtr.write_str("(")?; + } + hir::GroupKind::CaptureName { ref name, .. } => { + write!(self.wtr, "(?P<{}>", name)?; + } + hir::GroupKind::NonCapturing => { + self.wtr.write_str("(?:")?; + } + }, + } + Ok(()) + } + + fn visit_post(&mut self, hir: &Hir) -> fmt::Result { + match *hir.kind() { + // Handled during visit_pre + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Anchor(_) + | HirKind::WordBoundary(_) + | HirKind::Concat(_) + | HirKind::Alternation(_) => {} + HirKind::Repetition(ref x) => { + match x.kind { + hir::RepetitionKind::ZeroOrOne => { + self.wtr.write_str("?")?; + } + hir::RepetitionKind::ZeroOrMore => { + self.wtr.write_str("*")?; + } + hir::RepetitionKind::OneOrMore => { + self.wtr.write_str("+")?; + } + hir::RepetitionKind::Range(ref x) => match *x { + hir::RepetitionRange::Exactly(m) => { + write!(self.wtr, "{{{}}}", m)?; + } + hir::RepetitionRange::AtLeast(m) => { + write!(self.wtr, "{{{},}}", m)?; + } + hir::RepetitionRange::Bounded(m, n) => { + write!(self.wtr, "{{{},{}}}", m, n)?; + } + }, + } + if !x.greedy { + self.wtr.write_str("?")?; + } + } + HirKind::Group(_) => { + self.wtr.write_str(")")?; + } + } + Ok(()) + } + + fn visit_alternation_in(&mut self) -> fmt::Result { + self.wtr.write_str("|") + } +} + +impl Writer { + fn write_literal_char(&mut self, c: char) -> fmt::Result { + if is_meta_character(c) { + self.wtr.write_str("\\")?; + } + self.wtr.write_char(c) + } + + fn write_literal_byte(&mut self, b: u8) -> fmt::Result { + let c = b as char; + if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() { + self.write_literal_char(c) + } else { + write!(self.wtr, "(?-u:\\x{:02X})", b) + } + } + + fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result { + let c = b as char; + if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() { + self.write_literal_char(c) + } else { + write!(self.wtr, "\\x{:02X}", b) + } + } +} + +#[cfg(test)] +mod tests { + use super::Printer; + use crate::ParserBuilder; + + fn roundtrip(given: &str, expected: &str) { + roundtrip_with(|b| b, given, expected); + } + + fn roundtrip_bytes(given: &str, expected: &str) { + roundtrip_with(|b| b.allow_invalid_utf8(true), given, expected); + } + + fn roundtrip_with(mut f: F, given: &str, expected: &str) + where + F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder, + { + let mut builder = ParserBuilder::new(); + f(&mut builder); + let hir = builder.build().parse(given).unwrap(); + + let mut printer = Printer::new(); + let mut dst = String::new(); + printer.print(&hir, &mut dst).unwrap(); + + // Check that the result is actually valid. + builder.build().parse(&dst).unwrap(); + + assert_eq!(expected, dst); + } + + #[test] + fn print_literal() { + roundtrip("a", "a"); + roundtrip(r"\xff", "\u{FF}"); + roundtrip_bytes(r"\xff", "\u{FF}"); + roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)"); + roundtrip("☃", "☃"); + } + + #[test] + fn print_class() { + roundtrip(r"[a]", r"[a]"); + roundtrip(r"[a-z]", r"[a-z]"); + roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]"); + roundtrip(r"[^\x01-\u{10FFFF}]", "[\u{0}]"); + roundtrip(r"[-]", r"[\-]"); + roundtrip(r"[☃-⛄]", r"[☃-⛄]"); + + roundtrip(r"(?-u)[a]", r"(?-u:[a])"); + roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])"); + roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])"); + + // The following test that the printer escapes meta characters + // in character classes. + roundtrip(r"[\[]", r"[\[]"); + roundtrip(r"[Z-_]", r"[Z-_]"); + roundtrip(r"[Z-_--Z]", r"[\[-_]"); + + // The following test that the printer escapes meta characters + // in byte oriented character classes. + roundtrip_bytes(r"(?-u)[\[]", r"(?-u:[\[])"); + roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])"); + roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])"); + } + + #[test] + fn print_anchor() { + roundtrip(r"^", r"\A"); + roundtrip(r"$", r"\z"); + roundtrip(r"(?m)^", r"(?m:^)"); + roundtrip(r"(?m)$", r"(?m:$)"); + } + + #[test] + fn print_word_boundary() { + roundtrip(r"\b", r"\b"); + roundtrip(r"\B", r"\B"); + roundtrip(r"(?-u)\b", r"(?-u:\b)"); + roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)"); + } + + #[test] + fn print_repetition() { + roundtrip("a?", "a?"); + roundtrip("a??", "a??"); + roundtrip("(?U)a?", "a??"); + + roundtrip("a*", "a*"); + roundtrip("a*?", "a*?"); + roundtrip("(?U)a*", "a*?"); + + roundtrip("a+", "a+"); + roundtrip("a+?", "a+?"); + roundtrip("(?U)a+", "a+?"); + + roundtrip("a{1}", "a{1}"); + roundtrip("a{1,}", "a{1,}"); + roundtrip("a{1,5}", "a{1,5}"); + roundtrip("a{1}?", "a{1}?"); + roundtrip("a{1,}?", "a{1,}?"); + roundtrip("a{1,5}?", "a{1,5}?"); + roundtrip("(?U)a{1}", "a{1}?"); + roundtrip("(?U)a{1,}", "a{1,}?"); + roundtrip("(?U)a{1,5}", "a{1,5}?"); + } + + #[test] + fn print_group() { + roundtrip("()", "()"); + roundtrip("(?P)", "(?P)"); + roundtrip("(?:)", "(?:)"); + + roundtrip("(a)", "(a)"); + roundtrip("(?Pa)", "(?Pa)"); + roundtrip("(?:a)", "(?:a)"); + + roundtrip("((((a))))", "((((a))))"); + } + + #[test] + fn print_alternation() { + roundtrip("|", "|"); + roundtrip("||", "||"); + + roundtrip("a|b", "a|b"); + roundtrip("a|b|c", "a|b|c"); + roundtrip("foo|bar|quux", "foo|bar|quux"); + } +} diff --git a/vendor/regex-syntax/src/hir/translate.rs b/vendor/regex-syntax/src/hir/translate.rs new file mode 100644 index 0000000000..890e1608b3 --- /dev/null +++ b/vendor/regex-syntax/src/hir/translate.rs @@ -0,0 +1,3207 @@ +/*! +Defines a translator that converts an `Ast` to an `Hir`. +*/ + +use std::cell::{Cell, RefCell}; +use std::result; + +use crate::ast::{self, Ast, Span, Visitor}; +use crate::hir::{self, Error, ErrorKind, Hir}; +use crate::unicode::{self, ClassQuery}; + +type Result = result::Result; + +/// A builder for constructing an AST->HIR translator. +#[derive(Clone, Debug)] +pub struct TranslatorBuilder { + allow_invalid_utf8: bool, + flags: Flags, +} + +impl Default for TranslatorBuilder { + fn default() -> TranslatorBuilder { + TranslatorBuilder::new() + } +} + +impl TranslatorBuilder { + /// Create a new translator builder with a default c onfiguration. + pub fn new() -> TranslatorBuilder { + TranslatorBuilder { + allow_invalid_utf8: false, + flags: Flags::default(), + } + } + + /// Build a translator using the current configuration. + pub fn build(&self) -> Translator { + Translator { + stack: RefCell::new(vec![]), + flags: Cell::new(self.flags), + allow_invalid_utf8: self.allow_invalid_utf8, + } + } + + /// When enabled, translation will permit the construction of a regular + /// expression that may match invalid UTF-8. + /// + /// When disabled (the default), the translator is guaranteed to produce + /// an expression that will only ever match valid UTF-8 (otherwise, the + /// translator will return an error). + /// + /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII + /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause + /// the parser to return an error. Namely, a negated ASCII word boundary + /// can result in matching positions that aren't valid UTF-8 boundaries. + pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.allow_invalid_utf8 = yes; + self + } + + /// Enable or disable the case insensitive flag (`i`) by default. + pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.case_insensitive = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the multi-line matching flag (`m`) by default. + pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.multi_line = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the "dot matches any character" flag (`s`) by + /// default. + pub fn dot_matches_new_line( + &mut self, + yes: bool, + ) -> &mut TranslatorBuilder { + self.flags.dot_matches_new_line = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the "swap greed" flag (`U`) by default. + pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.swap_greed = if yes { Some(true) } else { None }; + self + } + + /// Enable or disable the Unicode flag (`u`) by default. + pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.unicode = if yes { None } else { Some(false) }; + self + } +} + +/// A translator maps abstract syntax to a high level intermediate +/// representation. +/// +/// A translator may be benefit from reuse. That is, a translator can translate +/// many abstract syntax trees. +/// +/// A `Translator` can be configured in more detail via a +/// [`TranslatorBuilder`](struct.TranslatorBuilder.html). +#[derive(Clone, Debug)] +pub struct Translator { + /// Our call stack, but on the heap. + stack: RefCell>, + /// The current flag settings. + flags: Cell, + /// Whether we're allowed to produce HIR that can match arbitrary bytes. + allow_invalid_utf8: bool, +} + +impl Translator { + /// Create a new translator using the default configuration. + pub fn new() -> Translator { + TranslatorBuilder::new().build() + } + + /// Translate the given abstract syntax tree (AST) into a high level + /// intermediate representation (HIR). + /// + /// If there was a problem doing the translation, then an HIR-specific + /// error is returned. + /// + /// The original pattern string used to produce the `Ast` *must* also be + /// provided. The translator does not use the pattern string during any + /// correct translation, but is used for error reporting. + pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result { + ast::visit(ast, TranslatorI::new(self, pattern)) + } +} + +/// An HirFrame is a single stack frame, represented explicitly, which is +/// created for each item in the Ast that we traverse. +/// +/// Note that technically, this type doesn't represent our entire stack +/// frame. In particular, the Ast visitor represents any state associated with +/// traversing the Ast itself. +#[derive(Clone, Debug)] +enum HirFrame { + /// An arbitrary HIR expression. These get pushed whenever we hit a base + /// case in the Ast. They get popped after an inductive (i.e., recursive) + /// step is complete. + Expr(Hir), + /// A Unicode character class. This frame is mutated as we descend into + /// the Ast of a character class (which is itself its own mini recursive + /// structure). + ClassUnicode(hir::ClassUnicode), + /// A byte-oriented character class. This frame is mutated as we descend + /// into the Ast of a character class (which is itself its own mini + /// recursive structure). + /// + /// Byte character classes are created when Unicode mode (`u`) is disabled. + /// If `allow_invalid_utf8` is disabled (the default), then a byte + /// character is only permitted to match ASCII text. + ClassBytes(hir::ClassBytes), + /// This is pushed on to the stack upon first seeing any kind of group, + /// indicated by parentheses (including non-capturing groups). It is popped + /// upon leaving a group. + Group { + /// The old active flags when this group was opened. + /// + /// If this group sets flags, then the new active flags are set to the + /// result of merging the old flags with the flags introduced by this + /// group. If the group doesn't set any flags, then this is simply + /// equivalent to whatever flags were set when the group was opened. + /// + /// When this group is popped, the active flags should be restored to + /// the flags set here. + /// + /// The "active" flags correspond to whatever flags are set in the + /// Translator. + old_flags: Flags, + }, + /// This is pushed whenever a concatenation is observed. After visiting + /// every sub-expression in the concatenation, the translator's stack is + /// popped until it sees a Concat frame. + Concat, + /// This is pushed whenever an alternation is observed. After visiting + /// every sub-expression in the alternation, the translator's stack is + /// popped until it sees an Alternation frame. + Alternation, +} + +impl HirFrame { + /// Assert that the current stack frame is an Hir expression and return it. + fn unwrap_expr(self) -> Hir { + match self { + HirFrame::Expr(expr) => expr, + _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self), + } + } + + /// Assert that the current stack frame is a Unicode class expression and + /// return it. + fn unwrap_class_unicode(self) -> hir::ClassUnicode { + match self { + HirFrame::ClassUnicode(cls) => cls, + _ => panic!( + "tried to unwrap Unicode class \ + from HirFrame, got: {:?}", + self + ), + } + } + + /// Assert that the current stack frame is a byte class expression and + /// return it. + fn unwrap_class_bytes(self) -> hir::ClassBytes { + match self { + HirFrame::ClassBytes(cls) => cls, + _ => panic!( + "tried to unwrap byte class \ + from HirFrame, got: {:?}", + self + ), + } + } + + /// Assert that the current stack frame is a group indicator and return + /// its corresponding flags (the flags that were active at the time the + /// group was entered). + fn unwrap_group(self) -> Flags { + match self { + HirFrame::Group { old_flags } => old_flags, + _ => { + panic!("tried to unwrap group from HirFrame, got: {:?}", self) + } + } + } +} + +impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { + type Output = Hir; + type Err = Error; + + fn finish(self) -> Result { + // ... otherwise, we should have exactly one HIR on the stack. + assert_eq!(self.trans().stack.borrow().len(), 1); + Ok(self.pop().unwrap().unwrap_expr()) + } + + fn visit_pre(&mut self, ast: &Ast) -> Result<()> { + match *ast { + Ast::Class(ast::Class::Bracketed(_)) => { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + } + Ast::Group(ref x) => { + let old_flags = x + .flags() + .map(|ast| self.set_flags(ast)) + .unwrap_or_else(|| self.flags()); + self.push(HirFrame::Group { old_flags }); + } + Ast::Concat(ref x) if x.asts.is_empty() => {} + Ast::Concat(_) => { + self.push(HirFrame::Concat); + } + Ast::Alternation(ref x) if x.asts.is_empty() => {} + Ast::Alternation(_) => { + self.push(HirFrame::Alternation); + } + _ => {} + } + Ok(()) + } + + fn visit_post(&mut self, ast: &Ast) -> Result<()> { + match *ast { + Ast::Empty(_) => { + self.push(HirFrame::Expr(Hir::empty())); + } + Ast::Flags(ref x) => { + self.set_flags(&x.flags); + // Flags in the AST are generally considered directives and + // not actual sub-expressions. However, they can be used in + // the concrete syntax like `((?i))`, and we need some kind of + // indication of an expression there, and Empty is the correct + // choice. + // + // There can also be things like `(?i)+`, but we rule those out + // in the parser. In the future, we might allow them for + // consistency sake. + self.push(HirFrame::Expr(Hir::empty())); + } + Ast::Literal(ref x) => { + self.push(HirFrame::Expr(self.hir_literal(x)?)); + } + Ast::Dot(span) => { + self.push(HirFrame::Expr(self.hir_dot(span)?)); + } + Ast::Assertion(ref x) => { + self.push(HirFrame::Expr(self.hir_assertion(x)?)); + } + Ast::Class(ast::Class::Perl(ref x)) => { + if self.flags().unicode() { + let cls = self.hir_perl_unicode_class(x)?; + let hcls = hir::Class::Unicode(cls); + self.push(HirFrame::Expr(Hir::class(hcls))); + } else { + let cls = self.hir_perl_byte_class(x); + let hcls = hir::Class::Bytes(cls); + self.push(HirFrame::Expr(Hir::class(hcls))); + } + } + Ast::Class(ast::Class::Unicode(ref x)) => { + let cls = hir::Class::Unicode(self.hir_unicode_class(x)?); + self.push(HirFrame::Expr(Hir::class(cls))); + } + Ast::Class(ast::Class::Bracketed(ref ast)) => { + if self.flags().unicode() { + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + self.unicode_fold_and_negate( + &ast.span, + ast.negated, + &mut cls, + )?; + if cls.ranges().is_empty() { + return Err(self.error( + ast.span, + ErrorKind::EmptyClassNotAllowed, + )); + } + let expr = Hir::class(hir::Class::Unicode(cls)); + self.push(HirFrame::Expr(expr)); + } else { + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + self.bytes_fold_and_negate( + &ast.span, + ast.negated, + &mut cls, + )?; + if cls.ranges().is_empty() { + return Err(self.error( + ast.span, + ErrorKind::EmptyClassNotAllowed, + )); + } + + let expr = Hir::class(hir::Class::Bytes(cls)); + self.push(HirFrame::Expr(expr)); + } + } + Ast::Repetition(ref x) => { + let expr = self.pop().unwrap().unwrap_expr(); + self.push(HirFrame::Expr(self.hir_repetition(x, expr))); + } + Ast::Group(ref x) => { + let expr = self.pop().unwrap().unwrap_expr(); + let old_flags = self.pop().unwrap().unwrap_group(); + self.trans().flags.set(old_flags); + self.push(HirFrame::Expr(self.hir_group(x, expr))); + } + Ast::Concat(_) => { + let mut exprs = vec![]; + while let Some(HirFrame::Expr(expr)) = self.pop() { + if !expr.kind().is_empty() { + exprs.push(expr); + } + } + exprs.reverse(); + self.push(HirFrame::Expr(Hir::concat(exprs))); + } + Ast::Alternation(_) => { + let mut exprs = vec![]; + while let Some(HirFrame::Expr(expr)) = self.pop() { + exprs.push(expr); + } + exprs.reverse(); + self.push(HirFrame::Expr(Hir::alternation(exprs))); + } + } + Ok(()) + } + + fn visit_class_set_item_pre( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + match *ast { + ast::ClassSetItem::Bracketed(_) => { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + } + // We needn't handle the Union case here since the visitor will + // do it for us. + _ => {} + } + Ok(()) + } + + fn visit_class_set_item_post( + &mut self, + ast: &ast::ClassSetItem, + ) -> Result<()> { + match *ast { + ast::ClassSetItem::Empty(_) => {} + ast::ClassSetItem::Literal(ref x) => { + if self.flags().unicode() { + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.push(hir::ClassUnicodeRange::new(x.c, x.c)); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + let byte = self.class_literal_byte(x)?; + cls.push(hir::ClassBytesRange::new(byte, byte)); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Range(ref x) => { + if self.flags().unicode() { + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c)); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + let start = self.class_literal_byte(&x.start)?; + let end = self.class_literal_byte(&x.end)?; + cls.push(hir::ClassBytesRange::new(start, end)); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Ascii(ref x) => { + if self.flags().unicode() { + let xcls = self.hir_ascii_unicode_class(x)?; + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.union(&xcls); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let xcls = self.hir_ascii_byte_class(x)?; + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + cls.union(&xcls); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Unicode(ref x) => { + let xcls = self.hir_unicode_class(x)?; + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.union(&xcls); + self.push(HirFrame::ClassUnicode(cls)); + } + ast::ClassSetItem::Perl(ref x) => { + if self.flags().unicode() { + let xcls = self.hir_perl_unicode_class(x)?; + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + cls.union(&xcls); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let xcls = self.hir_perl_byte_class(x); + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + cls.union(&xcls); + self.push(HirFrame::ClassBytes(cls)); + } + } + ast::ClassSetItem::Bracketed(ref ast) => { + if self.flags().unicode() { + let mut cls1 = self.pop().unwrap().unwrap_class_unicode(); + self.unicode_fold_and_negate( + &ast.span, + ast.negated, + &mut cls1, + )?; + + let mut cls2 = self.pop().unwrap().unwrap_class_unicode(); + cls2.union(&cls1); + self.push(HirFrame::ClassUnicode(cls2)); + } else { + let mut cls1 = self.pop().unwrap().unwrap_class_bytes(); + self.bytes_fold_and_negate( + &ast.span, + ast.negated, + &mut cls1, + )?; + + let mut cls2 = self.pop().unwrap().unwrap_class_bytes(); + cls2.union(&cls1); + self.push(HirFrame::ClassBytes(cls2)); + } + } + // This is handled automatically by the visitor. + ast::ClassSetItem::Union(_) => {} + } + Ok(()) + } + + fn visit_class_set_binary_op_pre( + &mut self, + _op: &ast::ClassSetBinaryOp, + ) -> Result<()> { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + Ok(()) + } + + fn visit_class_set_binary_op_in( + &mut self, + _op: &ast::ClassSetBinaryOp, + ) -> Result<()> { + if self.flags().unicode() { + let cls = hir::ClassUnicode::empty(); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let cls = hir::ClassBytes::empty(); + self.push(HirFrame::ClassBytes(cls)); + } + Ok(()) + } + + fn visit_class_set_binary_op_post( + &mut self, + op: &ast::ClassSetBinaryOp, + ) -> Result<()> { + use crate::ast::ClassSetBinaryOpKind::*; + + if self.flags().unicode() { + let mut rhs = self.pop().unwrap().unwrap_class_unicode(); + let mut lhs = self.pop().unwrap().unwrap_class_unicode(); + let mut cls = self.pop().unwrap().unwrap_class_unicode(); + if self.flags().case_insensitive() { + rhs.try_case_fold_simple().map_err(|_| { + self.error( + op.rhs.span().clone(), + ErrorKind::UnicodeCaseUnavailable, + ) + })?; + lhs.try_case_fold_simple().map_err(|_| { + self.error( + op.lhs.span().clone(), + ErrorKind::UnicodeCaseUnavailable, + ) + })?; + } + match op.kind { + Intersection => lhs.intersect(&rhs), + Difference => lhs.difference(&rhs), + SymmetricDifference => lhs.symmetric_difference(&rhs), + } + cls.union(&lhs); + self.push(HirFrame::ClassUnicode(cls)); + } else { + let mut rhs = self.pop().unwrap().unwrap_class_bytes(); + let mut lhs = self.pop().unwrap().unwrap_class_bytes(); + let mut cls = self.pop().unwrap().unwrap_class_bytes(); + if self.flags().case_insensitive() { + rhs.case_fold_simple(); + lhs.case_fold_simple(); + } + match op.kind { + Intersection => lhs.intersect(&rhs), + Difference => lhs.difference(&rhs), + SymmetricDifference => lhs.symmetric_difference(&rhs), + } + cls.union(&lhs); + self.push(HirFrame::ClassBytes(cls)); + } + Ok(()) + } +} + +/// The internal implementation of a translator. +/// +/// This type is responsible for carrying around the original pattern string, +/// which is not tied to the internal state of a translator. +/// +/// A TranslatorI exists for the time it takes to translate a single Ast. +#[derive(Clone, Debug)] +struct TranslatorI<'t, 'p> { + trans: &'t Translator, + pattern: &'p str, +} + +impl<'t, 'p> TranslatorI<'t, 'p> { + /// Build a new internal translator. + fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> { + TranslatorI { trans, pattern } + } + + /// Return a reference to the underlying translator. + fn trans(&self) -> &Translator { + &self.trans + } + + /// Push the given frame on to the call stack. + fn push(&self, frame: HirFrame) { + self.trans().stack.borrow_mut().push(frame); + } + + /// Pop the top of the call stack. If the call stack is empty, return None. + fn pop(&self) -> Option { + self.trans().stack.borrow_mut().pop() + } + + /// Create a new error with the given span and error type. + fn error(&self, span: Span, kind: ErrorKind) -> Error { + Error { kind, pattern: self.pattern.to_string(), span } + } + + /// Return a copy of the active flags. + fn flags(&self) -> Flags { + self.trans().flags.get() + } + + /// Set the flags of this translator from the flags set in the given AST. + /// Then, return the old flags. + fn set_flags(&self, ast_flags: &ast::Flags) -> Flags { + let old_flags = self.flags(); + let mut new_flags = Flags::from_ast(ast_flags); + new_flags.merge(&old_flags); + self.trans().flags.set(new_flags); + old_flags + } + + fn hir_literal(&self, lit: &ast::Literal) -> Result { + let ch = match self.literal_to_char(lit)? { + byte @ hir::Literal::Byte(_) => return Ok(Hir::literal(byte)), + hir::Literal::Unicode(ch) => ch, + }; + if self.flags().case_insensitive() { + self.hir_from_char_case_insensitive(lit.span, ch) + } else { + self.hir_from_char(lit.span, ch) + } + } + + /// Convert an Ast literal to its scalar representation. + /// + /// When Unicode mode is enabled, then this always succeeds and returns a + /// `char` (Unicode scalar value). + /// + /// When Unicode mode is disabled, then a raw byte is returned. If that + /// byte is not ASCII and invalid UTF-8 is not allowed, then this returns + /// an error. + fn literal_to_char(&self, lit: &ast::Literal) -> Result { + if self.flags().unicode() { + return Ok(hir::Literal::Unicode(lit.c)); + } + let byte = match lit.byte() { + None => return Ok(hir::Literal::Unicode(lit.c)), + Some(byte) => byte, + }; + if byte <= 0x7F { + return Ok(hir::Literal::Unicode(byte as char)); + } + if !self.trans().allow_invalid_utf8 { + return Err(self.error(lit.span, ErrorKind::InvalidUtf8)); + } + Ok(hir::Literal::Byte(byte)) + } + + fn hir_from_char(&self, span: Span, c: char) -> Result { + if !self.flags().unicode() && c.len_utf8() > 1 { + return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); + } + Ok(Hir::literal(hir::Literal::Unicode(c))) + } + + fn hir_from_char_case_insensitive( + &self, + span: Span, + c: char, + ) -> Result { + if self.flags().unicode() { + // If case folding won't do anything, then don't bother trying. + let map = + unicode::contains_simple_case_mapping(c, c).map_err(|_| { + self.error(span, ErrorKind::UnicodeCaseUnavailable) + })?; + if !map { + return self.hir_from_char(span, c); + } + let mut cls = + hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new( + c, c, + )]); + cls.try_case_fold_simple().map_err(|_| { + self.error(span, ErrorKind::UnicodeCaseUnavailable) + })?; + Ok(Hir::class(hir::Class::Unicode(cls))) + } else { + if c.len_utf8() > 1 { + return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); + } + // If case folding won't do anything, then don't bother trying. + match c { + 'A'..='Z' | 'a'..='z' => {} + _ => return self.hir_from_char(span, c), + } + let mut cls = + hir::ClassBytes::new(vec![hir::ClassBytesRange::new( + c as u8, c as u8, + )]); + cls.case_fold_simple(); + Ok(Hir::class(hir::Class::Bytes(cls))) + } + } + + fn hir_dot(&self, span: Span) -> Result { + let unicode = self.flags().unicode(); + if !unicode && !self.trans().allow_invalid_utf8 { + return Err(self.error(span, ErrorKind::InvalidUtf8)); + } + Ok(if self.flags().dot_matches_new_line() { + Hir::any(!unicode) + } else { + Hir::dot(!unicode) + }) + } + + fn hir_assertion(&self, asst: &ast::Assertion) -> Result { + let unicode = self.flags().unicode(); + let multi_line = self.flags().multi_line(); + Ok(match asst.kind { + ast::AssertionKind::StartLine => Hir::anchor(if multi_line { + hir::Anchor::StartLine + } else { + hir::Anchor::StartText + }), + ast::AssertionKind::EndLine => Hir::anchor(if multi_line { + hir::Anchor::EndLine + } else { + hir::Anchor::EndText + }), + ast::AssertionKind::StartText => { + Hir::anchor(hir::Anchor::StartText) + } + ast::AssertionKind::EndText => Hir::anchor(hir::Anchor::EndText), + ast::AssertionKind::WordBoundary => { + Hir::word_boundary(if unicode { + hir::WordBoundary::Unicode + } else { + hir::WordBoundary::Ascii + }) + } + ast::AssertionKind::NotWordBoundary => { + Hir::word_boundary(if unicode { + hir::WordBoundary::UnicodeNegate + } else { + // It is possible for negated ASCII word boundaries to + // match at invalid UTF-8 boundaries, even when searching + // valid UTF-8. + if !self.trans().allow_invalid_utf8 { + return Err( + self.error(asst.span, ErrorKind::InvalidUtf8) + ); + } + hir::WordBoundary::AsciiNegate + }) + } + }) + } + + fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir { + let kind = match group.kind { + ast::GroupKind::CaptureIndex(idx) => { + hir::GroupKind::CaptureIndex(idx) + } + ast::GroupKind::CaptureName(ref capname) => { + hir::GroupKind::CaptureName { + name: capname.name.clone(), + index: capname.index, + } + } + ast::GroupKind::NonCapturing(_) => hir::GroupKind::NonCapturing, + }; + Hir::group(hir::Group { kind, hir: Box::new(expr) }) + } + + fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir { + let kind = match rep.op.kind { + ast::RepetitionKind::ZeroOrOne => hir::RepetitionKind::ZeroOrOne, + ast::RepetitionKind::ZeroOrMore => hir::RepetitionKind::ZeroOrMore, + ast::RepetitionKind::OneOrMore => hir::RepetitionKind::OneOrMore, + ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => { + hir::RepetitionKind::Range(hir::RepetitionRange::Exactly(m)) + } + ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => { + hir::RepetitionKind::Range(hir::RepetitionRange::AtLeast(m)) + } + ast::RepetitionKind::Range(ast::RepetitionRange::Bounded( + m, + n, + )) => { + hir::RepetitionKind::Range(hir::RepetitionRange::Bounded(m, n)) + } + }; + let greedy = + if self.flags().swap_greed() { !rep.greedy } else { rep.greedy }; + Hir::repetition(hir::Repetition { kind, greedy, hir: Box::new(expr) }) + } + + fn hir_unicode_class( + &self, + ast_class: &ast::ClassUnicode, + ) -> Result { + use crate::ast::ClassUnicodeKind::*; + + if !self.flags().unicode() { + return Err( + self.error(ast_class.span, ErrorKind::UnicodeNotAllowed) + ); + } + let query = match ast_class.kind { + OneLetter(name) => ClassQuery::OneLetter(name), + Named(ref name) => ClassQuery::Binary(name), + NamedValue { ref name, ref value, .. } => ClassQuery::ByValue { + property_name: name, + property_value: value, + }, + }; + let mut result = self.convert_unicode_class_error( + &ast_class.span, + unicode::class(query), + ); + if let Ok(ref mut class) = result { + self.unicode_fold_and_negate( + &ast_class.span, + ast_class.negated, + class, + )?; + if class.ranges().is_empty() { + let err = self + .error(ast_class.span, ErrorKind::EmptyClassNotAllowed); + return Err(err); + } + } + result + } + + fn hir_ascii_unicode_class( + &self, + ast: &ast::ClassAscii, + ) -> Result { + let mut cls = hir::ClassUnicode::new( + ascii_class(&ast.kind) + .iter() + .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)), + ); + self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?; + Ok(cls) + } + + fn hir_ascii_byte_class( + &self, + ast: &ast::ClassAscii, + ) -> Result { + let mut cls = hir::ClassBytes::new( + ascii_class(&ast.kind) + .iter() + .map(|&(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)), + ); + self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?; + Ok(cls) + } + + fn hir_perl_unicode_class( + &self, + ast_class: &ast::ClassPerl, + ) -> Result { + use crate::ast::ClassPerlKind::*; + + assert!(self.flags().unicode()); + let result = match ast_class.kind { + Digit => unicode::perl_digit(), + Space => unicode::perl_space(), + Word => unicode::perl_word(), + }; + let mut class = + self.convert_unicode_class_error(&ast_class.span, result)?; + // We needn't apply case folding here because the Perl Unicode classes + // are already closed under Unicode simple case folding. + if ast_class.negated { + class.negate(); + } + Ok(class) + } + + fn hir_perl_byte_class( + &self, + ast_class: &ast::ClassPerl, + ) -> hir::ClassBytes { + use crate::ast::ClassPerlKind::*; + + assert!(!self.flags().unicode()); + let mut class = match ast_class.kind { + Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit), + Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space), + Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word), + }; + // We needn't apply case folding here because the Perl ASCII classes + // are already closed (under ASCII case folding). + if ast_class.negated { + class.negate(); + } + class + } + + /// Converts the given Unicode specific error to an HIR translation error. + /// + /// The span given should approximate the position at which an error would + /// occur. + fn convert_unicode_class_error( + &self, + span: &Span, + result: unicode::Result, + ) -> Result { + result.map_err(|err| { + let sp = span.clone(); + match err { + unicode::Error::PropertyNotFound => { + self.error(sp, ErrorKind::UnicodePropertyNotFound) + } + unicode::Error::PropertyValueNotFound => { + self.error(sp, ErrorKind::UnicodePropertyValueNotFound) + } + unicode::Error::PerlClassNotFound => { + self.error(sp, ErrorKind::UnicodePerlClassNotFound) + } + } + }) + } + + fn unicode_fold_and_negate( + &self, + span: &Span, + negated: bool, + class: &mut hir::ClassUnicode, + ) -> Result<()> { + // Note that we must apply case folding before negation! + // Consider `(?i)[^x]`. If we applied negation field, then + // the result would be the character class that matched any + // Unicode scalar value. + if self.flags().case_insensitive() { + class.try_case_fold_simple().map_err(|_| { + self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable) + })?; + } + if negated { + class.negate(); + } + Ok(()) + } + + fn bytes_fold_and_negate( + &self, + span: &Span, + negated: bool, + class: &mut hir::ClassBytes, + ) -> Result<()> { + // Note that we must apply case folding before negation! + // Consider `(?i)[^x]`. If we applied negation first, then + // the result would be the character class that matched any + // Unicode scalar value. + if self.flags().case_insensitive() { + class.case_fold_simple(); + } + if negated { + class.negate(); + } + if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() { + return Err(self.error(span.clone(), ErrorKind::InvalidUtf8)); + } + Ok(()) + } + + /// Return a scalar byte value suitable for use as a literal in a byte + /// character class. + fn class_literal_byte(&self, ast: &ast::Literal) -> Result { + match self.literal_to_char(ast)? { + hir::Literal::Byte(byte) => Ok(byte), + hir::Literal::Unicode(ch) => { + if ch <= 0x7F as char { + Ok(ch as u8) + } else { + // We can't feasibly support Unicode in + // byte oriented classes. Byte classes don't + // do Unicode case folding. + Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed)) + } + } + } + } +} + +/// A translator's representation of a regular expression's flags at any given +/// moment in time. +/// +/// Each flag can be in one of three states: absent, present but disabled or +/// present but enabled. +#[derive(Clone, Copy, Debug, Default)] +struct Flags { + case_insensitive: Option, + multi_line: Option, + dot_matches_new_line: Option, + swap_greed: Option, + unicode: Option, + // Note that `ignore_whitespace` is omitted here because it is handled + // entirely in the parser. +} + +impl Flags { + fn from_ast(ast: &ast::Flags) -> Flags { + let mut flags = Flags::default(); + let mut enable = true; + for item in &ast.items { + match item.kind { + ast::FlagsItemKind::Negation => { + enable = false; + } + ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => { + flags.case_insensitive = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => { + flags.multi_line = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => { + flags.dot_matches_new_line = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => { + flags.swap_greed = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::Unicode) => { + flags.unicode = Some(enable); + } + ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {} + } + } + flags + } + + fn merge(&mut self, previous: &Flags) { + if self.case_insensitive.is_none() { + self.case_insensitive = previous.case_insensitive; + } + if self.multi_line.is_none() { + self.multi_line = previous.multi_line; + } + if self.dot_matches_new_line.is_none() { + self.dot_matches_new_line = previous.dot_matches_new_line; + } + if self.swap_greed.is_none() { + self.swap_greed = previous.swap_greed; + } + if self.unicode.is_none() { + self.unicode = previous.unicode; + } + } + + fn case_insensitive(&self) -> bool { + self.case_insensitive.unwrap_or(false) + } + + fn multi_line(&self) -> bool { + self.multi_line.unwrap_or(false) + } + + fn dot_matches_new_line(&self) -> bool { + self.dot_matches_new_line.unwrap_or(false) + } + + fn swap_greed(&self) -> bool { + self.swap_greed.unwrap_or(false) + } + + fn unicode(&self) -> bool { + self.unicode.unwrap_or(true) + } +} + +fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes { + let ranges: Vec<_> = ascii_class(kind) + .iter() + .cloned() + .map(|(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)) + .collect(); + hir::ClassBytes::new(ranges) +} + +fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] { + use crate::ast::ClassAsciiKind::*; + match *kind { + Alnum => &[('0', '9'), ('A', 'Z'), ('a', 'z')], + Alpha => &[('A', 'Z'), ('a', 'z')], + Ascii => &[('\x00', '\x7F')], + Blank => &[('\t', '\t'), (' ', ' ')], + Cntrl => &[('\x00', '\x1F'), ('\x7F', '\x7F')], + Digit => &[('0', '9')], + Graph => &[('!', '~')], + Lower => &[('a', 'z')], + Print => &[(' ', '~')], + Punct => &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')], + Space => &[ + ('\t', '\t'), + ('\n', '\n'), + ('\x0B', '\x0B'), + ('\x0C', '\x0C'), + ('\r', '\r'), + (' ', ' '), + ], + Upper => &[('A', 'Z')], + Word => &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')], + Xdigit => &[('0', '9'), ('A', 'F'), ('a', 'f')], + } +} + +#[cfg(test)] +mod tests { + use crate::ast::parse::ParserBuilder; + use crate::ast::{self, Ast, Position, Span}; + use crate::hir::{self, Hir, HirKind}; + use crate::unicode::{self, ClassQuery}; + + use super::{ascii_class, TranslatorBuilder}; + + // We create these errors to compare with real hir::Errors in the tests. + // We define equality between TestError and hir::Error to disregard the + // pattern string in hir::Error, which is annoying to provide in tests. + #[derive(Clone, Debug)] + struct TestError { + span: Span, + kind: hir::ErrorKind, + } + + impl PartialEq for TestError { + fn eq(&self, other: &hir::Error) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + impl PartialEq for hir::Error { + fn eq(&self, other: &TestError) -> bool { + self.span == other.span && self.kind == other.kind + } + } + + fn parse(pattern: &str) -> Ast { + ParserBuilder::new().octal(true).build().parse(pattern).unwrap() + } + + fn t(pattern: &str) -> Hir { + TranslatorBuilder::new() + .allow_invalid_utf8(false) + .build() + .translate(pattern, &parse(pattern)) + .unwrap() + } + + fn t_err(pattern: &str) -> hir::Error { + TranslatorBuilder::new() + .allow_invalid_utf8(false) + .build() + .translate(pattern, &parse(pattern)) + .unwrap_err() + } + + fn t_bytes(pattern: &str) -> Hir { + TranslatorBuilder::new() + .allow_invalid_utf8(true) + .build() + .translate(pattern, &parse(pattern)) + .unwrap() + } + + fn hir_lit(s: &str) -> Hir { + match s.len() { + 0 => Hir::empty(), + _ => { + let lits = s + .chars() + .map(hir::Literal::Unicode) + .map(Hir::literal) + .collect(); + Hir::concat(lits) + } + } + } + + fn hir_blit(s: &[u8]) -> Hir { + match s.len() { + 0 => Hir::empty(), + 1 => Hir::literal(hir::Literal::Byte(s[0])), + _ => { + let lits = s + .iter() + .cloned() + .map(hir::Literal::Byte) + .map(Hir::literal) + .collect(); + Hir::concat(lits) + } + } + } + + fn hir_group(i: u32, expr: Hir) -> Hir { + Hir::group(hir::Group { + kind: hir::GroupKind::CaptureIndex(i), + hir: Box::new(expr), + }) + } + + fn hir_group_name(i: u32, name: &str, expr: Hir) -> Hir { + Hir::group(hir::Group { + kind: hir::GroupKind::CaptureName { + name: name.to_string(), + index: i, + }, + hir: Box::new(expr), + }) + } + + fn hir_group_nocap(expr: Hir) -> Hir { + Hir::group(hir::Group { + kind: hir::GroupKind::NonCapturing, + hir: Box::new(expr), + }) + } + + fn hir_quest(greedy: bool, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrOne, + greedy, + hir: Box::new(expr), + }) + } + + fn hir_star(greedy: bool, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + greedy, + hir: Box::new(expr), + }) + } + + fn hir_plus(greedy: bool, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::OneOrMore, + greedy, + hir: Box::new(expr), + }) + } + + fn hir_range(greedy: bool, range: hir::RepetitionRange, expr: Hir) -> Hir { + Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::Range(range), + greedy, + hir: Box::new(expr), + }) + } + + fn hir_alt(alts: Vec) -> Hir { + Hir::alternation(alts) + } + + fn hir_cat(exprs: Vec) -> Hir { + Hir::concat(exprs) + } + + #[allow(dead_code)] + fn hir_uclass_query(query: ClassQuery<'_>) -> Hir { + Hir::class(hir::Class::Unicode(unicode::class(query).unwrap())) + } + + #[allow(dead_code)] + fn hir_uclass_perl_word() -> Hir { + Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap())) + } + + fn hir_uclass(ranges: &[(char, char)]) -> Hir { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) + .collect(); + Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(ranges))) + } + + fn hir_bclass(ranges: &[(u8, u8)]) -> Hir { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| hir::ClassBytesRange::new(s, e)) + .collect(); + Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges))) + } + + fn hir_bclass_from_char(ranges: &[(char, char)]) -> Hir { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| { + assert!(s as u32 <= 0x7F); + assert!(e as u32 <= 0x7F); + hir::ClassBytesRange::new(s as u8, e as u8) + }) + .collect(); + Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges))) + } + + fn hir_case_fold(expr: Hir) -> Hir { + match expr.into_kind() { + HirKind::Class(mut cls) => { + cls.case_fold_simple(); + Hir::class(cls) + } + _ => panic!("cannot case fold non-class Hir expr"), + } + } + + fn hir_negate(expr: Hir) -> Hir { + match expr.into_kind() { + HirKind::Class(mut cls) => { + cls.negate(); + Hir::class(cls) + } + _ => panic!("cannot negate non-class Hir expr"), + } + } + + #[allow(dead_code)] + fn hir_union(expr1: Hir, expr2: Hir) -> Hir { + use crate::hir::Class::{Bytes, Unicode}; + + match (expr1.into_kind(), expr2.into_kind()) { + (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => { + c1.union(&c2); + Hir::class(hir::Class::Unicode(c1)) + } + (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => { + c1.union(&c2); + Hir::class(hir::Class::Bytes(c1)) + } + _ => panic!("cannot union non-class Hir exprs"), + } + } + + #[allow(dead_code)] + fn hir_difference(expr1: Hir, expr2: Hir) -> Hir { + use crate::hir::Class::{Bytes, Unicode}; + + match (expr1.into_kind(), expr2.into_kind()) { + (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => { + c1.difference(&c2); + Hir::class(hir::Class::Unicode(c1)) + } + (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => { + c1.difference(&c2); + Hir::class(hir::Class::Bytes(c1)) + } + _ => panic!("cannot difference non-class Hir exprs"), + } + } + + fn hir_anchor(anchor: hir::Anchor) -> Hir { + Hir::anchor(anchor) + } + + fn hir_word(wb: hir::WordBoundary) -> Hir { + Hir::word_boundary(wb) + } + + #[test] + fn empty() { + assert_eq!(t(""), Hir::empty()); + assert_eq!(t("(?i)"), Hir::empty()); + assert_eq!(t("()"), hir_group(1, Hir::empty())); + assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty())); + assert_eq!(t("(?P)"), hir_group_name(1, "wat", Hir::empty())); + assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()])); + assert_eq!( + t("()|()"), + hir_alt(vec![ + hir_group(1, Hir::empty()), + hir_group(2, Hir::empty()), + ]) + ); + assert_eq!( + t("(|b)"), + hir_group(1, hir_alt(vec![Hir::empty(), hir_lit("b"),])) + ); + assert_eq!( + t("(a|)"), + hir_group(1, hir_alt(vec![hir_lit("a"), Hir::empty(),])) + ); + assert_eq!( + t("(a||c)"), + hir_group( + 1, + hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),]) + ) + ); + assert_eq!( + t("(||)"), + hir_group( + 1, + hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),]) + ) + ); + } + + #[test] + fn literal() { + assert_eq!(t("a"), hir_lit("a")); + assert_eq!(t("(?-u)a"), hir_lit("a")); + assert_eq!(t("☃"), hir_lit("☃")); + assert_eq!(t("abcd"), hir_lit("abcd")); + + assert_eq!(t_bytes("(?-u)a"), hir_lit("a")); + assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a")); + assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a")); + assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF")); + + assert_eq!( + t_err("(?-u)☃"), + TestError { + kind: hir::ErrorKind::UnicodeNotAllowed, + span: Span::new( + Position::new(5, 1, 6), + Position::new(8, 1, 7) + ), + } + ); + assert_eq!( + t_err(r"(?-u)\xFF"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(9, 1, 10) + ), + } + ); + } + + #[test] + fn literal_case_insensitive() { + #[cfg(feature = "unicode-case")] + assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),])); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i:a)"), + hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')],)) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("a(?i)a(?-i)a"), + hir_cat(vec![ + hir_lit("a"), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_lit("a"), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)ab@c"), + hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_uclass(&[('B', 'B'), ('b', 'b')]), + hir_lit("@"), + hir_uclass(&[('C', 'C'), ('c', 'c')]), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)β"), + hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),]) + ); + + assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?-u)a(?i)a(?-i)a"), + hir_cat(vec![ + hir_lit("a"), + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), + hir_lit("a"), + ]) + ); + assert_eq!( + t("(?i-u)ab@c"), + hir_cat(vec![ + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), + hir_bclass(&[(b'B', b'B'), (b'b', b'b')]), + hir_lit("@"), + hir_bclass(&[(b'C', b'C'), (b'c', b'c')]), + ]) + ); + + assert_eq!( + t_bytes("(?i-u)a"), + hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) + ); + assert_eq!( + t_bytes("(?i-u)\x61"), + hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) + ); + assert_eq!( + t_bytes(r"(?i-u)\x61"), + hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) + ); + assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF")); + + assert_eq!( + t_err("(?i-u)β"), + TestError { + kind: hir::ErrorKind::UnicodeNotAllowed, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 8), + ), + } + ); + } + + #[test] + fn dot() { + assert_eq!( + t("."), + hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}'),]) + ); + assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}'),])); + assert_eq!( + t_bytes("(?-u)."), + hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF'),]) + ); + assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); + + // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed. + assert_eq!( + t_err("(?-u)."), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(6, 1, 7) + ), + } + ); + assert_eq!( + t_err("(?s-u)."), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(7, 1, 8) + ), + } + ); + } + + #[test] + fn assertions() { + assert_eq!(t("^"), hir_anchor(hir::Anchor::StartText)); + assert_eq!(t("$"), hir_anchor(hir::Anchor::EndText)); + assert_eq!(t(r"\A"), hir_anchor(hir::Anchor::StartText)); + assert_eq!(t(r"\z"), hir_anchor(hir::Anchor::EndText)); + assert_eq!(t("(?m)^"), hir_anchor(hir::Anchor::StartLine)); + assert_eq!(t("(?m)$"), hir_anchor(hir::Anchor::EndLine)); + assert_eq!(t(r"(?m)\A"), hir_anchor(hir::Anchor::StartText)); + assert_eq!(t(r"(?m)\z"), hir_anchor(hir::Anchor::EndText)); + + assert_eq!(t(r"\b"), hir_word(hir::WordBoundary::Unicode)); + assert_eq!(t(r"\B"), hir_word(hir::WordBoundary::UnicodeNegate)); + assert_eq!(t(r"(?-u)\b"), hir_word(hir::WordBoundary::Ascii)); + assert_eq!( + t_bytes(r"(?-u)\B"), + hir_word(hir::WordBoundary::AsciiNegate) + ); + + assert_eq!( + t_err(r"(?-u)\B"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8) + ), + } + ); + } + + #[test] + fn group() { + assert_eq!(t("(a)"), hir_group(1, hir_lit("a"))); + assert_eq!( + t("(a)(b)"), + hir_cat(vec![ + hir_group(1, hir_lit("a")), + hir_group(2, hir_lit("b")), + ]) + ); + assert_eq!( + t("(a)|(b)"), + hir_alt(vec![ + hir_group(1, hir_lit("a")), + hir_group(2, hir_lit("b")), + ]) + ); + assert_eq!(t("(?P)"), hir_group_name(1, "foo", Hir::empty())); + assert_eq!(t("(?Pa)"), hir_group_name(1, "foo", hir_lit("a"))); + assert_eq!( + t("(?Pa)(?Pb)"), + hir_cat(vec![ + hir_group_name(1, "foo", hir_lit("a")), + hir_group_name(2, "bar", hir_lit("b")), + ]) + ); + assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty())); + assert_eq!(t("(?:a)"), hir_group_nocap(hir_lit("a"))); + assert_eq!( + t("(?:a)(b)"), + hir_cat(vec![ + hir_group_nocap(hir_lit("a")), + hir_group(1, hir_lit("b")), + ]) + ); + assert_eq!( + t("(a)(?:b)(c)"), + hir_cat(vec![ + hir_group(1, hir_lit("a")), + hir_group_nocap(hir_lit("b")), + hir_group(2, hir_lit("c")), + ]) + ); + assert_eq!( + t("(a)(?Pb)(c)"), + hir_cat(vec![ + hir_group(1, hir_lit("a")), + hir_group_name(2, "foo", hir_lit("b")), + hir_group(3, hir_lit("c")), + ]) + ); + assert_eq!(t("()"), hir_group(1, Hir::empty())); + assert_eq!(t("((?i))"), hir_group(1, Hir::empty())); + assert_eq!(t("((?x))"), hir_group(1, Hir::empty())); + assert_eq!(t("(((?x)))"), hir_group(1, hir_group(2, Hir::empty()))); + } + + #[test] + fn flags() { + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i:a)a"), + hir_cat(vec![ + hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')])), + hir_lit("a"), + ]) + ); + assert_eq!( + t("(?i-u:a)β"), + hir_cat(vec![ + hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), + hir_lit("β"), + ]) + ); + assert_eq!( + t("(?:(?i-u)a)b"), + hir_cat(vec![ + hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), + hir_lit("b"), + ]) + ); + assert_eq!( + t("((?i-u)a)b"), + hir_cat(vec![ + hir_group(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), + hir_lit("b"), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)(?-i:a)a"), + hir_cat(vec![ + hir_group_nocap(hir_lit("a")), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?im)a^"), + hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_anchor(hir::Anchor::StartLine), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?im)a^(?i-m)a^"), + hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_anchor(hir::Anchor::StartLine), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_anchor(hir::Anchor::StartText), + ]) + ); + assert_eq!( + t("(?U)a*a*?(?-U)a*a*?"), + hir_cat(vec![ + hir_star(false, hir_lit("a")), + hir_star(true, hir_lit("a")), + hir_star(true, hir_lit("a")), + hir_star(false, hir_lit("a")), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?:a(?i)a)a"), + hir_cat(vec![ + hir_group_nocap(hir_cat(vec![ + hir_lit("a"), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + ])), + hir_lit("a"), + ]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)(?:a(?-i)a)a"), + hir_cat(vec![ + hir_group_nocap(hir_cat(vec![ + hir_uclass(&[('A', 'A'), ('a', 'a')]), + hir_lit("a"), + ])), + hir_uclass(&[('A', 'A'), ('a', 'a')]), + ]) + ); + } + + #[test] + fn escape() { + assert_eq!( + t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"), + hir_lit(r"\.+*?()|[]{}^$#") + ); + } + + #[test] + fn repetition() { + assert_eq!(t("a?"), hir_quest(true, hir_lit("a"))); + assert_eq!(t("a*"), hir_star(true, hir_lit("a"))); + assert_eq!(t("a+"), hir_plus(true, hir_lit("a"))); + assert_eq!(t("a??"), hir_quest(false, hir_lit("a"))); + assert_eq!(t("a*?"), hir_star(false, hir_lit("a"))); + assert_eq!(t("a+?"), hir_plus(false, hir_lit("a"))); + + assert_eq!( + t("a{1}"), + hir_range(true, hir::RepetitionRange::Exactly(1), hir_lit("a"),) + ); + assert_eq!( + t("a{1,}"), + hir_range(true, hir::RepetitionRange::AtLeast(1), hir_lit("a"),) + ); + assert_eq!( + t("a{1,2}"), + hir_range(true, hir::RepetitionRange::Bounded(1, 2), hir_lit("a"),) + ); + assert_eq!( + t("a{1}?"), + hir_range(false, hir::RepetitionRange::Exactly(1), hir_lit("a"),) + ); + assert_eq!( + t("a{1,}?"), + hir_range(false, hir::RepetitionRange::AtLeast(1), hir_lit("a"),) + ); + assert_eq!( + t("a{1,2}?"), + hir_range( + false, + hir::RepetitionRange::Bounded(1, 2), + hir_lit("a"), + ) + ); + + assert_eq!( + t("ab?"), + hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) + ); + assert_eq!( + t("(ab)?"), + hir_quest( + true, + hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),])) + ) + ); + assert_eq!( + t("a|b?"), + hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) + ); + } + + #[test] + fn cat_alt() { + assert_eq!( + t("(ab)"), + hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),])) + ); + assert_eq!(t("a|b"), hir_alt(vec![hir_lit("a"), hir_lit("b"),])); + assert_eq!( + t("a|b|c"), + hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),]) + ); + assert_eq!( + t("ab|bc|cd"), + hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),]) + ); + assert_eq!( + t("(a|b)"), + hir_group(1, hir_alt(vec![hir_lit("a"), hir_lit("b"),])) + ); + assert_eq!( + t("(a|b|c)"), + hir_group( + 1, + hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),]) + ) + ); + assert_eq!( + t("(ab|bc|cd)"), + hir_group( + 1, + hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),]) + ) + ); + assert_eq!( + t("(ab|(bc|(cd)))"), + hir_group( + 1, + hir_alt(vec![ + hir_lit("ab"), + hir_group( + 2, + hir_alt(vec![ + hir_lit("bc"), + hir_group(3, hir_lit("cd")), + ]) + ), + ]) + ) + ); + } + + #[test] + fn class_ascii() { + assert_eq!( + t("[[:alnum:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)) + ); + assert_eq!( + t("[[:alpha:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Alpha)) + ); + assert_eq!( + t("[[:ascii:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Ascii)) + ); + assert_eq!( + t("[[:blank:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Blank)) + ); + assert_eq!( + t("[[:cntrl:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Cntrl)) + ); + assert_eq!( + t("[[:digit:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Digit)) + ); + assert_eq!( + t("[[:graph:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Graph)) + ); + assert_eq!( + t("[[:lower:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower)) + ); + assert_eq!( + t("[[:print:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Print)) + ); + assert_eq!( + t("[[:punct:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Punct)) + ); + assert_eq!( + t("[[:space:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Space)) + ); + assert_eq!( + t("[[:upper:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Upper)) + ); + assert_eq!( + t("[[:word:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Word)) + ); + assert_eq!( + t("[[:xdigit:]]"), + hir_uclass(ascii_class(&ast::ClassAsciiKind::Xdigit)) + ); + + assert_eq!( + t("[[:^lower:]]"), + hir_negate(hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower))) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[[:lower:]]"), + hir_uclass(&[ + ('A', 'Z'), + ('a', 'z'), + ('\u{17F}', '\u{17F}'), + ('\u{212A}', '\u{212A}'), + ]) + ); + + assert_eq!( + t("(?-u)[[:lower:]]"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Lower)) + ); + assert_eq!( + t("(?i-u)[[:lower:]]"), + hir_case_fold(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Lower + ))) + ); + + assert_eq!( + t_err("(?-u)[[:^lower:]]"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(16, 1, 17) + ), + } + ); + assert_eq!( + t_err("(?i-u)[[:^lower:]]"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(7, 1, 8), + Position::new(17, 1, 18) + ), + } + ); + } + + #[test] + fn class_ascii_multiple() { + // See: https://github.com/rust-lang/regex/issues/680 + assert_eq!( + t("[[:alnum:][:^ascii:]]"), + hir_union( + hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)), + hir_uclass(&[('\u{80}', '\u{10FFFF}')]), + ), + ); + assert_eq!( + t_bytes("(?-u)[[:alnum:][:^ascii:]]"), + hir_union( + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Alnum)), + hir_bclass(&[(0x80, 0xFF)]), + ), + ); + } + + #[test] + #[cfg(feature = "unicode-perl")] + fn class_perl() { + // Unicode + assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit"))); + assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space"))); + assert_eq!(t(r"\w"), hir_uclass_perl_word()); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\d"), + hir_uclass_query(ClassQuery::Binary("digit")) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\s"), + hir_uclass_query(ClassQuery::Binary("space")) + ); + #[cfg(feature = "unicode-case")] + assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word()); + + // Unicode, negated + assert_eq!( + t(r"\D"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) + ); + assert_eq!( + t(r"\S"), + hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) + ); + assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word())); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\D"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\S"), + hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) + ); + #[cfg(feature = "unicode-case")] + assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word())); + + // ASCII only + assert_eq!( + t(r"(?-u)\d"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) + ); + assert_eq!( + t(r"(?-u)\s"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space)) + ); + assert_eq!( + t(r"(?-u)\w"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word)) + ); + assert_eq!( + t(r"(?i-u)\d"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) + ); + assert_eq!( + t(r"(?i-u)\s"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space)) + ); + assert_eq!( + t(r"(?i-u)\w"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word)) + ); + + // ASCII only, negated + assert_eq!( + t(r"(?-u)\D"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Digit + ))) + ); + assert_eq!( + t(r"(?-u)\S"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Space + ))) + ); + assert_eq!( + t(r"(?-u)\W"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Word + ))) + ); + assert_eq!( + t(r"(?i-u)\D"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Digit + ))) + ); + assert_eq!( + t(r"(?i-u)\S"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Space + ))) + ); + assert_eq!( + t(r"(?i-u)\W"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Word + ))) + ); + } + + #[test] + #[cfg(not(feature = "unicode-perl"))] + fn class_perl_word_disabled() { + assert_eq!( + t_err(r"\w"), + TestError { + kind: hir::ErrorKind::UnicodePerlClassNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(2, 1, 3) + ), + } + ); + } + + #[test] + #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))] + fn class_perl_space_disabled() { + assert_eq!( + t_err(r"\s"), + TestError { + kind: hir::ErrorKind::UnicodePerlClassNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(2, 1, 3) + ), + } + ); + } + + #[test] + #[cfg(all( + not(feature = "unicode-perl"), + not(feature = "unicode-gencat") + ))] + fn class_perl_digit_disabled() { + assert_eq!( + t_err(r"\d"), + TestError { + kind: hir::ErrorKind::UnicodePerlClassNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(2, 1, 3) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-gencat")] + fn class_unicode_gencat() { + assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z"))); + assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z"))); + assert_eq!( + t(r"\p{Separator}"), + hir_uclass_query(ClassQuery::Binary("Z")) + ); + assert_eq!( + t(r"\p{se PaRa ToR}"), + hir_uclass_query(ClassQuery::Binary("Z")) + ); + assert_eq!( + t(r"\p{gc:Separator}"), + hir_uclass_query(ClassQuery::Binary("Z")) + ); + assert_eq!( + t(r"\p{gc=Separator}"), + hir_uclass_query(ClassQuery::Binary("Z")) + ); + assert_eq!( + t(r"\p{Other}"), + hir_uclass_query(ClassQuery::Binary("Other")) + ); + assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other"))); + + assert_eq!( + t(r"\PZ"), + hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) + ); + assert_eq!( + t(r"\P{separator}"), + hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) + ); + assert_eq!( + t(r"\P{gc!=separator}"), + hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) + ); + + assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any"))); + assert_eq!( + t(r"\p{assigned}"), + hir_uclass_query(ClassQuery::Binary("Assigned")) + ); + assert_eq!( + t(r"\p{ascii}"), + hir_uclass_query(ClassQuery::Binary("ASCII")) + ); + assert_eq!( + t(r"\p{gc:any}"), + hir_uclass_query(ClassQuery::Binary("Any")) + ); + assert_eq!( + t(r"\p{gc:assigned}"), + hir_uclass_query(ClassQuery::Binary("Assigned")) + ); + assert_eq!( + t(r"\p{gc:ascii}"), + hir_uclass_query(ClassQuery::Binary("ASCII")) + ); + + assert_eq!( + t_err(r"(?-u)\pZ"), + TestError { + kind: hir::ErrorKind::UnicodeNotAllowed, + span: Span::new( + Position::new(5, 1, 6), + Position::new(8, 1, 9) + ), + } + ); + assert_eq!( + t_err(r"(?-u)\p{Separator}"), + TestError { + kind: hir::ErrorKind::UnicodeNotAllowed, + span: Span::new( + Position::new(5, 1, 6), + Position::new(18, 1, 19) + ), + } + ); + assert_eq!( + t_err(r"\pE"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(3, 1, 4) + ), + } + ); + assert_eq!( + t_err(r"\p{Foo}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(7, 1, 8) + ), + } + ); + assert_eq!( + t_err(r"\p{gc:Foo}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(10, 1, 11) + ), + } + ); + } + + #[test] + #[cfg(not(feature = "unicode-gencat"))] + fn class_unicode_gencat_disabled() { + assert_eq!( + t_err(r"\p{Separator}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(13, 1, 14) + ), + } + ); + + assert_eq!( + t_err(r"\p{Any}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(7, 1, 8) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-script")] + fn class_unicode_script() { + assert_eq!( + t(r"\p{Greek}"), + hir_uclass_query(ClassQuery::Binary("Greek")) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\p{Greek}"), + hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek"))) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)\P{Greek}"), + hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( + "Greek" + )))) + ); + + assert_eq!( + t_err(r"\p{sc:Foo}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(10, 1, 11) + ), + } + ); + assert_eq!( + t_err(r"\p{scx:Foo}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(11, 1, 12) + ), + } + ); + } + + #[test] + #[cfg(not(feature = "unicode-script"))] + fn class_unicode_script_disabled() { + assert_eq!( + t_err(r"\p{Greek}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(9, 1, 10) + ), + } + ); + + assert_eq!( + t_err(r"\p{scx:Greek}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(13, 1, 14) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-age")] + fn class_unicode_age() { + assert_eq!( + t_err(r"\p{age:Foo}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyValueNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(11, 1, 12) + ), + } + ); + } + + #[test] + #[cfg(feature = "unicode-gencat")] + fn class_unicode_any_empty() { + assert_eq!( + t_err(r"\P{any}"), + TestError { + kind: hir::ErrorKind::EmptyClassNotAllowed, + span: Span::new( + Position::new(0, 1, 1), + Position::new(7, 1, 8) + ), + } + ); + } + + #[test] + #[cfg(not(feature = "unicode-age"))] + fn class_unicode_age_disabled() { + assert_eq!( + t_err(r"\p{age:3.0}"), + TestError { + kind: hir::ErrorKind::UnicodePropertyNotFound, + span: Span::new( + Position::new(0, 1, 1), + Position::new(11, 1, 12) + ), + } + ); + } + + #[test] + fn class_bracketed() { + assert_eq!(t("[a]"), hir_uclass(&[('a', 'a')])); + assert_eq!(t("[^[a]]"), hir_negate(hir_uclass(&[('a', 'a')]))); + assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')])); + assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')])); + assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')])); + assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')])); + assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')])); + assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')])); + #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] + assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit"))); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[\pZ]"), + hir_uclass_query(ClassQuery::Binary("separator")) + ); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[\p{separator}]"), + hir_uclass_query(ClassQuery::Binary("separator")) + ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] + assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit"))); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[^\PZ]"), + hir_uclass_query(ClassQuery::Binary("separator")) + ); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[^\P{separator}]"), + hir_uclass_query(ClassQuery::Binary("separator")) + ); + #[cfg(all( + feature = "unicode-case", + any(feature = "unicode-perl", feature = "unicode-gencat") + ))] + assert_eq!( + t(r"(?i)[^\D]"), + hir_uclass_query(ClassQuery::Binary("digit")) + ); + #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] + assert_eq!( + t(r"(?i)[^\P{greek}]"), + hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek"))) + ); + + assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')])); + assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')])); + assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')])); + + #[cfg(feature = "unicode-case")] + assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')])); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[k]"), + hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),]) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[β]"), + hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),]) + ); + assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),])); + + assert_eq!(t("[^a]"), hir_negate(hir_uclass(&[('a', 'a')]))); + assert_eq!(t(r"[^\x00]"), hir_negate(hir_uclass(&[('\0', '\0')]))); + assert_eq!( + t_bytes("(?-u)[^a]"), + hir_negate(hir_bclass(&[(b'a', b'a')])) + ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] + assert_eq!( + t(r"[^\d]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) + ); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[^\pZ]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) + ); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[^\p{separator}]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) + ); + #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] + assert_eq!( + t(r"(?i)[^\p{greek}]"), + hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( + "greek" + )))) + ); + #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] + assert_eq!( + t(r"(?i)[\P{greek}]"), + hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( + "greek" + )))) + ); + + // Test some weird cases. + assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')])); + + assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')])); + assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')])); + + assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')])); + assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')])); + assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')])); + assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')])); + assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')])); + + assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')])); + assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')])); + + assert_eq!( + t_err("(?-u)[^a]"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(9, 1, 10) + ), + } + ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] + assert_eq!( + t_err(r"[^\s\S]"), + TestError { + kind: hir::ErrorKind::EmptyClassNotAllowed, + span: Span::new( + Position::new(0, 1, 1), + Position::new(7, 1, 8) + ), + } + ); + #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] + assert_eq!( + t_err(r"(?-u)[^\s\S]"), + TestError { + kind: hir::ErrorKind::EmptyClassNotAllowed, + span: Span::new( + Position::new(5, 1, 6), + Position::new(12, 1, 13) + ), + } + ); + } + + #[test] + fn class_bracketed_union() { + assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')])); + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[a\pZb]"), + hir_union( + hir_uclass(&[('a', 'b')]), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + ); + #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))] + assert_eq!( + t(r"[\pZ\p{Greek}]"), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-gencat", + feature = "unicode-script" + ))] + assert_eq!( + t(r"[\p{age:3.0}\pZ\p{Greek}]"), + hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + ) + ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-gencat", + feature = "unicode-script" + ))] + assert_eq!( + t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"), + hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("cyrillic")), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + ) + ) + ); + + #[cfg(all( + feature = "unicode-age", + feature = "unicode-case", + feature = "unicode-gencat", + feature = "unicode-script" + ))] + assert_eq!( + t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"), + hir_case_fold(hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + )) + ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-gencat", + feature = "unicode-script" + ))] + assert_eq!( + t(r"[^\p{age:3.0}\pZ\p{Greek}]"), + hir_negate(hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + )) + ); + #[cfg(all( + feature = "unicode-age", + feature = "unicode-case", + feature = "unicode-gencat", + feature = "unicode-script" + ))] + assert_eq!( + t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"), + hir_negate(hir_case_fold(hir_union( + hir_uclass_query(ClassQuery::ByValue { + property_name: "age", + property_value: "3.0", + }), + hir_union( + hir_uclass_query(ClassQuery::Binary("greek")), + hir_uclass_query(ClassQuery::Binary("separator")) + ) + ))) + ); + } + + #[test] + fn class_bracketed_nested() { + assert_eq!(t(r"[a[^c]]"), hir_negate(hir_uclass(&[('c', 'c')]))); + assert_eq!(t(r"[a-b[^c]]"), hir_negate(hir_uclass(&[('c', 'c')]))); + assert_eq!(t(r"[a-c[^c]]"), hir_negate(hir_uclass(&[]))); + + assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')])); + assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')])); + + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)[a[^c]]"), + hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')]))) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)[a-b[^c]]"), + hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')]))) + ); + + #[cfg(feature = "unicode-case")] + assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')])); + #[cfg(feature = "unicode-case")] + assert_eq!( + t(r"(?i)[^a-b[^c]]"), + hir_uclass(&[('C', 'C'), ('c', 'c')]) + ); + + assert_eq!( + t_err(r"[^a-c[^c]]"), + TestError { + kind: hir::ErrorKind::EmptyClassNotAllowed, + span: Span::new( + Position::new(0, 1, 1), + Position::new(10, 1, 11) + ), + } + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t_err(r"(?i)[^a-c[^c]]"), + TestError { + kind: hir::ErrorKind::EmptyClassNotAllowed, + span: Span::new( + Position::new(4, 1, 5), + Position::new(14, 1, 15) + ), + } + ); + } + + #[test] + fn class_bracketed_intersect() { + assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')])); + assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')])); + assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')])); + assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')])); + assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')])); + assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')])); + assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')])); + assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')])); + assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); + + assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')])); + assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); + assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); + assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')])); + assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')])); + assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')])); + + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[abc&&b-c]"), + hir_case_fold(hir_uclass(&[('b', 'c')])) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[abc&&[b-c]]"), + hir_case_fold(hir_uclass(&[('b', 'c')])) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[[abc]&&[b-c]]"), + hir_case_fold(hir_uclass(&[('b', 'c')])) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[a-z&&b-y&&c-x]"), + hir_case_fold(hir_uclass(&[('c', 'x')])) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[c-da-b&&a-d]"), + hir_case_fold(hir_uclass(&[('a', 'd')])) + ); + #[cfg(feature = "unicode-case")] + assert_eq!( + t("(?i)[a-d&&c-da-b]"), + hir_case_fold(hir_uclass(&[('a', 'd')])) + ); + + assert_eq!( + t("(?i-u)[abc&&b-c]"), + hir_case_fold(hir_bclass(&[(b'b', b'c')])) + ); + assert_eq!( + t("(?i-u)[abc&&[b-c]]"), + hir_case_fold(hir_bclass(&[(b'b', b'c')])) + ); + assert_eq!( + t("(?i-u)[[abc]&&[b-c]]"), + hir_case_fold(hir_bclass(&[(b'b', b'c')])) + ); + assert_eq!( + t("(?i-u)[a-z&&b-y&&c-x]"), + hir_case_fold(hir_bclass(&[(b'c', b'x')])) + ); + assert_eq!( + t("(?i-u)[c-da-b&&a-d]"), + hir_case_fold(hir_bclass(&[(b'a', b'd')])) + ); + assert_eq!( + t("(?i-u)[a-d&&c-da-b]"), + hir_case_fold(hir_bclass(&[(b'a', b'd')])) + ); + + // In `[a^]`, `^` does not need to be escaped, so it makes sense that + // `^` is also allowed to be unescaped after `&&`. + assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')])); + // `]` needs to be escaped after `&&` since it's not at start of class. + assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')])); + assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')])); + assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')])); + assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')])); + // Test precedence. + assert_eq!( + t(r"[a-w&&[^c-g]z]"), + hir_uclass(&[('a', 'b'), ('h', 'w')]) + ); + } + + #[test] + fn class_bracketed_intersect_negate() { + #[cfg(feature = "unicode-perl")] + assert_eq!( + t(r"[^\w&&\d]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) + ); + assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); + #[cfg(feature = "unicode-perl")] + assert_eq!( + t(r"[^[\w&&\d]]"), + hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) + ); + #[cfg(feature = "unicode-perl")] + assert_eq!( + t(r"[^[^\w&&\d]]"), + hir_uclass_query(ClassQuery::Binary("digit")) + ); + #[cfg(feature = "unicode-perl")] + assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word())); + + #[cfg(feature = "unicode-perl")] + assert_eq!( + t_bytes(r"(?-u)[^\w&&\d]"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Digit + ))) + ); + assert_eq!( + t_bytes(r"(?-u)[^[a-z&&a-c]]"), + hir_negate(hir_bclass(&[(b'a', b'c')])) + ); + assert_eq!( + t_bytes(r"(?-u)[^[\w&&\d]]"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Digit + ))) + ); + assert_eq!( + t_bytes(r"(?-u)[^[^\w&&\d]]"), + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) + ); + assert_eq!( + t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"), + hir_negate(hir_bclass_from_char(ascii_class( + &ast::ClassAsciiKind::Word + ))) + ); + } + + #[test] + fn class_bracketed_difference() { + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"[\pL--[:ascii:]]"), + hir_difference( + hir_uclass_query(ClassQuery::Binary("letter")), + hir_uclass(&[('\0', '\x7F')]) + ) + ); + + assert_eq!( + t(r"(?-u)[[:alpha:]--[:lower:]]"), + hir_bclass(&[(b'A', b'Z')]) + ); + } + + #[test] + fn class_bracketed_symmetric_difference() { + #[cfg(feature = "unicode-script")] + assert_eq!( + t(r"[\p{sc:Greek}~~\p{scx:Greek}]"), + hir_uclass(&[ + ('\u{0342}', '\u{0342}'), + ('\u{0345}', '\u{0345}'), + ('\u{1DC0}', '\u{1DC1}'), + ]) + ); + assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')])); + + assert_eq!( + t(r"(?-u)[a-g~~c-j]"), + hir_bclass(&[(b'a', b'b'), (b'h', b'j')]) + ); + } + + #[test] + fn ignore_whitespace() { + assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3")); + assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S")); + assert_eq!( + t(r"(?x)\x # comment +{ # comment + 53 # comment +} #comment"), + hir_lit("S") + ); + + assert_eq!(t(r"(?x)\x 53"), hir_lit("S")); + assert_eq!( + t(r"(?x)\x # comment + 53 # comment"), + hir_lit("S") + ); + assert_eq!(t(r"(?x)\x5 3"), hir_lit("S")); + + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"(?x)\p # comment +{ # comment + Separator # comment +} # comment"), + hir_uclass_query(ClassQuery::Binary("separator")) + ); + + assert_eq!( + t(r"(?x)a # comment +{ # comment + 5 # comment + , # comment + 10 # comment +} # comment"), + hir_range( + true, + hir::RepetitionRange::Bounded(5, 10), + hir_lit("a") + ) + ); + + assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a ")); + } + + #[test] + fn analysis_is_always_utf8() { + // Positive examples. + assert!(t_bytes(r"a").is_always_utf8()); + assert!(t_bytes(r"ab").is_always_utf8()); + assert!(t_bytes(r"(?-u)a").is_always_utf8()); + assert!(t_bytes(r"(?-u)ab").is_always_utf8()); + assert!(t_bytes(r"\xFF").is_always_utf8()); + assert!(t_bytes(r"\xFF\xFF").is_always_utf8()); + assert!(t_bytes(r"[^a]").is_always_utf8()); + assert!(t_bytes(r"[^a][^a]").is_always_utf8()); + assert!(t_bytes(r"\b").is_always_utf8()); + assert!(t_bytes(r"\B").is_always_utf8()); + assert!(t_bytes(r"(?-u)\b").is_always_utf8()); + + // Negative examples. + assert!(!t_bytes(r"(?-u)\xFF").is_always_utf8()); + assert!(!t_bytes(r"(?-u)\xFF\xFF").is_always_utf8()); + assert!(!t_bytes(r"(?-u)[^a]").is_always_utf8()); + assert!(!t_bytes(r"(?-u)[^a][^a]").is_always_utf8()); + assert!(!t_bytes(r"(?-u)\B").is_always_utf8()); + } + + #[test] + fn analysis_is_all_assertions() { + // Positive examples. + assert!(t(r"\b").is_all_assertions()); + assert!(t(r"\B").is_all_assertions()); + assert!(t(r"^").is_all_assertions()); + assert!(t(r"$").is_all_assertions()); + assert!(t(r"\A").is_all_assertions()); + assert!(t(r"\z").is_all_assertions()); + assert!(t(r"$^\z\A\b\B").is_all_assertions()); + assert!(t(r"$|^|\z|\A|\b|\B").is_all_assertions()); + assert!(t(r"^$|$^").is_all_assertions()); + assert!(t(r"((\b)+())*^").is_all_assertions()); + + // Negative examples. + assert!(!t(r"^a").is_all_assertions()); + } + + #[test] + fn analysis_is_anchored() { + // Positive examples. + assert!(t(r"^").is_anchored_start()); + assert!(t(r"$").is_anchored_end()); + assert!(t(r"^").is_line_anchored_start()); + assert!(t(r"$").is_line_anchored_end()); + + assert!(t(r"^^").is_anchored_start()); + assert!(t(r"$$").is_anchored_end()); + assert!(t(r"^^").is_line_anchored_start()); + assert!(t(r"$$").is_line_anchored_end()); + + assert!(t(r"^$").is_anchored_start()); + assert!(t(r"^$").is_anchored_end()); + assert!(t(r"^$").is_line_anchored_start()); + assert!(t(r"^$").is_line_anchored_end()); + + assert!(t(r"^foo").is_anchored_start()); + assert!(t(r"foo$").is_anchored_end()); + assert!(t(r"^foo").is_line_anchored_start()); + assert!(t(r"foo$").is_line_anchored_end()); + + assert!(t(r"^foo|^bar").is_anchored_start()); + assert!(t(r"foo$|bar$").is_anchored_end()); + assert!(t(r"^foo|^bar").is_line_anchored_start()); + assert!(t(r"foo$|bar$").is_line_anchored_end()); + + assert!(t(r"^(foo|bar)").is_anchored_start()); + assert!(t(r"(foo|bar)$").is_anchored_end()); + assert!(t(r"^(foo|bar)").is_line_anchored_start()); + assert!(t(r"(foo|bar)$").is_line_anchored_end()); + + assert!(t(r"^+").is_anchored_start()); + assert!(t(r"$+").is_anchored_end()); + assert!(t(r"^+").is_line_anchored_start()); + assert!(t(r"$+").is_line_anchored_end()); + assert!(t(r"^++").is_anchored_start()); + assert!(t(r"$++").is_anchored_end()); + assert!(t(r"^++").is_line_anchored_start()); + assert!(t(r"$++").is_line_anchored_end()); + assert!(t(r"(^)+").is_anchored_start()); + assert!(t(r"($)+").is_anchored_end()); + assert!(t(r"(^)+").is_line_anchored_start()); + assert!(t(r"($)+").is_line_anchored_end()); + + assert!(t(r"$^").is_anchored_start()); + assert!(t(r"$^").is_anchored_start()); + assert!(t(r"$^").is_line_anchored_end()); + assert!(t(r"$^").is_line_anchored_end()); + assert!(t(r"$^|^$").is_anchored_start()); + assert!(t(r"$^|^$").is_anchored_end()); + assert!(t(r"$^|^$").is_line_anchored_start()); + assert!(t(r"$^|^$").is_line_anchored_end()); + + assert!(t(r"\b^").is_anchored_start()); + assert!(t(r"$\b").is_anchored_end()); + assert!(t(r"\b^").is_line_anchored_start()); + assert!(t(r"$\b").is_line_anchored_end()); + assert!(t(r"^(?m:^)").is_anchored_start()); + assert!(t(r"(?m:$)$").is_anchored_end()); + assert!(t(r"^(?m:^)").is_line_anchored_start()); + assert!(t(r"(?m:$)$").is_line_anchored_end()); + assert!(t(r"(?m:^)^").is_anchored_start()); + assert!(t(r"$(?m:$)").is_anchored_end()); + assert!(t(r"(?m:^)^").is_line_anchored_start()); + assert!(t(r"$(?m:$)").is_line_anchored_end()); + + // Negative examples. + assert!(!t(r"(?m)^").is_anchored_start()); + assert!(!t(r"(?m)$").is_anchored_end()); + assert!(!t(r"(?m:^$)|$^").is_anchored_start()); + assert!(!t(r"(?m:^$)|$^").is_anchored_end()); + assert!(!t(r"$^|(?m:^$)").is_anchored_start()); + assert!(!t(r"$^|(?m:^$)").is_anchored_end()); + + assert!(!t(r"a^").is_anchored_start()); + assert!(!t(r"$a").is_anchored_start()); + assert!(!t(r"a^").is_line_anchored_start()); + assert!(!t(r"$a").is_line_anchored_start()); + + assert!(!t(r"a^").is_anchored_end()); + assert!(!t(r"$a").is_anchored_end()); + assert!(!t(r"a^").is_line_anchored_end()); + assert!(!t(r"$a").is_line_anchored_end()); + + assert!(!t(r"^foo|bar").is_anchored_start()); + assert!(!t(r"foo|bar$").is_anchored_end()); + assert!(!t(r"^foo|bar").is_line_anchored_start()); + assert!(!t(r"foo|bar$").is_line_anchored_end()); + + assert!(!t(r"^*").is_anchored_start()); + assert!(!t(r"$*").is_anchored_end()); + assert!(!t(r"^*").is_line_anchored_start()); + assert!(!t(r"$*").is_line_anchored_end()); + assert!(!t(r"^*+").is_anchored_start()); + assert!(!t(r"$*+").is_anchored_end()); + assert!(!t(r"^*+").is_line_anchored_start()); + assert!(!t(r"$*+").is_line_anchored_end()); + assert!(!t(r"^+*").is_anchored_start()); + assert!(!t(r"$+*").is_anchored_end()); + assert!(!t(r"^+*").is_line_anchored_start()); + assert!(!t(r"$+*").is_line_anchored_end()); + assert!(!t(r"(^)*").is_anchored_start()); + assert!(!t(r"($)*").is_anchored_end()); + assert!(!t(r"(^)*").is_line_anchored_start()); + assert!(!t(r"($)*").is_line_anchored_end()); + } + + #[test] + fn analysis_is_line_anchored() { + assert!(t(r"(?m)^(foo|bar)").is_line_anchored_start()); + assert!(t(r"(?m)(foo|bar)$").is_line_anchored_end()); + + assert!(t(r"(?m)^foo|^bar").is_line_anchored_start()); + assert!(t(r"(?m)foo$|bar$").is_line_anchored_end()); + + assert!(t(r"(?m)^").is_line_anchored_start()); + assert!(t(r"(?m)$").is_line_anchored_end()); + + assert!(t(r"(?m:^$)|$^").is_line_anchored_start()); + assert!(t(r"(?m:^$)|$^").is_line_anchored_end()); + + assert!(t(r"$^|(?m:^$)").is_line_anchored_start()); + assert!(t(r"$^|(?m:^$)").is_line_anchored_end()); + } + + #[test] + fn analysis_is_any_anchored() { + // Positive examples. + assert!(t(r"^").is_any_anchored_start()); + assert!(t(r"$").is_any_anchored_end()); + assert!(t(r"\A").is_any_anchored_start()); + assert!(t(r"\z").is_any_anchored_end()); + + // Negative examples. + assert!(!t(r"(?m)^").is_any_anchored_start()); + assert!(!t(r"(?m)$").is_any_anchored_end()); + assert!(!t(r"$").is_any_anchored_start()); + assert!(!t(r"^").is_any_anchored_end()); + } + + #[test] + fn analysis_is_match_empty() { + // Positive examples. + assert!(t(r"").is_match_empty()); + assert!(t(r"()").is_match_empty()); + assert!(t(r"()*").is_match_empty()); + assert!(t(r"()+").is_match_empty()); + assert!(t(r"()?").is_match_empty()); + assert!(t(r"a*").is_match_empty()); + assert!(t(r"a?").is_match_empty()); + assert!(t(r"a{0}").is_match_empty()); + assert!(t(r"a{0,}").is_match_empty()); + assert!(t(r"a{0,1}").is_match_empty()); + assert!(t(r"a{0,10}").is_match_empty()); + #[cfg(feature = "unicode-gencat")] + assert!(t(r"\pL*").is_match_empty()); + assert!(t(r"a*|b").is_match_empty()); + assert!(t(r"b|a*").is_match_empty()); + assert!(t(r"a|").is_match_empty()); + assert!(t(r"|a").is_match_empty()); + assert!(t(r"a||b").is_match_empty()); + assert!(t(r"a*a?(abcd)*").is_match_empty()); + assert!(t(r"^").is_match_empty()); + assert!(t(r"$").is_match_empty()); + assert!(t(r"(?m)^").is_match_empty()); + assert!(t(r"(?m)$").is_match_empty()); + assert!(t(r"\A").is_match_empty()); + assert!(t(r"\z").is_match_empty()); + assert!(t(r"\B").is_match_empty()); + assert!(t_bytes(r"(?-u)\B").is_match_empty()); + assert!(t(r"\b").is_match_empty()); + assert!(t(r"(?-u)\b").is_match_empty()); + + // Negative examples. + assert!(!t(r"a+").is_match_empty()); + assert!(!t(r"a{1}").is_match_empty()); + assert!(!t(r"a{1,}").is_match_empty()); + assert!(!t(r"a{1,2}").is_match_empty()); + assert!(!t(r"a{1,10}").is_match_empty()); + assert!(!t(r"b|a").is_match_empty()); + assert!(!t(r"a*a+(abcd)*").is_match_empty()); + } + + #[test] + fn analysis_is_literal() { + // Positive examples. + assert!(t(r"a").is_literal()); + assert!(t(r"ab").is_literal()); + assert!(t(r"abc").is_literal()); + assert!(t(r"(?m)abc").is_literal()); + + // Negative examples. + assert!(!t(r"").is_literal()); + assert!(!t(r"^").is_literal()); + assert!(!t(r"a|b").is_literal()); + assert!(!t(r"(a)").is_literal()); + assert!(!t(r"a+").is_literal()); + assert!(!t(r"foo(a)").is_literal()); + assert!(!t(r"(a)foo").is_literal()); + assert!(!t(r"[a]").is_literal()); + } + + #[test] + fn analysis_is_alternation_literal() { + // Positive examples. + assert!(t(r"a").is_alternation_literal()); + assert!(t(r"ab").is_alternation_literal()); + assert!(t(r"abc").is_alternation_literal()); + assert!(t(r"(?m)abc").is_alternation_literal()); + assert!(t(r"a|b").is_alternation_literal()); + assert!(t(r"a|b|c").is_alternation_literal()); + assert!(t(r"foo|bar").is_alternation_literal()); + assert!(t(r"foo|bar|baz").is_alternation_literal()); + + // Negative examples. + assert!(!t(r"").is_alternation_literal()); + assert!(!t(r"^").is_alternation_literal()); + assert!(!t(r"(a)").is_alternation_literal()); + assert!(!t(r"a+").is_alternation_literal()); + assert!(!t(r"foo(a)").is_alternation_literal()); + assert!(!t(r"(a)foo").is_alternation_literal()); + assert!(!t(r"[a]").is_alternation_literal()); + assert!(!t(r"[a]|b").is_alternation_literal()); + assert!(!t(r"a|[b]").is_alternation_literal()); + assert!(!t(r"(a)|b").is_alternation_literal()); + assert!(!t(r"a|(b)").is_alternation_literal()); + } +} diff --git a/vendor/regex-syntax/src/hir/visitor.rs b/vendor/regex-syntax/src/hir/visitor.rs new file mode 100644 index 0000000000..4f5a70909c --- /dev/null +++ b/vendor/regex-syntax/src/hir/visitor.rs @@ -0,0 +1,203 @@ +use crate::hir::{self, Hir, HirKind}; + +/// A trait for visiting the high-level IR (HIR) in depth first order. +/// +/// The principle aim of this trait is to enable callers to perform case +/// analysis on a high-level intermediate representation of a regular +/// expression without necessarily using recursion. In particular, this permits +/// callers to do case analysis with constant stack usage, which can be +/// important since the size of an HIR may be proportional to end user input. +/// +/// Typical usage of this trait involves providing an implementation and then +/// running it using the [`visit`](fn.visit.html) function. +pub trait Visitor { + /// The result of visiting an HIR. + type Output; + /// An error that visiting an HIR might return. + type Err; + + /// All implementors of `Visitor` must provide a `finish` method, which + /// yields the result of visiting the HIR or an error. + fn finish(self) -> Result; + + /// This method is called before beginning traversal of the HIR. + fn start(&mut self) {} + + /// This method is called on an `Hir` before descending into child `Hir` + /// nodes. + fn visit_pre(&mut self, _hir: &Hir) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called on an `Hir` after descending all of its child + /// `Hir` nodes. + fn visit_post(&mut self, _hir: &Hir) -> Result<(), Self::Err> { + Ok(()) + } + + /// This method is called between child nodes of an alternation. + fn visit_alternation_in(&mut self) -> Result<(), Self::Err> { + Ok(()) + } +} + +/// Executes an implementation of `Visitor` in constant stack space. +/// +/// This function will visit every node in the given `Hir` while calling +/// appropriate methods provided by the +/// [`Visitor`](trait.Visitor.html) trait. +/// +/// The primary use case for this method is when one wants to perform case +/// analysis over an `Hir` without using a stack size proportional to the depth +/// of the `Hir`. Namely, this method will instead use constant stack space, +/// but will use heap space proportional to the size of the `Hir`. This may be +/// desirable in cases where the size of `Hir` is proportional to end user +/// input. +/// +/// If the visitor returns an error at any point, then visiting is stopped and +/// the error is returned. +pub fn visit(hir: &Hir, visitor: V) -> Result { + HeapVisitor::new().visit(hir, visitor) +} + +/// HeapVisitor visits every item in an `Hir` recursively using constant stack +/// size and a heap size proportional to the size of the `Hir`. +struct HeapVisitor<'a> { + /// A stack of `Hir` nodes. This is roughly analogous to the call stack + /// used in a typical recursive visitor. + stack: Vec<(&'a Hir, Frame<'a>)>, +} + +/// Represents a single stack frame while performing structural induction over +/// an `Hir`. +enum Frame<'a> { + /// A stack frame allocated just before descending into a repetition + /// operator's child node. + Repetition(&'a hir::Repetition), + /// A stack frame allocated just before descending into a group's child + /// node. + Group(&'a hir::Group), + /// The stack frame used while visiting every child node of a concatenation + /// of expressions. + Concat { + /// The child node we are currently visiting. + head: &'a Hir, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Hir], + }, + /// The stack frame used while visiting every child node of an alternation + /// of expressions. + Alternation { + /// The child node we are currently visiting. + head: &'a Hir, + /// The remaining child nodes to visit (which may be empty). + tail: &'a [Hir], + }, +} + +impl<'a> HeapVisitor<'a> { + fn new() -> HeapVisitor<'a> { + HeapVisitor { stack: vec![] } + } + + fn visit( + &mut self, + mut hir: &'a Hir, + mut visitor: V, + ) -> Result { + self.stack.clear(); + + visitor.start(); + loop { + visitor.visit_pre(hir)?; + if let Some(x) = self.induct(hir) { + let child = x.child(); + self.stack.push((hir, x)); + hir = child; + continue; + } + // No induction means we have a base case, so we can post visit + // it now. + visitor.visit_post(hir)?; + + // At this point, we now try to pop our call stack until it is + // either empty or we hit another inductive case. + loop { + let (post_hir, frame) = match self.stack.pop() { + None => return visitor.finish(), + Some((post_hir, frame)) => (post_hir, frame), + }; + // If this is a concat/alternate, then we might have additional + // inductive steps to process. + if let Some(x) = self.pop(frame) { + if let Frame::Alternation { .. } = x { + visitor.visit_alternation_in()?; + } + hir = x.child(); + self.stack.push((post_hir, x)); + break; + } + // Otherwise, we've finished visiting all the child nodes for + // this HIR, so we can post visit it now. + visitor.visit_post(post_hir)?; + } + } + } + + /// Build a stack frame for the given HIR if one is needed (which occurs if + /// and only if there are child nodes in the HIR). Otherwise, return None. + fn induct(&mut self, hir: &'a Hir) -> Option> { + match *hir.kind() { + HirKind::Repetition(ref x) => Some(Frame::Repetition(x)), + HirKind::Group(ref x) => Some(Frame::Group(x)), + HirKind::Concat(ref x) if x.is_empty() => None, + HirKind::Concat(ref x) => { + Some(Frame::Concat { head: &x[0], tail: &x[1..] }) + } + HirKind::Alternation(ref x) if x.is_empty() => None, + HirKind::Alternation(ref x) => { + Some(Frame::Alternation { head: &x[0], tail: &x[1..] }) + } + _ => None, + } + } + + /// Pops the given frame. If the frame has an additional inductive step, + /// then return it, otherwise return `None`. + fn pop(&self, induct: Frame<'a>) -> Option> { + match induct { + Frame::Repetition(_) => None, + Frame::Group(_) => None, + Frame::Concat { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Concat { head: &tail[0], tail: &tail[1..] }) + } + } + Frame::Alternation { tail, .. } => { + if tail.is_empty() { + None + } else { + Some(Frame::Alternation { + head: &tail[0], + tail: &tail[1..], + }) + } + } + } + } +} + +impl<'a> Frame<'a> { + /// Perform the next inductive step on this frame and return the next + /// child HIR node to visit. + fn child(&self) -> &'a Hir { + match *self { + Frame::Repetition(rep) => &rep.hir, + Frame::Group(group) => &group.hir, + Frame::Concat { head, .. } => head, + Frame::Alternation { head, .. } => head, + } + } +} diff --git a/vendor/regex-syntax/src/lib.rs b/vendor/regex-syntax/src/lib.rs new file mode 100644 index 0000000000..1dfb38af39 --- /dev/null +++ b/vendor/regex-syntax/src/lib.rs @@ -0,0 +1,312 @@ +/*! +This crate provides a robust regular expression parser. + +This crate defines two primary types: + +* [`Ast`](ast/enum.Ast.html) is the abstract syntax of a regular expression. + An abstract syntax corresponds to a *structured representation* of the + concrete syntax of a regular expression, where the concrete syntax is the + pattern string itself (e.g., `foo(bar)+`). Given some abstract syntax, it + can be converted back to the original concrete syntax (modulo some details, + like whitespace). To a first approximation, the abstract syntax is complex + and difficult to analyze. +* [`Hir`](hir/struct.Hir.html) is the high-level intermediate representation + ("HIR" or "high-level IR" for short) of regular expression. It corresponds to + an intermediate state of a regular expression that sits between the abstract + syntax and the low level compiled opcodes that are eventually responsible for + executing a regular expression search. Given some high-level IR, it is not + possible to produce the original concrete syntax (although it is possible to + produce an equivalent concrete syntax, but it will likely scarcely resemble + the original pattern). To a first approximation, the high-level IR is simple + and easy to analyze. + +These two types come with conversion routines: + +* An [`ast::parse::Parser`](ast/parse/struct.Parser.html) converts concrete + syntax (a `&str`) to an [`Ast`](ast/enum.Ast.html). +* A [`hir::translate::Translator`](hir/translate/struct.Translator.html) + converts an [`Ast`](ast/enum.Ast.html) to a [`Hir`](hir/struct.Hir.html). + +As a convenience, the above two conversion routines are combined into one via +the top-level [`Parser`](struct.Parser.html) type. This `Parser` will first +convert your pattern to an `Ast` and then convert the `Ast` to an `Hir`. + + +# Example + +This example shows how to parse a pattern string into its HIR: + +``` +use regex_syntax::Parser; +use regex_syntax::hir::{self, Hir}; + +let hir = Parser::new().parse("a|b").unwrap(); +assert_eq!(hir, Hir::alternation(vec![ + Hir::literal(hir::Literal::Unicode('a')), + Hir::literal(hir::Literal::Unicode('b')), +])); +``` + + +# Concrete syntax supported + +The concrete syntax is documented as part of the public API of the +[`regex` crate](https://docs.rs/regex/%2A/regex/#syntax). + + +# Input safety + +A key feature of this library is that it is safe to use with end user facing +input. This plays a significant role in the internal implementation. In +particular: + +1. Parsers provide a `nest_limit` option that permits callers to control how + deeply nested a regular expression is allowed to be. This makes it possible + to do case analysis over an `Ast` or an `Hir` using recursion without + worrying about stack overflow. +2. Since relying on a particular stack size is brittle, this crate goes to + great lengths to ensure that all interactions with both the `Ast` and the + `Hir` do not use recursion. Namely, they use constant stack space and heap + space proportional to the size of the original pattern string (in bytes). + This includes the type's corresponding destructors. (One exception to this + is literal extraction, but this will eventually get fixed.) + + +# Error reporting + +The `Display` implementations on all `Error` types exposed in this library +provide nice human readable errors that are suitable for showing to end users +in a monospace font. + + +# Literal extraction + +This crate provides limited support for +[literal extraction from `Hir` values](hir/literal/struct.Literals.html). +Be warned that literal extraction currently uses recursion, and therefore, +stack size proportional to the size of the `Hir`. + +The purpose of literal extraction is to speed up searches. That is, if you +know a regular expression must match a prefix or suffix literal, then it is +often quicker to search for instances of that literal, and then confirm or deny +the match using the full regular expression engine. These optimizations are +done automatically in the `regex` crate. + + +# Crate features + +An important feature provided by this crate is its Unicode support. This +includes things like case folding, boolean properties, general categories, +scripts and Unicode-aware support for the Perl classes `\w`, `\s` and `\d`. +However, a downside of this support is that it requires bundling several +Unicode data tables that are substantial in size. + +A fair number of use cases do not require full Unicode support. For this +reason, this crate exposes a number of features to control which Unicode +data is available. + +If a regular expression attempts to use a Unicode feature that is not available +because the corresponding crate feature was disabled, then translating that +regular expression to an `Hir` will return an error. (It is still possible +construct an `Ast` for such a regular expression, since Unicode data is not +used until translation to an `Hir`.) Stated differently, enabling or disabling +any of the features below can only add or subtract from the total set of valid +regular expressions. Enabling or disabling a feature will never modify the +match semantics of a regular expression. + +The following features are available: + +* **unicode** - + Enables all Unicode features. This feature is enabled by default, and will + always cover all Unicode features, even if more are added in the future. +* **unicode-age** - + Provide the data for the + [Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age). + This makes it possible to use classes like `\p{Age:6.0}` to refer to all + codepoints first introduced in Unicode 6.0 +* **unicode-bool** - + Provide the data for numerous Unicode boolean properties. The full list + is not included here, but contains properties like `Alphabetic`, `Emoji`, + `Lowercase`, `Math`, `Uppercase` and `White_Space`. +* **unicode-case** - + Provide the data for case insensitive matching using + [Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches). +* **unicode-gencat** - + Provide the data for + [Uncode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values). + This includes, but is not limited to, `Decimal_Number`, `Letter`, + `Math_Symbol`, `Number` and `Punctuation`. +* **unicode-perl** - + Provide the data for supporting the Unicode-aware Perl character classes, + corresponding to `\w`, `\s` and `\d`. This is also necessary for using + Unicode-aware word boundary assertions. Note that if this feature is + disabled, the `\s` and `\d` character classes are still available if the + `unicode-bool` and `unicode-gencat` features are enabled, respectively. +* **unicode-script** - + Provide the data for + [Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/). + This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`, + `Latin` and `Thai`. +* **unicode-segment** - + Provide the data necessary to provide the properties used to implement the + [Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/). + This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and + `\p{sb=ATerm}`. +*/ + +#![deny(missing_docs)] +#![warn(missing_debug_implementations)] +#![forbid(unsafe_code)] + +pub use crate::error::{Error, Result}; +pub use crate::parser::{Parser, ParserBuilder}; +pub use crate::unicode::UnicodeWordError; + +pub mod ast; +mod either; +mod error; +pub mod hir; +mod parser; +mod unicode; +mod unicode_tables; +pub mod utf8; + +/// Escapes all regular expression meta characters in `text`. +/// +/// The string returned may be safely used as a literal in a regular +/// expression. +pub fn escape(text: &str) -> String { + let mut quoted = String::new(); + escape_into(text, &mut quoted); + quoted +} + +/// Escapes all meta characters in `text` and writes the result into `buf`. +/// +/// This will append escape characters into the given buffer. The characters +/// that are appended are safe to use as a literal in a regular expression. +pub fn escape_into(text: &str, buf: &mut String) { + buf.reserve(text.len()); + for c in text.chars() { + if is_meta_character(c) { + buf.push('\\'); + } + buf.push(c); + } +} + +/// Returns true if the given character has significance in a regex. +/// +/// These are the only characters that are allowed to be escaped, with one +/// exception: an ASCII space character may be escaped when extended mode (with +/// the `x` flag) is enabled. In particular, `is_meta_character(' ')` returns +/// `false`. +/// +/// Note that the set of characters for which this function returns `true` or +/// `false` is fixed and won't change in a semver compatible release. +pub fn is_meta_character(c: char) -> bool { + match c { + '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' + | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true, + _ => false, + } +} + +/// Returns true if and only if the given character is a Unicode word +/// character. +/// +/// A Unicode word character is defined by +/// [UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties). +/// In particular, a character +/// is considered a word character if it is in either of the `Alphabetic` or +/// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark` +/// or `Connector_Punctuation` general categories. +/// +/// # Panics +/// +/// If the `unicode-perl` feature is not enabled, then this function panics. +/// For this reason, it is recommended that callers use +/// [`try_is_word_character`](fn.try_is_word_character.html) +/// instead. +pub fn is_word_character(c: char) -> bool { + try_is_word_character(c).expect("unicode-perl feature must be enabled") +} + +/// Returns true if and only if the given character is a Unicode word +/// character. +/// +/// A Unicode word character is defined by +/// [UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties). +/// In particular, a character +/// is considered a word character if it is in either of the `Alphabetic` or +/// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark` +/// or `Connector_Punctuation` general categories. +/// +/// # Errors +/// +/// If the `unicode-perl` feature is not enabled, then this function always +/// returns an error. +pub fn try_is_word_character( + c: char, +) -> std::result::Result { + unicode::is_word_character(c) +} + +/// Returns true if and only if the given character is an ASCII word character. +/// +/// An ASCII word character is defined by the following character class: +/// `[_0-9a-zA-Z]'. +pub fn is_word_byte(c: u8) -> bool { + match c { + b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true, + _ => false, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn escape_meta() { + assert_eq!( + escape(r"\.+*?()|[]{}^$#&-~"), + r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~".to_string() + ); + } + + #[test] + fn word_byte() { + assert!(is_word_byte(b'a')); + assert!(!is_word_byte(b'-')); + } + + #[test] + #[cfg(feature = "unicode-perl")] + fn word_char() { + assert!(is_word_character('a'), "ASCII"); + assert!(is_word_character('à'), "Latin-1"); + assert!(is_word_character('β'), "Greek"); + assert!(is_word_character('\u{11011}'), "Brahmi (Unicode 6.0)"); + assert!(is_word_character('\u{11611}'), "Modi (Unicode 7.0)"); + assert!(is_word_character('\u{11711}'), "Ahom (Unicode 8.0)"); + assert!(is_word_character('\u{17828}'), "Tangut (Unicode 9.0)"); + assert!(is_word_character('\u{1B1B1}'), "Nushu (Unicode 10.0)"); + assert!(is_word_character('\u{16E40}'), "Medefaidrin (Unicode 11.0)"); + assert!(!is_word_character('-')); + assert!(!is_word_character('☃')); + } + + #[test] + #[should_panic] + #[cfg(not(feature = "unicode-perl"))] + fn word_char_disabled_panic() { + assert!(is_word_character('a')); + } + + #[test] + #[cfg(not(feature = "unicode-perl"))] + fn word_char_disabled_error() { + assert!(try_is_word_character('a').is_err()); + } +} diff --git a/vendor/regex-syntax/src/parser.rs b/vendor/regex-syntax/src/parser.rs new file mode 100644 index 0000000000..ded95b280a --- /dev/null +++ b/vendor/regex-syntax/src/parser.rs @@ -0,0 +1,200 @@ +use crate::ast; +use crate::hir; + +use crate::Result; + +/// A builder for a regular expression parser. +/// +/// This builder permits modifying configuration options for the parser. +/// +/// This type combines the builder options for both the +/// [AST `ParserBuilder`](ast/parse/struct.ParserBuilder.html) +/// and the +/// [HIR `TranslatorBuilder`](hir/translate/struct.TranslatorBuilder.html). +#[derive(Clone, Debug, Default)] +pub struct ParserBuilder { + ast: ast::parse::ParserBuilder, + hir: hir::translate::TranslatorBuilder, +} + +impl ParserBuilder { + /// Create a new parser builder with a default configuration. + pub fn new() -> ParserBuilder { + ParserBuilder::default() + } + + /// Build a parser from this configuration with the given pattern. + pub fn build(&self) -> Parser { + Parser { ast: self.ast.build(), hir: self.hir.build() } + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an `Ast` using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire Ast is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since this parser + /// implementation will limit itself to heap space proportional to the + /// length of the pattern string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation, which results in a nest + /// depth of `1`. In general, a nest limit is not something that manifests + /// in an obvious way in the concrete syntax, therefore, it should not be + /// used in a granular way. + pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { + self.ast.nest_limit(limit); + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\0` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { + self.ast.octal(yes); + self + } + + /// When enabled, the parser will permit the construction of a regular + /// expression that may match invalid UTF-8. + /// + /// When disabled (the default), the parser is guaranteed to produce + /// an expression that will only ever match valid UTF-8 (otherwise, the + /// parser will return an error). + /// + /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII + /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause + /// the parser to return an error. Namely, a negated ASCII word boundary + /// can result in matching positions that aren't valid UTF-8 boundaries. + pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.allow_invalid_utf8(yes); + self + } + + /// Enable verbose mode in the regular expression. + /// + /// When enabled, verbose mode permits insignificant whitespace in many + /// places in the regular expression, as well as comments. Comments are + /// started using `#` and continue until the end of the line. + /// + /// By default, this is disabled. It may be selectively enabled in the + /// regular expression by using the `x` flag regardless of this setting. + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { + self.ast.ignore_whitespace(yes); + self + } + + /// Enable or disable the case insensitive flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `i` flag. + pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.case_insensitive(yes); + self + } + + /// Enable or disable the multi-line matching flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `m` flag. + pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.multi_line(yes); + self + } + + /// Enable or disable the "dot matches any character" flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `s` flag. + pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.dot_matches_new_line(yes); + self + } + + /// Enable or disable the "swap greed" flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `U` flag. + pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.swap_greed(yes); + self + } + + /// Enable or disable the Unicode flag (`u`) by default. + /// + /// By default this is **enabled**. It may alternatively be selectively + /// disabled in the regular expression itself via the `u` flag. + /// + /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by + /// default), a regular expression will fail to parse if Unicode mode is + /// disabled and a sub-expression could possibly match invalid UTF-8. + pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.unicode(yes); + self + } +} + +/// A convenience parser for regular expressions. +/// +/// This parser takes as input a regular expression pattern string (the +/// "concrete syntax") and returns a high-level intermediate representation +/// (the HIR) suitable for most types of analysis. In particular, this parser +/// hides the intermediate state of producing an AST (the "abstract syntax"). +/// The AST is itself far more complex than the HIR, so this parser serves as a +/// convenience for never having to deal with it at all. +/// +/// If callers have more fine grained use cases that need an AST, then please +/// see the [`ast::parse`](ast/parse/index.html) module. +/// +/// A `Parser` can be configured in more detail via a +/// [`ParserBuilder`](struct.ParserBuilder.html). +#[derive(Clone, Debug)] +pub struct Parser { + ast: ast::parse::Parser, + hir: hir::translate::Translator, +} + +impl Parser { + /// Create a new parser with a default configuration. + /// + /// The parser can be run with `parse` method. The parse method returns + /// a high level intermediate representation of the given regular + /// expression. + /// + /// To set configuration options on the parser, use + /// [`ParserBuilder`](struct.ParserBuilder.html). + pub fn new() -> Parser { + ParserBuilder::new().build() + } + + /// Parse the regular expression into a high level intermediate + /// representation. + pub fn parse(&mut self, pattern: &str) -> Result { + let ast = self.ast.parse(pattern)?; + let hir = self.hir.translate(pattern, &ast)?; + Ok(hir) + } +} diff --git a/vendor/regex-syntax/src/unicode.rs b/vendor/regex-syntax/src/unicode.rs new file mode 100644 index 0000000000..8194d7f55b --- /dev/null +++ b/vendor/regex-syntax/src/unicode.rs @@ -0,0 +1,1001 @@ +use std::error; +use std::fmt; +use std::result; + +use crate::hir; + +/// A type alias for errors specific to Unicode handling of classes. +pub type Result = result::Result; + +/// An inclusive range of codepoints from a generated file (hence the static +/// lifetime). +type Range = &'static [(char, char)]; + +/// An error that occurs when dealing with Unicode. +/// +/// We don't impl the Error trait here because these always get converted +/// into other public errors. (This error type isn't exported.) +#[derive(Debug)] +pub enum Error { + PropertyNotFound, + PropertyValueNotFound, + // Not used when unicode-perl is enabled. + #[allow(dead_code)] + PerlClassNotFound, +} + +/// A type alias for errors specific to Unicode case folding. +pub type FoldResult = result::Result; + +/// An error that occurs when Unicode-aware simple case folding fails. +/// +/// This error can occur when the case mapping tables necessary for Unicode +/// aware case folding are unavailable. This only occurs when the +/// `unicode-case` feature is disabled. (The feature is enabled by default.) +#[derive(Debug)] +pub struct CaseFoldError(()); + +impl error::Error for CaseFoldError {} + +impl fmt::Display for CaseFoldError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "Unicode-aware case folding is not available \ + (probably because the unicode-case feature is not enabled)" + ) + } +} + +/// An error that occurs when the Unicode-aware `\w` class is unavailable. +/// +/// This error can occur when the data tables necessary for the Unicode aware +/// Perl character class `\w` are unavailable. This only occurs when the +/// `unicode-perl` feature is disabled. (The feature is enabled by default.) +#[derive(Debug)] +pub struct UnicodeWordError(()); + +impl error::Error for UnicodeWordError {} + +impl fmt::Display for UnicodeWordError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "Unicode-aware \\w class is not available \ + (probably because the unicode-perl feature is not enabled)" + ) + } +} + +/// Return an iterator over the equivalence class of simple case mappings +/// for the given codepoint. The equivalence class does not include the +/// given codepoint. +/// +/// If the equivalence class is empty, then this returns the next scalar +/// value that has a non-empty equivalence class, if it exists. If no such +/// scalar value exists, then `None` is returned. The point of this behavior +/// is to permit callers to avoid calling `simple_fold` more than they need +/// to, since there is some cost to fetching the equivalence class. +/// +/// This returns an error if the Unicode case folding tables are not available. +pub fn simple_fold( + c: char, +) -> FoldResult, Option>> { + #[cfg(not(feature = "unicode-case"))] + fn imp( + _: char, + ) -> FoldResult, Option>> + { + use std::option::IntoIter; + Err::, _>, _>(CaseFoldError(())) + } + + #[cfg(feature = "unicode-case")] + fn imp( + c: char, + ) -> FoldResult, Option>> + { + use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; + + Ok(CASE_FOLDING_SIMPLE + .binary_search_by_key(&c, |&(c1, _)| c1) + .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().copied()) + .map_err(|i| { + if i >= CASE_FOLDING_SIMPLE.len() { + None + } else { + Some(CASE_FOLDING_SIMPLE[i].0) + } + })) + } + + imp(c) +} + +/// Returns true if and only if the given (inclusive) range contains at least +/// one Unicode scalar value that has a non-empty non-trivial simple case +/// mapping. +/// +/// This function panics if `end < start`. +/// +/// This returns an error if the Unicode case folding tables are not available. +pub fn contains_simple_case_mapping( + start: char, + end: char, +) -> FoldResult { + #[cfg(not(feature = "unicode-case"))] + fn imp(_: char, _: char) -> FoldResult { + Err(CaseFoldError(())) + } + + #[cfg(feature = "unicode-case")] + fn imp(start: char, end: char) -> FoldResult { + use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; + use std::cmp::Ordering; + + assert!(start <= end); + Ok(CASE_FOLDING_SIMPLE + .binary_search_by(|&(c, _)| { + if start <= c && c <= end { + Ordering::Equal + } else if c > end { + Ordering::Greater + } else { + Ordering::Less + } + }) + .is_ok()) + } + + imp(start, end) +} + +/// A query for finding a character class defined by Unicode. This supports +/// either use of a property name directly, or lookup by property value. The +/// former generally refers to Binary properties (see UTS#44, Table 8), but +/// as a special exception (see UTS#18, Section 1.2) both general categories +/// (an enumeration) and scripts (a catalog) are supported as if each of their +/// possible values were a binary property. +/// +/// In all circumstances, property names and values are normalized and +/// canonicalized. That is, `GC == gc == GeneralCategory == general_category`. +/// +/// The lifetime `'a` refers to the shorter of the lifetimes of property name +/// and property value. +#[derive(Debug)] +pub enum ClassQuery<'a> { + /// Return a class corresponding to a Unicode binary property, named by + /// a single letter. + OneLetter(char), + /// Return a class corresponding to a Unicode binary property. + /// + /// Note that, by special exception (see UTS#18, Section 1.2), both + /// general category values and script values are permitted here as if + /// they were a binary property. + Binary(&'a str), + /// Return a class corresponding to all codepoints whose property + /// (identified by `property_name`) corresponds to the given value + /// (identified by `property_value`). + ByValue { + /// A property name. + property_name: &'a str, + /// A property value. + property_value: &'a str, + }, +} + +impl<'a> ClassQuery<'a> { + fn canonicalize(&self) -> Result { + match *self { + ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()), + ClassQuery::Binary(name) => self.canonical_binary(name), + ClassQuery::ByValue { property_name, property_value } => { + let property_name = symbolic_name_normalize(property_name); + let property_value = symbolic_name_normalize(property_value); + + let canon_name = match canonical_prop(&property_name)? { + None => return Err(Error::PropertyNotFound), + Some(canon_name) => canon_name, + }; + Ok(match canon_name { + "General_Category" => { + let canon = match canonical_gencat(&property_value)? { + None => return Err(Error::PropertyValueNotFound), + Some(canon) => canon, + }; + CanonicalClassQuery::GeneralCategory(canon) + } + "Script" => { + let canon = match canonical_script(&property_value)? { + None => return Err(Error::PropertyValueNotFound), + Some(canon) => canon, + }; + CanonicalClassQuery::Script(canon) + } + _ => { + let vals = match property_values(canon_name)? { + None => return Err(Error::PropertyValueNotFound), + Some(vals) => vals, + }; + let canon_val = + match canonical_value(vals, &property_value) { + None => { + return Err(Error::PropertyValueNotFound) + } + Some(canon_val) => canon_val, + }; + CanonicalClassQuery::ByValue { + property_name: canon_name, + property_value: canon_val, + } + } + }) + } + } + } + + fn canonical_binary(&self, name: &str) -> Result { + let norm = symbolic_name_normalize(name); + + // This is a special case where 'cf' refers to the 'Format' general + // category, but where the 'cf' abbreviation is also an abbreviation + // for the 'Case_Folding' property. But we want to treat it as + // a general category. (Currently, we don't even support the + // 'Case_Folding' property. But if we do in the future, users will be + // required to spell it out.) + if norm != "cf" { + if let Some(canon) = canonical_prop(&norm)? { + return Ok(CanonicalClassQuery::Binary(canon)); + } + } + if let Some(canon) = canonical_gencat(&norm)? { + return Ok(CanonicalClassQuery::GeneralCategory(canon)); + } + if let Some(canon) = canonical_script(&norm)? { + return Ok(CanonicalClassQuery::Script(canon)); + } + Err(Error::PropertyNotFound) + } +} + +/// Like ClassQuery, but its parameters have been canonicalized. This also +/// differentiates binary properties from flattened general categories and +/// scripts. +#[derive(Debug, Eq, PartialEq)] +enum CanonicalClassQuery { + /// The canonical binary property name. + Binary(&'static str), + /// The canonical general category name. + GeneralCategory(&'static str), + /// The canonical script name. + Script(&'static str), + /// An arbitrary association between property and value, both of which + /// have been canonicalized. + /// + /// Note that by construction, the property name of ByValue will never + /// be General_Category or Script. Those two cases are subsumed by the + /// eponymous variants. + ByValue { + /// The canonical property name. + property_name: &'static str, + /// The canonical property value. + property_value: &'static str, + }, +} + +/// Looks up a Unicode class given a query. If one doesn't exist, then +/// `None` is returned. +pub fn class(query: ClassQuery<'_>) -> Result { + use self::CanonicalClassQuery::*; + + match query.canonicalize()? { + Binary(name) => bool_property(name), + GeneralCategory(name) => gencat(name), + Script(name) => script(name), + ByValue { property_name: "Age", property_value } => { + let mut class = hir::ClassUnicode::empty(); + for set in ages(property_value)? { + class.union(&hir_class(set)); + } + Ok(class) + } + ByValue { property_name: "Script_Extensions", property_value } => { + script_extension(property_value) + } + ByValue { + property_name: "Grapheme_Cluster_Break", + property_value, + } => gcb(property_value), + ByValue { property_name: "Sentence_Break", property_value } => { + sb(property_value) + } + ByValue { property_name: "Word_Break", property_value } => { + wb(property_value) + } + _ => { + // What else should we support? + Err(Error::PropertyNotFound) + } + } +} + +/// Returns a Unicode aware class for \w. +/// +/// This returns an error if the data is not available for \w. +pub fn perl_word() -> Result { + #[cfg(not(feature = "unicode-perl"))] + fn imp() -> Result { + Err(Error::PerlClassNotFound) + } + + #[cfg(feature = "unicode-perl")] + fn imp() -> Result { + use crate::unicode_tables::perl_word::PERL_WORD; + Ok(hir_class(PERL_WORD)) + } + + imp() +} + +/// Returns a Unicode aware class for \s. +/// +/// This returns an error if the data is not available for \s. +pub fn perl_space() -> Result { + #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))] + fn imp() -> Result { + Err(Error::PerlClassNotFound) + } + + #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] + fn imp() -> Result { + use crate::unicode_tables::perl_space::WHITE_SPACE; + Ok(hir_class(WHITE_SPACE)) + } + + #[cfg(feature = "unicode-bool")] + fn imp() -> Result { + use crate::unicode_tables::property_bool::WHITE_SPACE; + Ok(hir_class(WHITE_SPACE)) + } + + imp() +} + +/// Returns a Unicode aware class for \d. +/// +/// This returns an error if the data is not available for \d. +pub fn perl_digit() -> Result { + #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))] + fn imp() -> Result { + Err(Error::PerlClassNotFound) + } + + #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] + fn imp() -> Result { + use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER; + Ok(hir_class(DECIMAL_NUMBER)) + } + + #[cfg(feature = "unicode-gencat")] + fn imp() -> Result { + use crate::unicode_tables::general_category::DECIMAL_NUMBER; + Ok(hir_class(DECIMAL_NUMBER)) + } + + imp() +} + +/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges. +pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { + let hir_ranges: Vec = ranges + .iter() + .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) + .collect(); + hir::ClassUnicode::new(hir_ranges) +} + +/// Returns true only if the given codepoint is in the `\w` character class. +/// +/// If the `unicode-perl` feature is not enabled, then this returns an error. +pub fn is_word_character(c: char) -> result::Result { + #[cfg(not(feature = "unicode-perl"))] + fn imp(_: char) -> result::Result { + Err(UnicodeWordError(())) + } + + #[cfg(feature = "unicode-perl")] + fn imp(c: char) -> result::Result { + use crate::is_word_byte; + use crate::unicode_tables::perl_word::PERL_WORD; + use std::cmp::Ordering; + + if c <= 0x7F as char && is_word_byte(c as u8) { + return Ok(true); + } + Ok(PERL_WORD + .binary_search_by(|&(start, end)| { + if start <= c && c <= end { + Ordering::Equal + } else if start > c { + Ordering::Greater + } else { + Ordering::Less + } + }) + .is_ok()) + } + + imp(c) +} + +/// A mapping of property values for a specific property. +/// +/// The first element of each tuple is a normalized property value while the +/// second element of each tuple is the corresponding canonical property +/// value. +type PropertyValues = &'static [(&'static str, &'static str)]; + +fn canonical_gencat(normalized_value: &str) -> Result> { + Ok(match normalized_value { + "any" => Some("Any"), + "assigned" => Some("Assigned"), + "ascii" => Some("ASCII"), + _ => { + let gencats = property_values("General_Category")?.unwrap(); + canonical_value(gencats, normalized_value) + } + }) +} + +fn canonical_script(normalized_value: &str) -> Result> { + let scripts = property_values("Script")?.unwrap(); + Ok(canonical_value(scripts, normalized_value)) +} + +/// Find the canonical property name for the given normalized property name. +/// +/// If no such property exists, then `None` is returned. +/// +/// The normalized property name must have been normalized according to +/// UAX44 LM3, which can be done using `symbolic_name_normalize`. +/// +/// If the property names data is not available, then an error is returned. +fn canonical_prop(normalized_name: &str) -> Result> { + #[cfg(not(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + )))] + fn imp(_: &str) -> Result> { + Err(Error::PropertyNotFound) + } + + #[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + ))] + fn imp(name: &str) -> Result> { + use crate::unicode_tables::property_names::PROPERTY_NAMES; + + Ok(PROPERTY_NAMES + .binary_search_by_key(&name, |&(n, _)| n) + .ok() + .map(|i| PROPERTY_NAMES[i].1)) + } + + imp(normalized_name) +} + +/// Find the canonical property value for the given normalized property +/// value. +/// +/// The given property values should correspond to the values for the property +/// under question, which can be found using `property_values`. +/// +/// If no such property value exists, then `None` is returned. +/// +/// The normalized property value must have been normalized according to +/// UAX44 LM3, which can be done using `symbolic_name_normalize`. +fn canonical_value( + vals: PropertyValues, + normalized_value: &str, +) -> Option<&'static str> { + vals.binary_search_by_key(&normalized_value, |&(n, _)| n) + .ok() + .map(|i| vals[i].1) +} + +/// Return the table of property values for the given property name. +/// +/// If the property values data is not available, then an error is returned. +fn property_values( + canonical_property_name: &'static str, +) -> Result> { + #[cfg(not(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + )))] + fn imp(_: &'static str) -> Result> { + Err(Error::PropertyValueNotFound) + } + + #[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", + ))] + fn imp(name: &'static str) -> Result> { + use crate::unicode_tables::property_values::PROPERTY_VALUES; + + Ok(PROPERTY_VALUES + .binary_search_by_key(&name, |&(n, _)| n) + .ok() + .map(|i| PROPERTY_VALUES[i].1)) + } + + imp(canonical_property_name) +} + +// This is only used in some cases, but small enough to just let it be dead +// instead of figuring out (and maintaining) the right set of features. +#[allow(dead_code)] +fn property_set( + name_map: &'static [(&'static str, Range)], + canonical: &'static str, +) -> Option { + name_map + .binary_search_by_key(&canonical, |x| x.0) + .ok() + .map(|i| name_map[i].1) +} + +/// Returns an iterator over Unicode Age sets. Each item corresponds to a set +/// of codepoints that were added in a particular revision of Unicode. The +/// iterator yields items in chronological order. +/// +/// If the given age value isn't valid or if the data isn't available, then an +/// error is returned instead. +fn ages(canonical_age: &str) -> Result> { + #[cfg(not(feature = "unicode-age"))] + fn imp(_: &str) -> Result> { + use std::option::IntoIter; + Err::, _>(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-age")] + fn imp(canonical_age: &str) -> Result> { + use crate::unicode_tables::age; + + const AGES: &[(&str, Range)] = &[ + ("V1_1", age::V1_1), + ("V2_0", age::V2_0), + ("V2_1", age::V2_1), + ("V3_0", age::V3_0), + ("V3_1", age::V3_1), + ("V3_2", age::V3_2), + ("V4_0", age::V4_0), + ("V4_1", age::V4_1), + ("V5_0", age::V5_0), + ("V5_1", age::V5_1), + ("V5_2", age::V5_2), + ("V6_0", age::V6_0), + ("V6_1", age::V6_1), + ("V6_2", age::V6_2), + ("V6_3", age::V6_3), + ("V7_0", age::V7_0), + ("V8_0", age::V8_0), + ("V9_0", age::V9_0), + ("V10_0", age::V10_0), + ("V11_0", age::V11_0), + ("V12_0", age::V12_0), + ("V12_1", age::V12_1), + ("V13_0", age::V13_0), + ("V14_0", age::V14_0), + ("V15_0", age::V15_0), + ]; + assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync"); + + let pos = AGES.iter().position(|&(age, _)| canonical_age == age); + match pos { + None => Err(Error::PropertyValueNotFound), + Some(i) => Ok(AGES[..=i].iter().map(|&(_, classes)| classes)), + } + } + + imp(canonical_age) +} + +/// Returns the Unicode HIR class corresponding to the given general category. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given general category could not be found, or if the general +/// category data is not available, then an error is returned. +fn gencat(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-gencat"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-gencat")] + fn imp(name: &'static str) -> Result { + use crate::unicode_tables::general_category::BY_NAME; + match name { + "ASCII" => Ok(hir_class(&[('\0', '\x7F')])), + "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])), + "Assigned" => { + let mut cls = gencat("Unassigned")?; + cls.negate(); + Ok(cls) + } + name => property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound), + } + } + + match canonical_name { + "Decimal_Number" => perl_digit(), + name => imp(name), + } +} + +/// Returns the Unicode HIR class corresponding to the given script. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given script could not be found, or if the script data is not +/// available, then an error is returned. +fn script(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-script"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-script")] + fn imp(name: &'static str) -> Result { + use crate::unicode_tables::script::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Returns the Unicode HIR class corresponding to the given script extension. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given script extension could not be found, or if the script data is +/// not available, then an error is returned. +fn script_extension( + canonical_name: &'static str, +) -> Result { + #[cfg(not(feature = "unicode-script"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-script")] + fn imp(name: &'static str) -> Result { + use crate::unicode_tables::script_extension::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Returns the Unicode HIR class corresponding to the given Unicode boolean +/// property. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given boolean property could not be found, or if the boolean +/// property data is not available, then an error is returned. +fn bool_property(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-bool"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-bool")] + fn imp(name: &'static str) -> Result { + use crate::unicode_tables::property_bool::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyNotFound) + } + + match canonical_name { + "Decimal_Number" => perl_digit(), + "White_Space" => perl_space(), + name => imp(name), + } +} + +/// Returns the Unicode HIR class corresponding to the given grapheme cluster +/// break property. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given property could not be found, or if the corresponding data is +/// not available, then an error is returned. +fn gcb(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-segment"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-segment")] + fn imp(name: &'static str) -> Result { + use crate::unicode_tables::grapheme_cluster_break::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Returns the Unicode HIR class corresponding to the given word break +/// property. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given property could not be found, or if the corresponding data is +/// not available, then an error is returned. +fn wb(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-segment"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-segment")] + fn imp(name: &'static str) -> Result { + use crate::unicode_tables::word_break::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Returns the Unicode HIR class corresponding to the given sentence +/// break property. +/// +/// Name canonicalization is assumed to be performed by the caller. +/// +/// If the given property could not be found, or if the corresponding data is +/// not available, then an error is returned. +fn sb(canonical_name: &'static str) -> Result { + #[cfg(not(feature = "unicode-segment"))] + fn imp(_: &'static str) -> Result { + Err(Error::PropertyNotFound) + } + + #[cfg(feature = "unicode-segment")] + fn imp(name: &'static str) -> Result { + use crate::unicode_tables::sentence_break::BY_NAME; + property_set(BY_NAME, name) + .map(hir_class) + .ok_or(Error::PropertyValueNotFound) + } + + imp(canonical_name) +} + +/// Like symbolic_name_normalize_bytes, but operates on a string. +fn symbolic_name_normalize(x: &str) -> String { + let mut tmp = x.as_bytes().to_vec(); + let len = symbolic_name_normalize_bytes(&mut tmp).len(); + tmp.truncate(len); + // This should always succeed because `symbolic_name_normalize_bytes` + // guarantees that `&tmp[..len]` is always valid UTF-8. + // + // N.B. We could avoid the additional UTF-8 check here, but it's unlikely + // to be worth skipping the additional safety check. A benchmark must + // justify it first. + String::from_utf8(tmp).unwrap() +} + +/// Normalize the given symbolic name in place according to UAX44-LM3. +/// +/// A "symbolic name" typically corresponds to property names and property +/// value aliases. Note, though, that it should not be applied to property +/// string values. +/// +/// The slice returned is guaranteed to be valid UTF-8 for all possible values +/// of `slice`. +/// +/// See: https://unicode.org/reports/tr44/#UAX44-LM3 +fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] { + // I couldn't find a place in the standard that specified that property + // names/aliases had a particular structure (unlike character names), but + // we assume that it's ASCII only and drop anything that isn't ASCII. + let mut start = 0; + let mut starts_with_is = false; + if slice.len() >= 2 { + // Ignore any "is" prefix. + starts_with_is = slice[0..2] == b"is"[..] + || slice[0..2] == b"IS"[..] + || slice[0..2] == b"iS"[..] + || slice[0..2] == b"Is"[..]; + if starts_with_is { + start = 2; + } + } + let mut next_write = 0; + for i in start..slice.len() { + // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid + // UTF-8, we ensure that the slice contains only ASCII bytes. In + // particular, we drop every non-ASCII byte from the normalized string. + let b = slice[i]; + if b == b' ' || b == b'_' || b == b'-' { + continue; + } else if b'A' <= b && b <= b'Z' { + slice[next_write] = b + (b'a' - b'A'); + next_write += 1; + } else if b <= 0x7F { + slice[next_write] = b; + next_write += 1; + } + } + // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally + // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross + // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it + // is actually an alias for the 'Other' general category. + if starts_with_is && next_write == 1 && slice[0] == b'c' { + slice[0] = b'i'; + slice[1] = b's'; + slice[2] = b'c'; + next_write = 3; + } + &mut slice[..next_write] +} + +#[cfg(test)] +mod tests { + use super::{ + contains_simple_case_mapping, simple_fold, symbolic_name_normalize, + symbolic_name_normalize_bytes, + }; + + #[cfg(feature = "unicode-case")] + fn simple_fold_ok(c: char) -> impl Iterator { + simple_fold(c).unwrap().unwrap() + } + + #[cfg(feature = "unicode-case")] + fn simple_fold_err(c: char) -> Option { + match simple_fold(c).unwrap() { + Ok(_) => unreachable!("simple_fold returned Ok iterator"), + Err(next) => next, + } + } + + #[cfg(feature = "unicode-case")] + fn contains_case_map(start: char, end: char) -> bool { + contains_simple_case_mapping(start, end).unwrap() + } + + #[test] + #[cfg(feature = "unicode-case")] + fn simple_fold_k() { + let xs: Vec = simple_fold_ok('k').collect(); + assert_eq!(xs, vec!['K', 'K']); + + let xs: Vec = simple_fold_ok('K').collect(); + assert_eq!(xs, vec!['k', 'K']); + + let xs: Vec = simple_fold_ok('K').collect(); + assert_eq!(xs, vec!['K', 'k']); + } + + #[test] + #[cfg(feature = "unicode-case")] + fn simple_fold_a() { + let xs: Vec = simple_fold_ok('a').collect(); + assert_eq!(xs, vec!['A']); + + let xs: Vec = simple_fold_ok('A').collect(); + assert_eq!(xs, vec!['a']); + } + + #[test] + #[cfg(feature = "unicode-case")] + fn simple_fold_empty() { + assert_eq!(Some('A'), simple_fold_err('?')); + assert_eq!(Some('A'), simple_fold_err('@')); + assert_eq!(Some('a'), simple_fold_err('[')); + assert_eq!(Some('Ⰰ'), simple_fold_err('☃')); + } + + #[test] + #[cfg(feature = "unicode-case")] + fn simple_fold_max() { + assert_eq!(None, simple_fold_err('\u{10FFFE}')); + assert_eq!(None, simple_fold_err('\u{10FFFF}')); + } + + #[test] + #[cfg(not(feature = "unicode-case"))] + fn simple_fold_disabled() { + assert!(simple_fold('a').is_err()); + } + + #[test] + #[cfg(feature = "unicode-case")] + fn range_contains() { + assert!(contains_case_map('A', 'A')); + assert!(contains_case_map('Z', 'Z')); + assert!(contains_case_map('A', 'Z')); + assert!(contains_case_map('@', 'A')); + assert!(contains_case_map('Z', '[')); + assert!(contains_case_map('☃', 'Ⰰ')); + + assert!(!contains_case_map('[', '[')); + assert!(!contains_case_map('[', '`')); + + assert!(!contains_case_map('☃', '☃')); + } + + #[test] + #[cfg(not(feature = "unicode-case"))] + fn range_contains_disabled() { + assert!(contains_simple_case_mapping('a', 'a').is_err()); + } + + #[test] + #[cfg(feature = "unicode-gencat")] + fn regression_466() { + use super::{CanonicalClassQuery, ClassQuery}; + + let q = ClassQuery::OneLetter('C'); + assert_eq!( + q.canonicalize().unwrap(), + CanonicalClassQuery::GeneralCategory("Other") + ); + } + + #[test] + fn sym_normalize() { + let sym_norm = symbolic_name_normalize; + + assert_eq!(sym_norm("Line_Break"), "linebreak"); + assert_eq!(sym_norm("Line-break"), "linebreak"); + assert_eq!(sym_norm("linebreak"), "linebreak"); + assert_eq!(sym_norm("BA"), "ba"); + assert_eq!(sym_norm("ba"), "ba"); + assert_eq!(sym_norm("Greek"), "greek"); + assert_eq!(sym_norm("isGreek"), "greek"); + assert_eq!(sym_norm("IS_Greek"), "greek"); + assert_eq!(sym_norm("isc"), "isc"); + assert_eq!(sym_norm("is c"), "isc"); + assert_eq!(sym_norm("is_c"), "isc"); + } + + #[test] + fn valid_utf8_symbolic() { + let mut x = b"abc\xFFxyz".to_vec(); + let y = symbolic_name_normalize_bytes(&mut x); + assert_eq!(y, b"abcxyz"); + } +} diff --git a/vendor/regex-syntax/src/unicode_tables/LICENSE-UNICODE b/vendor/regex-syntax/src/unicode_tables/LICENSE-UNICODE new file mode 100644 index 0000000000..b82826bdbd --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/LICENSE-UNICODE @@ -0,0 +1,57 @@ +UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE + +Unicode Data Files include all data files under the directories +http://www.unicode.org/Public/, http://www.unicode.org/reports/, +http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and +http://www.unicode.org/utility/trac/browser/. + +Unicode Data Files do not include PDF online code charts under the +directory http://www.unicode.org/Public/. + +Software includes any source code published in the Unicode Standard +or under the directories +http://www.unicode.org/Public/, http://www.unicode.org/reports/, +http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and +http://www.unicode.org/utility/trac/browser/. + +NOTICE TO USER: Carefully read the following legal agreement. +BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S +DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), +YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE +TERMS AND CONDITIONS OF THIS AGREEMENT. +IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE +THE DATA FILES OR SOFTWARE. + +COPYRIGHT AND PERMISSION NOTICE + +Copyright © 1991-2018 Unicode, Inc. All rights reserved. +Distributed under the Terms of Use in http://www.unicode.org/copyright.html. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Unicode data files and any associated documentation +(the "Data Files") or Unicode software and any associated documentation +(the "Software") to deal in the Data Files or Software +without restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, and/or sell copies of +the Data Files or Software, and to permit persons to whom the Data Files +or Software are furnished to do so, provided that either +(a) this copyright and permission notice appear with all copies +of the Data Files or Software, or +(b) this copyright and permission notice appear in associated +Documentation. + +THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT OF THIRD PARTY RIGHTS. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS +NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL +DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, +DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THE DATA FILES OR SOFTWARE. + +Except as contained in this notice, the name of a copyright holder +shall not be used in advertising or otherwise to promote the sale, +use or other dealings in these Data Files or Software without prior +written authorization of the copyright holder. diff --git a/vendor/regex-syntax/src/unicode_tables/age.rs b/vendor/regex-syntax/src/unicode_tables/age.rs new file mode 100644 index 0000000000..71f4861e07 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/age.rs @@ -0,0 +1,1791 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate age ucd-15.0.0 --chars +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("V10_0", V10_0), + ("V11_0", V11_0), + ("V12_0", V12_0), + ("V12_1", V12_1), + ("V13_0", V13_0), + ("V14_0", V14_0), + ("V15_0", V15_0), + ("V1_1", V1_1), + ("V2_0", V2_0), + ("V2_1", V2_1), + ("V3_0", V3_0), + ("V3_1", V3_1), + ("V3_2", V3_2), + ("V4_0", V4_0), + ("V4_1", V4_1), + ("V5_0", V5_0), + ("V5_1", V5_1), + ("V5_2", V5_2), + ("V6_0", V6_0), + ("V6_1", V6_1), + ("V6_2", V6_2), + ("V6_3", V6_3), + ("V7_0", V7_0), + ("V8_0", V8_0), + ("V9_0", V9_0), +]; + +pub const V10_0: &'static [(char, char)] = &[ + ('ࡠ', 'ࡪ'), + ('ৼ', '৽'), + ('\u{afa}', '\u{aff}'), + ('\u{d00}', '\u{d00}'), + ('\u{d3b}', '\u{d3c}'), + ('᳷', '᳷'), + ('\u{1df6}', '\u{1df9}'), + ('₿', '₿'), + ('⏿', '⏿'), + ('⯒', '⯒'), + ('⹅', '⹉'), + ('ㄮ', 'ㄮ'), + ('鿖', '鿪'), + ('𐌭', '𐌯'), + ('𑨀', '\u{11a47}'), + ('𑩐', '𑪃'), + ('𑪆', '𑪜'), + ('𑪞', '𑪢'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('𑵐', '𑵙'), + ('𖿡', '𖿡'), + ('𛀂', '𛄞'), + ('𛅰', '𛋻'), + ('🉠', '🉥'), + ('🛓', '🛔'), + ('🛷', '🛸'), + ('🤀', '🤋'), + ('🤟', '🤟'), + ('🤨', '🤯'), + ('🤱', '🤲'), + ('🥌', '🥌'), + ('🥟', '🥫'), + ('🦒', '🦗'), + ('🧐', '🧦'), + ('𬺰', '𮯠'), +]; + +pub const V11_0: &'static [(char, char)] = &[ + ('ՠ', 'ՠ'), + ('ֈ', 'ֈ'), + ('ׯ', 'ׯ'), + ('\u{7fd}', '߿'), + ('\u{8d3}', '\u{8d3}'), + ('\u{9fe}', '\u{9fe}'), + ('੶', '੶'), + ('\u{c04}', '\u{c04}'), + ('಄', '಄'), + ('ᡸ', 'ᡸ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('⮺', '⮼'), + ('⯓', '⯫'), + ('⯰', '⯾'), + ('⹊', '⹎'), + ('ㄯ', 'ㄯ'), + ('鿫', '鿯'), + ('ꞯ', 'ꞯ'), + ('Ꞹ', 'ꞹ'), + ('ꣾ', '\u{a8ff}'), + ('𐨴', '𐨵'), + ('𐩈', '𐩈'), + ('𐴀', '\u{10d27}'), + ('𐴰', '𐴹'), + ('𐼀', '𐼧'), + ('𐼰', '𐽙'), + ('\u{110cd}', '\u{110cd}'), + ('𑅄', '𑅆'), + ('\u{1133b}', '\u{1133b}'), + ('\u{1145e}', '\u{1145e}'), + ('𑜚', '𑜚'), + ('𑠀', '𑠻'), + ('𑪝', '𑪝'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '𑶘'), + ('𑶠', '𑶩'), + ('𑻠', '𑻸'), + ('𖹀', '𖺚'), + ('𘟭', '𘟱'), + ('𝋠', '𝋳'), + ('𝍲', '𝍸'), + ('𞱱', '𞲴'), + ('🄯', '🄯'), + ('🛹', '🛹'), + ('🟕', '🟘'), + ('🥍', '🥏'), + ('🥬', '🥰'), + ('🥳', '🥶'), + ('🥺', '🥺'), + ('🥼', '🥿'), + ('🦘', '🦢'), + ('🦰', '🦹'), + ('🧁', '🧂'), + ('🧧', '🧿'), + ('🩠', '🩭'), +]; + +pub const V12_0: &'static [(char, char)] = &[ + ('౷', '౷'), + ('ຆ', 'ຆ'), + ('ຉ', 'ຉ'), + ('ຌ', 'ຌ'), + ('ຎ', 'ຓ'), + ('ຘ', 'ຘ'), + ('ຠ', 'ຠ'), + ('ຨ', 'ຩ'), + ('ຬ', 'ຬ'), + ('\u{eba}', '\u{eba}'), + ('ᳺ', 'ᳺ'), + ('⯉', '⯉'), + ('⯿', '⯿'), + ('⹏', '⹏'), + ('Ꞻ', 'ꞿ'), + ('Ꟃ', 'Ᶎ'), + ('ꭦ', 'ꭧ'), + ('𐿠', '𐿶'), + ('𑑟', '𑑟'), + ('𑚸', '𑚸'), + ('𑦠', '𑦧'), + ('𑦪', '\u{119d7}'), + ('\u{119da}', '𑧤'), + ('𑪄', '𑪅'), + ('𑿀', '𑿱'), + ('𑿿', '𑿿'), + ('\u{13430}', '\u{13438}'), + ('𖽅', '𖽊'), + ('\u{16f4f}', '\u{16f4f}'), + ('𖽿', '𖾇'), + ('𖿢', '𖿣'), + ('𘟲', '𘟷'), + ('𛅐', '𛅒'), + ('𛅤', '𛅧'), + ('𞄀', '𞄬'), + ('\u{1e130}', '𞄽'), + ('𞅀', '𞅉'), + ('𞅎', '𞅏'), + ('𞋀', '𞋹'), + ('𞋿', '𞋿'), + ('𞥋', '𞥋'), + ('𞴁', '𞴽'), + ('🅬', '🅬'), + ('🛕', '🛕'), + ('🛺', '🛺'), + ('🟠', '🟫'), + ('🤍', '🤏'), + ('🤿', '🤿'), + ('🥱', '🥱'), + ('🥻', '🥻'), + ('🦥', '🦪'), + ('🦮', '🦯'), + ('🦺', '🦿'), + ('🧃', '🧊'), + ('🧍', '🧏'), + ('🨀', '🩓'), + ('🩰', '🩳'), + ('🩸', '🩺'), + ('🪀', '🪂'), + ('🪐', '🪕'), +]; + +pub const V12_1: &'static [(char, char)] = &[('㋿', '㋿')]; + +pub const V13_0: &'static [(char, char)] = &[ + ('ࢾ', 'ࣇ'), + ('\u{b55}', '\u{b55}'), + ('ഄ', 'ഄ'), + ('\u{d81}', '\u{d81}'), + ('\u{1abf}', '\u{1ac0}'), + ('⮗', '⮗'), + ('⹐', '⹒'), + ('ㆻ', 'ㆿ'), + ('䶶', '䶿'), + ('鿰', '鿼'), + ('Ꟈ', 'ꟊ'), + ('Ꟶ', 'ꟶ'), + ('\u{a82c}', '\u{a82c}'), + ('ꭨ', '꭫'), + ('𐆜', '𐆜'), + ('𐺀', '𐺩'), + ('\u{10eab}', '𐺭'), + ('𐺰', '𐺱'), + ('𐾰', '𐿋'), + ('𑅇', '𑅇'), + ('𑇎', '\u{111cf}'), + ('𑑚', '𑑚'), + ('𑑠', '𑑡'), + ('𑤀', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '𑥆'), + ('𑥐', '𑥙'), + ('𑾰', '𑾰'), + ('\u{16fe4}', '\u{16fe4}'), + ('𖿰', '𖿱'), + ('𘫳', '𘳕'), + ('𘴀', '𘴈'), + ('🄍', '🄏'), + ('🅭', '🅯'), + ('🆭', '🆭'), + ('🛖', '🛗'), + ('🛻', '🛼'), + ('🢰', '🢱'), + ('🤌', '🤌'), + ('🥲', '🥲'), + ('🥷', '🥸'), + ('🦣', '🦤'), + ('🦫', '🦭'), + ('🧋', '🧋'), + ('🩴', '🩴'), + ('🪃', '🪆'), + ('🪖', '🪨'), + ('🪰', '🪶'), + ('🫀', '🫂'), + ('🫐', '🫖'), + ('🬀', '🮒'), + ('🮔', '🯊'), + ('🯰', '🯹'), + ('𪛗', '𪛝'), + ('𰀀', '𱍊'), +]; + +pub const V14_0: &'static [(char, char)] = &[ + ('؝', '؝'), + ('ࡰ', 'ࢎ'), + ('\u{890}', '\u{891}'), + ('\u{898}', '\u{89f}'), + ('ࢵ', 'ࢵ'), + ('ࣈ', '\u{8d2}'), + ('\u{c3c}', '\u{c3c}'), + ('ౝ', 'ౝ'), + ('ೝ', 'ೝ'), + ('ᜍ', 'ᜍ'), + ('᜕', '᜕'), + ('ᜟ', 'ᜟ'), + ('\u{180f}', '\u{180f}'), + ('\u{1ac1}', '\u{1ace}'), + ('ᭌ', 'ᭌ'), + ('᭽', '᭾'), + ('\u{1dfa}', '\u{1dfa}'), + ('⃀', '⃀'), + ('Ⱟ', 'Ⱟ'), + ('ⱟ', 'ⱟ'), + ('⹓', '⹝'), + ('鿽', '鿿'), + ('Ꟁ', 'ꟁ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꟴ'), + ('﯂', '﯂'), + ('﵀', '﵏'), + ('﷏', '﷏'), + ('﷾', '﷿'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐽰', '𐾉'), + ('\u{11070}', '𑁵'), + ('\u{110c2}', '\u{110c2}'), + ('𑚹', '𑚹'), + ('𑝀', '𑝆'), + ('𑪰', '𑪿'), + ('𒾐', '𒿲'), + ('𖩰', '𖪾'), + ('𖫀', '𖫉'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛄟', '𛄢'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('𜽐', '𜿃'), + ('𝇩', '𝇪'), + ('𝼀', '𝼞'), + ('𞊐', '\u{1e2ae}'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('🛝', '🛟'), + ('🟰', '🟰'), + ('🥹', '🥹'), + ('🧌', '🧌'), + ('🩻', '🩼'), + ('🪩', '🪬'), + ('🪷', '🪺'), + ('🫃', '🫅'), + ('🫗', '🫙'), + ('🫠', '🫧'), + ('🫰', '🫶'), + ('𪛞', '𪛟'), + ('𫜵', '𫜸'), +]; + +pub const V15_0: &'static [(char, char)] = &[ + ('ೳ', 'ೳ'), + ('\u{ece}', '\u{ece}'), + ('\u{10efd}', '\u{10eff}'), + ('𑈿', '\u{11241}'), + ('𑬀', '𑬉'), + ('\u{11f00}', '𑼐'), + ('𑼒', '\u{11f3a}'), + ('𑼾', '𑽙'), + ('𓐯', '𓐯'), + ('\u{13439}', '\u{13455}'), + ('𛄲', '𛄲'), + ('𛅕', '𛅕'), + ('𝋀', '𝋓'), + ('𝼥', '𝼪'), + ('𞀰', '𞁭'), + ('\u{1e08f}', '\u{1e08f}'), + ('𞓐', '𞓹'), + ('🛜', '🛜'), + ('🝴', '🝶'), + ('🝻', '🝿'), + ('🟙', '🟙'), + ('🩵', '🩷'), + ('🪇', '🪈'), + ('🪭', '🪯'), + ('🪻', '🪽'), + ('🪿', '🪿'), + ('🫎', '🫏'), + ('🫚', '🫛'), + ('🫨', '🫨'), + ('🫷', '🫸'), + ('𫜹', '𫜹'), + ('𱍐', '𲎯'), +]; + +pub const V1_1: &'static [(char, char)] = &[ + ('\0', 'ǵ'), + ('Ǻ', 'ȗ'), + ('ɐ', 'ʨ'), + ('ʰ', '˞'), + ('ˠ', '˩'), + ('\u{300}', '\u{345}'), + ('\u{360}', '\u{361}'), + ('ʹ', '͵'), + ('ͺ', 'ͺ'), + (';', ';'), + ('΄', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ώ'), + ('ϐ', 'ϖ'), + ('Ϛ', 'Ϛ'), + ('Ϝ', 'Ϝ'), + ('Ϟ', 'Ϟ'), + ('Ϡ', 'Ϡ'), + ('Ϣ', 'ϳ'), + ('Ё', 'Ќ'), + ('Ў', 'я'), + ('ё', 'ќ'), + ('ў', '\u{486}'), + ('Ґ', 'ӄ'), + ('Ӈ', 'ӈ'), + ('Ӌ', 'ӌ'), + ('Ӑ', 'ӫ'), + ('Ӯ', 'ӵ'), + ('Ӹ', 'ӹ'), + ('Ա', 'Ֆ'), + ('ՙ', '՟'), + ('ա', 'և'), + ('։', '։'), + ('\u{5b0}', '\u{5b9}'), + ('\u{5bb}', '׃'), + ('א', 'ת'), + ('װ', '״'), + ('،', '،'), + ('؛', '؛'), + ('؟', '؟'), + ('ء', 'غ'), + ('ـ', '\u{652}'), + ('٠', '٭'), + ('\u{670}', 'ڷ'), + ('ں', 'ھ'), + ('ۀ', 'ێ'), + ('ې', '\u{6ed}'), + ('۰', '۹'), + ('\u{901}', 'ः'), + ('अ', 'ह'), + ('\u{93c}', '\u{94d}'), + ('ॐ', '\u{954}'), + ('क़', '॰'), + ('\u{981}', 'ঃ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9be}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', '\u{9cd}'), + ('\u{9d7}', '\u{9d7}'), + ('ড়', 'ঢ়'), + ('য়', '\u{9e3}'), + ('০', '৺'), + ('\u{a02}', '\u{a02}'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('੦', 'ੴ'), + ('\u{a81}', 'ઃ'), + ('અ', 'ઋ'), + ('ઍ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('ૐ', 'ૐ'), + ('ૠ', 'ૠ'), + ('૦', '૯'), + ('\u{b01}', 'ଃ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଶ', 'ହ'), + ('\u{b3c}', '\u{b43}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b56}', '\u{b57}'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୡ'), + ('୦', '୰'), + ('\u{b82}', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'வ'), + ('ஷ', 'ஹ'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('\u{bd7}', '\u{bd7}'), + ('௧', '௲'), + ('ఁ', 'ః'), + ('అ', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'ళ'), + ('వ', 'హ'), + ('\u{c3e}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('ౠ', 'ౡ'), + ('౦', '౯'), + ('ಂ', 'ಃ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('ಾ', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('ೞ', 'ೞ'), + ('ೠ', 'ೡ'), + ('೦', '೯'), + ('ം', 'ഃ'), + ('അ', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', 'ന'), + ('പ', 'ഹ'), + ('\u{d3e}', '\u{d43}'), + ('െ', 'ൈ'), + ('ൊ', '\u{d4d}'), + ('\u{d57}', '\u{d57}'), + ('ൠ', 'ൡ'), + ('൦', '൯'), + ('ก', '\u{e3a}'), + ('฿', '๛'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ງ', 'ຈ'), + ('ຊ', 'ຊ'), + ('ຍ', 'ຍ'), + ('ດ', 'ທ'), + ('ນ', 'ຟ'), + ('ມ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ວ'), + ('ສ', 'ຫ'), + ('ອ', '\u{eb9}'), + ('\u{ebb}', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('\u{ec8}', '\u{ecd}'), + ('໐', '໙'), + ('ໜ', 'ໝ'), + ('Ⴀ', 'Ⴥ'), + ('ა', 'ჶ'), + ('჻', '჻'), + ('ᄀ', 'ᅙ'), + ('ᅟ', 'ᆢ'), + ('ᆨ', 'ᇹ'), + ('Ḁ', 'ẚ'), + ('Ạ', 'ỹ'), + ('ἀ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ῄ'), + ('ῆ', 'ΐ'), + ('ῖ', 'Ί'), + ('῝', '`'), + ('ῲ', 'ῴ'), + ('ῶ', '῾'), + ('\u{2000}', '\u{202e}'), + ('‰', '⁆'), + ('\u{206a}', '⁰'), + ('⁴', '₎'), + ('₠', '₪'), + ('\u{20d0}', '\u{20e1}'), + ('℀', 'ℸ'), + ('⅓', 'ↂ'), + ('←', '⇪'), + ('∀', '⋱'), + ('⌀', '⌀'), + ('⌂', '⍺'), + ('␀', '␤'), + ('⑀', '⑊'), + ('①', '⓪'), + ('─', '▕'), + ('■', '◯'), + ('☀', '☓'), + ('☚', '♯'), + ('✁', '✄'), + ('✆', '✉'), + ('✌', '✧'), + ('✩', '❋'), + ('❍', '❍'), + ('❏', '❒'), + ('❖', '❖'), + ('❘', '❞'), + ('❡', '❧'), + ('❶', '➔'), + ('➘', '➯'), + ('➱', '➾'), + ('\u{3000}', '〷'), + ('〿', '〿'), + ('ぁ', 'ゔ'), + ('\u{3099}', 'ゞ'), + ('ァ', 'ヾ'), + ('ㄅ', 'ㄬ'), + ('ㄱ', 'ㆎ'), + ('㆐', '㆟'), + ('㈀', '㈜'), + ('㈠', '㉃'), + ('㉠', '㉻'), + ('㉿', '㊰'), + ('㋀', '㋋'), + ('㋐', '㋾'), + ('㌀', '㍶'), + ('㍻', '㏝'), + ('㏠', '㏾'), + ('一', '龥'), + ('\u{e000}', '鶴'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('\u{fb1e}', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', '﴿'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('\u{fe20}', '\u{fe23}'), + ('︰', '﹄'), + ('﹉', '﹒'), + ('﹔', '﹦'), + ('﹨', '﹫'), + ('ﹰ', 'ﹲ'), + ('ﹴ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('\u{feff}', '\u{feff}'), + ('!', '~'), + ('。', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('¢', '₩'), + ('│', '○'), + ('�', '\u{ffff}'), +]; + +pub const V2_0: &'static [(char, char)] = &[ + ('\u{591}', '\u{5a1}'), + ('\u{5a3}', '\u{5af}'), + ('\u{5c4}', '\u{5c4}'), + ('ༀ', 'ཇ'), + ('ཉ', 'ཀྵ'), + ('\u{f71}', 'ྋ'), + ('\u{f90}', '\u{f95}'), + ('\u{f97}', '\u{f97}'), + ('\u{f99}', '\u{fad}'), + ('\u{fb1}', '\u{fb7}'), + ('\u{fb9}', '\u{fb9}'), + ('ẛ', 'ẛ'), + ('₫', '₫'), + ('가', '힣'), + ('\u{1fffe}', '\u{1ffff}'), + ('\u{2fffe}', '\u{2ffff}'), + ('\u{3fffe}', '\u{3ffff}'), + ('\u{4fffe}', '\u{4ffff}'), + ('\u{5fffe}', '\u{5ffff}'), + ('\u{6fffe}', '\u{6ffff}'), + ('\u{7fffe}', '\u{7ffff}'), + ('\u{8fffe}', '\u{8ffff}'), + ('\u{9fffe}', '\u{9ffff}'), + ('\u{afffe}', '\u{affff}'), + ('\u{bfffe}', '\u{bffff}'), + ('\u{cfffe}', '\u{cffff}'), + ('\u{dfffe}', '\u{dffff}'), + ('\u{efffe}', '\u{10ffff}'), +]; + +pub const V2_1: &'static [(char, char)] = &[('€', '€'), ('', '')]; + +pub const V3_0: &'static [(char, char)] = &[ + ('Ƕ', 'ǹ'), + ('Ș', 'ȟ'), + ('Ȣ', 'ȳ'), + ('ʩ', 'ʭ'), + ('˟', '˟'), + ('˪', 'ˮ'), + ('\u{346}', '\u{34e}'), + ('\u{362}', '\u{362}'), + ('ϗ', 'ϗ'), + ('ϛ', 'ϛ'), + ('ϝ', 'ϝ'), + ('ϟ', 'ϟ'), + ('ϡ', 'ϡ'), + ('Ѐ', 'Ѐ'), + ('Ѝ', 'Ѝ'), + ('ѐ', 'ѐ'), + ('ѝ', 'ѝ'), + ('\u{488}', '\u{489}'), + ('Ҍ', 'ҏ'), + ('Ӭ', 'ӭ'), + ('֊', '֊'), + ('\u{653}', '\u{655}'), + ('ڸ', 'ڹ'), + ('ڿ', 'ڿ'), + ('ۏ', 'ۏ'), + ('ۺ', '۾'), + ('܀', '܍'), + ('\u{70f}', 'ܬ'), + ('\u{730}', '\u{74a}'), + ('ހ', '\u{7b0}'), + ('ං', 'ඃ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('ෲ', '෴'), + ('ཪ', 'ཪ'), + ('\u{f96}', '\u{f96}'), + ('\u{fae}', '\u{fb0}'), + ('\u{fb8}', '\u{fb8}'), + ('\u{fba}', '\u{fbc}'), + ('྾', '࿌'), + ('࿏', '࿏'), + ('က', 'အ'), + ('ဣ', 'ဧ'), + ('ဩ', 'ဪ'), + ('ာ', '\u{1032}'), + ('\u{1036}', '\u{1039}'), + ('၀', '\u{1059}'), + ('ሀ', 'ሆ'), + ('ለ', 'ቆ'), + ('ቈ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኆ'), + ('ኈ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኮ'), + ('ኰ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዎ'), + ('ዐ', 'ዖ'), + ('ዘ', 'ዮ'), + ('ደ', 'ጎ'), + ('ጐ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ጞ'), + ('ጠ', 'ፆ'), + ('ፈ', 'ፚ'), + ('፡', '፼'), + ('Ꭰ', 'Ᏼ'), + ('ᐁ', 'ᙶ'), + ('\u{1680}', '᚜'), + ('ᚠ', 'ᛰ'), + ('ក', 'ៜ'), + ('០', '៩'), + ('᠀', '\u{180e}'), + ('᠐', '᠙'), + ('ᠠ', 'ᡷ'), + ('ᢀ', '\u{18a9}'), + ('\u{202f}', '\u{202f}'), + ('⁈', '⁍'), + ('₭', '₯'), + ('\u{20e2}', '\u{20e3}'), + ('ℹ', '℺'), + ('Ↄ', 'Ↄ'), + ('⇫', '⇳'), + ('⌁', '⌁'), + ('⍻', '⍻'), + ('⍽', '⎚'), + ('␥', '␦'), + ('◰', '◷'), + ('☙', '☙'), + ('♰', '♱'), + ('⠀', '⣿'), + ('⺀', '⺙'), + ('⺛', '⻳'), + ('⼀', '⿕'), + ('⿰', '⿻'), + ('〸', '〺'), + ('〾', '〾'), + ('ㆠ', 'ㆷ'), + ('㐀', '䶵'), + ('ꀀ', 'ꒌ'), + ('꒐', '꒡'), + ('꒤', '꒳'), + ('꒵', '꓀'), + ('꓂', '꓄'), + ('꓆', '꓆'), + ('יִ', 'יִ'), + ('\u{fff9}', '\u{fffb}'), +]; + +pub const V3_1: &'static [(char, char)] = &[ + ('ϴ', 'ϵ'), + ('\u{fdd0}', '\u{fdef}'), + ('𐌀', '𐌞'), + ('𐌠', '𐌣'), + ('𐌰', '𐍊'), + ('𐐀', '𐐥'), + ('𐐨', '𐑍'), + ('𝀀', '𝃵'), + ('𝄀', '𝄦'), + ('𝄪', '𝇝'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓀'), + ('𝓂', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚣'), + ('𝚨', '𝟉'), + ('𝟎', '𝟿'), + ('𠀀', '𪛖'), + ('丽', '𪘀'), + ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const V3_2: &'static [(char, char)] = &[ + ('Ƞ', 'Ƞ'), + ('\u{34f}', '\u{34f}'), + ('\u{363}', '\u{36f}'), + ('Ϙ', 'ϙ'), + ('϶', '϶'), + ('Ҋ', 'ҋ'), + ('Ӆ', 'ӆ'), + ('Ӊ', 'ӊ'), + ('Ӎ', 'ӎ'), + ('Ԁ', 'ԏ'), + ('ٮ', 'ٯ'), + ('ޱ', 'ޱ'), + ('ჷ', 'ჸ'), + ('ᜀ', 'ᜌ'), + ('ᜎ', '\u{1714}'), + ('ᜠ', '᜶'), + ('ᝀ', '\u{1753}'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('\u{1772}', '\u{1773}'), + ('⁇', '⁇'), + ('⁎', '⁒'), + ('⁗', '⁗'), + ('\u{205f}', '\u{2063}'), + ('ⁱ', 'ⁱ'), + ('₰', '₱'), + ('\u{20e4}', '\u{20ea}'), + ('ℽ', '⅋'), + ('⇴', '⇿'), + ('⋲', '⋿'), + ('⍼', '⍼'), + ('⎛', '⏎'), + ('⓫', '⓾'), + ('▖', '▟'), + ('◸', '◿'), + ('☖', '☗'), + ('♲', '♽'), + ('⚀', '⚉'), + ('❨', '❵'), + ('⟐', '⟫'), + ('⟰', '⟿'), + ('⤀', '⫿'), + ('〻', '〽'), + ('ゕ', 'ゖ'), + ('ゟ', '゠'), + ('ヿ', 'ヿ'), + ('ㇰ', 'ㇿ'), + ('㉑', '㉟'), + ('㊱', '㊿'), + ('꒢', '꒣'), + ('꒴', '꒴'), + ('꓁', '꓁'), + ('꓅', '꓅'), + ('侮', '頻'), + ('﷼', '﷼'), + ('\u{fe00}', '\u{fe0f}'), + ('﹅', '﹆'), + ('ﹳ', 'ﹳ'), + ('⦅', '⦆'), +]; + +pub const V4_0: &'static [(char, char)] = &[ + ('ȡ', 'ȡ'), + ('ȴ', 'ȶ'), + ('ʮ', 'ʯ'), + ('˯', '˿'), + ('\u{350}', '\u{357}'), + ('\u{35d}', '\u{35f}'), + ('Ϸ', 'ϻ'), + ('\u{600}', '\u{603}'), + ('؍', '\u{615}'), + ('\u{656}', '\u{658}'), + ('ۮ', 'ۯ'), + ('ۿ', 'ۿ'), + ('ܭ', 'ܯ'), + ('ݍ', 'ݏ'), + ('ऄ', 'ऄ'), + ('ঽ', 'ঽ'), + ('\u{a01}', '\u{a01}'), + ('ਃ', 'ਃ'), + ('ઌ', 'ઌ'), + ('ૡ', '\u{ae3}'), + ('૱', '૱'), + ('ଵ', 'ଵ'), + ('ୱ', 'ୱ'), + ('௳', '௺'), + ('\u{cbc}', 'ಽ'), + ('\u{17dd}', '\u{17dd}'), + ('៰', '៹'), + ('ᤀ', 'ᤜ'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('᥀', '᥀'), + ('᥄', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('᧠', '᧿'), + ('ᴀ', 'ᵫ'), + ('⁓', '⁔'), + ('℻', '℻'), + ('⏏', '⏐'), + ('⓿', '⓿'), + ('☔', '☕'), + ('⚊', '⚑'), + ('⚠', '⚡'), + ('⬀', '⬍'), + ('㈝', '㈞'), + ('㉐', '㉐'), + ('㉼', '㉽'), + ('㋌', '㋏'), + ('㍷', '㍺'), + ('㏞', '㏟'), + ('㏿', '㏿'), + ('䷀', '䷿'), + ('﷽', '﷽'), + ('﹇', '﹈'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐄀', '𐄂'), + ('𐄇', '𐄳'), + ('𐄷', '𐄿'), + ('𐎀', '𐎝'), + ('𐎟', '𐎟'), + ('𐐦', '𐐧'), + ('𐑎', '𐒝'), + ('𐒠', '𐒩'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐠿'), + ('𝌀', '𝍖'), + ('𝓁', '𝓁'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const V4_1: &'static [(char, char)] = &[ + ('ȷ', 'Ɂ'), + ('\u{358}', '\u{35c}'), + ('ϼ', 'Ͽ'), + ('Ӷ', 'ӷ'), + ('\u{5a2}', '\u{5a2}'), + ('\u{5c5}', '\u{5c7}'), + ('؋', '؋'), + ('؞', '؞'), + ('\u{659}', '\u{65e}'), + ('ݐ', 'ݭ'), + ('ॽ', 'ॽ'), + ('ৎ', 'ৎ'), + ('ஶ', 'ஶ'), + ('௦', '௦'), + ('࿐', '࿑'), + ('ჹ', 'ჺ'), + ('ჼ', 'ჼ'), + ('ሇ', 'ሇ'), + ('ቇ', 'ቇ'), + ('ኇ', 'ኇ'), + ('ኯ', 'ኯ'), + ('ዏ', 'ዏ'), + ('ዯ', 'ዯ'), + ('ጏ', 'ጏ'), + ('ጟ', 'ጟ'), + ('ፇ', 'ፇ'), + ('\u{135f}', '፠'), + ('ᎀ', '᎙'), + ('ᦀ', 'ᦩ'), + ('ᦰ', 'ᧉ'), + ('᧐', '᧙'), + ('᧞', '᧟'), + ('ᨀ', '\u{1a1b}'), + ('᨞', '᨟'), + ('ᵬ', '\u{1dc3}'), + ('⁕', '⁖'), + ('⁘', '⁞'), + ('ₐ', 'ₔ'), + ('₲', '₵'), + ('\u{20eb}', '\u{20eb}'), + ('ℼ', 'ℼ'), + ('⅌', '⅌'), + ('⏑', '⏛'), + ('☘', '☘'), + ('♾', '♿'), + ('⚒', '⚜'), + ('⚢', '⚱'), + ('⟀', '⟆'), + ('⬎', '⬓'), + ('Ⰰ', 'Ⱞ'), + ('ⰰ', 'ⱞ'), + ('Ⲁ', '⳪'), + ('⳹', 'ⴥ'), + ('ⴰ', 'ⵥ'), + ('ⵯ', 'ⵯ'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('⸀', '⸗'), + ('⸜', '⸝'), + ('㇀', '㇏'), + ('㉾', '㉾'), + ('龦', '龻'), + ('꜀', '꜖'), + ('ꠀ', '꠫'), + ('並', '龎'), + ('︐', '︙'), + ('𐅀', '𐆊'), + ('𐎠', '𐏃'), + ('𐏈', '𐏕'), + ('𐨀', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨳'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '𐩇'), + ('𐩐', '𐩘'), + ('𝈀', '𝉅'), + ('𝚤', '𝚥'), +]; + +pub const V5_0: &'static [(char, char)] = &[ + ('ɂ', 'ɏ'), + ('ͻ', 'ͽ'), + ('ӏ', 'ӏ'), + ('Ӻ', 'ӿ'), + ('Ԑ', 'ԓ'), + ('\u{5ba}', '\u{5ba}'), + ('߀', 'ߺ'), + ('ॻ', 'ॼ'), + ('ॾ', 'ॿ'), + ('\u{ce2}', '\u{ce3}'), + ('ೱ', 'ೲ'), + ('\u{1b00}', 'ᭋ'), + ('᭐', '᭼'), + ('\u{1dc4}', '\u{1dca}'), + ('\u{1dfe}', '\u{1dff}'), + ('\u{20ec}', '\u{20ef}'), + ('⅍', 'ⅎ'), + ('ↄ', 'ↄ'), + ('⏜', '⏧'), + ('⚲', '⚲'), + ('⟇', '⟊'), + ('⬔', '⬚'), + ('⬠', '⬣'), + ('Ⱡ', 'ⱬ'), + ('ⱴ', 'ⱷ'), + ('ꜗ', 'ꜚ'), + ('꜠', '꜡'), + ('ꡀ', '꡷'), + ('𐤀', '𐤙'), + ('𐤟', '𐤟'), + ('𒀀', '𒍮'), + ('𒐀', '𒑢'), + ('𒑰', '𒑳'), + ('𝍠', '𝍱'), + ('𝟊', '𝟋'), +]; + +pub const V5_1: &'static [(char, char)] = &[ + ('Ͱ', 'ͳ'), + ('Ͷ', 'ͷ'), + ('Ϗ', 'Ϗ'), + ('\u{487}', '\u{487}'), + ('Ԕ', 'ԣ'), + ('؆', '؊'), + ('\u{616}', '\u{61a}'), + ('ػ', 'ؿ'), + ('ݮ', 'ݿ'), + ('ॱ', 'ॲ'), + ('\u{a51}', '\u{a51}'), + ('\u{a75}', '\u{a75}'), + ('\u{b44}', '\u{b44}'), + ('\u{b62}', '\u{b63}'), + ('ௐ', 'ௐ'), + ('ఽ', 'ఽ'), + ('ౘ', 'ౙ'), + ('\u{c62}', '\u{c63}'), + ('౸', '౿'), + ('ഽ', 'ഽ'), + ('\u{d44}', '\u{d44}'), + ('\u{d62}', '\u{d63}'), + ('൰', '൵'), + ('൹', 'ൿ'), + ('ཫ', 'ཬ'), + ('࿎', '࿎'), + ('࿒', '࿔'), + ('ဢ', 'ဢ'), + ('ဨ', 'ဨ'), + ('ါ', 'ါ'), + ('\u{1033}', '\u{1035}'), + ('\u{103a}', 'ဿ'), + ('ၚ', '႙'), + ('႞', '႟'), + ('ᢪ', 'ᢪ'), + ('\u{1b80}', '᮪'), + ('ᮮ', '᮹'), + ('ᰀ', '\u{1c37}'), + ('᰻', '᱉'), + ('ᱍ', '᱿'), + ('\u{1dcb}', '\u{1de6}'), + ('ẜ', 'ẟ'), + ('Ỻ', 'ỿ'), + ('\u{2064}', '\u{2064}'), + ('\u{20f0}', '\u{20f0}'), + ('⅏', '⅏'), + ('ↅ', 'ↈ'), + ('⚝', '⚝'), + ('⚳', '⚼'), + ('⛀', '⛃'), + ('⟌', '⟌'), + ('⟬', '⟯'), + ('⬛', '⬟'), + ('⬤', '⭌'), + ('⭐', '⭔'), + ('Ɑ', 'Ɐ'), + ('ⱱ', 'ⱳ'), + ('ⱸ', 'ⱽ'), + ('\u{2de0}', '\u{2dff}'), + ('⸘', '⸛'), + ('⸞', '⸰'), + ('ㄭ', 'ㄭ'), + ('㇐', '㇣'), + ('龼', '鿃'), + ('ꔀ', 'ꘫ'), + ('Ꙁ', 'ꙟ'), + ('Ꙣ', '꙳'), + ('\u{a67c}', 'ꚗ'), + ('ꜛ', 'ꜟ'), + ('Ꜣ', 'ꞌ'), + ('ꟻ', 'ꟿ'), + ('ꢀ', '\u{a8c4}'), + ('꣎', '꣙'), + ('꤀', '꥓'), + ('꥟', '꥟'), + ('ꨀ', '\u{aa36}'), + ('ꩀ', 'ꩍ'), + ('꩐', '꩙'), + ('꩜', '꩟'), + ('\u{fe24}', '\u{fe26}'), + ('𐆐', '𐆛'), + ('𐇐', '\u{101fd}'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('𐤠', '𐤹'), + ('𐤿', '𐤿'), + ('𝄩', '𝄩'), + ('🀀', '🀫'), + ('🀰', '🂓'), +]; + +pub const V5_2: &'static [(char, char)] = &[ + ('Ԥ', 'ԥ'), + ('ࠀ', '\u{82d}'), + ('࠰', '࠾'), + ('\u{900}', '\u{900}'), + ('ॎ', 'ॎ'), + ('\u{955}', '\u{955}'), + ('ॹ', 'ॺ'), + ('৻', '৻'), + ('࿕', '࿘'), + ('ႚ', '\u{109d}'), + ('ᅚ', 'ᅞ'), + ('ᆣ', 'ᆧ'), + ('ᇺ', 'ᇿ'), + ('᐀', '᐀'), + ('ᙷ', 'ᙿ'), + ('ᢰ', 'ᣵ'), + ('ᦪ', 'ᦫ'), + ('᧚', '᧚'), + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '᪉'), + ('᪐', '᪙'), + ('᪠', '᪭'), + ('\u{1cd0}', 'ᳲ'), + ('\u{1dfd}', '\u{1dfd}'), + ('₶', '₸'), + ('⅐', '⅒'), + ('↉', '↉'), + ('⏨', '⏨'), + ('⚞', '⚟'), + ('⚽', '⚿'), + ('⛄', '⛍'), + ('⛏', '⛡'), + ('⛣', '⛣'), + ('⛨', '⛿'), + ('❗', '❗'), + ('⭕', '⭙'), + ('Ɒ', 'Ɒ'), + ('Ȿ', 'Ɀ'), + ('Ⳬ', '\u{2cf1}'), + ('⸱', '⸱'), + ('㉄', '㉏'), + ('鿄', '鿋'), + ('ꓐ', '꓿'), + ('ꚠ', '꛷'), + ('꠰', '꠹'), + ('\u{a8e0}', 'ꣻ'), + ('ꥠ', 'ꥼ'), + ('\u{a980}', '꧍'), + ('ꧏ', '꧙'), + ('꧞', '꧟'), + ('ꩠ', 'ꩻ'), + ('ꪀ', 'ꫂ'), + ('ꫛ', '꫟'), + ('ꯀ', '\u{abed}'), + ('꯰', '꯹'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('恵', '舘'), + ('𐡀', '𐡕'), + ('𐡗', '𐡟'), + ('𐤚', '𐤛'), + ('𐩠', '𐩿'), + ('𐬀', '𐬵'), + ('𐬹', '𐭕'), + ('𐭘', '𐭲'), + ('𐭸', '𐭿'), + ('𐰀', '𐱈'), + ('𐹠', '𐹾'), + ('\u{11080}', '𑃁'), + ('𓀀', '𓐮'), + ('🄀', '🄊'), + ('🄐', '🄮'), + ('🄱', '🄱'), + ('🄽', '🄽'), + ('🄿', '🄿'), + ('🅂', '🅂'), + ('🅆', '🅆'), + ('🅊', '🅎'), + ('🅗', '🅗'), + ('🅟', '🅟'), + ('🅹', '🅹'), + ('🅻', '🅼'), + ('🅿', '🅿'), + ('🆊', '🆍'), + ('🆐', '🆐'), + ('🈀', '🈀'), + ('🈐', '🈱'), + ('🉀', '🉈'), + ('𪜀', '𫜴'), +]; + +pub const V6_0: &'static [(char, char)] = &[ + ('Ԧ', 'ԧ'), + ('ؠ', 'ؠ'), + ('\u{65f}', '\u{65f}'), + ('ࡀ', '\u{85b}'), + ('࡞', '࡞'), + ('\u{93a}', 'ऻ'), + ('ॏ', 'ॏ'), + ('\u{956}', '\u{957}'), + ('ॳ', 'ॷ'), + ('୲', '୷'), + ('ഩ', 'ഩ'), + ('ഺ', 'ഺ'), + ('ൎ', 'ൎ'), + ('ྌ', '\u{f8f}'), + ('࿙', '࿚'), + ('\u{135d}', '\u{135e}'), + ('ᯀ', '᯳'), + ('᯼', '᯿'), + ('\u{1dfc}', '\u{1dfc}'), + ('ₕ', 'ₜ'), + ('₹', '₹'), + ('⏩', '⏳'), + ('⛎', '⛎'), + ('⛢', '⛢'), + ('⛤', '⛧'), + ('✅', '✅'), + ('✊', '✋'), + ('✨', '✨'), + ('❌', '❌'), + ('❎', '❎'), + ('❓', '❕'), + ('❟', '❠'), + ('➕', '➗'), + ('➰', '➰'), + ('➿', '➿'), + ('⟎', '⟏'), + ('⵰', '⵰'), + ('\u{2d7f}', '\u{2d7f}'), + ('ㆸ', 'ㆺ'), + ('Ꙡ', 'ꙡ'), + ('Ɥ', 'ꞎ'), + ('Ꞑ', 'ꞑ'), + ('Ꞡ', 'ꞩ'), + ('ꟺ', 'ꟺ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('﮲', '﯁'), + ('𑀀', '𑁍'), + ('𑁒', '𑁯'), + ('𖠀', '𖨸'), + ('𛀀', '𛀁'), + ('🂠', '🂮'), + ('🂱', '🂾'), + ('🃁', '🃏'), + ('🃑', '🃟'), + ('🄰', '🄰'), + ('🄲', '🄼'), + ('🄾', '🄾'), + ('🅀', '🅁'), + ('🅃', '🅅'), + ('🅇', '🅉'), + ('🅏', '🅖'), + ('🅘', '🅞'), + ('🅠', '🅩'), + ('🅰', '🅸'), + ('🅺', '🅺'), + ('🅽', '🅾'), + ('🆀', '🆉'), + ('🆎', '🆏'), + ('🆑', '🆚'), + ('🇦', '🇿'), + ('🈁', '🈂'), + ('🈲', '🈺'), + ('🉐', '🉑'), + ('🌀', '🌠'), + ('🌰', '🌵'), + ('🌷', '🍼'), + ('🎀', '🎓'), + ('🎠', '🏄'), + ('🏆', '🏊'), + ('🏠', '🏰'), + ('🐀', '🐾'), + ('👀', '👀'), + ('👂', '📷'), + ('📹', '📼'), + ('🔀', '🔽'), + ('🕐', '🕧'), + ('🗻', '🗿'), + ('😁', '😐'), + ('😒', '😔'), + ('😖', '😖'), + ('😘', '😘'), + ('😚', '😚'), + ('😜', '😞'), + ('😠', '😥'), + ('😨', '😫'), + ('😭', '😭'), + ('😰', '😳'), + ('😵', '🙀'), + ('🙅', '🙏'), + ('🚀', '🛅'), + ('🜀', '🝳'), + ('𫝀', '𫠝'), +]; + +pub const V6_1: &'static [(char, char)] = &[ + ('֏', '֏'), + ('\u{604}', '\u{604}'), + ('ࢠ', 'ࢠ'), + ('ࢢ', 'ࢬ'), + ('\u{8e4}', '\u{8fe}'), + ('૰', '૰'), + ('ໞ', 'ໟ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ჽ', 'ჿ'), + ('\u{1bab}', '\u{1bad}'), + ('ᮺ', 'ᮿ'), + ('᳀', '᳇'), + ('ᳳ', 'ᳶ'), + ('⟋', '⟋'), + ('⟍', '⟍'), + ('Ⳳ', 'ⳳ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⵦ', 'ⵧ'), + ('⸲', '⸻'), + ('鿌', '鿌'), + ('\u{a674}', '\u{a67b}'), + ('\u{a69f}', '\u{a69f}'), + ('Ꞓ', 'ꞓ'), + ('Ɦ', 'Ɦ'), + ('ꟸ', 'ꟹ'), + ('ꫠ', '\u{aaf6}'), + ('郞', '隷'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𑃐', '𑃨'), + ('𑃰', '𑃹'), + ('\u{11100}', '\u{11134}'), + ('𑄶', '𑅃'), + ('\u{11180}', '𑇈'), + ('𑇐', '𑇙'), + ('𑚀', '\u{116b7}'), + ('𑛀', '𑛉'), + ('𖼀', '𖽄'), + ('𖽐', '𖽾'), + ('\u{16f8f}', '𖾟'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('𞻰', '𞻱'), + ('🅪', '🅫'), + ('🕀', '🕃'), + ('😀', '😀'), + ('😑', '😑'), + ('😕', '😕'), + ('😗', '😗'), + ('😙', '😙'), + ('😛', '😛'), + ('😟', '😟'), + ('😦', '😧'), + ('😬', '😬'), + ('😮', '😯'), + ('😴', '😴'), +]; + +pub const V6_2: &'static [(char, char)] = &[('₺', '₺')]; + +pub const V6_3: &'static [(char, char)] = + &[('\u{61c}', '\u{61c}'), ('\u{2066}', '\u{2069}')]; + +pub const V7_0: &'static [(char, char)] = &[ + ('Ϳ', 'Ϳ'), + ('Ԩ', 'ԯ'), + ('֍', '֎'), + ('\u{605}', '\u{605}'), + ('ࢡ', 'ࢡ'), + ('ࢭ', 'ࢲ'), + ('\u{8ff}', '\u{8ff}'), + ('ॸ', 'ॸ'), + ('ঀ', 'ঀ'), + ('\u{c00}', '\u{c00}'), + ('ఴ', 'ఴ'), + ('\u{c81}', '\u{c81}'), + ('\u{d01}', '\u{d01}'), + ('෦', '෯'), + ('ᛱ', 'ᛸ'), + ('ᤝ', 'ᤞ'), + ('\u{1ab0}', '\u{1abe}'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{1de7}', '\u{1df5}'), + ('₻', '₽'), + ('⏴', '⏺'), + ('✀', '✀'), + ('⭍', '⭏'), + ('⭚', '⭳'), + ('⭶', '⮕'), + ('⮘', '⮹'), + ('⮽', '⯈'), + ('⯊', '⯑'), + ('⸼', '⹂'), + ('Ꚙ', 'ꚝ'), + ('ꞔ', 'ꞟ'), + ('Ɜ', 'Ɬ'), + ('Ʞ', 'Ʇ'), + ('ꟷ', 'ꟷ'), + ('ꧠ', 'ꧾ'), + ('\u{aa7c}', 'ꩿ'), + ('ꬰ', 'ꭟ'), + ('ꭤ', 'ꭥ'), + ('\u{fe27}', '\u{fe2d}'), + ('𐆋', '𐆌'), + ('𐆠', '𐆠'), + ('\u{102e0}', '𐋻'), + ('𐌟', '𐌟'), + ('𐍐', '\u{1037a}'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕯', '𐕯'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐡠', '𐢞'), + ('𐢧', '𐢯'), + ('𐪀', '𐪟'), + ('𐫀', '\u{10ae6}'), + ('𐫫', '𐫶'), + ('𐮀', '𐮑'), + ('𐮙', '𐮜'), + ('𐮩', '𐮯'), + ('\u{1107f}', '\u{1107f}'), + ('𑅐', '𑅶'), + ('𑇍', '𑇍'), + ('𑇚', '𑇚'), + ('𑇡', '𑇴'), + ('𑈀', '𑈑'), + ('𑈓', '𑈽'), + ('𑊰', '\u{112ea}'), + ('𑋰', '𑋹'), + ('\u{11301}', '𑌃'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('\u{1133c}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('\u{11357}', '\u{11357}'), + ('𑍝', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('𑒀', '𑓇'), + ('𑓐', '𑓙'), + ('𑖀', '\u{115b5}'), + ('𑖸', '𑗉'), + ('𑘀', '𑙄'), + ('𑙐', '𑙙'), + ('𑢠', '𑣲'), + ('𑣿', '𑣿'), + ('𑫀', '𑫸'), + ('𒍯', '𒎘'), + ('𒑣', '𒑮'), + ('𒑴', '𒑴'), + ('𖩀', '𖩞'), + ('𖩠', '𖩩'), + ('𖩮', '𖩯'), + ('𖫐', '𖫭'), + ('\u{16af0}', '𖫵'), + ('𖬀', '𖭅'), + ('𖭐', '𖭙'), + ('𖭛', '𖭡'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('𛲜', '\u{1bca3}'), + ('𞠀', '𞣄'), + ('𞣇', '\u{1e8d6}'), + ('🂿', '🂿'), + ('🃠', '🃵'), + ('🄋', '🄌'), + ('🌡', '🌬'), + ('🌶', '🌶'), + ('🍽', '🍽'), + ('🎔', '🎟'), + ('🏅', '🏅'), + ('🏋', '🏎'), + ('🏔', '🏟'), + ('🏱', '🏷'), + ('🐿', '🐿'), + ('👁', '👁'), + ('📸', '📸'), + ('📽', '📾'), + ('🔾', '🔿'), + ('🕄', '🕊'), + ('🕨', '🕹'), + ('🕻', '🖣'), + ('🖥', '🗺'), + ('🙁', '🙂'), + ('🙐', '🙿'), + ('🛆', '🛏'), + ('🛠', '🛬'), + ('🛰', '🛳'), + ('🞀', '🟔'), + ('🠀', '🠋'), + ('🠐', '🡇'), + ('🡐', '🡙'), + ('🡠', '🢇'), + ('🢐', '🢭'), +]; + +pub const V8_0: &'static [(char, char)] = &[ + ('ࢳ', 'ࢴ'), + ('\u{8e3}', '\u{8e3}'), + ('ૹ', 'ૹ'), + ('ౚ', 'ౚ'), + ('ൟ', 'ൟ'), + ('Ᏽ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('₾', '₾'), + ('↊', '↋'), + ('⯬', '⯯'), + ('鿍', '鿕'), + ('\u{a69e}', '\u{a69e}'), + ('ꞏ', 'ꞏ'), + ('Ʝ', 'ꞷ'), + ('꣼', 'ꣽ'), + ('ꭠ', 'ꭣ'), + ('ꭰ', 'ꮿ'), + ('\u{fe2e}', '\u{fe2f}'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐣻', '𐣿'), + ('𐦼', '𐦽'), + ('𐧀', '𐧏'), + ('𐧒', '𐧿'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐳺', '𐳿'), + ('\u{111c9}', '\u{111cc}'), + ('𑇛', '𑇟'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊩'), + ('\u{11300}', '\u{11300}'), + ('𑍐', '𑍐'), + ('𑗊', '\u{115dd}'), + ('𑜀', '𑜙'), + ('\u{1171d}', '\u{1172b}'), + ('𑜰', '𑜿'), + ('𒎙', '𒎙'), + ('𒒀', '𒕃'), + ('𔐀', '𔙆'), + ('𝇞', '𝇨'), + ('𝠀', '𝪋'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('🌭', '🌯'), + ('🍾', '🍿'), + ('🏏', '🏓'), + ('🏸', '🏿'), + ('📿', '📿'), + ('🕋', '🕏'), + ('🙃', '🙄'), + ('🛐', '🛐'), + ('🤐', '🤘'), + ('🦀', '🦄'), + ('🧀', '🧀'), + ('𫠠', '𬺡'), +]; + +pub const V9_0: &'static [(char, char)] = &[ + ('ࢶ', 'ࢽ'), + ('\u{8d4}', '\u{8e2}'), + ('ಀ', 'ಀ'), + ('൏', '൏'), + ('ൔ', 'ൖ'), + ('൘', '൞'), + ('൶', '൸'), + ('ᲀ', 'ᲈ'), + ('\u{1dfb}', '\u{1dfb}'), + ('⏻', '⏾'), + ('⹃', '⹄'), + ('Ɪ', 'Ɪ'), + ('\u{a8c5}', '\u{a8c5}'), + ('𐆍', '𐆎'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('\u{1123e}', '\u{1123e}'), + ('𑐀', '𑑙'), + ('𑑛', '𑑛'), + ('𑑝', '𑑝'), + ('𑙠', '𑙬'), + ('𑰀', '𑰈'), + ('𑰊', '\u{11c36}'), + ('\u{11c38}', '𑱅'), + ('𑱐', '𑱬'), + ('𑱰', '𑲏'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('𖿠', '𖿠'), + ('𗀀', '𘟬'), + ('𘠀', '𘫲'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('𞤀', '\u{1e94a}'), + ('𞥐', '𞥙'), + ('𞥞', '𞥟'), + ('🆛', '🆬'), + ('🈻', '🈻'), + ('🕺', '🕺'), + ('🖤', '🖤'), + ('🛑', '🛒'), + ('🛴', '🛶'), + ('🤙', '🤞'), + ('🤠', '🤧'), + ('🤰', '🤰'), + ('🤳', '🤾'), + ('🥀', '🥋'), + ('🥐', '🥞'), + ('🦅', '🦑'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/case_folding_simple.rs b/vendor/regex-syntax/src/unicode_tables/case_folding_simple.rs new file mode 100644 index 0000000000..23f9364ce9 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/case_folding_simple.rs @@ -0,0 +1,2888 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate case-folding-simple ucd-15.0.0 --chars --all-pairs +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const CASE_FOLDING_SIMPLE: &'static [(char, &'static [char])] = &[ + ('A', &['a']), + ('B', &['b']), + ('C', &['c']), + ('D', &['d']), + ('E', &['e']), + ('F', &['f']), + ('G', &['g']), + ('H', &['h']), + ('I', &['i']), + ('J', &['j']), + ('K', &['k', 'K']), + ('L', &['l']), + ('M', &['m']), + ('N', &['n']), + ('O', &['o']), + ('P', &['p']), + ('Q', &['q']), + ('R', &['r']), + ('S', &['s', 'ſ']), + ('T', &['t']), + ('U', &['u']), + ('V', &['v']), + ('W', &['w']), + ('X', &['x']), + ('Y', &['y']), + ('Z', &['z']), + ('a', &['A']), + ('b', &['B']), + ('c', &['C']), + ('d', &['D']), + ('e', &['E']), + ('f', &['F']), + ('g', &['G']), + ('h', &['H']), + ('i', &['I']), + ('j', &['J']), + ('k', &['K', 'K']), + ('l', &['L']), + ('m', &['M']), + ('n', &['N']), + ('o', &['O']), + ('p', &['P']), + ('q', &['Q']), + ('r', &['R']), + ('s', &['S', 'ſ']), + ('t', &['T']), + ('u', &['U']), + ('v', &['V']), + ('w', &['W']), + ('x', &['X']), + ('y', &['Y']), + ('z', &['Z']), + ('µ', &['Μ', 'μ']), + ('À', &['à']), + ('Á', &['á']), + ('Â', &['â']), + ('Ã', &['ã']), + ('Ä', &['ä']), + ('Å', &['å', 'Å']), + ('Æ', &['æ']), + ('Ç', &['ç']), + ('È', &['è']), + ('É', &['é']), + ('Ê', &['ê']), + ('Ë', &['ë']), + ('Ì', &['ì']), + ('Í', &['í']), + ('Î', &['î']), + ('Ï', &['ï']), + ('Ð', &['ð']), + ('Ñ', &['ñ']), + ('Ò', &['ò']), + ('Ó', &['ó']), + ('Ô', &['ô']), + ('Õ', &['õ']), + ('Ö', &['ö']), + ('Ø', &['ø']), + ('Ù', &['ù']), + ('Ú', &['ú']), + ('Û', &['û']), + ('Ü', &['ü']), + ('Ý', &['ý']), + ('Þ', &['þ']), + ('ß', &['ẞ']), + ('à', &['À']), + ('á', &['Á']), + ('â', &['Â']), + ('ã', &['Ã']), + ('ä', &['Ä']), + ('å', &['Å', 'Å']), + ('æ', &['Æ']), + ('ç', &['Ç']), + ('è', &['È']), + ('é', &['É']), + ('ê', &['Ê']), + ('ë', &['Ë']), + ('ì', &['Ì']), + ('í', &['Í']), + ('î', &['Î']), + ('ï', &['Ï']), + ('ð', &['Ð']), + ('ñ', &['Ñ']), + ('ò', &['Ò']), + ('ó', &['Ó']), + ('ô', &['Ô']), + ('õ', &['Õ']), + ('ö', &['Ö']), + ('ø', &['Ø']), + ('ù', &['Ù']), + ('ú', &['Ú']), + ('û', &['Û']), + ('ü', &['Ü']), + ('ý', &['Ý']), + ('þ', &['Þ']), + ('ÿ', &['Ÿ']), + ('Ā', &['ā']), + ('ā', &['Ā']), + ('Ă', &['ă']), + ('ă', &['Ă']), + ('Ą', &['ą']), + ('ą', &['Ą']), + ('Ć', &['ć']), + ('ć', &['Ć']), + ('Ĉ', &['ĉ']), + ('ĉ', &['Ĉ']), + ('Ċ', &['ċ']), + ('ċ', &['Ċ']), + ('Č', &['č']), + ('č', &['Č']), + ('Ď', &['ď']), + ('ď', &['Ď']), + ('Đ', &['đ']), + ('đ', &['Đ']), + ('Ē', &['ē']), + ('ē', &['Ē']), + ('Ĕ', &['ĕ']), + ('ĕ', &['Ĕ']), + ('Ė', &['ė']), + ('ė', &['Ė']), + ('Ę', &['ę']), + ('ę', &['Ę']), + ('Ě', &['ě']), + ('ě', &['Ě']), + ('Ĝ', &['ĝ']), + ('ĝ', &['Ĝ']), + ('Ğ', &['ğ']), + ('ğ', &['Ğ']), + ('Ġ', &['ġ']), + ('ġ', &['Ġ']), + ('Ģ', &['ģ']), + ('ģ', &['Ģ']), + ('Ĥ', &['ĥ']), + ('ĥ', &['Ĥ']), + ('Ħ', &['ħ']), + ('ħ', &['Ħ']), + ('Ĩ', &['ĩ']), + ('ĩ', &['Ĩ']), + ('Ī', &['ī']), + ('ī', &['Ī']), + ('Ĭ', &['ĭ']), + ('ĭ', &['Ĭ']), + ('Į', &['į']), + ('į', &['Į']), + ('IJ', &['ij']), + ('ij', &['IJ']), + ('Ĵ', &['ĵ']), + ('ĵ', &['Ĵ']), + ('Ķ', &['ķ']), + ('ķ', &['Ķ']), + ('Ĺ', &['ĺ']), + ('ĺ', &['Ĺ']), + ('Ļ', &['ļ']), + ('ļ', &['Ļ']), + ('Ľ', &['ľ']), + ('ľ', &['Ľ']), + ('Ŀ', &['ŀ']), + ('ŀ', &['Ŀ']), + ('Ł', &['ł']), + ('ł', &['Ł']), + ('Ń', &['ń']), + ('ń', &['Ń']), + ('Ņ', &['ņ']), + ('ņ', &['Ņ']), + ('Ň', &['ň']), + ('ň', &['Ň']), + ('Ŋ', &['ŋ']), + ('ŋ', &['Ŋ']), + ('Ō', &['ō']), + ('ō', &['Ō']), + ('Ŏ', &['ŏ']), + ('ŏ', &['Ŏ']), + ('Ő', &['ő']), + ('ő', &['Ő']), + ('Œ', &['œ']), + ('œ', &['Œ']), + ('Ŕ', &['ŕ']), + ('ŕ', &['Ŕ']), + ('Ŗ', &['ŗ']), + ('ŗ', &['Ŗ']), + ('Ř', &['ř']), + ('ř', &['Ř']), + ('Ś', &['ś']), + ('ś', &['Ś']), + ('Ŝ', &['ŝ']), + ('ŝ', &['Ŝ']), + ('Ş', &['ş']), + ('ş', &['Ş']), + ('Š', &['š']), + ('š', &['Š']), + ('Ţ', &['ţ']), + ('ţ', &['Ţ']), + ('Ť', &['ť']), + ('ť', &['Ť']), + ('Ŧ', &['ŧ']), + ('ŧ', &['Ŧ']), + ('Ũ', &['ũ']), + ('ũ', &['Ũ']), + ('Ū', &['ū']), + ('ū', &['Ū']), + ('Ŭ', &['ŭ']), + ('ŭ', &['Ŭ']), + ('Ů', &['ů']), + ('ů', &['Ů']), + ('Ű', &['ű']), + ('ű', &['Ű']), + ('Ų', &['ų']), + ('ų', &['Ų']), + ('Ŵ', &['ŵ']), + ('ŵ', &['Ŵ']), + ('Ŷ', &['ŷ']), + ('ŷ', &['Ŷ']), + ('Ÿ', &['ÿ']), + ('Ź', &['ź']), + ('ź', &['Ź']), + ('Ż', &['ż']), + ('ż', &['Ż']), + ('Ž', &['ž']), + ('ž', &['Ž']), + ('ſ', &['S', 's']), + ('ƀ', &['Ƀ']), + ('Ɓ', &['ɓ']), + ('Ƃ', &['ƃ']), + ('ƃ', &['Ƃ']), + ('Ƅ', &['ƅ']), + ('ƅ', &['Ƅ']), + ('Ɔ', &['ɔ']), + ('Ƈ', &['ƈ']), + ('ƈ', &['Ƈ']), + ('Ɖ', &['ɖ']), + ('Ɗ', &['ɗ']), + ('Ƌ', &['ƌ']), + ('ƌ', &['Ƌ']), + ('Ǝ', &['ǝ']), + ('Ə', &['ə']), + ('Ɛ', &['ɛ']), + ('Ƒ', &['ƒ']), + ('ƒ', &['Ƒ']), + ('Ɠ', &['ɠ']), + ('Ɣ', &['ɣ']), + ('ƕ', &['Ƕ']), + ('Ɩ', &['ɩ']), + ('Ɨ', &['ɨ']), + ('Ƙ', &['ƙ']), + ('ƙ', &['Ƙ']), + ('ƚ', &['Ƚ']), + ('Ɯ', &['ɯ']), + ('Ɲ', &['ɲ']), + ('ƞ', &['Ƞ']), + ('Ɵ', &['ɵ']), + ('Ơ', &['ơ']), + ('ơ', &['Ơ']), + ('Ƣ', &['ƣ']), + ('ƣ', &['Ƣ']), + ('Ƥ', &['ƥ']), + ('ƥ', &['Ƥ']), + ('Ʀ', &['ʀ']), + ('Ƨ', &['ƨ']), + ('ƨ', &['Ƨ']), + ('Ʃ', &['ʃ']), + ('Ƭ', &['ƭ']), + ('ƭ', &['Ƭ']), + ('Ʈ', &['ʈ']), + ('Ư', &['ư']), + ('ư', &['Ư']), + ('Ʊ', &['ʊ']), + ('Ʋ', &['ʋ']), + ('Ƴ', &['ƴ']), + ('ƴ', &['Ƴ']), + ('Ƶ', &['ƶ']), + ('ƶ', &['Ƶ']), + ('Ʒ', &['ʒ']), + ('Ƹ', &['ƹ']), + ('ƹ', &['Ƹ']), + ('Ƽ', &['ƽ']), + ('ƽ', &['Ƽ']), + ('ƿ', &['Ƿ']), + ('DŽ', &['Dž', 'dž']), + ('Dž', &['DŽ', 'dž']), + ('dž', &['DŽ', 'Dž']), + ('LJ', &['Lj', 'lj']), + ('Lj', &['LJ', 'lj']), + ('lj', &['LJ', 'Lj']), + ('NJ', &['Nj', 'nj']), + ('Nj', &['NJ', 'nj']), + ('nj', &['NJ', 'Nj']), + ('Ǎ', &['ǎ']), + ('ǎ', &['Ǎ']), + ('Ǐ', &['ǐ']), + ('ǐ', &['Ǐ']), + ('Ǒ', &['ǒ']), + ('ǒ', &['Ǒ']), + ('Ǔ', &['ǔ']), + ('ǔ', &['Ǔ']), + ('Ǖ', &['ǖ']), + ('ǖ', &['Ǖ']), + ('Ǘ', &['ǘ']), + ('ǘ', &['Ǘ']), + ('Ǚ', &['ǚ']), + ('ǚ', &['Ǚ']), + ('Ǜ', &['ǜ']), + ('ǜ', &['Ǜ']), + ('ǝ', &['Ǝ']), + ('Ǟ', &['ǟ']), + ('ǟ', &['Ǟ']), + ('Ǡ', &['ǡ']), + ('ǡ', &['Ǡ']), + ('Ǣ', &['ǣ']), + ('ǣ', &['Ǣ']), + ('Ǥ', &['ǥ']), + ('ǥ', &['Ǥ']), + ('Ǧ', &['ǧ']), + ('ǧ', &['Ǧ']), + ('Ǩ', &['ǩ']), + ('ǩ', &['Ǩ']), + ('Ǫ', &['ǫ']), + ('ǫ', &['Ǫ']), + ('Ǭ', &['ǭ']), + ('ǭ', &['Ǭ']), + ('Ǯ', &['ǯ']), + ('ǯ', &['Ǯ']), + ('DZ', &['Dz', 'dz']), + ('Dz', &['DZ', 'dz']), + ('dz', &['DZ', 'Dz']), + ('Ǵ', &['ǵ']), + ('ǵ', &['Ǵ']), + ('Ƕ', &['ƕ']), + ('Ƿ', &['ƿ']), + ('Ǹ', &['ǹ']), + ('ǹ', &['Ǹ']), + ('Ǻ', &['ǻ']), + ('ǻ', &['Ǻ']), + ('Ǽ', &['ǽ']), + ('ǽ', &['Ǽ']), + ('Ǿ', &['ǿ']), + ('ǿ', &['Ǿ']), + ('Ȁ', &['ȁ']), + ('ȁ', &['Ȁ']), + ('Ȃ', &['ȃ']), + ('ȃ', &['Ȃ']), + ('Ȅ', &['ȅ']), + ('ȅ', &['Ȅ']), + ('Ȇ', &['ȇ']), + ('ȇ', &['Ȇ']), + ('Ȉ', &['ȉ']), + ('ȉ', &['Ȉ']), + ('Ȋ', &['ȋ']), + ('ȋ', &['Ȋ']), + ('Ȍ', &['ȍ']), + ('ȍ', &['Ȍ']), + ('Ȏ', &['ȏ']), + ('ȏ', &['Ȏ']), + ('Ȑ', &['ȑ']), + ('ȑ', &['Ȑ']), + ('Ȓ', &['ȓ']), + ('ȓ', &['Ȓ']), + ('Ȕ', &['ȕ']), + ('ȕ', &['Ȕ']), + ('Ȗ', &['ȗ']), + ('ȗ', &['Ȗ']), + ('Ș', &['ș']), + ('ș', &['Ș']), + ('Ț', &['ț']), + ('ț', &['Ț']), + ('Ȝ', &['ȝ']), + ('ȝ', &['Ȝ']), + ('Ȟ', &['ȟ']), + ('ȟ', &['Ȟ']), + ('Ƞ', &['ƞ']), + ('Ȣ', &['ȣ']), + ('ȣ', &['Ȣ']), + ('Ȥ', &['ȥ']), + ('ȥ', &['Ȥ']), + ('Ȧ', &['ȧ']), + ('ȧ', &['Ȧ']), + ('Ȩ', &['ȩ']), + ('ȩ', &['Ȩ']), + ('Ȫ', &['ȫ']), + ('ȫ', &['Ȫ']), + ('Ȭ', &['ȭ']), + ('ȭ', &['Ȭ']), + ('Ȯ', &['ȯ']), + ('ȯ', &['Ȯ']), + ('Ȱ', &['ȱ']), + ('ȱ', &['Ȱ']), + ('Ȳ', &['ȳ']), + ('ȳ', &['Ȳ']), + ('Ⱥ', &['ⱥ']), + ('Ȼ', &['ȼ']), + ('ȼ', &['Ȼ']), + ('Ƚ', &['ƚ']), + ('Ⱦ', &['ⱦ']), + ('ȿ', &['Ȿ']), + ('ɀ', &['Ɀ']), + ('Ɂ', &['ɂ']), + ('ɂ', &['Ɂ']), + ('Ƀ', &['ƀ']), + ('Ʉ', &['ʉ']), + ('Ʌ', &['ʌ']), + ('Ɇ', &['ɇ']), + ('ɇ', &['Ɇ']), + ('Ɉ', &['ɉ']), + ('ɉ', &['Ɉ']), + ('Ɋ', &['ɋ']), + ('ɋ', &['Ɋ']), + ('Ɍ', &['ɍ']), + ('ɍ', &['Ɍ']), + ('Ɏ', &['ɏ']), + ('ɏ', &['Ɏ']), + ('ɐ', &['Ɐ']), + ('ɑ', &['Ɑ']), + ('ɒ', &['Ɒ']), + ('ɓ', &['Ɓ']), + ('ɔ', &['Ɔ']), + ('ɖ', &['Ɖ']), + ('ɗ', &['Ɗ']), + ('ə', &['Ə']), + ('ɛ', &['Ɛ']), + ('ɜ', &['Ɜ']), + ('ɠ', &['Ɠ']), + ('ɡ', &['Ɡ']), + ('ɣ', &['Ɣ']), + ('ɥ', &['Ɥ']), + ('ɦ', &['Ɦ']), + ('ɨ', &['Ɨ']), + ('ɩ', &['Ɩ']), + ('ɪ', &['Ɪ']), + ('ɫ', &['Ɫ']), + ('ɬ', &['Ɬ']), + ('ɯ', &['Ɯ']), + ('ɱ', &['Ɱ']), + ('ɲ', &['Ɲ']), + ('ɵ', &['Ɵ']), + ('ɽ', &['Ɽ']), + ('ʀ', &['Ʀ']), + ('ʂ', &['Ʂ']), + ('ʃ', &['Ʃ']), + ('ʇ', &['Ʇ']), + ('ʈ', &['Ʈ']), + ('ʉ', &['Ʉ']), + ('ʊ', &['Ʊ']), + ('ʋ', &['Ʋ']), + ('ʌ', &['Ʌ']), + ('ʒ', &['Ʒ']), + ('ʝ', &['Ʝ']), + ('ʞ', &['Ʞ']), + ('\u{345}', &['Ι', 'ι', 'ι']), + ('Ͱ', &['ͱ']), + ('ͱ', &['Ͱ']), + ('Ͳ', &['ͳ']), + ('ͳ', &['Ͳ']), + ('Ͷ', &['ͷ']), + ('ͷ', &['Ͷ']), + ('ͻ', &['Ͻ']), + ('ͼ', &['Ͼ']), + ('ͽ', &['Ͽ']), + ('Ϳ', &['ϳ']), + ('Ά', &['ά']), + ('Έ', &['έ']), + ('Ή', &['ή']), + ('Ί', &['ί']), + ('Ό', &['ό']), + ('Ύ', &['ύ']), + ('Ώ', &['ώ']), + ('Α', &['α']), + ('Β', &['β', 'ϐ']), + ('Γ', &['γ']), + ('Δ', &['δ']), + ('Ε', &['ε', 'ϵ']), + ('Ζ', &['ζ']), + ('Η', &['η']), + ('Θ', &['θ', 'ϑ', 'ϴ']), + ('Ι', &['\u{345}', 'ι', 'ι']), + ('Κ', &['κ', 'ϰ']), + ('Λ', &['λ']), + ('Μ', &['µ', 'μ']), + ('Ν', &['ν']), + ('Ξ', &['ξ']), + ('Ο', &['ο']), + ('Π', &['π', 'ϖ']), + ('Ρ', &['ρ', 'ϱ']), + ('Σ', &['ς', 'σ']), + ('Τ', &['τ']), + ('Υ', &['υ']), + ('Φ', &['φ', 'ϕ']), + ('Χ', &['χ']), + ('Ψ', &['ψ']), + ('Ω', &['ω', 'Ω']), + ('Ϊ', &['ϊ']), + ('Ϋ', &['ϋ']), + ('ά', &['Ά']), + ('έ', &['Έ']), + ('ή', &['Ή']), + ('ί', &['Ί']), + ('α', &['Α']), + ('β', &['Β', 'ϐ']), + ('γ', &['Γ']), + ('δ', &['Δ']), + ('ε', &['Ε', 'ϵ']), + ('ζ', &['Ζ']), + ('η', &['Η']), + ('θ', &['Θ', 'ϑ', 'ϴ']), + ('ι', &['\u{345}', 'Ι', 'ι']), + ('κ', &['Κ', 'ϰ']), + ('λ', &['Λ']), + ('μ', &['µ', 'Μ']), + ('ν', &['Ν']), + ('ξ', &['Ξ']), + ('ο', &['Ο']), + ('π', &['Π', 'ϖ']), + ('ρ', &['Ρ', 'ϱ']), + ('ς', &['Σ', 'σ']), + ('σ', &['Σ', 'ς']), + ('τ', &['Τ']), + ('υ', &['Υ']), + ('φ', &['Φ', 'ϕ']), + ('χ', &['Χ']), + ('ψ', &['Ψ']), + ('ω', &['Ω', 'Ω']), + ('ϊ', &['Ϊ']), + ('ϋ', &['Ϋ']), + ('ό', &['Ό']), + ('ύ', &['Ύ']), + ('ώ', &['Ώ']), + ('Ϗ', &['ϗ']), + ('ϐ', &['Β', 'β']), + ('ϑ', &['Θ', 'θ', 'ϴ']), + ('ϕ', &['Φ', 'φ']), + ('ϖ', &['Π', 'π']), + ('ϗ', &['Ϗ']), + ('Ϙ', &['ϙ']), + ('ϙ', &['Ϙ']), + ('Ϛ', &['ϛ']), + ('ϛ', &['Ϛ']), + ('Ϝ', &['ϝ']), + ('ϝ', &['Ϝ']), + ('Ϟ', &['ϟ']), + ('ϟ', &['Ϟ']), + ('Ϡ', &['ϡ']), + ('ϡ', &['Ϡ']), + ('Ϣ', &['ϣ']), + ('ϣ', &['Ϣ']), + ('Ϥ', &['ϥ']), + ('ϥ', &['Ϥ']), + ('Ϧ', &['ϧ']), + ('ϧ', &['Ϧ']), + ('Ϩ', &['ϩ']), + ('ϩ', &['Ϩ']), + ('Ϫ', &['ϫ']), + ('ϫ', &['Ϫ']), + ('Ϭ', &['ϭ']), + ('ϭ', &['Ϭ']), + ('Ϯ', &['ϯ']), + ('ϯ', &['Ϯ']), + ('ϰ', &['Κ', 'κ']), + ('ϱ', &['Ρ', 'ρ']), + ('ϲ', &['Ϲ']), + ('ϳ', &['Ϳ']), + ('ϴ', &['Θ', 'θ', 'ϑ']), + ('ϵ', &['Ε', 'ε']), + ('Ϸ', &['ϸ']), + ('ϸ', &['Ϸ']), + ('Ϲ', &['ϲ']), + ('Ϻ', &['ϻ']), + ('ϻ', &['Ϻ']), + ('Ͻ', &['ͻ']), + ('Ͼ', &['ͼ']), + ('Ͽ', &['ͽ']), + ('Ѐ', &['ѐ']), + ('Ё', &['ё']), + ('Ђ', &['ђ']), + ('Ѓ', &['ѓ']), + ('Є', &['є']), + ('Ѕ', &['ѕ']), + ('І', &['і']), + ('Ї', &['ї']), + ('Ј', &['ј']), + ('Љ', &['љ']), + ('Њ', &['њ']), + ('Ћ', &['ћ']), + ('Ќ', &['ќ']), + ('Ѝ', &['ѝ']), + ('Ў', &['ў']), + ('Џ', &['џ']), + ('А', &['а']), + ('Б', &['б']), + ('В', &['в', 'ᲀ']), + ('Г', &['г']), + ('Д', &['д', 'ᲁ']), + ('Е', &['е']), + ('Ж', &['ж']), + ('З', &['з']), + ('И', &['и']), + ('Й', &['й']), + ('К', &['к']), + ('Л', &['л']), + ('М', &['м']), + ('Н', &['н']), + ('О', &['о', 'ᲂ']), + ('П', &['п']), + ('Р', &['р']), + ('С', &['с', 'ᲃ']), + ('Т', &['т', 'ᲄ', 'ᲅ']), + ('У', &['у']), + ('Ф', &['ф']), + ('Х', &['х']), + ('Ц', &['ц']), + ('Ч', &['ч']), + ('Ш', &['ш']), + ('Щ', &['щ']), + ('Ъ', &['ъ', 'ᲆ']), + ('Ы', &['ы']), + ('Ь', &['ь']), + ('Э', &['э']), + ('Ю', &['ю']), + ('Я', &['я']), + ('а', &['А']), + ('б', &['Б']), + ('в', &['В', 'ᲀ']), + ('г', &['Г']), + ('д', &['Д', 'ᲁ']), + ('е', &['Е']), + ('ж', &['Ж']), + ('з', &['З']), + ('и', &['И']), + ('й', &['Й']), + ('к', &['К']), + ('л', &['Л']), + ('м', &['М']), + ('н', &['Н']), + ('о', &['О', 'ᲂ']), + ('п', &['П']), + ('р', &['Р']), + ('с', &['С', 'ᲃ']), + ('т', &['Т', 'ᲄ', 'ᲅ']), + ('у', &['У']), + ('ф', &['Ф']), + ('х', &['Х']), + ('ц', &['Ц']), + ('ч', &['Ч']), + ('ш', &['Ш']), + ('щ', &['Щ']), + ('ъ', &['Ъ', 'ᲆ']), + ('ы', &['Ы']), + ('ь', &['Ь']), + ('э', &['Э']), + ('ю', &['Ю']), + ('я', &['Я']), + ('ѐ', &['Ѐ']), + ('ё', &['Ё']), + ('ђ', &['Ђ']), + ('ѓ', &['Ѓ']), + ('є', &['Є']), + ('ѕ', &['Ѕ']), + ('і', &['І']), + ('ї', &['Ї']), + ('ј', &['Ј']), + ('љ', &['Љ']), + ('њ', &['Њ']), + ('ћ', &['Ћ']), + ('ќ', &['Ќ']), + ('ѝ', &['Ѝ']), + ('ў', &['Ў']), + ('џ', &['Џ']), + ('Ѡ', &['ѡ']), + ('ѡ', &['Ѡ']), + ('Ѣ', &['ѣ', 'ᲇ']), + ('ѣ', &['Ѣ', 'ᲇ']), + ('Ѥ', &['ѥ']), + ('ѥ', &['Ѥ']), + ('Ѧ', &['ѧ']), + ('ѧ', &['Ѧ']), + ('Ѩ', &['ѩ']), + ('ѩ', &['Ѩ']), + ('Ѫ', &['ѫ']), + ('ѫ', &['Ѫ']), + ('Ѭ', &['ѭ']), + ('ѭ', &['Ѭ']), + ('Ѯ', &['ѯ']), + ('ѯ', &['Ѯ']), + ('Ѱ', &['ѱ']), + ('ѱ', &['Ѱ']), + ('Ѳ', &['ѳ']), + ('ѳ', &['Ѳ']), + ('Ѵ', &['ѵ']), + ('ѵ', &['Ѵ']), + ('Ѷ', &['ѷ']), + ('ѷ', &['Ѷ']), + ('Ѹ', &['ѹ']), + ('ѹ', &['Ѹ']), + ('Ѻ', &['ѻ']), + ('ѻ', &['Ѻ']), + ('Ѽ', &['ѽ']), + ('ѽ', &['Ѽ']), + ('Ѿ', &['ѿ']), + ('ѿ', &['Ѿ']), + ('Ҁ', &['ҁ']), + ('ҁ', &['Ҁ']), + ('Ҋ', &['ҋ']), + ('ҋ', &['Ҋ']), + ('Ҍ', &['ҍ']), + ('ҍ', &['Ҍ']), + ('Ҏ', &['ҏ']), + ('ҏ', &['Ҏ']), + ('Ґ', &['ґ']), + ('ґ', &['Ґ']), + ('Ғ', &['ғ']), + ('ғ', &['Ғ']), + ('Ҕ', &['ҕ']), + ('ҕ', &['Ҕ']), + ('Җ', &['җ']), + ('җ', &['Җ']), + ('Ҙ', &['ҙ']), + ('ҙ', &['Ҙ']), + ('Қ', &['қ']), + ('қ', &['Қ']), + ('Ҝ', &['ҝ']), + ('ҝ', &['Ҝ']), + ('Ҟ', &['ҟ']), + ('ҟ', &['Ҟ']), + ('Ҡ', &['ҡ']), + ('ҡ', &['Ҡ']), + ('Ң', &['ң']), + ('ң', &['Ң']), + ('Ҥ', &['ҥ']), + ('ҥ', &['Ҥ']), + ('Ҧ', &['ҧ']), + ('ҧ', &['Ҧ']), + ('Ҩ', &['ҩ']), + ('ҩ', &['Ҩ']), + ('Ҫ', &['ҫ']), + ('ҫ', &['Ҫ']), + ('Ҭ', &['ҭ']), + ('ҭ', &['Ҭ']), + ('Ү', &['ү']), + ('ү', &['Ү']), + ('Ұ', &['ұ']), + ('ұ', &['Ұ']), + ('Ҳ', &['ҳ']), + ('ҳ', &['Ҳ']), + ('Ҵ', &['ҵ']), + ('ҵ', &['Ҵ']), + ('Ҷ', &['ҷ']), + ('ҷ', &['Ҷ']), + ('Ҹ', &['ҹ']), + ('ҹ', &['Ҹ']), + ('Һ', &['һ']), + ('һ', &['Һ']), + ('Ҽ', &['ҽ']), + ('ҽ', &['Ҽ']), + ('Ҿ', &['ҿ']), + ('ҿ', &['Ҿ']), + ('Ӏ', &['ӏ']), + ('Ӂ', &['ӂ']), + ('ӂ', &['Ӂ']), + ('Ӄ', &['ӄ']), + ('ӄ', &['Ӄ']), + ('Ӆ', &['ӆ']), + ('ӆ', &['Ӆ']), + ('Ӈ', &['ӈ']), + ('ӈ', &['Ӈ']), + ('Ӊ', &['ӊ']), + ('ӊ', &['Ӊ']), + ('Ӌ', &['ӌ']), + ('ӌ', &['Ӌ']), + ('Ӎ', &['ӎ']), + ('ӎ', &['Ӎ']), + ('ӏ', &['Ӏ']), + ('Ӑ', &['ӑ']), + ('ӑ', &['Ӑ']), + ('Ӓ', &['ӓ']), + ('ӓ', &['Ӓ']), + ('Ӕ', &['ӕ']), + ('ӕ', &['Ӕ']), + ('Ӗ', &['ӗ']), + ('ӗ', &['Ӗ']), + ('Ә', &['ә']), + ('ә', &['Ә']), + ('Ӛ', &['ӛ']), + ('ӛ', &['Ӛ']), + ('Ӝ', &['ӝ']), + ('ӝ', &['Ӝ']), + ('Ӟ', &['ӟ']), + ('ӟ', &['Ӟ']), + ('Ӡ', &['ӡ']), + ('ӡ', &['Ӡ']), + ('Ӣ', &['ӣ']), + ('ӣ', &['Ӣ']), + ('Ӥ', &['ӥ']), + ('ӥ', &['Ӥ']), + ('Ӧ', &['ӧ']), + ('ӧ', &['Ӧ']), + ('Ө', &['ө']), + ('ө', &['Ө']), + ('Ӫ', &['ӫ']), + ('ӫ', &['Ӫ']), + ('Ӭ', &['ӭ']), + ('ӭ', &['Ӭ']), + ('Ӯ', &['ӯ']), + ('ӯ', &['Ӯ']), + ('Ӱ', &['ӱ']), + ('ӱ', &['Ӱ']), + ('Ӳ', &['ӳ']), + ('ӳ', &['Ӳ']), + ('Ӵ', &['ӵ']), + ('ӵ', &['Ӵ']), + ('Ӷ', &['ӷ']), + ('ӷ', &['Ӷ']), + ('Ӹ', &['ӹ']), + ('ӹ', &['Ӹ']), + ('Ӻ', &['ӻ']), + ('ӻ', &['Ӻ']), + ('Ӽ', &['ӽ']), + ('ӽ', &['Ӽ']), + ('Ӿ', &['ӿ']), + ('ӿ', &['Ӿ']), + ('Ԁ', &['ԁ']), + ('ԁ', &['Ԁ']), + ('Ԃ', &['ԃ']), + ('ԃ', &['Ԃ']), + ('Ԅ', &['ԅ']), + ('ԅ', &['Ԅ']), + ('Ԇ', &['ԇ']), + ('ԇ', &['Ԇ']), + ('Ԉ', &['ԉ']), + ('ԉ', &['Ԉ']), + ('Ԋ', &['ԋ']), + ('ԋ', &['Ԋ']), + ('Ԍ', &['ԍ']), + ('ԍ', &['Ԍ']), + ('Ԏ', &['ԏ']), + ('ԏ', &['Ԏ']), + ('Ԑ', &['ԑ']), + ('ԑ', &['Ԑ']), + ('Ԓ', &['ԓ']), + ('ԓ', &['Ԓ']), + ('Ԕ', &['ԕ']), + ('ԕ', &['Ԕ']), + ('Ԗ', &['ԗ']), + ('ԗ', &['Ԗ']), + ('Ԙ', &['ԙ']), + ('ԙ', &['Ԙ']), + ('Ԛ', &['ԛ']), + ('ԛ', &['Ԛ']), + ('Ԝ', &['ԝ']), + ('ԝ', &['Ԝ']), + ('Ԟ', &['ԟ']), + ('ԟ', &['Ԟ']), + ('Ԡ', &['ԡ']), + ('ԡ', &['Ԡ']), + ('Ԣ', &['ԣ']), + ('ԣ', &['Ԣ']), + ('Ԥ', &['ԥ']), + ('ԥ', &['Ԥ']), + ('Ԧ', &['ԧ']), + ('ԧ', &['Ԧ']), + ('Ԩ', &['ԩ']), + ('ԩ', &['Ԩ']), + ('Ԫ', &['ԫ']), + ('ԫ', &['Ԫ']), + ('Ԭ', &['ԭ']), + ('ԭ', &['Ԭ']), + ('Ԯ', &['ԯ']), + ('ԯ', &['Ԯ']), + ('Ա', &['ա']), + ('Բ', &['բ']), + ('Գ', &['գ']), + ('Դ', &['դ']), + ('Ե', &['ե']), + ('Զ', &['զ']), + ('Է', &['է']), + ('Ը', &['ը']), + ('Թ', &['թ']), + ('Ժ', &['ժ']), + ('Ի', &['ի']), + ('Լ', &['լ']), + ('Խ', &['խ']), + ('Ծ', &['ծ']), + ('Կ', &['կ']), + ('Հ', &['հ']), + ('Ձ', &['ձ']), + ('Ղ', &['ղ']), + ('Ճ', &['ճ']), + ('Մ', &['մ']), + ('Յ', &['յ']), + ('Ն', &['ն']), + ('Շ', &['շ']), + ('Ո', &['ո']), + ('Չ', &['չ']), + ('Պ', &['պ']), + ('Ջ', &['ջ']), + ('Ռ', &['ռ']), + ('Ս', &['ս']), + ('Վ', &['վ']), + ('Տ', &['տ']), + ('Ր', &['ր']), + ('Ց', &['ց']), + ('Ւ', &['ւ']), + ('Փ', &['փ']), + ('Ք', &['ք']), + ('Օ', &['օ']), + ('Ֆ', &['ֆ']), + ('ա', &['Ա']), + ('բ', &['Բ']), + ('գ', &['Գ']), + ('դ', &['Դ']), + ('ե', &['Ե']), + ('զ', &['Զ']), + ('է', &['Է']), + ('ը', &['Ը']), + ('թ', &['Թ']), + ('ժ', &['Ժ']), + ('ի', &['Ի']), + ('լ', &['Լ']), + ('խ', &['Խ']), + ('ծ', &['Ծ']), + ('կ', &['Կ']), + ('հ', &['Հ']), + ('ձ', &['Ձ']), + ('ղ', &['Ղ']), + ('ճ', &['Ճ']), + ('մ', &['Մ']), + ('յ', &['Յ']), + ('ն', &['Ն']), + ('շ', &['Շ']), + ('ո', &['Ո']), + ('չ', &['Չ']), + ('պ', &['Պ']), + ('ջ', &['Ջ']), + ('ռ', &['Ռ']), + ('ս', &['Ս']), + ('վ', &['Վ']), + ('տ', &['Տ']), + ('ր', &['Ր']), + ('ց', &['Ց']), + ('ւ', &['Ւ']), + ('փ', &['Փ']), + ('ք', &['Ք']), + ('օ', &['Օ']), + ('ֆ', &['Ֆ']), + ('Ⴀ', &['ⴀ']), + ('Ⴁ', &['ⴁ']), + ('Ⴂ', &['ⴂ']), + ('Ⴃ', &['ⴃ']), + ('Ⴄ', &['ⴄ']), + ('Ⴅ', &['ⴅ']), + ('Ⴆ', &['ⴆ']), + ('Ⴇ', &['ⴇ']), + ('Ⴈ', &['ⴈ']), + ('Ⴉ', &['ⴉ']), + ('Ⴊ', &['ⴊ']), + ('Ⴋ', &['ⴋ']), + ('Ⴌ', &['ⴌ']), + ('Ⴍ', &['ⴍ']), + ('Ⴎ', &['ⴎ']), + ('Ⴏ', &['ⴏ']), + ('Ⴐ', &['ⴐ']), + ('Ⴑ', &['ⴑ']), + ('Ⴒ', &['ⴒ']), + ('Ⴓ', &['ⴓ']), + ('Ⴔ', &['ⴔ']), + ('Ⴕ', &['ⴕ']), + ('Ⴖ', &['ⴖ']), + ('Ⴗ', &['ⴗ']), + ('Ⴘ', &['ⴘ']), + ('Ⴙ', &['ⴙ']), + ('Ⴚ', &['ⴚ']), + ('Ⴛ', &['ⴛ']), + ('Ⴜ', &['ⴜ']), + ('Ⴝ', &['ⴝ']), + ('Ⴞ', &['ⴞ']), + ('Ⴟ', &['ⴟ']), + ('Ⴠ', &['ⴠ']), + ('Ⴡ', &['ⴡ']), + ('Ⴢ', &['ⴢ']), + ('Ⴣ', &['ⴣ']), + ('Ⴤ', &['ⴤ']), + ('Ⴥ', &['ⴥ']), + ('Ⴧ', &['ⴧ']), + ('Ⴭ', &['ⴭ']), + ('ა', &['Ა']), + ('ბ', &['Ბ']), + ('გ', &['Გ']), + ('დ', &['Დ']), + ('ე', &['Ე']), + ('ვ', &['Ვ']), + ('ზ', &['Ზ']), + ('თ', &['Თ']), + ('ი', &['Ი']), + ('კ', &['Კ']), + ('ლ', &['Ლ']), + ('მ', &['Მ']), + ('ნ', &['Ნ']), + ('ო', &['Ო']), + ('პ', &['Პ']), + ('ჟ', &['Ჟ']), + ('რ', &['Რ']), + ('ს', &['Ს']), + ('ტ', &['Ტ']), + ('უ', &['Უ']), + ('ფ', &['Ფ']), + ('ქ', &['Ქ']), + ('ღ', &['Ღ']), + ('ყ', &['Ყ']), + ('შ', &['Შ']), + ('ჩ', &['Ჩ']), + ('ც', &['Ც']), + ('ძ', &['Ძ']), + ('წ', &['Წ']), + ('ჭ', &['Ჭ']), + ('ხ', &['Ხ']), + ('ჯ', &['Ჯ']), + ('ჰ', &['Ჰ']), + ('ჱ', &['Ჱ']), + ('ჲ', &['Ჲ']), + ('ჳ', &['Ჳ']), + ('ჴ', &['Ჴ']), + ('ჵ', &['Ჵ']), + ('ჶ', &['Ჶ']), + ('ჷ', &['Ჷ']), + ('ჸ', &['Ჸ']), + ('ჹ', &['Ჹ']), + ('ჺ', &['Ჺ']), + ('ჽ', &['Ჽ']), + ('ჾ', &['Ჾ']), + ('ჿ', &['Ჿ']), + ('Ꭰ', &['ꭰ']), + ('Ꭱ', &['ꭱ']), + ('Ꭲ', &['ꭲ']), + ('Ꭳ', &['ꭳ']), + ('Ꭴ', &['ꭴ']), + ('Ꭵ', &['ꭵ']), + ('Ꭶ', &['ꭶ']), + ('Ꭷ', &['ꭷ']), + ('Ꭸ', &['ꭸ']), + ('Ꭹ', &['ꭹ']), + ('Ꭺ', &['ꭺ']), + ('Ꭻ', &['ꭻ']), + ('Ꭼ', &['ꭼ']), + ('Ꭽ', &['ꭽ']), + ('Ꭾ', &['ꭾ']), + ('Ꭿ', &['ꭿ']), + ('Ꮀ', &['ꮀ']), + ('Ꮁ', &['ꮁ']), + ('Ꮂ', &['ꮂ']), + ('Ꮃ', &['ꮃ']), + ('Ꮄ', &['ꮄ']), + ('Ꮅ', &['ꮅ']), + ('Ꮆ', &['ꮆ']), + ('Ꮇ', &['ꮇ']), + ('Ꮈ', &['ꮈ']), + ('Ꮉ', &['ꮉ']), + ('Ꮊ', &['ꮊ']), + ('Ꮋ', &['ꮋ']), + ('Ꮌ', &['ꮌ']), + ('Ꮍ', &['ꮍ']), + ('Ꮎ', &['ꮎ']), + ('Ꮏ', &['ꮏ']), + ('Ꮐ', &['ꮐ']), + ('Ꮑ', &['ꮑ']), + ('Ꮒ', &['ꮒ']), + ('Ꮓ', &['ꮓ']), + ('Ꮔ', &['ꮔ']), + ('Ꮕ', &['ꮕ']), + ('Ꮖ', &['ꮖ']), + ('Ꮗ', &['ꮗ']), + ('Ꮘ', &['ꮘ']), + ('Ꮙ', &['ꮙ']), + ('Ꮚ', &['ꮚ']), + ('Ꮛ', &['ꮛ']), + ('Ꮜ', &['ꮜ']), + ('Ꮝ', &['ꮝ']), + ('Ꮞ', &['ꮞ']), + ('Ꮟ', &['ꮟ']), + ('Ꮠ', &['ꮠ']), + ('Ꮡ', &['ꮡ']), + ('Ꮢ', &['ꮢ']), + ('Ꮣ', &['ꮣ']), + ('Ꮤ', &['ꮤ']), + ('Ꮥ', &['ꮥ']), + ('Ꮦ', &['ꮦ']), + ('Ꮧ', &['ꮧ']), + ('Ꮨ', &['ꮨ']), + ('Ꮩ', &['ꮩ']), + ('Ꮪ', &['ꮪ']), + ('Ꮫ', &['ꮫ']), + ('Ꮬ', &['ꮬ']), + ('Ꮭ', &['ꮭ']), + ('Ꮮ', &['ꮮ']), + ('Ꮯ', &['ꮯ']), + ('Ꮰ', &['ꮰ']), + ('Ꮱ', &['ꮱ']), + ('Ꮲ', &['ꮲ']), + ('Ꮳ', &['ꮳ']), + ('Ꮴ', &['ꮴ']), + ('Ꮵ', &['ꮵ']), + ('Ꮶ', &['ꮶ']), + ('Ꮷ', &['ꮷ']), + ('Ꮸ', &['ꮸ']), + ('Ꮹ', &['ꮹ']), + ('Ꮺ', &['ꮺ']), + ('Ꮻ', &['ꮻ']), + ('Ꮼ', &['ꮼ']), + ('Ꮽ', &['ꮽ']), + ('Ꮾ', &['ꮾ']), + ('Ꮿ', &['ꮿ']), + ('Ᏸ', &['ᏸ']), + ('Ᏹ', &['ᏹ']), + ('Ᏺ', &['ᏺ']), + ('Ᏻ', &['ᏻ']), + ('Ᏼ', &['ᏼ']), + ('Ᏽ', &['ᏽ']), + ('ᏸ', &['Ᏸ']), + ('ᏹ', &['Ᏹ']), + ('ᏺ', &['Ᏺ']), + ('ᏻ', &['Ᏻ']), + ('ᏼ', &['Ᏼ']), + ('ᏽ', &['Ᏽ']), + ('ᲀ', &['В', 'в']), + ('ᲁ', &['Д', 'д']), + ('ᲂ', &['О', 'о']), + ('ᲃ', &['С', 'с']), + ('ᲄ', &['Т', 'т', 'ᲅ']), + ('ᲅ', &['Т', 'т', 'ᲄ']), + ('ᲆ', &['Ъ', 'ъ']), + ('ᲇ', &['Ѣ', 'ѣ']), + ('ᲈ', &['Ꙋ', 'ꙋ']), + ('Ა', &['ა']), + ('Ბ', &['ბ']), + ('Გ', &['გ']), + ('Დ', &['დ']), + ('Ე', &['ე']), + ('Ვ', &['ვ']), + ('Ზ', &['ზ']), + ('Თ', &['თ']), + ('Ი', &['ი']), + ('Კ', &['კ']), + ('Ლ', &['ლ']), + ('Მ', &['მ']), + ('Ნ', &['ნ']), + ('Ო', &['ო']), + ('Პ', &['პ']), + ('Ჟ', &['ჟ']), + ('Რ', &['რ']), + ('Ს', &['ს']), + ('Ტ', &['ტ']), + ('Უ', &['უ']), + ('Ფ', &['ფ']), + ('Ქ', &['ქ']), + ('Ღ', &['ღ']), + ('Ყ', &['ყ']), + ('Შ', &['შ']), + ('Ჩ', &['ჩ']), + ('Ც', &['ც']), + ('Ძ', &['ძ']), + ('Წ', &['წ']), + ('Ჭ', &['ჭ']), + ('Ხ', &['ხ']), + ('Ჯ', &['ჯ']), + ('Ჰ', &['ჰ']), + ('Ჱ', &['ჱ']), + ('Ჲ', &['ჲ']), + ('Ჳ', &['ჳ']), + ('Ჴ', &['ჴ']), + ('Ჵ', &['ჵ']), + ('Ჶ', &['ჶ']), + ('Ჷ', &['ჷ']), + ('Ჸ', &['ჸ']), + ('Ჹ', &['ჹ']), + ('Ჺ', &['ჺ']), + ('Ჽ', &['ჽ']), + ('Ჾ', &['ჾ']), + ('Ჿ', &['ჿ']), + ('ᵹ', &['Ᵹ']), + ('ᵽ', &['Ᵽ']), + ('ᶎ', &['Ᶎ']), + ('Ḁ', &['ḁ']), + ('ḁ', &['Ḁ']), + ('Ḃ', &['ḃ']), + ('ḃ', &['Ḃ']), + ('Ḅ', &['ḅ']), + ('ḅ', &['Ḅ']), + ('Ḇ', &['ḇ']), + ('ḇ', &['Ḇ']), + ('Ḉ', &['ḉ']), + ('ḉ', &['Ḉ']), + ('Ḋ', &['ḋ']), + ('ḋ', &['Ḋ']), + ('Ḍ', &['ḍ']), + ('ḍ', &['Ḍ']), + ('Ḏ', &['ḏ']), + ('ḏ', &['Ḏ']), + ('Ḑ', &['ḑ']), + ('ḑ', &['Ḑ']), + ('Ḓ', &['ḓ']), + ('ḓ', &['Ḓ']), + ('Ḕ', &['ḕ']), + ('ḕ', &['Ḕ']), + ('Ḗ', &['ḗ']), + ('ḗ', &['Ḗ']), + ('Ḙ', &['ḙ']), + ('ḙ', &['Ḙ']), + ('Ḛ', &['ḛ']), + ('ḛ', &['Ḛ']), + ('Ḝ', &['ḝ']), + ('ḝ', &['Ḝ']), + ('Ḟ', &['ḟ']), + ('ḟ', &['Ḟ']), + ('Ḡ', &['ḡ']), + ('ḡ', &['Ḡ']), + ('Ḣ', &['ḣ']), + ('ḣ', &['Ḣ']), + ('Ḥ', &['ḥ']), + ('ḥ', &['Ḥ']), + ('Ḧ', &['ḧ']), + ('ḧ', &['Ḧ']), + ('Ḩ', &['ḩ']), + ('ḩ', &['Ḩ']), + ('Ḫ', &['ḫ']), + ('ḫ', &['Ḫ']), + ('Ḭ', &['ḭ']), + ('ḭ', &['Ḭ']), + ('Ḯ', &['ḯ']), + ('ḯ', &['Ḯ']), + ('Ḱ', &['ḱ']), + ('ḱ', &['Ḱ']), + ('Ḳ', &['ḳ']), + ('ḳ', &['Ḳ']), + ('Ḵ', &['ḵ']), + ('ḵ', &['Ḵ']), + ('Ḷ', &['ḷ']), + ('ḷ', &['Ḷ']), + ('Ḹ', &['ḹ']), + ('ḹ', &['Ḹ']), + ('Ḻ', &['ḻ']), + ('ḻ', &['Ḻ']), + ('Ḽ', &['ḽ']), + ('ḽ', &['Ḽ']), + ('Ḿ', &['ḿ']), + ('ḿ', &['Ḿ']), + ('Ṁ', &['ṁ']), + ('ṁ', &['Ṁ']), + ('Ṃ', &['ṃ']), + ('ṃ', &['Ṃ']), + ('Ṅ', &['ṅ']), + ('ṅ', &['Ṅ']), + ('Ṇ', &['ṇ']), + ('ṇ', &['Ṇ']), + ('Ṉ', &['ṉ']), + ('ṉ', &['Ṉ']), + ('Ṋ', &['ṋ']), + ('ṋ', &['Ṋ']), + ('Ṍ', &['ṍ']), + ('ṍ', &['Ṍ']), + ('Ṏ', &['ṏ']), + ('ṏ', &['Ṏ']), + ('Ṑ', &['ṑ']), + ('ṑ', &['Ṑ']), + ('Ṓ', &['ṓ']), + ('ṓ', &['Ṓ']), + ('Ṕ', &['ṕ']), + ('ṕ', &['Ṕ']), + ('Ṗ', &['ṗ']), + ('ṗ', &['Ṗ']), + ('Ṙ', &['ṙ']), + ('ṙ', &['Ṙ']), + ('Ṛ', &['ṛ']), + ('ṛ', &['Ṛ']), + ('Ṝ', &['ṝ']), + ('ṝ', &['Ṝ']), + ('Ṟ', &['ṟ']), + ('ṟ', &['Ṟ']), + ('Ṡ', &['ṡ', 'ẛ']), + ('ṡ', &['Ṡ', 'ẛ']), + ('Ṣ', &['ṣ']), + ('ṣ', &['Ṣ']), + ('Ṥ', &['ṥ']), + ('ṥ', &['Ṥ']), + ('Ṧ', &['ṧ']), + ('ṧ', &['Ṧ']), + ('Ṩ', &['ṩ']), + ('ṩ', &['Ṩ']), + ('Ṫ', &['ṫ']), + ('ṫ', &['Ṫ']), + ('Ṭ', &['ṭ']), + ('ṭ', &['Ṭ']), + ('Ṯ', &['ṯ']), + ('ṯ', &['Ṯ']), + ('Ṱ', &['ṱ']), + ('ṱ', &['Ṱ']), + ('Ṳ', &['ṳ']), + ('ṳ', &['Ṳ']), + ('Ṵ', &['ṵ']), + ('ṵ', &['Ṵ']), + ('Ṷ', &['ṷ']), + ('ṷ', &['Ṷ']), + ('Ṹ', &['ṹ']), + ('ṹ', &['Ṹ']), + ('Ṻ', &['ṻ']), + ('ṻ', &['Ṻ']), + ('Ṽ', &['ṽ']), + ('ṽ', &['Ṽ']), + ('Ṿ', &['ṿ']), + ('ṿ', &['Ṿ']), + ('Ẁ', &['ẁ']), + ('ẁ', &['Ẁ']), + ('Ẃ', &['ẃ']), + ('ẃ', &['Ẃ']), + ('Ẅ', &['ẅ']), + ('ẅ', &['Ẅ']), + ('Ẇ', &['ẇ']), + ('ẇ', &['Ẇ']), + ('Ẉ', &['ẉ']), + ('ẉ', &['Ẉ']), + ('Ẋ', &['ẋ']), + ('ẋ', &['Ẋ']), + ('Ẍ', &['ẍ']), + ('ẍ', &['Ẍ']), + ('Ẏ', &['ẏ']), + ('ẏ', &['Ẏ']), + ('Ẑ', &['ẑ']), + ('ẑ', &['Ẑ']), + ('Ẓ', &['ẓ']), + ('ẓ', &['Ẓ']), + ('Ẕ', &['ẕ']), + ('ẕ', &['Ẕ']), + ('ẛ', &['Ṡ', 'ṡ']), + ('ẞ', &['ß']), + ('Ạ', &['ạ']), + ('ạ', &['Ạ']), + ('Ả', &['ả']), + ('ả', &['Ả']), + ('Ấ', &['ấ']), + ('ấ', &['Ấ']), + ('Ầ', &['ầ']), + ('ầ', &['Ầ']), + ('Ẩ', &['ẩ']), + ('ẩ', &['Ẩ']), + ('Ẫ', &['ẫ']), + ('ẫ', &['Ẫ']), + ('Ậ', &['ậ']), + ('ậ', &['Ậ']), + ('Ắ', &['ắ']), + ('ắ', &['Ắ']), + ('Ằ', &['ằ']), + ('ằ', &['Ằ']), + ('Ẳ', &['ẳ']), + ('ẳ', &['Ẳ']), + ('Ẵ', &['ẵ']), + ('ẵ', &['Ẵ']), + ('Ặ', &['ặ']), + ('ặ', &['Ặ']), + ('Ẹ', &['ẹ']), + ('ẹ', &['Ẹ']), + ('Ẻ', &['ẻ']), + ('ẻ', &['Ẻ']), + ('Ẽ', &['ẽ']), + ('ẽ', &['Ẽ']), + ('Ế', &['ế']), + ('ế', &['Ế']), + ('Ề', &['ề']), + ('ề', &['Ề']), + ('Ể', &['ể']), + ('ể', &['Ể']), + ('Ễ', &['ễ']), + ('ễ', &['Ễ']), + ('Ệ', &['ệ']), + ('ệ', &['Ệ']), + ('Ỉ', &['ỉ']), + ('ỉ', &['Ỉ']), + ('Ị', &['ị']), + ('ị', &['Ị']), + ('Ọ', &['ọ']), + ('ọ', &['Ọ']), + ('Ỏ', &['ỏ']), + ('ỏ', &['Ỏ']), + ('Ố', &['ố']), + ('ố', &['Ố']), + ('Ồ', &['ồ']), + ('ồ', &['Ồ']), + ('Ổ', &['ổ']), + ('ổ', &['Ổ']), + ('Ỗ', &['ỗ']), + ('ỗ', &['Ỗ']), + ('Ộ', &['ộ']), + ('ộ', &['Ộ']), + ('Ớ', &['ớ']), + ('ớ', &['Ớ']), + ('Ờ', &['ờ']), + ('ờ', &['Ờ']), + ('Ở', &['ở']), + ('ở', &['Ở']), + ('Ỡ', &['ỡ']), + ('ỡ', &['Ỡ']), + ('Ợ', &['ợ']), + ('ợ', &['Ợ']), + ('Ụ', &['ụ']), + ('ụ', &['Ụ']), + ('Ủ', &['ủ']), + ('ủ', &['Ủ']), + ('Ứ', &['ứ']), + ('ứ', &['Ứ']), + ('Ừ', &['ừ']), + ('ừ', &['Ừ']), + ('Ử', &['ử']), + ('ử', &['Ử']), + ('Ữ', &['ữ']), + ('ữ', &['Ữ']), + ('Ự', &['ự']), + ('ự', &['Ự']), + ('Ỳ', &['ỳ']), + ('ỳ', &['Ỳ']), + ('Ỵ', &['ỵ']), + ('ỵ', &['Ỵ']), + ('Ỷ', &['ỷ']), + ('ỷ', &['Ỷ']), + ('Ỹ', &['ỹ']), + ('ỹ', &['Ỹ']), + ('Ỻ', &['ỻ']), + ('ỻ', &['Ỻ']), + ('Ỽ', &['ỽ']), + ('ỽ', &['Ỽ']), + ('Ỿ', &['ỿ']), + ('ỿ', &['Ỿ']), + ('ἀ', &['Ἀ']), + ('ἁ', &['Ἁ']), + ('ἂ', &['Ἂ']), + ('ἃ', &['Ἃ']), + ('ἄ', &['Ἄ']), + ('ἅ', &['Ἅ']), + ('ἆ', &['Ἆ']), + ('ἇ', &['Ἇ']), + ('Ἀ', &['ἀ']), + ('Ἁ', &['ἁ']), + ('Ἂ', &['ἂ']), + ('Ἃ', &['ἃ']), + ('Ἄ', &['ἄ']), + ('Ἅ', &['ἅ']), + ('Ἆ', &['ἆ']), + ('Ἇ', &['ἇ']), + ('ἐ', &['Ἐ']), + ('ἑ', &['Ἑ']), + ('ἒ', &['Ἒ']), + ('ἓ', &['Ἓ']), + ('ἔ', &['Ἔ']), + ('ἕ', &['Ἕ']), + ('Ἐ', &['ἐ']), + ('Ἑ', &['ἑ']), + ('Ἒ', &['ἒ']), + ('Ἓ', &['ἓ']), + ('Ἔ', &['ἔ']), + ('Ἕ', &['ἕ']), + ('ἠ', &['Ἠ']), + ('ἡ', &['Ἡ']), + ('ἢ', &['Ἢ']), + ('ἣ', &['Ἣ']), + ('ἤ', &['Ἤ']), + ('ἥ', &['Ἥ']), + ('ἦ', &['Ἦ']), + ('ἧ', &['Ἧ']), + ('Ἠ', &['ἠ']), + ('Ἡ', &['ἡ']), + ('Ἢ', &['ἢ']), + ('Ἣ', &['ἣ']), + ('Ἤ', &['ἤ']), + ('Ἥ', &['ἥ']), + ('Ἦ', &['ἦ']), + ('Ἧ', &['ἧ']), + ('ἰ', &['Ἰ']), + ('ἱ', &['Ἱ']), + ('ἲ', &['Ἲ']), + ('ἳ', &['Ἳ']), + ('ἴ', &['Ἴ']), + ('ἵ', &['Ἵ']), + ('ἶ', &['Ἶ']), + ('ἷ', &['Ἷ']), + ('Ἰ', &['ἰ']), + ('Ἱ', &['ἱ']), + ('Ἲ', &['ἲ']), + ('Ἳ', &['ἳ']), + ('Ἴ', &['ἴ']), + ('Ἵ', &['ἵ']), + ('Ἶ', &['ἶ']), + ('Ἷ', &['ἷ']), + ('ὀ', &['Ὀ']), + ('ὁ', &['Ὁ']), + ('ὂ', &['Ὂ']), + ('ὃ', &['Ὃ']), + ('ὄ', &['Ὄ']), + ('ὅ', &['Ὅ']), + ('Ὀ', &['ὀ']), + ('Ὁ', &['ὁ']), + ('Ὂ', &['ὂ']), + ('Ὃ', &['ὃ']), + ('Ὄ', &['ὄ']), + ('Ὅ', &['ὅ']), + ('ὑ', &['Ὑ']), + ('ὓ', &['Ὓ']), + ('ὕ', &['Ὕ']), + ('ὗ', &['Ὗ']), + ('Ὑ', &['ὑ']), + ('Ὓ', &['ὓ']), + ('Ὕ', &['ὕ']), + ('Ὗ', &['ὗ']), + ('ὠ', &['Ὠ']), + ('ὡ', &['Ὡ']), + ('ὢ', &['Ὢ']), + ('ὣ', &['Ὣ']), + ('ὤ', &['Ὤ']), + ('ὥ', &['Ὥ']), + ('ὦ', &['Ὦ']), + ('ὧ', &['Ὧ']), + ('Ὠ', &['ὠ']), + ('Ὡ', &['ὡ']), + ('Ὢ', &['ὢ']), + ('Ὣ', &['ὣ']), + ('Ὤ', &['ὤ']), + ('Ὥ', &['ὥ']), + ('Ὦ', &['ὦ']), + ('Ὧ', &['ὧ']), + ('ὰ', &['Ὰ']), + ('ά', &['Ά']), + ('ὲ', &['Ὲ']), + ('έ', &['Έ']), + ('ὴ', &['Ὴ']), + ('ή', &['Ή']), + ('ὶ', &['Ὶ']), + ('ί', &['Ί']), + ('ὸ', &['Ὸ']), + ('ό', &['Ό']), + ('ὺ', &['Ὺ']), + ('ύ', &['Ύ']), + ('ὼ', &['Ὼ']), + ('ώ', &['Ώ']), + ('ᾀ', &['ᾈ']), + ('ᾁ', &['ᾉ']), + ('ᾂ', &['ᾊ']), + ('ᾃ', &['ᾋ']), + ('ᾄ', &['ᾌ']), + ('ᾅ', &['ᾍ']), + ('ᾆ', &['ᾎ']), + ('ᾇ', &['ᾏ']), + ('ᾈ', &['ᾀ']), + ('ᾉ', &['ᾁ']), + ('ᾊ', &['ᾂ']), + ('ᾋ', &['ᾃ']), + ('ᾌ', &['ᾄ']), + ('ᾍ', &['ᾅ']), + ('ᾎ', &['ᾆ']), + ('ᾏ', &['ᾇ']), + ('ᾐ', &['ᾘ']), + ('ᾑ', &['ᾙ']), + ('ᾒ', &['ᾚ']), + ('ᾓ', &['ᾛ']), + ('ᾔ', &['ᾜ']), + ('ᾕ', &['ᾝ']), + ('ᾖ', &['ᾞ']), + ('ᾗ', &['ᾟ']), + ('ᾘ', &['ᾐ']), + ('ᾙ', &['ᾑ']), + ('ᾚ', &['ᾒ']), + ('ᾛ', &['ᾓ']), + ('ᾜ', &['ᾔ']), + ('ᾝ', &['ᾕ']), + ('ᾞ', &['ᾖ']), + ('ᾟ', &['ᾗ']), + ('ᾠ', &['ᾨ']), + ('ᾡ', &['ᾩ']), + ('ᾢ', &['ᾪ']), + ('ᾣ', &['ᾫ']), + ('ᾤ', &['ᾬ']), + ('ᾥ', &['ᾭ']), + ('ᾦ', &['ᾮ']), + ('ᾧ', &['ᾯ']), + ('ᾨ', &['ᾠ']), + ('ᾩ', &['ᾡ']), + ('ᾪ', &['ᾢ']), + ('ᾫ', &['ᾣ']), + ('ᾬ', &['ᾤ']), + ('ᾭ', &['ᾥ']), + ('ᾮ', &['ᾦ']), + ('ᾯ', &['ᾧ']), + ('ᾰ', &['Ᾰ']), + ('ᾱ', &['Ᾱ']), + ('ᾳ', &['ᾼ']), + ('Ᾰ', &['ᾰ']), + ('Ᾱ', &['ᾱ']), + ('Ὰ', &['ὰ']), + ('Ά', &['ά']), + ('ᾼ', &['ᾳ']), + ('ι', &['\u{345}', 'Ι', 'ι']), + ('ῃ', &['ῌ']), + ('Ὲ', &['ὲ']), + ('Έ', &['έ']), + ('Ὴ', &['ὴ']), + ('Ή', &['ή']), + ('ῌ', &['ῃ']), + ('ῐ', &['Ῐ']), + ('ῑ', &['Ῑ']), + ('Ῐ', &['ῐ']), + ('Ῑ', &['ῑ']), + ('Ὶ', &['ὶ']), + ('Ί', &['ί']), + ('ῠ', &['Ῠ']), + ('ῡ', &['Ῡ']), + ('ῥ', &['Ῥ']), + ('Ῠ', &['ῠ']), + ('Ῡ', &['ῡ']), + ('Ὺ', &['ὺ']), + ('Ύ', &['ύ']), + ('Ῥ', &['ῥ']), + ('ῳ', &['ῼ']), + ('Ὸ', &['ὸ']), + ('Ό', &['ό']), + ('Ὼ', &['ὼ']), + ('Ώ', &['ώ']), + ('ῼ', &['ῳ']), + ('Ω', &['Ω', 'ω']), + ('K', &['K', 'k']), + ('Å', &['Å', 'å']), + ('Ⅎ', &['ⅎ']), + ('ⅎ', &['Ⅎ']), + ('Ⅰ', &['ⅰ']), + ('Ⅱ', &['ⅱ']), + ('Ⅲ', &['ⅲ']), + ('Ⅳ', &['ⅳ']), + ('Ⅴ', &['ⅴ']), + ('Ⅵ', &['ⅵ']), + ('Ⅶ', &['ⅶ']), + ('Ⅷ', &['ⅷ']), + ('Ⅸ', &['ⅸ']), + ('Ⅹ', &['ⅹ']), + ('Ⅺ', &['ⅺ']), + ('Ⅻ', &['ⅻ']), + ('Ⅼ', &['ⅼ']), + ('Ⅽ', &['ⅽ']), + ('Ⅾ', &['ⅾ']), + ('Ⅿ', &['ⅿ']), + ('ⅰ', &['Ⅰ']), + ('ⅱ', &['Ⅱ']), + ('ⅲ', &['Ⅲ']), + ('ⅳ', &['Ⅳ']), + ('ⅴ', &['Ⅴ']), + ('ⅵ', &['Ⅵ']), + ('ⅶ', &['Ⅶ']), + ('ⅷ', &['Ⅷ']), + ('ⅸ', &['Ⅸ']), + ('ⅹ', &['Ⅹ']), + ('ⅺ', &['Ⅺ']), + ('ⅻ', &['Ⅻ']), + ('ⅼ', &['Ⅼ']), + ('ⅽ', &['Ⅽ']), + ('ⅾ', &['Ⅾ']), + ('ⅿ', &['Ⅿ']), + ('Ↄ', &['ↄ']), + ('ↄ', &['Ↄ']), + ('Ⓐ', &['ⓐ']), + ('Ⓑ', &['ⓑ']), + ('Ⓒ', &['ⓒ']), + ('Ⓓ', &['ⓓ']), + ('Ⓔ', &['ⓔ']), + ('Ⓕ', &['ⓕ']), + ('Ⓖ', &['ⓖ']), + ('Ⓗ', &['ⓗ']), + ('Ⓘ', &['ⓘ']), + ('Ⓙ', &['ⓙ']), + ('Ⓚ', &['ⓚ']), + ('Ⓛ', &['ⓛ']), + ('Ⓜ', &['ⓜ']), + ('Ⓝ', &['ⓝ']), + ('Ⓞ', &['ⓞ']), + ('Ⓟ', &['ⓟ']), + ('Ⓠ', &['ⓠ']), + ('Ⓡ', &['ⓡ']), + ('Ⓢ', &['ⓢ']), + ('Ⓣ', &['ⓣ']), + ('Ⓤ', &['ⓤ']), + ('Ⓥ', &['ⓥ']), + ('Ⓦ', &['ⓦ']), + ('Ⓧ', &['ⓧ']), + ('Ⓨ', &['ⓨ']), + ('Ⓩ', &['ⓩ']), + ('ⓐ', &['Ⓐ']), + ('ⓑ', &['Ⓑ']), + ('ⓒ', &['Ⓒ']), + ('ⓓ', &['Ⓓ']), + ('ⓔ', &['Ⓔ']), + ('ⓕ', &['Ⓕ']), + ('ⓖ', &['Ⓖ']), + ('ⓗ', &['Ⓗ']), + ('ⓘ', &['Ⓘ']), + ('ⓙ', &['Ⓙ']), + ('ⓚ', &['Ⓚ']), + ('ⓛ', &['Ⓛ']), + ('ⓜ', &['Ⓜ']), + ('ⓝ', &['Ⓝ']), + ('ⓞ', &['Ⓞ']), + ('ⓟ', &['Ⓟ']), + ('ⓠ', &['Ⓠ']), + ('ⓡ', &['Ⓡ']), + ('ⓢ', &['Ⓢ']), + ('ⓣ', &['Ⓣ']), + ('ⓤ', &['Ⓤ']), + ('ⓥ', &['Ⓥ']), + ('ⓦ', &['Ⓦ']), + ('ⓧ', &['Ⓧ']), + ('ⓨ', &['Ⓨ']), + ('ⓩ', &['Ⓩ']), + ('Ⰰ', &['ⰰ']), + ('Ⰱ', &['ⰱ']), + ('Ⰲ', &['ⰲ']), + ('Ⰳ', &['ⰳ']), + ('Ⰴ', &['ⰴ']), + ('Ⰵ', &['ⰵ']), + ('Ⰶ', &['ⰶ']), + ('Ⰷ', &['ⰷ']), + ('Ⰸ', &['ⰸ']), + ('Ⰹ', &['ⰹ']), + ('Ⰺ', &['ⰺ']), + ('Ⰻ', &['ⰻ']), + ('Ⰼ', &['ⰼ']), + ('Ⰽ', &['ⰽ']), + ('Ⰾ', &['ⰾ']), + ('Ⰿ', &['ⰿ']), + ('Ⱀ', &['ⱀ']), + ('Ⱁ', &['ⱁ']), + ('Ⱂ', &['ⱂ']), + ('Ⱃ', &['ⱃ']), + ('Ⱄ', &['ⱄ']), + ('Ⱅ', &['ⱅ']), + ('Ⱆ', &['ⱆ']), + ('Ⱇ', &['ⱇ']), + ('Ⱈ', &['ⱈ']), + ('Ⱉ', &['ⱉ']), + ('Ⱊ', &['ⱊ']), + ('Ⱋ', &['ⱋ']), + ('Ⱌ', &['ⱌ']), + ('Ⱍ', &['ⱍ']), + ('Ⱎ', &['ⱎ']), + ('Ⱏ', &['ⱏ']), + ('Ⱐ', &['ⱐ']), + ('Ⱑ', &['ⱑ']), + ('Ⱒ', &['ⱒ']), + ('Ⱓ', &['ⱓ']), + ('Ⱔ', &['ⱔ']), + ('Ⱕ', &['ⱕ']), + ('Ⱖ', &['ⱖ']), + ('Ⱗ', &['ⱗ']), + ('Ⱘ', &['ⱘ']), + ('Ⱙ', &['ⱙ']), + ('Ⱚ', &['ⱚ']), + ('Ⱛ', &['ⱛ']), + ('Ⱜ', &['ⱜ']), + ('Ⱝ', &['ⱝ']), + ('Ⱞ', &['ⱞ']), + ('Ⱟ', &['ⱟ']), + ('ⰰ', &['Ⰰ']), + ('ⰱ', &['Ⰱ']), + ('ⰲ', &['Ⰲ']), + ('ⰳ', &['Ⰳ']), + ('ⰴ', &['Ⰴ']), + ('ⰵ', &['Ⰵ']), + ('ⰶ', &['Ⰶ']), + ('ⰷ', &['Ⰷ']), + ('ⰸ', &['Ⰸ']), + ('ⰹ', &['Ⰹ']), + ('ⰺ', &['Ⰺ']), + ('ⰻ', &['Ⰻ']), + ('ⰼ', &['Ⰼ']), + ('ⰽ', &['Ⰽ']), + ('ⰾ', &['Ⰾ']), + ('ⰿ', &['Ⰿ']), + ('ⱀ', &['Ⱀ']), + ('ⱁ', &['Ⱁ']), + ('ⱂ', &['Ⱂ']), + ('ⱃ', &['Ⱃ']), + ('ⱄ', &['Ⱄ']), + ('ⱅ', &['Ⱅ']), + ('ⱆ', &['Ⱆ']), + ('ⱇ', &['Ⱇ']), + ('ⱈ', &['Ⱈ']), + ('ⱉ', &['Ⱉ']), + ('ⱊ', &['Ⱊ']), + ('ⱋ', &['Ⱋ']), + ('ⱌ', &['Ⱌ']), + ('ⱍ', &['Ⱍ']), + ('ⱎ', &['Ⱎ']), + ('ⱏ', &['Ⱏ']), + ('ⱐ', &['Ⱐ']), + ('ⱑ', &['Ⱑ']), + ('ⱒ', &['Ⱒ']), + ('ⱓ', &['Ⱓ']), + ('ⱔ', &['Ⱔ']), + ('ⱕ', &['Ⱕ']), + ('ⱖ', &['Ⱖ']), + ('ⱗ', &['Ⱗ']), + ('ⱘ', &['Ⱘ']), + ('ⱙ', &['Ⱙ']), + ('ⱚ', &['Ⱚ']), + ('ⱛ', &['Ⱛ']), + ('ⱜ', &['Ⱜ']), + ('ⱝ', &['Ⱝ']), + ('ⱞ', &['Ⱞ']), + ('ⱟ', &['Ⱟ']), + ('Ⱡ', &['ⱡ']), + ('ⱡ', &['Ⱡ']), + ('Ɫ', &['ɫ']), + ('Ᵽ', &['ᵽ']), + ('Ɽ', &['ɽ']), + ('ⱥ', &['Ⱥ']), + ('ⱦ', &['Ⱦ']), + ('Ⱨ', &['ⱨ']), + ('ⱨ', &['Ⱨ']), + ('Ⱪ', &['ⱪ']), + ('ⱪ', &['Ⱪ']), + ('Ⱬ', &['ⱬ']), + ('ⱬ', &['Ⱬ']), + ('Ɑ', &['ɑ']), + ('Ɱ', &['ɱ']), + ('Ɐ', &['ɐ']), + ('Ɒ', &['ɒ']), + ('Ⱳ', &['ⱳ']), + ('ⱳ', &['Ⱳ']), + ('Ⱶ', &['ⱶ']), + ('ⱶ', &['Ⱶ']), + ('Ȿ', &['ȿ']), + ('Ɀ', &['ɀ']), + ('Ⲁ', &['ⲁ']), + ('ⲁ', &['Ⲁ']), + ('Ⲃ', &['ⲃ']), + ('ⲃ', &['Ⲃ']), + ('Ⲅ', &['ⲅ']), + ('ⲅ', &['Ⲅ']), + ('Ⲇ', &['ⲇ']), + ('ⲇ', &['Ⲇ']), + ('Ⲉ', &['ⲉ']), + ('ⲉ', &['Ⲉ']), + ('Ⲋ', &['ⲋ']), + ('ⲋ', &['Ⲋ']), + ('Ⲍ', &['ⲍ']), + ('ⲍ', &['Ⲍ']), + ('Ⲏ', &['ⲏ']), + ('ⲏ', &['Ⲏ']), + ('Ⲑ', &['ⲑ']), + ('ⲑ', &['Ⲑ']), + ('Ⲓ', &['ⲓ']), + ('ⲓ', &['Ⲓ']), + ('Ⲕ', &['ⲕ']), + ('ⲕ', &['Ⲕ']), + ('Ⲗ', &['ⲗ']), + ('ⲗ', &['Ⲗ']), + ('Ⲙ', &['ⲙ']), + ('ⲙ', &['Ⲙ']), + ('Ⲛ', &['ⲛ']), + ('ⲛ', &['Ⲛ']), + ('Ⲝ', &['ⲝ']), + ('ⲝ', &['Ⲝ']), + ('Ⲟ', &['ⲟ']), + ('ⲟ', &['Ⲟ']), + ('Ⲡ', &['ⲡ']), + ('ⲡ', &['Ⲡ']), + ('Ⲣ', &['ⲣ']), + ('ⲣ', &['Ⲣ']), + ('Ⲥ', &['ⲥ']), + ('ⲥ', &['Ⲥ']), + ('Ⲧ', &['ⲧ']), + ('ⲧ', &['Ⲧ']), + ('Ⲩ', &['ⲩ']), + ('ⲩ', &['Ⲩ']), + ('Ⲫ', &['ⲫ']), + ('ⲫ', &['Ⲫ']), + ('Ⲭ', &['ⲭ']), + ('ⲭ', &['Ⲭ']), + ('Ⲯ', &['ⲯ']), + ('ⲯ', &['Ⲯ']), + ('Ⲱ', &['ⲱ']), + ('ⲱ', &['Ⲱ']), + ('Ⲳ', &['ⲳ']), + ('ⲳ', &['Ⲳ']), + ('Ⲵ', &['ⲵ']), + ('ⲵ', &['Ⲵ']), + ('Ⲷ', &['ⲷ']), + ('ⲷ', &['Ⲷ']), + ('Ⲹ', &['ⲹ']), + ('ⲹ', &['Ⲹ']), + ('Ⲻ', &['ⲻ']), + ('ⲻ', &['Ⲻ']), + ('Ⲽ', &['ⲽ']), + ('ⲽ', &['Ⲽ']), + ('Ⲿ', &['ⲿ']), + ('ⲿ', &['Ⲿ']), + ('Ⳁ', &['ⳁ']), + ('ⳁ', &['Ⳁ']), + ('Ⳃ', &['ⳃ']), + ('ⳃ', &['Ⳃ']), + ('Ⳅ', &['ⳅ']), + ('ⳅ', &['Ⳅ']), + ('Ⳇ', &['ⳇ']), + ('ⳇ', &['Ⳇ']), + ('Ⳉ', &['ⳉ']), + ('ⳉ', &['Ⳉ']), + ('Ⳋ', &['ⳋ']), + ('ⳋ', &['Ⳋ']), + ('Ⳍ', &['ⳍ']), + ('ⳍ', &['Ⳍ']), + ('Ⳏ', &['ⳏ']), + ('ⳏ', &['Ⳏ']), + ('Ⳑ', &['ⳑ']), + ('ⳑ', &['Ⳑ']), + ('Ⳓ', &['ⳓ']), + ('ⳓ', &['Ⳓ']), + ('Ⳕ', &['ⳕ']), + ('ⳕ', &['Ⳕ']), + ('Ⳗ', &['ⳗ']), + ('ⳗ', &['Ⳗ']), + ('Ⳙ', &['ⳙ']), + ('ⳙ', &['Ⳙ']), + ('Ⳛ', &['ⳛ']), + ('ⳛ', &['Ⳛ']), + ('Ⳝ', &['ⳝ']), + ('ⳝ', &['Ⳝ']), + ('Ⳟ', &['ⳟ']), + ('ⳟ', &['Ⳟ']), + ('Ⳡ', &['ⳡ']), + ('ⳡ', &['Ⳡ']), + ('Ⳣ', &['ⳣ']), + ('ⳣ', &['Ⳣ']), + ('Ⳬ', &['ⳬ']), + ('ⳬ', &['Ⳬ']), + ('Ⳮ', &['ⳮ']), + ('ⳮ', &['Ⳮ']), + ('Ⳳ', &['ⳳ']), + ('ⳳ', &['Ⳳ']), + ('ⴀ', &['Ⴀ']), + ('ⴁ', &['Ⴁ']), + ('ⴂ', &['Ⴂ']), + ('ⴃ', &['Ⴃ']), + ('ⴄ', &['Ⴄ']), + ('ⴅ', &['Ⴅ']), + ('ⴆ', &['Ⴆ']), + ('ⴇ', &['Ⴇ']), + ('ⴈ', &['Ⴈ']), + ('ⴉ', &['Ⴉ']), + ('ⴊ', &['Ⴊ']), + ('ⴋ', &['Ⴋ']), + ('ⴌ', &['Ⴌ']), + ('ⴍ', &['Ⴍ']), + ('ⴎ', &['Ⴎ']), + ('ⴏ', &['Ⴏ']), + ('ⴐ', &['Ⴐ']), + ('ⴑ', &['Ⴑ']), + ('ⴒ', &['Ⴒ']), + ('ⴓ', &['Ⴓ']), + ('ⴔ', &['Ⴔ']), + ('ⴕ', &['Ⴕ']), + ('ⴖ', &['Ⴖ']), + ('ⴗ', &['Ⴗ']), + ('ⴘ', &['Ⴘ']), + ('ⴙ', &['Ⴙ']), + ('ⴚ', &['Ⴚ']), + ('ⴛ', &['Ⴛ']), + ('ⴜ', &['Ⴜ']), + ('ⴝ', &['Ⴝ']), + ('ⴞ', &['Ⴞ']), + ('ⴟ', &['Ⴟ']), + ('ⴠ', &['Ⴠ']), + ('ⴡ', &['Ⴡ']), + ('ⴢ', &['Ⴢ']), + ('ⴣ', &['Ⴣ']), + ('ⴤ', &['Ⴤ']), + ('ⴥ', &['Ⴥ']), + ('ⴧ', &['Ⴧ']), + ('ⴭ', &['Ⴭ']), + ('Ꙁ', &['ꙁ']), + ('ꙁ', &['Ꙁ']), + ('Ꙃ', &['ꙃ']), + ('ꙃ', &['Ꙃ']), + ('Ꙅ', &['ꙅ']), + ('ꙅ', &['Ꙅ']), + ('Ꙇ', &['ꙇ']), + ('ꙇ', &['Ꙇ']), + ('Ꙉ', &['ꙉ']), + ('ꙉ', &['Ꙉ']), + ('Ꙋ', &['ᲈ', 'ꙋ']), + ('ꙋ', &['ᲈ', 'Ꙋ']), + ('Ꙍ', &['ꙍ']), + ('ꙍ', &['Ꙍ']), + ('Ꙏ', &['ꙏ']), + ('ꙏ', &['Ꙏ']), + ('Ꙑ', &['ꙑ']), + ('ꙑ', &['Ꙑ']), + ('Ꙓ', &['ꙓ']), + ('ꙓ', &['Ꙓ']), + ('Ꙕ', &['ꙕ']), + ('ꙕ', &['Ꙕ']), + ('Ꙗ', &['ꙗ']), + ('ꙗ', &['Ꙗ']), + ('Ꙙ', &['ꙙ']), + ('ꙙ', &['Ꙙ']), + ('Ꙛ', &['ꙛ']), + ('ꙛ', &['Ꙛ']), + ('Ꙝ', &['ꙝ']), + ('ꙝ', &['Ꙝ']), + ('Ꙟ', &['ꙟ']), + ('ꙟ', &['Ꙟ']), + ('Ꙡ', &['ꙡ']), + ('ꙡ', &['Ꙡ']), + ('Ꙣ', &['ꙣ']), + ('ꙣ', &['Ꙣ']), + ('Ꙥ', &['ꙥ']), + ('ꙥ', &['Ꙥ']), + ('Ꙧ', &['ꙧ']), + ('ꙧ', &['Ꙧ']), + ('Ꙩ', &['ꙩ']), + ('ꙩ', &['Ꙩ']), + ('Ꙫ', &['ꙫ']), + ('ꙫ', &['Ꙫ']), + ('Ꙭ', &['ꙭ']), + ('ꙭ', &['Ꙭ']), + ('Ꚁ', &['ꚁ']), + ('ꚁ', &['Ꚁ']), + ('Ꚃ', &['ꚃ']), + ('ꚃ', &['Ꚃ']), + ('Ꚅ', &['ꚅ']), + ('ꚅ', &['Ꚅ']), + ('Ꚇ', &['ꚇ']), + ('ꚇ', &['Ꚇ']), + ('Ꚉ', &['ꚉ']), + ('ꚉ', &['Ꚉ']), + ('Ꚋ', &['ꚋ']), + ('ꚋ', &['Ꚋ']), + ('Ꚍ', &['ꚍ']), + ('ꚍ', &['Ꚍ']), + ('Ꚏ', &['ꚏ']), + ('ꚏ', &['Ꚏ']), + ('Ꚑ', &['ꚑ']), + ('ꚑ', &['Ꚑ']), + ('Ꚓ', &['ꚓ']), + ('ꚓ', &['Ꚓ']), + ('Ꚕ', &['ꚕ']), + ('ꚕ', &['Ꚕ']), + ('Ꚗ', &['ꚗ']), + ('ꚗ', &['Ꚗ']), + ('Ꚙ', &['ꚙ']), + ('ꚙ', &['Ꚙ']), + ('Ꚛ', &['ꚛ']), + ('ꚛ', &['Ꚛ']), + ('Ꜣ', &['ꜣ']), + ('ꜣ', &['Ꜣ']), + ('Ꜥ', &['ꜥ']), + ('ꜥ', &['Ꜥ']), + ('Ꜧ', &['ꜧ']), + ('ꜧ', &['Ꜧ']), + ('Ꜩ', &['ꜩ']), + ('ꜩ', &['Ꜩ']), + ('Ꜫ', &['ꜫ']), + ('ꜫ', &['Ꜫ']), + ('Ꜭ', &['ꜭ']), + ('ꜭ', &['Ꜭ']), + ('Ꜯ', &['ꜯ']), + ('ꜯ', &['Ꜯ']), + ('Ꜳ', &['ꜳ']), + ('ꜳ', &['Ꜳ']), + ('Ꜵ', &['ꜵ']), + ('ꜵ', &['Ꜵ']), + ('Ꜷ', &['ꜷ']), + ('ꜷ', &['Ꜷ']), + ('Ꜹ', &['ꜹ']), + ('ꜹ', &['Ꜹ']), + ('Ꜻ', &['ꜻ']), + ('ꜻ', &['Ꜻ']), + ('Ꜽ', &['ꜽ']), + ('ꜽ', &['Ꜽ']), + ('Ꜿ', &['ꜿ']), + ('ꜿ', &['Ꜿ']), + ('Ꝁ', &['ꝁ']), + ('ꝁ', &['Ꝁ']), + ('Ꝃ', &['ꝃ']), + ('ꝃ', &['Ꝃ']), + ('Ꝅ', &['ꝅ']), + ('ꝅ', &['Ꝅ']), + ('Ꝇ', &['ꝇ']), + ('ꝇ', &['Ꝇ']), + ('Ꝉ', &['ꝉ']), + ('ꝉ', &['Ꝉ']), + ('Ꝋ', &['ꝋ']), + ('ꝋ', &['Ꝋ']), + ('Ꝍ', &['ꝍ']), + ('ꝍ', &['Ꝍ']), + ('Ꝏ', &['ꝏ']), + ('ꝏ', &['Ꝏ']), + ('Ꝑ', &['ꝑ']), + ('ꝑ', &['Ꝑ']), + ('Ꝓ', &['ꝓ']), + ('ꝓ', &['Ꝓ']), + ('Ꝕ', &['ꝕ']), + ('ꝕ', &['Ꝕ']), + ('Ꝗ', &['ꝗ']), + ('ꝗ', &['Ꝗ']), + ('Ꝙ', &['ꝙ']), + ('ꝙ', &['Ꝙ']), + ('Ꝛ', &['ꝛ']), + ('ꝛ', &['Ꝛ']), + ('Ꝝ', &['ꝝ']), + ('ꝝ', &['Ꝝ']), + ('Ꝟ', &['ꝟ']), + ('ꝟ', &['Ꝟ']), + ('Ꝡ', &['ꝡ']), + ('ꝡ', &['Ꝡ']), + ('Ꝣ', &['ꝣ']), + ('ꝣ', &['Ꝣ']), + ('Ꝥ', &['ꝥ']), + ('ꝥ', &['Ꝥ']), + ('Ꝧ', &['ꝧ']), + ('ꝧ', &['Ꝧ']), + ('Ꝩ', &['ꝩ']), + ('ꝩ', &['Ꝩ']), + ('Ꝫ', &['ꝫ']), + ('ꝫ', &['Ꝫ']), + ('Ꝭ', &['ꝭ']), + ('ꝭ', &['Ꝭ']), + ('Ꝯ', &['ꝯ']), + ('ꝯ', &['Ꝯ']), + ('Ꝺ', &['ꝺ']), + ('ꝺ', &['Ꝺ']), + ('Ꝼ', &['ꝼ']), + ('ꝼ', &['Ꝼ']), + ('Ᵹ', &['ᵹ']), + ('Ꝿ', &['ꝿ']), + ('ꝿ', &['Ꝿ']), + ('Ꞁ', &['ꞁ']), + ('ꞁ', &['Ꞁ']), + ('Ꞃ', &['ꞃ']), + ('ꞃ', &['Ꞃ']), + ('Ꞅ', &['ꞅ']), + ('ꞅ', &['Ꞅ']), + ('Ꞇ', &['ꞇ']), + ('ꞇ', &['Ꞇ']), + ('Ꞌ', &['ꞌ']), + ('ꞌ', &['Ꞌ']), + ('Ɥ', &['ɥ']), + ('Ꞑ', &['ꞑ']), + ('ꞑ', &['Ꞑ']), + ('Ꞓ', &['ꞓ']), + ('ꞓ', &['Ꞓ']), + ('ꞔ', &['Ꞔ']), + ('Ꞗ', &['ꞗ']), + ('ꞗ', &['Ꞗ']), + ('Ꞙ', &['ꞙ']), + ('ꞙ', &['Ꞙ']), + ('Ꞛ', &['ꞛ']), + ('ꞛ', &['Ꞛ']), + ('Ꞝ', &['ꞝ']), + ('ꞝ', &['Ꞝ']), + ('Ꞟ', &['ꞟ']), + ('ꞟ', &['Ꞟ']), + ('Ꞡ', &['ꞡ']), + ('ꞡ', &['Ꞡ']), + ('Ꞣ', &['ꞣ']), + ('ꞣ', &['Ꞣ']), + ('Ꞥ', &['ꞥ']), + ('ꞥ', &['Ꞥ']), + ('Ꞧ', &['ꞧ']), + ('ꞧ', &['Ꞧ']), + ('Ꞩ', &['ꞩ']), + ('ꞩ', &['Ꞩ']), + ('Ɦ', &['ɦ']), + ('Ɜ', &['ɜ']), + ('Ɡ', &['ɡ']), + ('Ɬ', &['ɬ']), + ('Ɪ', &['ɪ']), + ('Ʞ', &['ʞ']), + ('Ʇ', &['ʇ']), + ('Ʝ', &['ʝ']), + ('Ꭓ', &['ꭓ']), + ('Ꞵ', &['ꞵ']), + ('ꞵ', &['Ꞵ']), + ('Ꞷ', &['ꞷ']), + ('ꞷ', &['Ꞷ']), + ('Ꞹ', &['ꞹ']), + ('ꞹ', &['Ꞹ']), + ('Ꞻ', &['ꞻ']), + ('ꞻ', &['Ꞻ']), + ('Ꞽ', &['ꞽ']), + ('ꞽ', &['Ꞽ']), + ('Ꞿ', &['ꞿ']), + ('ꞿ', &['Ꞿ']), + ('Ꟁ', &['ꟁ']), + ('ꟁ', &['Ꟁ']), + ('Ꟃ', &['ꟃ']), + ('ꟃ', &['Ꟃ']), + ('Ꞔ', &['ꞔ']), + ('Ʂ', &['ʂ']), + ('Ᶎ', &['ᶎ']), + ('Ꟈ', &['ꟈ']), + ('ꟈ', &['Ꟈ']), + ('Ꟊ', &['ꟊ']), + ('ꟊ', &['Ꟊ']), + ('Ꟑ', &['ꟑ']), + ('ꟑ', &['Ꟑ']), + ('Ꟗ', &['ꟗ']), + ('ꟗ', &['Ꟗ']), + ('Ꟙ', &['ꟙ']), + ('ꟙ', &['Ꟙ']), + ('Ꟶ', &['ꟶ']), + ('ꟶ', &['Ꟶ']), + ('ꭓ', &['Ꭓ']), + ('ꭰ', &['Ꭰ']), + ('ꭱ', &['Ꭱ']), + ('ꭲ', &['Ꭲ']), + ('ꭳ', &['Ꭳ']), + ('ꭴ', &['Ꭴ']), + ('ꭵ', &['Ꭵ']), + ('ꭶ', &['Ꭶ']), + ('ꭷ', &['Ꭷ']), + ('ꭸ', &['Ꭸ']), + ('ꭹ', &['Ꭹ']), + ('ꭺ', &['Ꭺ']), + ('ꭻ', &['Ꭻ']), + ('ꭼ', &['Ꭼ']), + ('ꭽ', &['Ꭽ']), + ('ꭾ', &['Ꭾ']), + ('ꭿ', &['Ꭿ']), + ('ꮀ', &['Ꮀ']), + ('ꮁ', &['Ꮁ']), + ('ꮂ', &['Ꮂ']), + ('ꮃ', &['Ꮃ']), + ('ꮄ', &['Ꮄ']), + ('ꮅ', &['Ꮅ']), + ('ꮆ', &['Ꮆ']), + ('ꮇ', &['Ꮇ']), + ('ꮈ', &['Ꮈ']), + ('ꮉ', &['Ꮉ']), + ('ꮊ', &['Ꮊ']), + ('ꮋ', &['Ꮋ']), + ('ꮌ', &['Ꮌ']), + ('ꮍ', &['Ꮍ']), + ('ꮎ', &['Ꮎ']), + ('ꮏ', &['Ꮏ']), + ('ꮐ', &['Ꮐ']), + ('ꮑ', &['Ꮑ']), + ('ꮒ', &['Ꮒ']), + ('ꮓ', &['Ꮓ']), + ('ꮔ', &['Ꮔ']), + ('ꮕ', &['Ꮕ']), + ('ꮖ', &['Ꮖ']), + ('ꮗ', &['Ꮗ']), + ('ꮘ', &['Ꮘ']), + ('ꮙ', &['Ꮙ']), + ('ꮚ', &['Ꮚ']), + ('ꮛ', &['Ꮛ']), + ('ꮜ', &['Ꮜ']), + ('ꮝ', &['Ꮝ']), + ('ꮞ', &['Ꮞ']), + ('ꮟ', &['Ꮟ']), + ('ꮠ', &['Ꮠ']), + ('ꮡ', &['Ꮡ']), + ('ꮢ', &['Ꮢ']), + ('ꮣ', &['Ꮣ']), + ('ꮤ', &['Ꮤ']), + ('ꮥ', &['Ꮥ']), + ('ꮦ', &['Ꮦ']), + ('ꮧ', &['Ꮧ']), + ('ꮨ', &['Ꮨ']), + ('ꮩ', &['Ꮩ']), + ('ꮪ', &['Ꮪ']), + ('ꮫ', &['Ꮫ']), + ('ꮬ', &['Ꮬ']), + ('ꮭ', &['Ꮭ']), + ('ꮮ', &['Ꮮ']), + ('ꮯ', &['Ꮯ']), + ('ꮰ', &['Ꮰ']), + ('ꮱ', &['Ꮱ']), + ('ꮲ', &['Ꮲ']), + ('ꮳ', &['Ꮳ']), + ('ꮴ', &['Ꮴ']), + ('ꮵ', &['Ꮵ']), + ('ꮶ', &['Ꮶ']), + ('ꮷ', &['Ꮷ']), + ('ꮸ', &['Ꮸ']), + ('ꮹ', &['Ꮹ']), + ('ꮺ', &['Ꮺ']), + ('ꮻ', &['Ꮻ']), + ('ꮼ', &['Ꮼ']), + ('ꮽ', &['Ꮽ']), + ('ꮾ', &['Ꮾ']), + ('ꮿ', &['Ꮿ']), + ('A', &['a']), + ('B', &['b']), + ('C', &['c']), + ('D', &['d']), + ('E', &['e']), + ('F', &['f']), + ('G', &['g']), + ('H', &['h']), + ('I', &['i']), + ('J', &['j']), + ('K', &['k']), + ('L', &['l']), + ('M', &['m']), + ('N', &['n']), + ('O', &['o']), + ('P', &['p']), + ('Q', &['q']), + ('R', &['r']), + ('S', &['s']), + ('T', &['t']), + ('U', &['u']), + ('V', &['v']), + ('W', &['w']), + ('X', &['x']), + ('Y', &['y']), + ('Z', &['z']), + ('a', &['A']), + ('b', &['B']), + ('c', &['C']), + ('d', &['D']), + ('e', &['E']), + ('f', &['F']), + ('g', &['G']), + ('h', &['H']), + ('i', &['I']), + ('j', &['J']), + ('k', &['K']), + ('l', &['L']), + ('m', &['M']), + ('n', &['N']), + ('o', &['O']), + ('p', &['P']), + ('q', &['Q']), + ('r', &['R']), + ('s', &['S']), + ('t', &['T']), + ('u', &['U']), + ('v', &['V']), + ('w', &['W']), + ('x', &['X']), + ('y', &['Y']), + ('z', &['Z']), + ('𐐀', &['𐐨']), + ('𐐁', &['𐐩']), + ('𐐂', &['𐐪']), + ('𐐃', &['𐐫']), + ('𐐄', &['𐐬']), + ('𐐅', &['𐐭']), + ('𐐆', &['𐐮']), + ('𐐇', &['𐐯']), + ('𐐈', &['𐐰']), + ('𐐉', &['𐐱']), + ('𐐊', &['𐐲']), + ('𐐋', &['𐐳']), + ('𐐌', &['𐐴']), + ('𐐍', &['𐐵']), + ('𐐎', &['𐐶']), + ('𐐏', &['𐐷']), + ('𐐐', &['𐐸']), + ('𐐑', &['𐐹']), + ('𐐒', &['𐐺']), + ('𐐓', &['𐐻']), + ('𐐔', &['𐐼']), + ('𐐕', &['𐐽']), + ('𐐖', &['𐐾']), + ('𐐗', &['𐐿']), + ('𐐘', &['𐑀']), + ('𐐙', &['𐑁']), + ('𐐚', &['𐑂']), + ('𐐛', &['𐑃']), + ('𐐜', &['𐑄']), + ('𐐝', &['𐑅']), + ('𐐞', &['𐑆']), + ('𐐟', &['𐑇']), + ('𐐠', &['𐑈']), + ('𐐡', &['𐑉']), + ('𐐢', &['𐑊']), + ('𐐣', &['𐑋']), + ('𐐤', &['𐑌']), + ('𐐥', &['𐑍']), + ('𐐦', &['𐑎']), + ('𐐧', &['𐑏']), + ('𐐨', &['𐐀']), + ('𐐩', &['𐐁']), + ('𐐪', &['𐐂']), + ('𐐫', &['𐐃']), + ('𐐬', &['𐐄']), + ('𐐭', &['𐐅']), + ('𐐮', &['𐐆']), + ('𐐯', &['𐐇']), + ('𐐰', &['𐐈']), + ('𐐱', &['𐐉']), + ('𐐲', &['𐐊']), + ('𐐳', &['𐐋']), + ('𐐴', &['𐐌']), + ('𐐵', &['𐐍']), + ('𐐶', &['𐐎']), + ('𐐷', &['𐐏']), + ('𐐸', &['𐐐']), + ('𐐹', &['𐐑']), + ('𐐺', &['𐐒']), + ('𐐻', &['𐐓']), + ('𐐼', &['𐐔']), + ('𐐽', &['𐐕']), + ('𐐾', &['𐐖']), + ('𐐿', &['𐐗']), + ('𐑀', &['𐐘']), + ('𐑁', &['𐐙']), + ('𐑂', &['𐐚']), + ('𐑃', &['𐐛']), + ('𐑄', &['𐐜']), + ('𐑅', &['𐐝']), + ('𐑆', &['𐐞']), + ('𐑇', &['𐐟']), + ('𐑈', &['𐐠']), + ('𐑉', &['𐐡']), + ('𐑊', &['𐐢']), + ('𐑋', &['𐐣']), + ('𐑌', &['𐐤']), + ('𐑍', &['𐐥']), + ('𐑎', &['𐐦']), + ('𐑏', &['𐐧']), + ('𐒰', &['𐓘']), + ('𐒱', &['𐓙']), + ('𐒲', &['𐓚']), + ('𐒳', &['𐓛']), + ('𐒴', &['𐓜']), + ('𐒵', &['𐓝']), + ('𐒶', &['𐓞']), + ('𐒷', &['𐓟']), + ('𐒸', &['𐓠']), + ('𐒹', &['𐓡']), + ('𐒺', &['𐓢']), + ('𐒻', &['𐓣']), + ('𐒼', &['𐓤']), + ('𐒽', &['𐓥']), + ('𐒾', &['𐓦']), + ('𐒿', &['𐓧']), + ('𐓀', &['𐓨']), + ('𐓁', &['𐓩']), + ('𐓂', &['𐓪']), + ('𐓃', &['𐓫']), + ('𐓄', &['𐓬']), + ('𐓅', &['𐓭']), + ('𐓆', &['𐓮']), + ('𐓇', &['𐓯']), + ('𐓈', &['𐓰']), + ('𐓉', &['𐓱']), + ('𐓊', &['𐓲']), + ('𐓋', &['𐓳']), + ('𐓌', &['𐓴']), + ('𐓍', &['𐓵']), + ('𐓎', &['𐓶']), + ('𐓏', &['𐓷']), + ('𐓐', &['𐓸']), + ('𐓑', &['𐓹']), + ('𐓒', &['𐓺']), + ('𐓓', &['𐓻']), + ('𐓘', &['𐒰']), + ('𐓙', &['𐒱']), + ('𐓚', &['𐒲']), + ('𐓛', &['𐒳']), + ('𐓜', &['𐒴']), + ('𐓝', &['𐒵']), + ('𐓞', &['𐒶']), + ('𐓟', &['𐒷']), + ('𐓠', &['𐒸']), + ('𐓡', &['𐒹']), + ('𐓢', &['𐒺']), + ('𐓣', &['𐒻']), + ('𐓤', &['𐒼']), + ('𐓥', &['𐒽']), + ('𐓦', &['𐒾']), + ('𐓧', &['𐒿']), + ('𐓨', &['𐓀']), + ('𐓩', &['𐓁']), + ('𐓪', &['𐓂']), + ('𐓫', &['𐓃']), + ('𐓬', &['𐓄']), + ('𐓭', &['𐓅']), + ('𐓮', &['𐓆']), + ('𐓯', &['𐓇']), + ('𐓰', &['𐓈']), + ('𐓱', &['𐓉']), + ('𐓲', &['𐓊']), + ('𐓳', &['𐓋']), + ('𐓴', &['𐓌']), + ('𐓵', &['𐓍']), + ('𐓶', &['𐓎']), + ('𐓷', &['𐓏']), + ('𐓸', &['𐓐']), + ('𐓹', &['𐓑']), + ('𐓺', &['𐓒']), + ('𐓻', &['𐓓']), + ('𐕰', &['𐖗']), + ('𐕱', &['𐖘']), + ('𐕲', &['𐖙']), + ('𐕳', &['𐖚']), + ('𐕴', &['𐖛']), + ('𐕵', &['𐖜']), + ('𐕶', &['𐖝']), + ('𐕷', &['𐖞']), + ('𐕸', &['𐖟']), + ('𐕹', &['𐖠']), + ('𐕺', &['𐖡']), + ('𐕼', &['𐖣']), + ('𐕽', &['𐖤']), + ('𐕾', &['𐖥']), + ('𐕿', &['𐖦']), + ('𐖀', &['𐖧']), + ('𐖁', &['𐖨']), + ('𐖂', &['𐖩']), + ('𐖃', &['𐖪']), + ('𐖄', &['𐖫']), + ('𐖅', &['𐖬']), + ('𐖆', &['𐖭']), + ('𐖇', &['𐖮']), + ('𐖈', &['𐖯']), + ('𐖉', &['𐖰']), + ('𐖊', &['𐖱']), + ('𐖌', &['𐖳']), + ('𐖍', &['𐖴']), + ('𐖎', &['𐖵']), + ('𐖏', &['𐖶']), + ('𐖐', &['𐖷']), + ('𐖑', &['𐖸']), + ('𐖒', &['𐖹']), + ('𐖔', &['𐖻']), + ('𐖕', &['𐖼']), + ('𐖗', &['𐕰']), + ('𐖘', &['𐕱']), + ('𐖙', &['𐕲']), + ('𐖚', &['𐕳']), + ('𐖛', &['𐕴']), + ('𐖜', &['𐕵']), + ('𐖝', &['𐕶']), + ('𐖞', &['𐕷']), + ('𐖟', &['𐕸']), + ('𐖠', &['𐕹']), + ('𐖡', &['𐕺']), + ('𐖣', &['𐕼']), + ('𐖤', &['𐕽']), + ('𐖥', &['𐕾']), + ('𐖦', &['𐕿']), + ('𐖧', &['𐖀']), + ('𐖨', &['𐖁']), + ('𐖩', &['𐖂']), + ('𐖪', &['𐖃']), + ('𐖫', &['𐖄']), + ('𐖬', &['𐖅']), + ('𐖭', &['𐖆']), + ('𐖮', &['𐖇']), + ('𐖯', &['𐖈']), + ('𐖰', &['𐖉']), + ('𐖱', &['𐖊']), + ('𐖳', &['𐖌']), + ('𐖴', &['𐖍']), + ('𐖵', &['𐖎']), + ('𐖶', &['𐖏']), + ('𐖷', &['𐖐']), + ('𐖸', &['𐖑']), + ('𐖹', &['𐖒']), + ('𐖻', &['𐖔']), + ('𐖼', &['𐖕']), + ('𐲀', &['𐳀']), + ('𐲁', &['𐳁']), + ('𐲂', &['𐳂']), + ('𐲃', &['𐳃']), + ('𐲄', &['𐳄']), + ('𐲅', &['𐳅']), + ('𐲆', &['𐳆']), + ('𐲇', &['𐳇']), + ('𐲈', &['𐳈']), + ('𐲉', &['𐳉']), + ('𐲊', &['𐳊']), + ('𐲋', &['𐳋']), + ('𐲌', &['𐳌']), + ('𐲍', &['𐳍']), + ('𐲎', &['𐳎']), + ('𐲏', &['𐳏']), + ('𐲐', &['𐳐']), + ('𐲑', &['𐳑']), + ('𐲒', &['𐳒']), + ('𐲓', &['𐳓']), + ('𐲔', &['𐳔']), + ('𐲕', &['𐳕']), + ('𐲖', &['𐳖']), + ('𐲗', &['𐳗']), + ('𐲘', &['𐳘']), + ('𐲙', &['𐳙']), + ('𐲚', &['𐳚']), + ('𐲛', &['𐳛']), + ('𐲜', &['𐳜']), + ('𐲝', &['𐳝']), + ('𐲞', &['𐳞']), + ('𐲟', &['𐳟']), + ('𐲠', &['𐳠']), + ('𐲡', &['𐳡']), + ('𐲢', &['𐳢']), + ('𐲣', &['𐳣']), + ('𐲤', &['𐳤']), + ('𐲥', &['𐳥']), + ('𐲦', &['𐳦']), + ('𐲧', &['𐳧']), + ('𐲨', &['𐳨']), + ('𐲩', &['𐳩']), + ('𐲪', &['𐳪']), + ('𐲫', &['𐳫']), + ('𐲬', &['𐳬']), + ('𐲭', &['𐳭']), + ('𐲮', &['𐳮']), + ('𐲯', &['𐳯']), + ('𐲰', &['𐳰']), + ('𐲱', &['𐳱']), + ('𐲲', &['𐳲']), + ('𐳀', &['𐲀']), + ('𐳁', &['𐲁']), + ('𐳂', &['𐲂']), + ('𐳃', &['𐲃']), + ('𐳄', &['𐲄']), + ('𐳅', &['𐲅']), + ('𐳆', &['𐲆']), + ('𐳇', &['𐲇']), + ('𐳈', &['𐲈']), + ('𐳉', &['𐲉']), + ('𐳊', &['𐲊']), + ('𐳋', &['𐲋']), + ('𐳌', &['𐲌']), + ('𐳍', &['𐲍']), + ('𐳎', &['𐲎']), + ('𐳏', &['𐲏']), + ('𐳐', &['𐲐']), + ('𐳑', &['𐲑']), + ('𐳒', &['𐲒']), + ('𐳓', &['𐲓']), + ('𐳔', &['𐲔']), + ('𐳕', &['𐲕']), + ('𐳖', &['𐲖']), + ('𐳗', &['𐲗']), + ('𐳘', &['𐲘']), + ('𐳙', &['𐲙']), + ('𐳚', &['𐲚']), + ('𐳛', &['𐲛']), + ('𐳜', &['𐲜']), + ('𐳝', &['𐲝']), + ('𐳞', &['𐲞']), + ('𐳟', &['𐲟']), + ('𐳠', &['𐲠']), + ('𐳡', &['𐲡']), + ('𐳢', &['𐲢']), + ('𐳣', &['𐲣']), + ('𐳤', &['𐲤']), + ('𐳥', &['𐲥']), + ('𐳦', &['𐲦']), + ('𐳧', &['𐲧']), + ('𐳨', &['𐲨']), + ('𐳩', &['𐲩']), + ('𐳪', &['𐲪']), + ('𐳫', &['𐲫']), + ('𐳬', &['𐲬']), + ('𐳭', &['𐲭']), + ('𐳮', &['𐲮']), + ('𐳯', &['𐲯']), + ('𐳰', &['𐲰']), + ('𐳱', &['𐲱']), + ('𐳲', &['𐲲']), + ('𑢠', &['𑣀']), + ('𑢡', &['𑣁']), + ('𑢢', &['𑣂']), + ('𑢣', &['𑣃']), + ('𑢤', &['𑣄']), + ('𑢥', &['𑣅']), + ('𑢦', &['𑣆']), + ('𑢧', &['𑣇']), + ('𑢨', &['𑣈']), + ('𑢩', &['𑣉']), + ('𑢪', &['𑣊']), + ('𑢫', &['𑣋']), + ('𑢬', &['𑣌']), + ('𑢭', &['𑣍']), + ('𑢮', &['𑣎']), + ('𑢯', &['𑣏']), + ('𑢰', &['𑣐']), + ('𑢱', &['𑣑']), + ('𑢲', &['𑣒']), + ('𑢳', &['𑣓']), + ('𑢴', &['𑣔']), + ('𑢵', &['𑣕']), + ('𑢶', &['𑣖']), + ('𑢷', &['𑣗']), + ('𑢸', &['𑣘']), + ('𑢹', &['𑣙']), + ('𑢺', &['𑣚']), + ('𑢻', &['𑣛']), + ('𑢼', &['𑣜']), + ('𑢽', &['𑣝']), + ('𑢾', &['𑣞']), + ('𑢿', &['𑣟']), + ('𑣀', &['𑢠']), + ('𑣁', &['𑢡']), + ('𑣂', &['𑢢']), + ('𑣃', &['𑢣']), + ('𑣄', &['𑢤']), + ('𑣅', &['𑢥']), + ('𑣆', &['𑢦']), + ('𑣇', &['𑢧']), + ('𑣈', &['𑢨']), + ('𑣉', &['𑢩']), + ('𑣊', &['𑢪']), + ('𑣋', &['𑢫']), + ('𑣌', &['𑢬']), + ('𑣍', &['𑢭']), + ('𑣎', &['𑢮']), + ('𑣏', &['𑢯']), + ('𑣐', &['𑢰']), + ('𑣑', &['𑢱']), + ('𑣒', &['𑢲']), + ('𑣓', &['𑢳']), + ('𑣔', &['𑢴']), + ('𑣕', &['𑢵']), + ('𑣖', &['𑢶']), + ('𑣗', &['𑢷']), + ('𑣘', &['𑢸']), + ('𑣙', &['𑢹']), + ('𑣚', &['𑢺']), + ('𑣛', &['𑢻']), + ('𑣜', &['𑢼']), + ('𑣝', &['𑢽']), + ('𑣞', &['𑢾']), + ('𑣟', &['𑢿']), + ('𖹀', &['𖹠']), + ('𖹁', &['𖹡']), + ('𖹂', &['𖹢']), + ('𖹃', &['𖹣']), + ('𖹄', &['𖹤']), + ('𖹅', &['𖹥']), + ('𖹆', &['𖹦']), + ('𖹇', &['𖹧']), + ('𖹈', &['𖹨']), + ('𖹉', &['𖹩']), + ('𖹊', &['𖹪']), + ('𖹋', &['𖹫']), + ('𖹌', &['𖹬']), + ('𖹍', &['𖹭']), + ('𖹎', &['𖹮']), + ('𖹏', &['𖹯']), + ('𖹐', &['𖹰']), + ('𖹑', &['𖹱']), + ('𖹒', &['𖹲']), + ('𖹓', &['𖹳']), + ('𖹔', &['𖹴']), + ('𖹕', &['𖹵']), + ('𖹖', &['𖹶']), + ('𖹗', &['𖹷']), + ('𖹘', &['𖹸']), + ('𖹙', &['𖹹']), + ('𖹚', &['𖹺']), + ('𖹛', &['𖹻']), + ('𖹜', &['𖹼']), + ('𖹝', &['𖹽']), + ('𖹞', &['𖹾']), + ('𖹟', &['𖹿']), + ('𖹠', &['𖹀']), + ('𖹡', &['𖹁']), + ('𖹢', &['𖹂']), + ('𖹣', &['𖹃']), + ('𖹤', &['𖹄']), + ('𖹥', &['𖹅']), + ('𖹦', &['𖹆']), + ('𖹧', &['𖹇']), + ('𖹨', &['𖹈']), + ('𖹩', &['𖹉']), + ('𖹪', &['𖹊']), + ('𖹫', &['𖹋']), + ('𖹬', &['𖹌']), + ('𖹭', &['𖹍']), + ('𖹮', &['𖹎']), + ('𖹯', &['𖹏']), + ('𖹰', &['𖹐']), + ('𖹱', &['𖹑']), + ('𖹲', &['𖹒']), + ('𖹳', &['𖹓']), + ('𖹴', &['𖹔']), + ('𖹵', &['𖹕']), + ('𖹶', &['𖹖']), + ('𖹷', &['𖹗']), + ('𖹸', &['𖹘']), + ('𖹹', &['𖹙']), + ('𖹺', &['𖹚']), + ('𖹻', &['𖹛']), + ('𖹼', &['𖹜']), + ('𖹽', &['𖹝']), + ('𖹾', &['𖹞']), + ('𖹿', &['𖹟']), + ('𞤀', &['𞤢']), + ('𞤁', &['𞤣']), + ('𞤂', &['𞤤']), + ('𞤃', &['𞤥']), + ('𞤄', &['𞤦']), + ('𞤅', &['𞤧']), + ('𞤆', &['𞤨']), + ('𞤇', &['𞤩']), + ('𞤈', &['𞤪']), + ('𞤉', &['𞤫']), + ('𞤊', &['𞤬']), + ('𞤋', &['𞤭']), + ('𞤌', &['𞤮']), + ('𞤍', &['𞤯']), + ('𞤎', &['𞤰']), + ('𞤏', &['𞤱']), + ('𞤐', &['𞤲']), + ('𞤑', &['𞤳']), + ('𞤒', &['𞤴']), + ('𞤓', &['𞤵']), + ('𞤔', &['𞤶']), + ('𞤕', &['𞤷']), + ('𞤖', &['𞤸']), + ('𞤗', &['𞤹']), + ('𞤘', &['𞤺']), + ('𞤙', &['𞤻']), + ('𞤚', &['𞤼']), + ('𞤛', &['𞤽']), + ('𞤜', &['𞤾']), + ('𞤝', &['𞤿']), + ('𞤞', &['𞥀']), + ('𞤟', &['𞥁']), + ('𞤠', &['𞥂']), + ('𞤡', &['𞥃']), + ('𞤢', &['𞤀']), + ('𞤣', &['𞤁']), + ('𞤤', &['𞤂']), + ('𞤥', &['𞤃']), + ('𞤦', &['𞤄']), + ('𞤧', &['𞤅']), + ('𞤨', &['𞤆']), + ('𞤩', &['𞤇']), + ('𞤪', &['𞤈']), + ('𞤫', &['𞤉']), + ('𞤬', &['𞤊']), + ('𞤭', &['𞤋']), + ('𞤮', &['𞤌']), + ('𞤯', &['𞤍']), + ('𞤰', &['𞤎']), + ('𞤱', &['𞤏']), + ('𞤲', &['𞤐']), + ('𞤳', &['𞤑']), + ('𞤴', &['𞤒']), + ('𞤵', &['𞤓']), + ('𞤶', &['𞤔']), + ('𞤷', &['𞤕']), + ('𞤸', &['𞤖']), + ('𞤹', &['𞤗']), + ('𞤺', &['𞤘']), + ('𞤻', &['𞤙']), + ('𞤼', &['𞤚']), + ('𞤽', &['𞤛']), + ('𞤾', &['𞤜']), + ('𞤿', &['𞤝']), + ('𞥀', &['𞤞']), + ('𞥁', &['𞤟']), + ('𞥂', &['𞤠']), + ('𞥃', &['𞤡']), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/general_category.rs b/vendor/regex-syntax/src/unicode_tables/general_category.rs new file mode 100644 index 0000000000..8fc9289127 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/general_category.rs @@ -0,0 +1,6552 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate general-category ucd-15.0.0 --chars --exclude surrogate +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("Cased_Letter", CASED_LETTER), + ("Close_Punctuation", CLOSE_PUNCTUATION), + ("Connector_Punctuation", CONNECTOR_PUNCTUATION), + ("Control", CONTROL), + ("Currency_Symbol", CURRENCY_SYMBOL), + ("Dash_Punctuation", DASH_PUNCTUATION), + ("Decimal_Number", DECIMAL_NUMBER), + ("Enclosing_Mark", ENCLOSING_MARK), + ("Final_Punctuation", FINAL_PUNCTUATION), + ("Format", FORMAT), + ("Initial_Punctuation", INITIAL_PUNCTUATION), + ("Letter", LETTER), + ("Letter_Number", LETTER_NUMBER), + ("Line_Separator", LINE_SEPARATOR), + ("Lowercase_Letter", LOWERCASE_LETTER), + ("Mark", MARK), + ("Math_Symbol", MATH_SYMBOL), + ("Modifier_Letter", MODIFIER_LETTER), + ("Modifier_Symbol", MODIFIER_SYMBOL), + ("Nonspacing_Mark", NONSPACING_MARK), + ("Number", NUMBER), + ("Open_Punctuation", OPEN_PUNCTUATION), + ("Other", OTHER), + ("Other_Letter", OTHER_LETTER), + ("Other_Number", OTHER_NUMBER), + ("Other_Punctuation", OTHER_PUNCTUATION), + ("Other_Symbol", OTHER_SYMBOL), + ("Paragraph_Separator", PARAGRAPH_SEPARATOR), + ("Private_Use", PRIVATE_USE), + ("Punctuation", PUNCTUATION), + ("Separator", SEPARATOR), + ("Space_Separator", SPACE_SEPARATOR), + ("Spacing_Mark", SPACING_MARK), + ("Symbol", SYMBOL), + ("Titlecase_Letter", TITLECASE_LETTER), + ("Unassigned", UNASSIGNED), + ("Uppercase_Letter", UPPERCASE_LETTER), +]; + +pub const CASED_LETTER: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('µ', 'µ'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ƺ'), + ('Ƽ', 'ƿ'), + ('DŽ', 'ʓ'), + ('ʕ', 'ʯ'), + ('Ͱ', 'ͳ'), + ('Ͷ', 'ͷ'), + ('ͻ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՠ', 'ֈ'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჽ', 'ჿ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ᴀ', 'ᴫ'), + ('ᵫ', 'ᵷ'), + ('ᵹ', 'ᶚ'), + ('Ḁ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℯ', 'ℴ'), + ('ℹ', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ↄ', 'ↄ'), + ('Ⰰ', 'ⱻ'), + ('Ȿ', 'ⳤ'), + ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('Ꙁ', 'ꙭ'), + ('Ꚁ', 'ꚛ'), + ('Ꜣ', 'ꝯ'), + ('ꝱ', 'ꞇ'), + ('Ꞌ', 'ꞎ'), + ('Ꞑ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('Ꟶ', 'ꟶ'), + ('ꟺ', 'ꟺ'), + ('ꬰ', 'ꭚ'), + ('ꭠ', 'ꭨ'), + ('ꭰ', 'ꮿ'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('A', 'Z'), + ('a', 'z'), + ('𐐀', '𐑏'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𑢠', '𑣟'), + ('𖹀', '𖹿'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝼀', '𝼉'), + ('𝼋', '𝼞'), + ('𝼥', '𝼪'), + ('𞤀', '𞥃'), +]; + +pub const CLOSE_PUNCTUATION: &'static [(char, char)] = &[ + (')', ')'), + (']', ']'), + ('}', '}'), + ('༻', '༻'), + ('༽', '༽'), + ('᚜', '᚜'), + ('⁆', '⁆'), + ('⁾', '⁾'), + ('₎', '₎'), + ('⌉', '⌉'), + ('⌋', '⌋'), + ('〉', '〉'), + ('❩', '❩'), + ('❫', '❫'), + ('❭', '❭'), + ('❯', '❯'), + ('❱', '❱'), + ('❳', '❳'), + ('❵', '❵'), + ('⟆', '⟆'), + ('⟧', '⟧'), + ('⟩', '⟩'), + ('⟫', '⟫'), + ('⟭', '⟭'), + ('⟯', '⟯'), + ('⦄', '⦄'), + ('⦆', '⦆'), + ('⦈', '⦈'), + ('⦊', '⦊'), + ('⦌', '⦌'), + ('⦎', '⦎'), + ('⦐', '⦐'), + ('⦒', '⦒'), + ('⦔', '⦔'), + ('⦖', '⦖'), + ('⦘', '⦘'), + ('⧙', '⧙'), + ('⧛', '⧛'), + ('⧽', '⧽'), + ('⸣', '⸣'), + ('⸥', '⸥'), + ('⸧', '⸧'), + ('⸩', '⸩'), + ('⹖', '⹖'), + ('⹘', '⹘'), + ('⹚', '⹚'), + ('⹜', '⹜'), + ('〉', '〉'), + ('》', '》'), + ('」', '」'), + ('』', '』'), + ('】', '】'), + ('〕', '〕'), + ('〗', '〗'), + ('〙', '〙'), + ('〛', '〛'), + ('〞', '〟'), + ('﴾', '﴾'), + ('︘', '︘'), + ('︶', '︶'), + ('︸', '︸'), + ('︺', '︺'), + ('︼', '︼'), + ('︾', '︾'), + ('﹀', '﹀'), + ('﹂', '﹂'), + ('﹄', '﹄'), + ('﹈', '﹈'), + ('﹚', '﹚'), + ('﹜', '﹜'), + ('﹞', '﹞'), + (')', ')'), + (']', ']'), + ('}', '}'), + ('⦆', '⦆'), + ('」', '」'), +]; + +pub const CONNECTOR_PUNCTUATION: &'static [(char, char)] = &[ + ('_', '_'), + ('‿', '⁀'), + ('⁔', '⁔'), + ('︳', '︴'), + ('﹍', '﹏'), + ('_', '_'), +]; + +pub const CONTROL: &'static [(char, char)] = + &[('\0', '\u{1f}'), ('\u{7f}', '\u{9f}')]; + +pub const CURRENCY_SYMBOL: &'static [(char, char)] = &[ + ('$', '$'), + ('¢', '¥'), + ('֏', '֏'), + ('؋', '؋'), + ('߾', '߿'), + ('৲', '৳'), + ('৻', '৻'), + ('૱', '૱'), + ('௹', '௹'), + ('฿', '฿'), + ('៛', '៛'), + ('₠', '⃀'), + ('꠸', '꠸'), + ('﷼', '﷼'), + ('﹩', '﹩'), + ('$', '$'), + ('¢', '£'), + ('¥', '₩'), + ('𑿝', '𑿠'), + ('𞋿', '𞋿'), + ('𞲰', '𞲰'), +]; + +pub const DASH_PUNCTUATION: &'static [(char, char)] = &[ + ('-', '-'), + ('֊', '֊'), + ('־', '־'), + ('᐀', '᐀'), + ('᠆', '᠆'), + ('‐', '―'), + ('⸗', '⸗'), + ('⸚', '⸚'), + ('⸺', '⸻'), + ('⹀', '⹀'), + ('⹝', '⹝'), + ('〜', '〜'), + ('〰', '〰'), + ('゠', '゠'), + ('︱', '︲'), + ('﹘', '﹘'), + ('﹣', '﹣'), + ('-', '-'), + ('𐺭', '𐺭'), +]; + +pub const DECIMAL_NUMBER: &'static [(char, char)] = &[ + ('0', '9'), + ('٠', '٩'), + ('۰', '۹'), + ('߀', '߉'), + ('०', '९'), + ('০', '৯'), + ('੦', '੯'), + ('૦', '૯'), + ('୦', '୯'), + ('௦', '௯'), + ('౦', '౯'), + ('೦', '೯'), + ('൦', '൯'), + ('෦', '෯'), + ('๐', '๙'), + ('໐', '໙'), + ('༠', '༩'), + ('၀', '၉'), + ('႐', '႙'), + ('០', '៩'), + ('᠐', '᠙'), + ('᥆', '᥏'), + ('᧐', '᧙'), + ('᪀', '᪉'), + ('᪐', '᪙'), + ('᭐', '᭙'), + ('᮰', '᮹'), + ('᱀', '᱉'), + ('᱐', '᱙'), + ('꘠', '꘩'), + ('꣐', '꣙'), + ('꤀', '꤉'), + ('꧐', '꧙'), + ('꧰', '꧹'), + ('꩐', '꩙'), + ('꯰', '꯹'), + ('0', '9'), + ('𐒠', '𐒩'), + ('𐴰', '𐴹'), + ('𑁦', '𑁯'), + ('𑃰', '𑃹'), + ('𑄶', '𑄿'), + ('𑇐', '𑇙'), + ('𑋰', '𑋹'), + ('𑑐', '𑑙'), + ('𑓐', '𑓙'), + ('𑙐', '𑙙'), + ('𑛀', '𑛉'), + ('𑜰', '𑜹'), + ('𑣠', '𑣩'), + ('𑥐', '𑥙'), + ('𑱐', '𑱙'), + ('𑵐', '𑵙'), + ('𑶠', '𑶩'), + ('𑽐', '𑽙'), + ('𖩠', '𖩩'), + ('𖫀', '𖫉'), + ('𖭐', '𖭙'), + ('𝟎', '𝟿'), + ('𞅀', '𞅉'), + ('𞋰', '𞋹'), + ('𞓰', '𞓹'), + ('𞥐', '𞥙'), + ('🯰', '🯹'), +]; + +pub const ENCLOSING_MARK: &'static [(char, char)] = &[ + ('\u{488}', '\u{489}'), + ('\u{1abe}', '\u{1abe}'), + ('\u{20dd}', '\u{20e0}'), + ('\u{20e2}', '\u{20e4}'), + ('\u{a670}', '\u{a672}'), +]; + +pub const FINAL_PUNCTUATION: &'static [(char, char)] = &[ + ('»', '»'), + ('’', '’'), + ('”', '”'), + ('›', '›'), + ('⸃', '⸃'), + ('⸅', '⸅'), + ('⸊', '⸊'), + ('⸍', '⸍'), + ('⸝', '⸝'), + ('⸡', '⸡'), +]; + +pub const FORMAT: &'static [(char, char)] = &[ + ('\u{ad}', '\u{ad}'), + ('\u{600}', '\u{605}'), + ('\u{61c}', '\u{61c}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{70f}', '\u{70f}'), + ('\u{890}', '\u{891}'), + ('\u{8e2}', '\u{8e2}'), + ('\u{180e}', '\u{180e}'), + ('\u{200b}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), + ('\u{2060}', '\u{2064}'), + ('\u{2066}', '\u{206f}'), + ('\u{feff}', '\u{feff}'), + ('\u{fff9}', '\u{fffb}'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110cd}', '\u{110cd}'), + ('\u{13430}', '\u{1343f}'), + ('\u{1bca0}', '\u{1bca3}'), + ('\u{1d173}', '\u{1d17a}'), + ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const INITIAL_PUNCTUATION: &'static [(char, char)] = &[ + ('«', '«'), + ('‘', '‘'), + ('‛', '“'), + ('‟', '‟'), + ('‹', '‹'), + ('⸂', '⸂'), + ('⸄', '⸄'), + ('⸉', '⸉'), + ('⸌', '⸌'), + ('⸜', '⸜'), + ('⸠', '⸠'), +]; + +pub const LETTER: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ˁ'), + ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('Ͱ', 'ʹ'), + ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), + ('ՠ', 'ֈ'), + ('א', 'ת'), + ('ׯ', 'ײ'), + ('ؠ', 'ي'), + ('ٮ', 'ٯ'), + ('ٱ', 'ۓ'), + ('ە', 'ە'), + ('ۥ', 'ۦ'), + ('ۮ', 'ۯ'), + ('ۺ', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', 'ܐ'), + ('ܒ', 'ܯ'), + ('ݍ', 'ޥ'), + ('ޱ', 'ޱ'), + ('ߊ', 'ߪ'), + ('ߴ', 'ߵ'), + ('ߺ', 'ߺ'), + ('ࠀ', 'ࠕ'), + ('ࠚ', 'ࠚ'), + ('ࠤ', 'ࠤ'), + ('ࠨ', 'ࠨ'), + ('ࡀ', 'ࡘ'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('ࢠ', 'ࣉ'), + ('ऄ', 'ह'), + ('ऽ', 'ऽ'), + ('ॐ', 'ॐ'), + ('क़', 'ॡ'), + ('ॱ', 'ঀ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('ঽ', 'ঽ'), + ('ৎ', 'ৎ'), + ('ড়', 'ঢ়'), + ('য়', 'ৡ'), + ('ৰ', 'ৱ'), + ('ৼ', 'ৼ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('ੲ', 'ੴ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('ઽ', 'ઽ'), + ('ૐ', 'ૐ'), + ('ૠ', 'ૡ'), + ('ૹ', 'ૹ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('ଽ', 'ଽ'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୡ'), + ('ୱ', 'ୱ'), + ('ஃ', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('ௐ', 'ௐ'), + ('అ', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('ఽ', 'ఽ'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', 'ౡ'), + ('ಀ', 'ಀ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('ಽ', 'ಽ'), + ('ೝ', 'ೞ'), + ('ೠ', 'ೡ'), + ('ೱ', 'ೲ'), + ('ഄ', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), + ('ൎ', 'ൎ'), + ('ൔ', 'ൖ'), + ('ൟ', 'ൡ'), + ('ൺ', 'ൿ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('ก', 'ะ'), + ('า', 'ำ'), + ('เ', 'ๆ'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ະ'), + ('າ', 'ຳ'), + ('ຽ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('ཀ', 'ཇ'), + ('ཉ', 'ཬ'), + ('ྈ', 'ྌ'), + ('က', 'ဪ'), + ('ဿ', 'ဿ'), + ('ၐ', 'ၕ'), + ('ၚ', 'ၝ'), + ('ၡ', 'ၡ'), + ('ၥ', 'ၦ'), + ('ၮ', 'ၰ'), + ('ၵ', 'ႁ'), + ('ႎ', 'ႎ'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛱ', 'ᛸ'), + ('ᜀ', 'ᜑ'), + ('ᜟ', 'ᜱ'), + ('ᝀ', 'ᝑ'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('ក', 'ឳ'), + ('ៗ', 'ៗ'), + ('ៜ', 'ៜ'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢄ'), + ('ᢇ', 'ᢨ'), + ('ᢪ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('ᥐ', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('ᨀ', 'ᨖ'), + ('ᨠ', 'ᩔ'), + ('ᪧ', 'ᪧ'), + ('ᬅ', 'ᬳ'), + ('ᭅ', 'ᭌ'), + ('ᮃ', 'ᮠ'), + ('ᮮ', 'ᮯ'), + ('ᮺ', 'ᯥ'), + ('ᰀ', 'ᰣ'), + ('ᱍ', 'ᱏ'), + ('ᱚ', 'ᱽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ᳩ', 'ᳬ'), + ('ᳮ', 'ᳳ'), + ('ᳵ', 'ᳶ'), + ('ᳺ', 'ᳺ'), + ('ᴀ', 'ᶿ'), + ('Ḁ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℯ', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ↄ', 'ↄ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('ⸯ', 'ⸯ'), + ('々', '〆'), + ('〱', '〵'), + ('〻', '〼'), + ('ぁ', 'ゖ'), + ('ゝ', 'ゟ'), + ('ァ', 'ヺ'), + ('ー', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘟ'), + ('ꘪ', 'ꘫ'), + ('Ꙁ', 'ꙮ'), + ('ꙿ', 'ꚝ'), + ('ꚠ', 'ꛥ'), + ('ꜗ', 'ꜟ'), + ('Ꜣ', 'ꞈ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠁ'), + ('ꠃ', 'ꠅ'), + ('ꠇ', 'ꠊ'), + ('ꠌ', 'ꠢ'), + ('ꡀ', 'ꡳ'), + ('ꢂ', 'ꢳ'), + ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', 'ꣾ'), + ('ꤊ', 'ꤥ'), + ('ꤰ', 'ꥆ'), + ('ꥠ', 'ꥼ'), + ('ꦄ', 'ꦲ'), + ('ꧏ', 'ꧏ'), + ('ꧠ', 'ꧤ'), + ('ꧦ', 'ꧯ'), + ('ꧺ', 'ꧾ'), + ('ꨀ', 'ꨨ'), + ('ꩀ', 'ꩂ'), + ('ꩄ', 'ꩋ'), + ('ꩠ', 'ꩶ'), + ('ꩺ', 'ꩺ'), + ('ꩾ', 'ꪯ'), + ('ꪱ', 'ꪱ'), + ('ꪵ', 'ꪶ'), + ('ꪹ', 'ꪽ'), + ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), + ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫪ'), + ('ꫲ', 'ꫴ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꯢ'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('יִ', 'יִ'), + ('ײַ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('A', 'Z'), + ('a', 'z'), + ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('𐌀', '𐌟'), + ('𐌭', '𐍀'), + ('𐍂', '𐍉'), + ('𐍐', '𐍵'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐐀', '𐒝'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '𐨀'), + ('𐨐', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '𐫤'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐴀', '𐴣'), + ('𐺀', '𐺩'), + ('𐺰', '𐺱'), + ('𐼀', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '𐽅'), + ('𐽰', '𐾁'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀃', '𑀷'), + ('𑁱', '𑁲'), + ('𑁵', '𑁵'), + ('𑂃', '𑂯'), + ('𑃐', '𑃨'), + ('𑄃', '𑄦'), + ('𑅄', '𑅄'), + ('𑅇', '𑅇'), + ('𑅐', '𑅲'), + ('𑅶', '𑅶'), + ('𑆃', '𑆲'), + ('𑇁', '𑇄'), + ('𑇚', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '𑈫'), + ('𑈿', '𑉀'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '𑋞'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('𑌽', '𑌽'), + ('𑍐', '𑍐'), + ('𑍝', '𑍡'), + ('𑐀', '𑐴'), + ('𑑇', '𑑊'), + ('𑑟', '𑑡'), + ('𑒀', '𑒯'), + ('𑓄', '𑓅'), + ('𑓇', '𑓇'), + ('𑖀', '𑖮'), + ('𑗘', '𑗛'), + ('𑘀', '𑘯'), + ('𑙄', '𑙄'), + ('𑚀', '𑚪'), + ('𑚸', '𑚸'), + ('𑜀', '𑜚'), + ('𑝀', '𑝆'), + ('𑠀', '𑠫'), + ('𑢠', '𑣟'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤯'), + ('𑤿', '𑤿'), + ('𑥁', '𑥁'), + ('𑦠', '𑦧'), + ('𑦪', '𑧐'), + ('𑧡', '𑧡'), + ('𑧣', '𑧣'), + ('𑨀', '𑨀'), + ('𑨋', '𑨲'), + ('𑨺', '𑨺'), + ('𑩐', '𑩐'), + ('𑩜', '𑪉'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '𑰮'), + ('𑱀', '𑱀'), + ('𑱲', '𑲏'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '𑴰'), + ('𑵆', '𑵆'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶉'), + ('𑶘', '𑶘'), + ('𑻠', '𑻲'), + ('𑼂', '𑼂'), + ('𑼄', '𑼐'), + ('𑼒', '𑼳'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('𓑁', '𓑆'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩰', '𖪾'), + ('𖫐', '𖫭'), + ('𖬀', '𖬯'), + ('𖭀', '𖭃'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖹿'), + ('𖼀', '𖽊'), + ('𖽐', '𖽐'), + ('𖾓', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '𖿣'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('𞀰', '𞁭'), + ('𞄀', '𞄬'), + ('𞄷', '𞄽'), + ('𞅎', '𞅎'), + ('𞊐', '𞊭'), + ('𞋀', '𞋫'), + ('𞓐', '𞓫'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('𞤀', '𞥃'), + ('𞥋', '𞥋'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; + +pub const LETTER_NUMBER: &'static [(char, char)] = &[ + ('ᛮ', 'ᛰ'), + ('Ⅰ', 'ↂ'), + ('ↅ', 'ↈ'), + ('〇', '〇'), + ('〡', '〩'), + ('〸', '〺'), + ('ꛦ', 'ꛯ'), + ('𐅀', '𐅴'), + ('𐍁', '𐍁'), + ('𐍊', '𐍊'), + ('𐏑', '𐏕'), + ('𒐀', '𒑮'), +]; + +pub const LINE_SEPARATOR: &'static [(char, char)] = + &[('\u{2028}', '\u{2028}')]; + +pub const LOWERCASE_LETTER: &'static [(char, char)] = &[ + ('a', 'z'), + ('µ', 'µ'), + ('ß', 'ö'), + ('ø', 'ÿ'), + ('ā', 'ā'), + ('ă', 'ă'), + ('ą', 'ą'), + ('ć', 'ć'), + ('ĉ', 'ĉ'), + ('ċ', 'ċ'), + ('č', 'č'), + ('ď', 'ď'), + ('đ', 'đ'), + ('ē', 'ē'), + ('ĕ', 'ĕ'), + ('ė', 'ė'), + ('ę', 'ę'), + ('ě', 'ě'), + ('ĝ', 'ĝ'), + ('ğ', 'ğ'), + ('ġ', 'ġ'), + ('ģ', 'ģ'), + ('ĥ', 'ĥ'), + ('ħ', 'ħ'), + ('ĩ', 'ĩ'), + ('ī', 'ī'), + ('ĭ', 'ĭ'), + ('į', 'į'), + ('ı', 'ı'), + ('ij', 'ij'), + ('ĵ', 'ĵ'), + ('ķ', 'ĸ'), + ('ĺ', 'ĺ'), + ('ļ', 'ļ'), + ('ľ', 'ľ'), + ('ŀ', 'ŀ'), + ('ł', 'ł'), + ('ń', 'ń'), + ('ņ', 'ņ'), + ('ň', 'ʼn'), + ('ŋ', 'ŋ'), + ('ō', 'ō'), + ('ŏ', 'ŏ'), + ('ő', 'ő'), + ('œ', 'œ'), + ('ŕ', 'ŕ'), + ('ŗ', 'ŗ'), + ('ř', 'ř'), + ('ś', 'ś'), + ('ŝ', 'ŝ'), + ('ş', 'ş'), + ('š', 'š'), + ('ţ', 'ţ'), + ('ť', 'ť'), + ('ŧ', 'ŧ'), + ('ũ', 'ũ'), + ('ū', 'ū'), + ('ŭ', 'ŭ'), + ('ů', 'ů'), + ('ű', 'ű'), + ('ų', 'ų'), + ('ŵ', 'ŵ'), + ('ŷ', 'ŷ'), + ('ź', 'ź'), + ('ż', 'ż'), + ('ž', 'ƀ'), + ('ƃ', 'ƃ'), + ('ƅ', 'ƅ'), + ('ƈ', 'ƈ'), + ('ƌ', 'ƍ'), + ('ƒ', 'ƒ'), + ('ƕ', 'ƕ'), + ('ƙ', 'ƛ'), + ('ƞ', 'ƞ'), + ('ơ', 'ơ'), + ('ƣ', 'ƣ'), + ('ƥ', 'ƥ'), + ('ƨ', 'ƨ'), + ('ƪ', 'ƫ'), + ('ƭ', 'ƭ'), + ('ư', 'ư'), + ('ƴ', 'ƴ'), + ('ƶ', 'ƶ'), + ('ƹ', 'ƺ'), + ('ƽ', 'ƿ'), + ('dž', 'dž'), + ('lj', 'lj'), + ('nj', 'nj'), + ('ǎ', 'ǎ'), + ('ǐ', 'ǐ'), + ('ǒ', 'ǒ'), + ('ǔ', 'ǔ'), + ('ǖ', 'ǖ'), + ('ǘ', 'ǘ'), + ('ǚ', 'ǚ'), + ('ǜ', 'ǝ'), + ('ǟ', 'ǟ'), + ('ǡ', 'ǡ'), + ('ǣ', 'ǣ'), + ('ǥ', 'ǥ'), + ('ǧ', 'ǧ'), + ('ǩ', 'ǩ'), + ('ǫ', 'ǫ'), + ('ǭ', 'ǭ'), + ('ǯ', 'ǰ'), + ('dz', 'dz'), + ('ǵ', 'ǵ'), + ('ǹ', 'ǹ'), + ('ǻ', 'ǻ'), + ('ǽ', 'ǽ'), + ('ǿ', 'ǿ'), + ('ȁ', 'ȁ'), + ('ȃ', 'ȃ'), + ('ȅ', 'ȅ'), + ('ȇ', 'ȇ'), + ('ȉ', 'ȉ'), + ('ȋ', 'ȋ'), + ('ȍ', 'ȍ'), + ('ȏ', 'ȏ'), + ('ȑ', 'ȑ'), + ('ȓ', 'ȓ'), + ('ȕ', 'ȕ'), + ('ȗ', 'ȗ'), + ('ș', 'ș'), + ('ț', 'ț'), + ('ȝ', 'ȝ'), + ('ȟ', 'ȟ'), + ('ȡ', 'ȡ'), + ('ȣ', 'ȣ'), + ('ȥ', 'ȥ'), + ('ȧ', 'ȧ'), + ('ȩ', 'ȩ'), + ('ȫ', 'ȫ'), + ('ȭ', 'ȭ'), + ('ȯ', 'ȯ'), + ('ȱ', 'ȱ'), + ('ȳ', 'ȹ'), + ('ȼ', 'ȼ'), + ('ȿ', 'ɀ'), + ('ɂ', 'ɂ'), + ('ɇ', 'ɇ'), + ('ɉ', 'ɉ'), + ('ɋ', 'ɋ'), + ('ɍ', 'ɍ'), + ('ɏ', 'ʓ'), + ('ʕ', 'ʯ'), + ('ͱ', 'ͱ'), + ('ͳ', 'ͳ'), + ('ͷ', 'ͷ'), + ('ͻ', 'ͽ'), + ('ΐ', 'ΐ'), + ('ά', 'ώ'), + ('ϐ', 'ϑ'), + ('ϕ', 'ϗ'), + ('ϙ', 'ϙ'), + ('ϛ', 'ϛ'), + ('ϝ', 'ϝ'), + ('ϟ', 'ϟ'), + ('ϡ', 'ϡ'), + ('ϣ', 'ϣ'), + ('ϥ', 'ϥ'), + ('ϧ', 'ϧ'), + ('ϩ', 'ϩ'), + ('ϫ', 'ϫ'), + ('ϭ', 'ϭ'), + ('ϯ', 'ϳ'), + ('ϵ', 'ϵ'), + ('ϸ', 'ϸ'), + ('ϻ', 'ϼ'), + ('а', 'џ'), + ('ѡ', 'ѡ'), + ('ѣ', 'ѣ'), + ('ѥ', 'ѥ'), + ('ѧ', 'ѧ'), + ('ѩ', 'ѩ'), + ('ѫ', 'ѫ'), + ('ѭ', 'ѭ'), + ('ѯ', 'ѯ'), + ('ѱ', 'ѱ'), + ('ѳ', 'ѳ'), + ('ѵ', 'ѵ'), + ('ѷ', 'ѷ'), + ('ѹ', 'ѹ'), + ('ѻ', 'ѻ'), + ('ѽ', 'ѽ'), + ('ѿ', 'ѿ'), + ('ҁ', 'ҁ'), + ('ҋ', 'ҋ'), + ('ҍ', 'ҍ'), + ('ҏ', 'ҏ'), + ('ґ', 'ґ'), + ('ғ', 'ғ'), + ('ҕ', 'ҕ'), + ('җ', 'җ'), + ('ҙ', 'ҙ'), + ('қ', 'қ'), + ('ҝ', 'ҝ'), + ('ҟ', 'ҟ'), + ('ҡ', 'ҡ'), + ('ң', 'ң'), + ('ҥ', 'ҥ'), + ('ҧ', 'ҧ'), + ('ҩ', 'ҩ'), + ('ҫ', 'ҫ'), + ('ҭ', 'ҭ'), + ('ү', 'ү'), + ('ұ', 'ұ'), + ('ҳ', 'ҳ'), + ('ҵ', 'ҵ'), + ('ҷ', 'ҷ'), + ('ҹ', 'ҹ'), + ('һ', 'һ'), + ('ҽ', 'ҽ'), + ('ҿ', 'ҿ'), + ('ӂ', 'ӂ'), + ('ӄ', 'ӄ'), + ('ӆ', 'ӆ'), + ('ӈ', 'ӈ'), + ('ӊ', 'ӊ'), + ('ӌ', 'ӌ'), + ('ӎ', 'ӏ'), + ('ӑ', 'ӑ'), + ('ӓ', 'ӓ'), + ('ӕ', 'ӕ'), + ('ӗ', 'ӗ'), + ('ә', 'ә'), + ('ӛ', 'ӛ'), + ('ӝ', 'ӝ'), + ('ӟ', 'ӟ'), + ('ӡ', 'ӡ'), + ('ӣ', 'ӣ'), + ('ӥ', 'ӥ'), + ('ӧ', 'ӧ'), + ('ө', 'ө'), + ('ӫ', 'ӫ'), + ('ӭ', 'ӭ'), + ('ӯ', 'ӯ'), + ('ӱ', 'ӱ'), + ('ӳ', 'ӳ'), + ('ӵ', 'ӵ'), + ('ӷ', 'ӷ'), + ('ӹ', 'ӹ'), + ('ӻ', 'ӻ'), + ('ӽ', 'ӽ'), + ('ӿ', 'ӿ'), + ('ԁ', 'ԁ'), + ('ԃ', 'ԃ'), + ('ԅ', 'ԅ'), + ('ԇ', 'ԇ'), + ('ԉ', 'ԉ'), + ('ԋ', 'ԋ'), + ('ԍ', 'ԍ'), + ('ԏ', 'ԏ'), + ('ԑ', 'ԑ'), + ('ԓ', 'ԓ'), + ('ԕ', 'ԕ'), + ('ԗ', 'ԗ'), + ('ԙ', 'ԙ'), + ('ԛ', 'ԛ'), + ('ԝ', 'ԝ'), + ('ԟ', 'ԟ'), + ('ԡ', 'ԡ'), + ('ԣ', 'ԣ'), + ('ԥ', 'ԥ'), + ('ԧ', 'ԧ'), + ('ԩ', 'ԩ'), + ('ԫ', 'ԫ'), + ('ԭ', 'ԭ'), + ('ԯ', 'ԯ'), + ('ՠ', 'ֈ'), + ('ა', 'ჺ'), + ('ჽ', 'ჿ'), + ('ᏸ', 'ᏽ'), + ('ᲀ', 'ᲈ'), + ('ᴀ', 'ᴫ'), + ('ᵫ', 'ᵷ'), + ('ᵹ', 'ᶚ'), + ('ḁ', 'ḁ'), + ('ḃ', 'ḃ'), + ('ḅ', 'ḅ'), + ('ḇ', 'ḇ'), + ('ḉ', 'ḉ'), + ('ḋ', 'ḋ'), + ('ḍ', 'ḍ'), + ('ḏ', 'ḏ'), + ('ḑ', 'ḑ'), + ('ḓ', 'ḓ'), + ('ḕ', 'ḕ'), + ('ḗ', 'ḗ'), + ('ḙ', 'ḙ'), + ('ḛ', 'ḛ'), + ('ḝ', 'ḝ'), + ('ḟ', 'ḟ'), + ('ḡ', 'ḡ'), + ('ḣ', 'ḣ'), + ('ḥ', 'ḥ'), + ('ḧ', 'ḧ'), + ('ḩ', 'ḩ'), + ('ḫ', 'ḫ'), + ('ḭ', 'ḭ'), + ('ḯ', 'ḯ'), + ('ḱ', 'ḱ'), + ('ḳ', 'ḳ'), + ('ḵ', 'ḵ'), + ('ḷ', 'ḷ'), + ('ḹ', 'ḹ'), + ('ḻ', 'ḻ'), + ('ḽ', 'ḽ'), + ('ḿ', 'ḿ'), + ('ṁ', 'ṁ'), + ('ṃ', 'ṃ'), + ('ṅ', 'ṅ'), + ('ṇ', 'ṇ'), + ('ṉ', 'ṉ'), + ('ṋ', 'ṋ'), + ('ṍ', 'ṍ'), + ('ṏ', 'ṏ'), + ('ṑ', 'ṑ'), + ('ṓ', 'ṓ'), + ('ṕ', 'ṕ'), + ('ṗ', 'ṗ'), + ('ṙ', 'ṙ'), + ('ṛ', 'ṛ'), + ('ṝ', 'ṝ'), + ('ṟ', 'ṟ'), + ('ṡ', 'ṡ'), + ('ṣ', 'ṣ'), + ('ṥ', 'ṥ'), + ('ṧ', 'ṧ'), + ('ṩ', 'ṩ'), + ('ṫ', 'ṫ'), + ('ṭ', 'ṭ'), + ('ṯ', 'ṯ'), + ('ṱ', 'ṱ'), + ('ṳ', 'ṳ'), + ('ṵ', 'ṵ'), + ('ṷ', 'ṷ'), + ('ṹ', 'ṹ'), + ('ṻ', 'ṻ'), + ('ṽ', 'ṽ'), + ('ṿ', 'ṿ'), + ('ẁ', 'ẁ'), + ('ẃ', 'ẃ'), + ('ẅ', 'ẅ'), + ('ẇ', 'ẇ'), + ('ẉ', 'ẉ'), + ('ẋ', 'ẋ'), + ('ẍ', 'ẍ'), + ('ẏ', 'ẏ'), + ('ẑ', 'ẑ'), + ('ẓ', 'ẓ'), + ('ẕ', 'ẝ'), + ('ẟ', 'ẟ'), + ('ạ', 'ạ'), + ('ả', 'ả'), + ('ấ', 'ấ'), + ('ầ', 'ầ'), + ('ẩ', 'ẩ'), + ('ẫ', 'ẫ'), + ('ậ', 'ậ'), + ('ắ', 'ắ'), + ('ằ', 'ằ'), + ('ẳ', 'ẳ'), + ('ẵ', 'ẵ'), + ('ặ', 'ặ'), + ('ẹ', 'ẹ'), + ('ẻ', 'ẻ'), + ('ẽ', 'ẽ'), + ('ế', 'ế'), + ('ề', 'ề'), + ('ể', 'ể'), + ('ễ', 'ễ'), + ('ệ', 'ệ'), + ('ỉ', 'ỉ'), + ('ị', 'ị'), + ('ọ', 'ọ'), + ('ỏ', 'ỏ'), + ('ố', 'ố'), + ('ồ', 'ồ'), + ('ổ', 'ổ'), + ('ỗ', 'ỗ'), + ('ộ', 'ộ'), + ('ớ', 'ớ'), + ('ờ', 'ờ'), + ('ở', 'ở'), + ('ỡ', 'ỡ'), + ('ợ', 'ợ'), + ('ụ', 'ụ'), + ('ủ', 'ủ'), + ('ứ', 'ứ'), + ('ừ', 'ừ'), + ('ử', 'ử'), + ('ữ', 'ữ'), + ('ự', 'ự'), + ('ỳ', 'ỳ'), + ('ỵ', 'ỵ'), + ('ỷ', 'ỷ'), + ('ỹ', 'ỹ'), + ('ỻ', 'ỻ'), + ('ỽ', 'ỽ'), + ('ỿ', 'ἇ'), + ('ἐ', 'ἕ'), + ('ἠ', 'ἧ'), + ('ἰ', 'ἷ'), + ('ὀ', 'ὅ'), + ('ὐ', 'ὗ'), + ('ὠ', 'ὧ'), + ('ὰ', 'ώ'), + ('ᾀ', 'ᾇ'), + ('ᾐ', 'ᾗ'), + ('ᾠ', 'ᾧ'), + ('ᾰ', 'ᾴ'), + ('ᾶ', 'ᾷ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῇ'), + ('ῐ', 'ΐ'), + ('ῖ', 'ῗ'), + ('ῠ', 'ῧ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῷ'), + ('ℊ', 'ℊ'), + ('ℎ', 'ℏ'), + ('ℓ', 'ℓ'), + ('ℯ', 'ℯ'), + ('ℴ', 'ℴ'), + ('ℹ', 'ℹ'), + ('ℼ', 'ℽ'), + ('ⅆ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('ↄ', 'ↄ'), + ('ⰰ', 'ⱟ'), + ('ⱡ', 'ⱡ'), + ('ⱥ', 'ⱦ'), + ('ⱨ', 'ⱨ'), + ('ⱪ', 'ⱪ'), + ('ⱬ', 'ⱬ'), + ('ⱱ', 'ⱱ'), + ('ⱳ', 'ⱴ'), + ('ⱶ', 'ⱻ'), + ('ⲁ', 'ⲁ'), + ('ⲃ', 'ⲃ'), + ('ⲅ', 'ⲅ'), + ('ⲇ', 'ⲇ'), + ('ⲉ', 'ⲉ'), + ('ⲋ', 'ⲋ'), + ('ⲍ', 'ⲍ'), + ('ⲏ', 'ⲏ'), + ('ⲑ', 'ⲑ'), + ('ⲓ', 'ⲓ'), + ('ⲕ', 'ⲕ'), + ('ⲗ', 'ⲗ'), + ('ⲙ', 'ⲙ'), + ('ⲛ', 'ⲛ'), + ('ⲝ', 'ⲝ'), + ('ⲟ', 'ⲟ'), + ('ⲡ', 'ⲡ'), + ('ⲣ', 'ⲣ'), + ('ⲥ', 'ⲥ'), + ('ⲧ', 'ⲧ'), + ('ⲩ', 'ⲩ'), + ('ⲫ', 'ⲫ'), + ('ⲭ', 'ⲭ'), + ('ⲯ', 'ⲯ'), + ('ⲱ', 'ⲱ'), + ('ⲳ', 'ⲳ'), + ('ⲵ', 'ⲵ'), + ('ⲷ', 'ⲷ'), + ('ⲹ', 'ⲹ'), + ('ⲻ', 'ⲻ'), + ('ⲽ', 'ⲽ'), + ('ⲿ', 'ⲿ'), + ('ⳁ', 'ⳁ'), + ('ⳃ', 'ⳃ'), + ('ⳅ', 'ⳅ'), + ('ⳇ', 'ⳇ'), + ('ⳉ', 'ⳉ'), + ('ⳋ', 'ⳋ'), + ('ⳍ', 'ⳍ'), + ('ⳏ', 'ⳏ'), + ('ⳑ', 'ⳑ'), + ('ⳓ', 'ⳓ'), + ('ⳕ', 'ⳕ'), + ('ⳗ', 'ⳗ'), + ('ⳙ', 'ⳙ'), + ('ⳛ', 'ⳛ'), + ('ⳝ', 'ⳝ'), + ('ⳟ', 'ⳟ'), + ('ⳡ', 'ⳡ'), + ('ⳣ', 'ⳤ'), + ('ⳬ', 'ⳬ'), + ('ⳮ', 'ⳮ'), + ('ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ꙁ', 'ꙁ'), + ('ꙃ', 'ꙃ'), + ('ꙅ', 'ꙅ'), + ('ꙇ', 'ꙇ'), + ('ꙉ', 'ꙉ'), + ('ꙋ', 'ꙋ'), + ('ꙍ', 'ꙍ'), + ('ꙏ', 'ꙏ'), + ('ꙑ', 'ꙑ'), + ('ꙓ', 'ꙓ'), + ('ꙕ', 'ꙕ'), + ('ꙗ', 'ꙗ'), + ('ꙙ', 'ꙙ'), + ('ꙛ', 'ꙛ'), + ('ꙝ', 'ꙝ'), + ('ꙟ', 'ꙟ'), + ('ꙡ', 'ꙡ'), + ('ꙣ', 'ꙣ'), + ('ꙥ', 'ꙥ'), + ('ꙧ', 'ꙧ'), + ('ꙩ', 'ꙩ'), + ('ꙫ', 'ꙫ'), + ('ꙭ', 'ꙭ'), + ('ꚁ', 'ꚁ'), + ('ꚃ', 'ꚃ'), + ('ꚅ', 'ꚅ'), + ('ꚇ', 'ꚇ'), + ('ꚉ', 'ꚉ'), + ('ꚋ', 'ꚋ'), + ('ꚍ', 'ꚍ'), + ('ꚏ', 'ꚏ'), + ('ꚑ', 'ꚑ'), + ('ꚓ', 'ꚓ'), + ('ꚕ', 'ꚕ'), + ('ꚗ', 'ꚗ'), + ('ꚙ', 'ꚙ'), + ('ꚛ', 'ꚛ'), + ('ꜣ', 'ꜣ'), + ('ꜥ', 'ꜥ'), + ('ꜧ', 'ꜧ'), + ('ꜩ', 'ꜩ'), + ('ꜫ', 'ꜫ'), + ('ꜭ', 'ꜭ'), + ('ꜯ', 'ꜱ'), + ('ꜳ', 'ꜳ'), + ('ꜵ', 'ꜵ'), + ('ꜷ', 'ꜷ'), + ('ꜹ', 'ꜹ'), + ('ꜻ', 'ꜻ'), + ('ꜽ', 'ꜽ'), + ('ꜿ', 'ꜿ'), + ('ꝁ', 'ꝁ'), + ('ꝃ', 'ꝃ'), + ('ꝅ', 'ꝅ'), + ('ꝇ', 'ꝇ'), + ('ꝉ', 'ꝉ'), + ('ꝋ', 'ꝋ'), + ('ꝍ', 'ꝍ'), + ('ꝏ', 'ꝏ'), + ('ꝑ', 'ꝑ'), + ('ꝓ', 'ꝓ'), + ('ꝕ', 'ꝕ'), + ('ꝗ', 'ꝗ'), + ('ꝙ', 'ꝙ'), + ('ꝛ', 'ꝛ'), + ('ꝝ', 'ꝝ'), + ('ꝟ', 'ꝟ'), + ('ꝡ', 'ꝡ'), + ('ꝣ', 'ꝣ'), + ('ꝥ', 'ꝥ'), + ('ꝧ', 'ꝧ'), + ('ꝩ', 'ꝩ'), + ('ꝫ', 'ꝫ'), + ('ꝭ', 'ꝭ'), + ('ꝯ', 'ꝯ'), + ('ꝱ', 'ꝸ'), + ('ꝺ', 'ꝺ'), + ('ꝼ', 'ꝼ'), + ('ꝿ', 'ꝿ'), + ('ꞁ', 'ꞁ'), + ('ꞃ', 'ꞃ'), + ('ꞅ', 'ꞅ'), + ('ꞇ', 'ꞇ'), + ('ꞌ', 'ꞌ'), + ('ꞎ', 'ꞎ'), + ('ꞑ', 'ꞑ'), + ('ꞓ', 'ꞕ'), + ('ꞗ', 'ꞗ'), + ('ꞙ', 'ꞙ'), + ('ꞛ', 'ꞛ'), + ('ꞝ', 'ꞝ'), + ('ꞟ', 'ꞟ'), + ('ꞡ', 'ꞡ'), + ('ꞣ', 'ꞣ'), + ('ꞥ', 'ꞥ'), + ('ꞧ', 'ꞧ'), + ('ꞩ', 'ꞩ'), + ('ꞯ', 'ꞯ'), + ('ꞵ', 'ꞵ'), + ('ꞷ', 'ꞷ'), + ('ꞹ', 'ꞹ'), + ('ꞻ', 'ꞻ'), + ('ꞽ', 'ꞽ'), + ('ꞿ', 'ꞿ'), + ('ꟁ', 'ꟁ'), + ('ꟃ', 'ꟃ'), + ('ꟈ', 'ꟈ'), + ('ꟊ', 'ꟊ'), + ('ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟕ'), + ('ꟗ', 'ꟗ'), + ('ꟙ', 'ꟙ'), + ('ꟶ', 'ꟶ'), + ('ꟺ', 'ꟺ'), + ('ꬰ', 'ꭚ'), + ('ꭠ', 'ꭨ'), + ('ꭰ', 'ꮿ'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('a', 'z'), + ('𐐨', '𐑏'), + ('𐓘', '𐓻'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐳀', '𐳲'), + ('𑣀', '𑣟'), + ('𖹠', '𖹿'), + ('𝐚', '𝐳'), + ('𝑎', '𝑔'), + ('𝑖', '𝑧'), + ('𝒂', '𝒛'), + ('𝒶', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝓏'), + ('𝓪', '𝔃'), + ('𝔞', '𝔷'), + ('𝕒', '𝕫'), + ('𝖆', '𝖟'), + ('𝖺', '𝗓'), + ('𝗮', '𝘇'), + ('𝘢', '𝘻'), + ('𝙖', '𝙯'), + ('𝚊', '𝚥'), + ('𝛂', '𝛚'), + ('𝛜', '𝛡'), + ('𝛼', '𝜔'), + ('𝜖', '𝜛'), + ('𝜶', '𝝎'), + ('𝝐', '𝝕'), + ('𝝰', '𝞈'), + ('𝞊', '𝞏'), + ('𝞪', '𝟂'), + ('𝟄', '𝟉'), + ('𝟋', '𝟋'), + ('𝼀', '𝼉'), + ('𝼋', '𝼞'), + ('𝼥', '𝼪'), + ('𞤢', '𞥃'), +]; + +pub const MARK: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{483}', '\u{489}'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6df}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', '\u{7f3}'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{819}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('\u{898}', '\u{89f}'), + ('\u{8ca}', '\u{8e1}'), + ('\u{8e3}', 'ः'), + ('\u{93a}', '\u{93c}'), + ('ा', 'ॏ'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', 'ঃ'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9be}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', '\u{9cd}'), + ('\u{9d7}', '\u{9d7}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ਃ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', 'ઃ'), + ('\u{abc}', '\u{abc}'), + ('ા', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', 'ଃ'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3e}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', '\u{c04}'), + ('\u{c3c}', '\u{c3c}'), + ('\u{c3e}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', 'ಃ'), + ('\u{cbc}', '\u{cbc}'), + ('ಾ', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{ce2}', '\u{ce3}'), + ('ೳ', 'ೳ'), + ('\u{d00}', 'ഃ'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d3e}', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', '\u{d4d}'), + ('\u{d57}', '\u{d57}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', 'ඃ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('ෲ', 'ෳ'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e47}', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('\u{ec8}', '\u{ece}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('༾', '༿'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('ါ', '\u{103e}'), + ('ၖ', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('ၢ', 'ၤ'), + ('ၧ', 'ၭ'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{108d}'), + ('ႏ', 'ႏ'), + ('ႚ', '\u{109d}'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '᜕'), + ('\u{1732}', '᜴'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '\u{180f}'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('\u{1a17}', '\u{1a1b}'), + ('ᩕ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1b00}', 'ᬄ'), + ('\u{1b34}', '᭄'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', 'ᮂ'), + ('ᮡ', '\u{1bad}'), + ('\u{1be6}', '᯳'), + ('ᰤ', '\u{1c37}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('᳷', '\u{1cf9}'), + ('\u{1dc0}', '\u{1dff}'), + ('\u{20d0}', '\u{20f0}'), + ('\u{2cef}', '\u{2cf1}'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('\u{302a}', '\u{302f}'), + ('\u{3099}', '\u{309a}'), + ('\u{a66f}', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('ꠣ', 'ꠧ'), + ('\u{a82c}', '\u{a82c}'), + ('ꢀ', 'ꢁ'), + ('ꢴ', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', '꥓'), + ('\u{a980}', 'ꦃ'), + ('\u{a9b3}', '꧀'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', 'ꩍ'), + ('ꩻ', 'ꩽ'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('ꫫ', 'ꫯ'), + ('ꫵ', '\u{aaf6}'), + ('ꯣ', 'ꯪ'), + ('꯬', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10efd}', '\u{10eff}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{10f82}', '\u{10f85}'), + ('𑀀', '𑀂'), + ('\u{11038}', '\u{11046}'), + ('\u{11070}', '\u{11070}'), + ('\u{11073}', '\u{11074}'), + ('\u{1107f}', '𑂂'), + ('𑂰', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{11134}'), + ('𑅅', '𑅆'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', '𑆂'), + ('𑆳', '𑇀'), + ('\u{111c9}', '\u{111cc}'), + ('𑇎', '\u{111cf}'), + ('𑈬', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{11241}', '\u{11241}'), + ('\u{112df}', '\u{112ea}'), + ('\u{11300}', '𑌃'), + ('\u{1133b}', '\u{1133c}'), + ('\u{1133e}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('\u{11357}', '\u{11357}'), + ('𑍢', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('𑐵', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b0}', '\u{114c3}'), + ('\u{115af}', '\u{115b5}'), + ('𑖸', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('𑘰', '\u{11640}'), + ('\u{116ab}', '\u{116b7}'), + ('\u{1171d}', '\u{1172b}'), + ('𑠬', '\u{1183a}'), + ('\u{11930}', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '\u{1193e}'), + ('𑥀', '𑥀'), + ('𑥂', '\u{11943}'), + ('𑧑', '\u{119d7}'), + ('\u{119da}', '\u{119e0}'), + ('𑧤', '𑧤'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', '𑨹'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a99}'), + ('𑰯', '\u{11c36}'), + ('\u{11c38}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('𑶊', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '\u{11d97}'), + ('\u{11ef3}', '𑻶'), + ('\u{11f00}', '\u{11f01}'), + ('𑼃', '𑼃'), + ('𑼴', '\u{11f3a}'), + ('𑼾', '\u{11f42}'), + ('\u{13440}', '\u{13440}'), + ('\u{13447}', '\u{13455}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f4f}', '\u{16f4f}'), + ('𖽑', '𖾇'), + ('\u{16f8f}', '\u{16f92}'), + ('\u{16fe4}', '\u{16fe4}'), + ('𖿰', '𖿱'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d165}', '\u{1d169}'), + ('𝅭', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e08f}', '\u{1e08f}'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ae}', '\u{1e2ae}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e4ec}', '\u{1e4ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e94a}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const MATH_SYMBOL: &'static [(char, char)] = &[ + ('+', '+'), + ('<', '>'), + ('|', '|'), + ('~', '~'), + ('¬', '¬'), + ('±', '±'), + ('×', '×'), + ('÷', '÷'), + ('϶', '϶'), + ('؆', '؈'), + ('⁄', '⁄'), + ('⁒', '⁒'), + ('⁺', '⁼'), + ('₊', '₌'), + ('℘', '℘'), + ('⅀', '⅄'), + ('⅋', '⅋'), + ('←', '↔'), + ('↚', '↛'), + ('↠', '↠'), + ('↣', '↣'), + ('↦', '↦'), + ('↮', '↮'), + ('⇎', '⇏'), + ('⇒', '⇒'), + ('⇔', '⇔'), + ('⇴', '⋿'), + ('⌠', '⌡'), + ('⍼', '⍼'), + ('⎛', '⎳'), + ('⏜', '⏡'), + ('▷', '▷'), + ('◁', '◁'), + ('◸', '◿'), + ('♯', '♯'), + ('⟀', '⟄'), + ('⟇', '⟥'), + ('⟰', '⟿'), + ('⤀', '⦂'), + ('⦙', '⧗'), + ('⧜', '⧻'), + ('⧾', '⫿'), + ('⬰', '⭄'), + ('⭇', '⭌'), + ('﬩', '﬩'), + ('﹢', '﹢'), + ('﹤', '﹦'), + ('+', '+'), + ('<', '>'), + ('|', '|'), + ('~', '~'), + ('¬', '¬'), + ('←', '↓'), + ('𝛁', '𝛁'), + ('𝛛', '𝛛'), + ('𝛻', '𝛻'), + ('𝜕', '𝜕'), + ('𝜵', '𝜵'), + ('𝝏', '𝝏'), + ('𝝯', '𝝯'), + ('𝞉', '𝞉'), + ('𝞩', '𝞩'), + ('𝟃', '𝟃'), + ('𞻰', '𞻱'), +]; + +pub const MODIFIER_LETTER: &'static [(char, char)] = &[ + ('ʰ', 'ˁ'), + ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('ʹ', 'ʹ'), + ('ͺ', 'ͺ'), + ('ՙ', 'ՙ'), + ('ـ', 'ـ'), + ('ۥ', 'ۦ'), + ('ߴ', 'ߵ'), + ('ߺ', 'ߺ'), + ('ࠚ', 'ࠚ'), + ('ࠤ', 'ࠤ'), + ('ࠨ', 'ࠨ'), + ('ࣉ', 'ࣉ'), + ('ॱ', 'ॱ'), + ('ๆ', 'ๆ'), + ('ໆ', 'ໆ'), + ('ჼ', 'ჼ'), + ('ៗ', 'ៗ'), + ('ᡃ', 'ᡃ'), + ('ᪧ', 'ᪧ'), + ('ᱸ', 'ᱽ'), + ('ᴬ', 'ᵪ'), + ('ᵸ', 'ᵸ'), + ('ᶛ', 'ᶿ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('ⱼ', 'ⱽ'), + ('ⵯ', 'ⵯ'), + ('ⸯ', 'ⸯ'), + ('々', '々'), + ('〱', '〵'), + ('〻', '〻'), + ('ゝ', 'ゞ'), + ('ー', 'ヾ'), + ('ꀕ', 'ꀕ'), + ('ꓸ', 'ꓽ'), + ('ꘌ', 'ꘌ'), + ('ꙿ', 'ꙿ'), + ('ꚜ', 'ꚝ'), + ('ꜗ', 'ꜟ'), + ('ꝰ', 'ꝰ'), + ('ꞈ', 'ꞈ'), + ('ꟲ', 'ꟴ'), + ('ꟸ', 'ꟹ'), + ('ꧏ', 'ꧏ'), + ('ꧦ', 'ꧦ'), + ('ꩰ', 'ꩰ'), + ('ꫝ', 'ꫝ'), + ('ꫳ', 'ꫴ'), + ('ꭜ', 'ꭟ'), + ('ꭩ', 'ꭩ'), + ('ー', 'ー'), + ('\u{ff9e}', '\u{ff9f}'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𖭀', '𖭃'), + ('𖾓', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '𖿣'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𞀰', '𞁭'), + ('𞄷', '𞄽'), + ('𞓫', '𞓫'), + ('𞥋', '𞥋'), +]; + +pub const MODIFIER_SYMBOL: &'static [(char, char)] = &[ + ('^', '^'), + ('`', '`'), + ('¨', '¨'), + ('¯', '¯'), + ('´', '´'), + ('¸', '¸'), + ('˂', '˅'), + ('˒', '˟'), + ('˥', '˫'), + ('˭', '˭'), + ('˯', '˿'), + ('͵', '͵'), + ('΄', '΅'), + ('࢈', '࢈'), + ('᾽', '᾽'), + ('᾿', '῁'), + ('῍', '῏'), + ('῝', '῟'), + ('῭', '`'), + ('´', '῾'), + ('゛', '゜'), + ('꜀', '꜖'), + ('꜠', '꜡'), + ('꞉', '꞊'), + ('꭛', '꭛'), + ('꭪', '꭫'), + ('﮲', '﯂'), + ('^', '^'), + ('`', '`'), + (' ̄', ' ̄'), + ('🏻', '🏿'), +]; + +pub const NONSPACING_MARK: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{483}', '\u{487}'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6df}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', '\u{7f3}'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{819}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('\u{898}', '\u{89f}'), + ('\u{8ca}', '\u{8e1}'), + ('\u{8e3}', '\u{902}'), + ('\u{93a}', '\u{93a}'), + ('\u{93c}', '\u{93c}'), + ('\u{941}', '\u{948}'), + ('\u{94d}', '\u{94d}'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', '\u{981}'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9c1}', '\u{9c4}'), + ('\u{9cd}', '\u{9cd}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', '\u{a02}'), + ('\u{a3c}', '\u{a3c}'), + ('\u{a41}', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', '\u{a82}'), + ('\u{abc}', '\u{abc}'), + ('\u{ac1}', '\u{ac5}'), + ('\u{ac7}', '\u{ac8}'), + ('\u{acd}', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', '\u{b01}'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3f}', '\u{b3f}'), + ('\u{b41}', '\u{b44}'), + ('\u{b4d}', '\u{b4d}'), + ('\u{b55}', '\u{b56}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bc0}', '\u{bc0}'), + ('\u{bcd}', '\u{bcd}'), + ('\u{c00}', '\u{c00}'), + ('\u{c04}', '\u{c04}'), + ('\u{c3c}', '\u{c3c}'), + ('\u{c3e}', '\u{c40}'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', '\u{c81}'), + ('\u{cbc}', '\u{cbc}'), + ('\u{cbf}', '\u{cbf}'), + ('\u{cc6}', '\u{cc6}'), + ('\u{ccc}', '\u{ccd}'), + ('\u{ce2}', '\u{ce3}'), + ('\u{d00}', '\u{d01}'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d41}', '\u{d44}'), + ('\u{d4d}', '\u{d4d}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', '\u{d81}'), + ('\u{dca}', '\u{dca}'), + ('\u{dd2}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e47}', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('\u{ec8}', '\u{ece}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('\u{f71}', '\u{f7e}'), + ('\u{f80}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('\u{102d}', '\u{1030}'), + ('\u{1032}', '\u{1037}'), + ('\u{1039}', '\u{103a}'), + ('\u{103d}', '\u{103e}'), + ('\u{1058}', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{1082}'), + ('\u{1085}', '\u{1086}'), + ('\u{108d}', '\u{108d}'), + ('\u{109d}', '\u{109d}'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '\u{1714}'), + ('\u{1732}', '\u{1733}'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17b5}'), + ('\u{17b7}', '\u{17bd}'), + ('\u{17c6}', '\u{17c6}'), + ('\u{17c9}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '\u{180f}'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', '\u{1922}'), + ('\u{1927}', '\u{1928}'), + ('\u{1932}', '\u{1932}'), + ('\u{1939}', '\u{193b}'), + ('\u{1a17}', '\u{1a18}'), + ('\u{1a1b}', '\u{1a1b}'), + ('\u{1a56}', '\u{1a56}'), + ('\u{1a58}', '\u{1a5e}'), + ('\u{1a60}', '\u{1a60}'), + ('\u{1a62}', '\u{1a62}'), + ('\u{1a65}', '\u{1a6c}'), + ('\u{1a73}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1abd}'), + ('\u{1abf}', '\u{1ace}'), + ('\u{1b00}', '\u{1b03}'), + ('\u{1b34}', '\u{1b34}'), + ('\u{1b36}', '\u{1b3a}'), + ('\u{1b3c}', '\u{1b3c}'), + ('\u{1b42}', '\u{1b42}'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '\u{1b81}'), + ('\u{1ba2}', '\u{1ba5}'), + ('\u{1ba8}', '\u{1ba9}'), + ('\u{1bab}', '\u{1bad}'), + ('\u{1be6}', '\u{1be6}'), + ('\u{1be8}', '\u{1be9}'), + ('\u{1bed}', '\u{1bed}'), + ('\u{1bef}', '\u{1bf1}'), + ('\u{1c2c}', '\u{1c33}'), + ('\u{1c36}', '\u{1c37}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce0}'), + ('\u{1ce2}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{1dc0}', '\u{1dff}'), + ('\u{20d0}', '\u{20dc}'), + ('\u{20e1}', '\u{20e1}'), + ('\u{20e5}', '\u{20f0}'), + ('\u{2cef}', '\u{2cf1}'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('\u{302a}', '\u{302d}'), + ('\u{3099}', '\u{309a}'), + ('\u{a66f}', '\u{a66f}'), + ('\u{a674}', '\u{a67d}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('\u{a825}', '\u{a826}'), + ('\u{a82c}', '\u{a82c}'), + ('\u{a8c4}', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', '\u{a951}'), + ('\u{a980}', '\u{a982}'), + ('\u{a9b3}', '\u{a9b3}'), + ('\u{a9b6}', '\u{a9b9}'), + ('\u{a9bc}', '\u{a9bd}'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa2e}'), + ('\u{aa31}', '\u{aa32}'), + ('\u{aa35}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', '\u{aa4c}'), + ('\u{aa7c}', '\u{aa7c}'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('\u{aaec}', '\u{aaed}'), + ('\u{aaf6}', '\u{aaf6}'), + ('\u{abe5}', '\u{abe5}'), + ('\u{abe8}', '\u{abe8}'), + ('\u{abed}', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10efd}', '\u{10eff}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{10f82}', '\u{10f85}'), + ('\u{11001}', '\u{11001}'), + ('\u{11038}', '\u{11046}'), + ('\u{11070}', '\u{11070}'), + ('\u{11073}', '\u{11074}'), + ('\u{1107f}', '\u{11081}'), + ('\u{110b3}', '\u{110b6}'), + ('\u{110b9}', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{1112b}'), + ('\u{1112d}', '\u{11134}'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', '\u{11181}'), + ('\u{111b6}', '\u{111be}'), + ('\u{111c9}', '\u{111cc}'), + ('\u{111cf}', '\u{111cf}'), + ('\u{1122f}', '\u{11231}'), + ('\u{11234}', '\u{11234}'), + ('\u{11236}', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{11241}', '\u{11241}'), + ('\u{112df}', '\u{112df}'), + ('\u{112e3}', '\u{112ea}'), + ('\u{11300}', '\u{11301}'), + ('\u{1133b}', '\u{1133c}'), + ('\u{11340}', '\u{11340}'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('\u{11438}', '\u{1143f}'), + ('\u{11442}', '\u{11444}'), + ('\u{11446}', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b3}', '\u{114b8}'), + ('\u{114ba}', '\u{114ba}'), + ('\u{114bf}', '\u{114c0}'), + ('\u{114c2}', '\u{114c3}'), + ('\u{115b2}', '\u{115b5}'), + ('\u{115bc}', '\u{115bd}'), + ('\u{115bf}', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('\u{11633}', '\u{1163a}'), + ('\u{1163d}', '\u{1163d}'), + ('\u{1163f}', '\u{11640}'), + ('\u{116ab}', '\u{116ab}'), + ('\u{116ad}', '\u{116ad}'), + ('\u{116b0}', '\u{116b5}'), + ('\u{116b7}', '\u{116b7}'), + ('\u{1171d}', '\u{1171f}'), + ('\u{11722}', '\u{11725}'), + ('\u{11727}', '\u{1172b}'), + ('\u{1182f}', '\u{11837}'), + ('\u{11839}', '\u{1183a}'), + ('\u{1193b}', '\u{1193c}'), + ('\u{1193e}', '\u{1193e}'), + ('\u{11943}', '\u{11943}'), + ('\u{119d4}', '\u{119d7}'), + ('\u{119da}', '\u{119db}'), + ('\u{119e0}', '\u{119e0}'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', '\u{11a38}'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a56}'), + ('\u{11a59}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a96}'), + ('\u{11a98}', '\u{11a99}'), + ('\u{11c30}', '\u{11c36}'), + ('\u{11c38}', '\u{11c3d}'), + ('\u{11c3f}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('\u{11caa}', '\u{11cb0}'), + ('\u{11cb2}', '\u{11cb3}'), + ('\u{11cb5}', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('\u{11d90}', '\u{11d91}'), + ('\u{11d95}', '\u{11d95}'), + ('\u{11d97}', '\u{11d97}'), + ('\u{11ef3}', '\u{11ef4}'), + ('\u{11f00}', '\u{11f01}'), + ('\u{11f36}', '\u{11f3a}'), + ('\u{11f40}', '\u{11f40}'), + ('\u{11f42}', '\u{11f42}'), + ('\u{13440}', '\u{13440}'), + ('\u{13447}', '\u{13455}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f4f}', '\u{16f4f}'), + ('\u{16f8f}', '\u{16f92}'), + ('\u{16fe4}', '\u{16fe4}'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d167}', '\u{1d169}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e08f}', '\u{1e08f}'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ae}', '\u{1e2ae}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e4ec}', '\u{1e4ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e94a}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const NUMBER: &'static [(char, char)] = &[ + ('0', '9'), + ('²', '³'), + ('¹', '¹'), + ('¼', '¾'), + ('٠', '٩'), + ('۰', '۹'), + ('߀', '߉'), + ('०', '९'), + ('০', '৯'), + ('৴', '৹'), + ('੦', '੯'), + ('૦', '૯'), + ('୦', '୯'), + ('୲', '୷'), + ('௦', '௲'), + ('౦', '౯'), + ('౸', '౾'), + ('೦', '೯'), + ('൘', '൞'), + ('൦', '൸'), + ('෦', '෯'), + ('๐', '๙'), + ('໐', '໙'), + ('༠', '༳'), + ('၀', '၉'), + ('႐', '႙'), + ('፩', '፼'), + ('ᛮ', 'ᛰ'), + ('០', '៩'), + ('៰', '៹'), + ('᠐', '᠙'), + ('᥆', '᥏'), + ('᧐', '᧚'), + ('᪀', '᪉'), + ('᪐', '᪙'), + ('᭐', '᭙'), + ('᮰', '᮹'), + ('᱀', '᱉'), + ('᱐', '᱙'), + ('⁰', '⁰'), + ('⁴', '⁹'), + ('₀', '₉'), + ('⅐', 'ↂ'), + ('ↅ', '↉'), + ('①', '⒛'), + ('⓪', '⓿'), + ('❶', '➓'), + ('⳽', '⳽'), + ('〇', '〇'), + ('〡', '〩'), + ('〸', '〺'), + ('㆒', '㆕'), + ('㈠', '㈩'), + ('㉈', '㉏'), + ('㉑', '㉟'), + ('㊀', '㊉'), + ('㊱', '㊿'), + ('꘠', '꘩'), + ('ꛦ', 'ꛯ'), + ('꠰', '꠵'), + ('꣐', '꣙'), + ('꤀', '꤉'), + ('꧐', '꧙'), + ('꧰', '꧹'), + ('꩐', '꩙'), + ('꯰', '꯹'), + ('0', '9'), + ('𐄇', '𐄳'), + ('𐅀', '𐅸'), + ('𐆊', '𐆋'), + ('𐋡', '𐋻'), + ('𐌠', '𐌣'), + ('𐍁', '𐍁'), + ('𐍊', '𐍊'), + ('𐏑', '𐏕'), + ('𐒠', '𐒩'), + ('𐡘', '𐡟'), + ('𐡹', '𐡿'), + ('𐢧', '𐢯'), + ('𐣻', '𐣿'), + ('𐤖', '𐤛'), + ('𐦼', '𐦽'), + ('𐧀', '𐧏'), + ('𐧒', '𐧿'), + ('𐩀', '𐩈'), + ('𐩽', '𐩾'), + ('𐪝', '𐪟'), + ('𐫫', '𐫯'), + ('𐭘', '𐭟'), + ('𐭸', '𐭿'), + ('𐮩', '𐮯'), + ('𐳺', '𐳿'), + ('𐴰', '𐴹'), + ('𐹠', '𐹾'), + ('𐼝', '𐼦'), + ('𐽑', '𐽔'), + ('𐿅', '𐿋'), + ('𑁒', '𑁯'), + ('𑃰', '𑃹'), + ('𑄶', '𑄿'), + ('𑇐', '𑇙'), + ('𑇡', '𑇴'), + ('𑋰', '𑋹'), + ('𑑐', '𑑙'), + ('𑓐', '𑓙'), + ('𑙐', '𑙙'), + ('𑛀', '𑛉'), + ('𑜰', '𑜻'), + ('𑣠', '𑣲'), + ('𑥐', '𑥙'), + ('𑱐', '𑱬'), + ('𑵐', '𑵙'), + ('𑶠', '𑶩'), + ('𑽐', '𑽙'), + ('𑿀', '𑿔'), + ('𒐀', '𒑮'), + ('𖩠', '𖩩'), + ('𖫀', '𖫉'), + ('𖭐', '𖭙'), + ('𖭛', '𖭡'), + ('𖺀', '𖺖'), + ('𝋀', '𝋓'), + ('𝋠', '𝋳'), + ('𝍠', '𝍸'), + ('𝟎', '𝟿'), + ('𞅀', '𞅉'), + ('𞋰', '𞋹'), + ('𞓰', '𞓹'), + ('𞣇', '𞣏'), + ('𞥐', '𞥙'), + ('𞱱', '𞲫'), + ('𞲭', '𞲯'), + ('𞲱', '𞲴'), + ('𞴁', '𞴭'), + ('𞴯', '𞴽'), + ('🄀', '🄌'), + ('🯰', '🯹'), +]; + +pub const OPEN_PUNCTUATION: &'static [(char, char)] = &[ + ('(', '('), + ('[', '['), + ('{', '{'), + ('༺', '༺'), + ('༼', '༼'), + ('᚛', '᚛'), + ('‚', '‚'), + ('„', '„'), + ('⁅', '⁅'), + ('⁽', '⁽'), + ('₍', '₍'), + ('⌈', '⌈'), + ('⌊', '⌊'), + ('〈', '〈'), + ('❨', '❨'), + ('❪', '❪'), + ('❬', '❬'), + ('❮', '❮'), + ('❰', '❰'), + ('❲', '❲'), + ('❴', '❴'), + ('⟅', '⟅'), + ('⟦', '⟦'), + ('⟨', '⟨'), + ('⟪', '⟪'), + ('⟬', '⟬'), + ('⟮', '⟮'), + ('⦃', '⦃'), + ('⦅', '⦅'), + ('⦇', '⦇'), + ('⦉', '⦉'), + ('⦋', '⦋'), + ('⦍', '⦍'), + ('⦏', '⦏'), + ('⦑', '⦑'), + ('⦓', '⦓'), + ('⦕', '⦕'), + ('⦗', '⦗'), + ('⧘', '⧘'), + ('⧚', '⧚'), + ('⧼', '⧼'), + ('⸢', '⸢'), + ('⸤', '⸤'), + ('⸦', '⸦'), + ('⸨', '⸨'), + ('⹂', '⹂'), + ('⹕', '⹕'), + ('⹗', '⹗'), + ('⹙', '⹙'), + ('⹛', '⹛'), + ('〈', '〈'), + ('《', '《'), + ('「', '「'), + ('『', '『'), + ('【', '【'), + ('〔', '〔'), + ('〖', '〖'), + ('〘', '〘'), + ('〚', '〚'), + ('〝', '〝'), + ('﴿', '﴿'), + ('︗', '︗'), + ('︵', '︵'), + ('︷', '︷'), + ('︹', '︹'), + ('︻', '︻'), + ('︽', '︽'), + ('︿', '︿'), + ('﹁', '﹁'), + ('﹃', '﹃'), + ('﹇', '﹇'), + ('﹙', '﹙'), + ('﹛', '﹛'), + ('﹝', '﹝'), + ('(', '('), + ('[', '['), + ('{', '{'), + ('⦅', '⦅'), + ('「', '「'), +]; + +pub const OTHER: &'static [(char, char)] = &[ + ('\0', '\u{1f}'), + ('\u{7f}', '\u{9f}'), + ('\u{ad}', '\u{ad}'), + ('\u{378}', '\u{379}'), + ('\u{380}', '\u{383}'), + ('\u{38b}', '\u{38b}'), + ('\u{38d}', '\u{38d}'), + ('\u{3a2}', '\u{3a2}'), + ('\u{530}', '\u{530}'), + ('\u{557}', '\u{558}'), + ('\u{58b}', '\u{58c}'), + ('\u{590}', '\u{590}'), + ('\u{5c8}', '\u{5cf}'), + ('\u{5eb}', '\u{5ee}'), + ('\u{5f5}', '\u{605}'), + ('\u{61c}', '\u{61c}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{70e}', '\u{70f}'), + ('\u{74b}', '\u{74c}'), + ('\u{7b2}', '\u{7bf}'), + ('\u{7fb}', '\u{7fc}'), + ('\u{82e}', '\u{82f}'), + ('\u{83f}', '\u{83f}'), + ('\u{85c}', '\u{85d}'), + ('\u{85f}', '\u{85f}'), + ('\u{86b}', '\u{86f}'), + ('\u{88f}', '\u{897}'), + ('\u{8e2}', '\u{8e2}'), + ('\u{984}', '\u{984}'), + ('\u{98d}', '\u{98e}'), + ('\u{991}', '\u{992}'), + ('\u{9a9}', '\u{9a9}'), + ('\u{9b1}', '\u{9b1}'), + ('\u{9b3}', '\u{9b5}'), + ('\u{9ba}', '\u{9bb}'), + ('\u{9c5}', '\u{9c6}'), + ('\u{9c9}', '\u{9ca}'), + ('\u{9cf}', '\u{9d6}'), + ('\u{9d8}', '\u{9db}'), + ('\u{9de}', '\u{9de}'), + ('\u{9e4}', '\u{9e5}'), + ('\u{9ff}', '\u{a00}'), + ('\u{a04}', '\u{a04}'), + ('\u{a0b}', '\u{a0e}'), + ('\u{a11}', '\u{a12}'), + ('\u{a29}', '\u{a29}'), + ('\u{a31}', '\u{a31}'), + ('\u{a34}', '\u{a34}'), + ('\u{a37}', '\u{a37}'), + ('\u{a3a}', '\u{a3b}'), + ('\u{a3d}', '\u{a3d}'), + ('\u{a43}', '\u{a46}'), + ('\u{a49}', '\u{a4a}'), + ('\u{a4e}', '\u{a50}'), + ('\u{a52}', '\u{a58}'), + ('\u{a5d}', '\u{a5d}'), + ('\u{a5f}', '\u{a65}'), + ('\u{a77}', '\u{a80}'), + ('\u{a84}', '\u{a84}'), + ('\u{a8e}', '\u{a8e}'), + ('\u{a92}', '\u{a92}'), + ('\u{aa9}', '\u{aa9}'), + ('\u{ab1}', '\u{ab1}'), + ('\u{ab4}', '\u{ab4}'), + ('\u{aba}', '\u{abb}'), + ('\u{ac6}', '\u{ac6}'), + ('\u{aca}', '\u{aca}'), + ('\u{ace}', '\u{acf}'), + ('\u{ad1}', '\u{adf}'), + ('\u{ae4}', '\u{ae5}'), + ('\u{af2}', '\u{af8}'), + ('\u{b00}', '\u{b00}'), + ('\u{b04}', '\u{b04}'), + ('\u{b0d}', '\u{b0e}'), + ('\u{b11}', '\u{b12}'), + ('\u{b29}', '\u{b29}'), + ('\u{b31}', '\u{b31}'), + ('\u{b34}', '\u{b34}'), + ('\u{b3a}', '\u{b3b}'), + ('\u{b45}', '\u{b46}'), + ('\u{b49}', '\u{b4a}'), + ('\u{b4e}', '\u{b54}'), + ('\u{b58}', '\u{b5b}'), + ('\u{b5e}', '\u{b5e}'), + ('\u{b64}', '\u{b65}'), + ('\u{b78}', '\u{b81}'), + ('\u{b84}', '\u{b84}'), + ('\u{b8b}', '\u{b8d}'), + ('\u{b91}', '\u{b91}'), + ('\u{b96}', '\u{b98}'), + ('\u{b9b}', '\u{b9b}'), + ('\u{b9d}', '\u{b9d}'), + ('\u{ba0}', '\u{ba2}'), + ('\u{ba5}', '\u{ba7}'), + ('\u{bab}', '\u{bad}'), + ('\u{bba}', '\u{bbd}'), + ('\u{bc3}', '\u{bc5}'), + ('\u{bc9}', '\u{bc9}'), + ('\u{bce}', '\u{bcf}'), + ('\u{bd1}', '\u{bd6}'), + ('\u{bd8}', '\u{be5}'), + ('\u{bfb}', '\u{bff}'), + ('\u{c0d}', '\u{c0d}'), + ('\u{c11}', '\u{c11}'), + ('\u{c29}', '\u{c29}'), + ('\u{c3a}', '\u{c3b}'), + ('\u{c45}', '\u{c45}'), + ('\u{c49}', '\u{c49}'), + ('\u{c4e}', '\u{c54}'), + ('\u{c57}', '\u{c57}'), + ('\u{c5b}', '\u{c5c}'), + ('\u{c5e}', '\u{c5f}'), + ('\u{c64}', '\u{c65}'), + ('\u{c70}', '\u{c76}'), + ('\u{c8d}', '\u{c8d}'), + ('\u{c91}', '\u{c91}'), + ('\u{ca9}', '\u{ca9}'), + ('\u{cb4}', '\u{cb4}'), + ('\u{cba}', '\u{cbb}'), + ('\u{cc5}', '\u{cc5}'), + ('\u{cc9}', '\u{cc9}'), + ('\u{cce}', '\u{cd4}'), + ('\u{cd7}', '\u{cdc}'), + ('\u{cdf}', '\u{cdf}'), + ('\u{ce4}', '\u{ce5}'), + ('\u{cf0}', '\u{cf0}'), + ('\u{cf4}', '\u{cff}'), + ('\u{d0d}', '\u{d0d}'), + ('\u{d11}', '\u{d11}'), + ('\u{d45}', '\u{d45}'), + ('\u{d49}', '\u{d49}'), + ('\u{d50}', '\u{d53}'), + ('\u{d64}', '\u{d65}'), + ('\u{d80}', '\u{d80}'), + ('\u{d84}', '\u{d84}'), + ('\u{d97}', '\u{d99}'), + ('\u{db2}', '\u{db2}'), + ('\u{dbc}', '\u{dbc}'), + ('\u{dbe}', '\u{dbf}'), + ('\u{dc7}', '\u{dc9}'), + ('\u{dcb}', '\u{dce}'), + ('\u{dd5}', '\u{dd5}'), + ('\u{dd7}', '\u{dd7}'), + ('\u{de0}', '\u{de5}'), + ('\u{df0}', '\u{df1}'), + ('\u{df5}', '\u{e00}'), + ('\u{e3b}', '\u{e3e}'), + ('\u{e5c}', '\u{e80}'), + ('\u{e83}', '\u{e83}'), + ('\u{e85}', '\u{e85}'), + ('\u{e8b}', '\u{e8b}'), + ('\u{ea4}', '\u{ea4}'), + ('\u{ea6}', '\u{ea6}'), + ('\u{ebe}', '\u{ebf}'), + ('\u{ec5}', '\u{ec5}'), + ('\u{ec7}', '\u{ec7}'), + ('\u{ecf}', '\u{ecf}'), + ('\u{eda}', '\u{edb}'), + ('\u{ee0}', '\u{eff}'), + ('\u{f48}', '\u{f48}'), + ('\u{f6d}', '\u{f70}'), + ('\u{f98}', '\u{f98}'), + ('\u{fbd}', '\u{fbd}'), + ('\u{fcd}', '\u{fcd}'), + ('\u{fdb}', '\u{fff}'), + ('\u{10c6}', '\u{10c6}'), + ('\u{10c8}', '\u{10cc}'), + ('\u{10ce}', '\u{10cf}'), + ('\u{1249}', '\u{1249}'), + ('\u{124e}', '\u{124f}'), + ('\u{1257}', '\u{1257}'), + ('\u{1259}', '\u{1259}'), + ('\u{125e}', '\u{125f}'), + ('\u{1289}', '\u{1289}'), + ('\u{128e}', '\u{128f}'), + ('\u{12b1}', '\u{12b1}'), + ('\u{12b6}', '\u{12b7}'), + ('\u{12bf}', '\u{12bf}'), + ('\u{12c1}', '\u{12c1}'), + ('\u{12c6}', '\u{12c7}'), + ('\u{12d7}', '\u{12d7}'), + ('\u{1311}', '\u{1311}'), + ('\u{1316}', '\u{1317}'), + ('\u{135b}', '\u{135c}'), + ('\u{137d}', '\u{137f}'), + ('\u{139a}', '\u{139f}'), + ('\u{13f6}', '\u{13f7}'), + ('\u{13fe}', '\u{13ff}'), + ('\u{169d}', '\u{169f}'), + ('\u{16f9}', '\u{16ff}'), + ('\u{1716}', '\u{171e}'), + ('\u{1737}', '\u{173f}'), + ('\u{1754}', '\u{175f}'), + ('\u{176d}', '\u{176d}'), + ('\u{1771}', '\u{1771}'), + ('\u{1774}', '\u{177f}'), + ('\u{17de}', '\u{17df}'), + ('\u{17ea}', '\u{17ef}'), + ('\u{17fa}', '\u{17ff}'), + ('\u{180e}', '\u{180e}'), + ('\u{181a}', '\u{181f}'), + ('\u{1879}', '\u{187f}'), + ('\u{18ab}', '\u{18af}'), + ('\u{18f6}', '\u{18ff}'), + ('\u{191f}', '\u{191f}'), + ('\u{192c}', '\u{192f}'), + ('\u{193c}', '\u{193f}'), + ('\u{1941}', '\u{1943}'), + ('\u{196e}', '\u{196f}'), + ('\u{1975}', '\u{197f}'), + ('\u{19ac}', '\u{19af}'), + ('\u{19ca}', '\u{19cf}'), + ('\u{19db}', '\u{19dd}'), + ('\u{1a1c}', '\u{1a1d}'), + ('\u{1a5f}', '\u{1a5f}'), + ('\u{1a7d}', '\u{1a7e}'), + ('\u{1a8a}', '\u{1a8f}'), + ('\u{1a9a}', '\u{1a9f}'), + ('\u{1aae}', '\u{1aaf}'), + ('\u{1acf}', '\u{1aff}'), + ('\u{1b4d}', '\u{1b4f}'), + ('\u{1b7f}', '\u{1b7f}'), + ('\u{1bf4}', '\u{1bfb}'), + ('\u{1c38}', '\u{1c3a}'), + ('\u{1c4a}', '\u{1c4c}'), + ('\u{1c89}', '\u{1c8f}'), + ('\u{1cbb}', '\u{1cbc}'), + ('\u{1cc8}', '\u{1ccf}'), + ('\u{1cfb}', '\u{1cff}'), + ('\u{1f16}', '\u{1f17}'), + ('\u{1f1e}', '\u{1f1f}'), + ('\u{1f46}', '\u{1f47}'), + ('\u{1f4e}', '\u{1f4f}'), + ('\u{1f58}', '\u{1f58}'), + ('\u{1f5a}', '\u{1f5a}'), + ('\u{1f5c}', '\u{1f5c}'), + ('\u{1f5e}', '\u{1f5e}'), + ('\u{1f7e}', '\u{1f7f}'), + ('\u{1fb5}', '\u{1fb5}'), + ('\u{1fc5}', '\u{1fc5}'), + ('\u{1fd4}', '\u{1fd5}'), + ('\u{1fdc}', '\u{1fdc}'), + ('\u{1ff0}', '\u{1ff1}'), + ('\u{1ff5}', '\u{1ff5}'), + ('\u{1fff}', '\u{1fff}'), + ('\u{200b}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), + ('\u{2060}', '\u{206f}'), + ('\u{2072}', '\u{2073}'), + ('\u{208f}', '\u{208f}'), + ('\u{209d}', '\u{209f}'), + ('\u{20c1}', '\u{20cf}'), + ('\u{20f1}', '\u{20ff}'), + ('\u{218c}', '\u{218f}'), + ('\u{2427}', '\u{243f}'), + ('\u{244b}', '\u{245f}'), + ('\u{2b74}', '\u{2b75}'), + ('\u{2b96}', '\u{2b96}'), + ('\u{2cf4}', '\u{2cf8}'), + ('\u{2d26}', '\u{2d26}'), + ('\u{2d28}', '\u{2d2c}'), + ('\u{2d2e}', '\u{2d2f}'), + ('\u{2d68}', '\u{2d6e}'), + ('\u{2d71}', '\u{2d7e}'), + ('\u{2d97}', '\u{2d9f}'), + ('\u{2da7}', '\u{2da7}'), + ('\u{2daf}', '\u{2daf}'), + ('\u{2db7}', '\u{2db7}'), + ('\u{2dbf}', '\u{2dbf}'), + ('\u{2dc7}', '\u{2dc7}'), + ('\u{2dcf}', '\u{2dcf}'), + ('\u{2dd7}', '\u{2dd7}'), + ('\u{2ddf}', '\u{2ddf}'), + ('\u{2e5e}', '\u{2e7f}'), + ('\u{2e9a}', '\u{2e9a}'), + ('\u{2ef4}', '\u{2eff}'), + ('\u{2fd6}', '\u{2fef}'), + ('\u{2ffc}', '\u{2fff}'), + ('\u{3040}', '\u{3040}'), + ('\u{3097}', '\u{3098}'), + ('\u{3100}', '\u{3104}'), + ('\u{3130}', '\u{3130}'), + ('\u{318f}', '\u{318f}'), + ('\u{31e4}', '\u{31ef}'), + ('\u{321f}', '\u{321f}'), + ('\u{a48d}', '\u{a48f}'), + ('\u{a4c7}', '\u{a4cf}'), + ('\u{a62c}', '\u{a63f}'), + ('\u{a6f8}', '\u{a6ff}'), + ('\u{a7cb}', '\u{a7cf}'), + ('\u{a7d2}', '\u{a7d2}'), + ('\u{a7d4}', '\u{a7d4}'), + ('\u{a7da}', '\u{a7f1}'), + ('\u{a82d}', '\u{a82f}'), + ('\u{a83a}', '\u{a83f}'), + ('\u{a878}', '\u{a87f}'), + ('\u{a8c6}', '\u{a8cd}'), + ('\u{a8da}', '\u{a8df}'), + ('\u{a954}', '\u{a95e}'), + ('\u{a97d}', '\u{a97f}'), + ('\u{a9ce}', '\u{a9ce}'), + ('\u{a9da}', '\u{a9dd}'), + ('\u{a9ff}', '\u{a9ff}'), + ('\u{aa37}', '\u{aa3f}'), + ('\u{aa4e}', '\u{aa4f}'), + ('\u{aa5a}', '\u{aa5b}'), + ('\u{aac3}', '\u{aada}'), + ('\u{aaf7}', '\u{ab00}'), + ('\u{ab07}', '\u{ab08}'), + ('\u{ab0f}', '\u{ab10}'), + ('\u{ab17}', '\u{ab1f}'), + ('\u{ab27}', '\u{ab27}'), + ('\u{ab2f}', '\u{ab2f}'), + ('\u{ab6c}', '\u{ab6f}'), + ('\u{abee}', '\u{abef}'), + ('\u{abfa}', '\u{abff}'), + ('\u{d7a4}', '\u{d7af}'), + ('\u{d7c7}', '\u{d7ca}'), + ('\u{d7fc}', '\u{f8ff}'), + ('\u{fa6e}', '\u{fa6f}'), + ('\u{fada}', '\u{faff}'), + ('\u{fb07}', '\u{fb12}'), + ('\u{fb18}', '\u{fb1c}'), + ('\u{fb37}', '\u{fb37}'), + ('\u{fb3d}', '\u{fb3d}'), + ('\u{fb3f}', '\u{fb3f}'), + ('\u{fb42}', '\u{fb42}'), + ('\u{fb45}', '\u{fb45}'), + ('\u{fbc3}', '\u{fbd2}'), + ('\u{fd90}', '\u{fd91}'), + ('\u{fdc8}', '\u{fdce}'), + ('\u{fdd0}', '\u{fdef}'), + ('\u{fe1a}', '\u{fe1f}'), + ('\u{fe53}', '\u{fe53}'), + ('\u{fe67}', '\u{fe67}'), + ('\u{fe6c}', '\u{fe6f}'), + ('\u{fe75}', '\u{fe75}'), + ('\u{fefd}', '\u{ff00}'), + ('\u{ffbf}', '\u{ffc1}'), + ('\u{ffc8}', '\u{ffc9}'), + ('\u{ffd0}', '\u{ffd1}'), + ('\u{ffd8}', '\u{ffd9}'), + ('\u{ffdd}', '\u{ffdf}'), + ('\u{ffe7}', '\u{ffe7}'), + ('\u{ffef}', '\u{fffb}'), + ('\u{fffe}', '\u{ffff}'), + ('\u{1000c}', '\u{1000c}'), + ('\u{10027}', '\u{10027}'), + ('\u{1003b}', '\u{1003b}'), + ('\u{1003e}', '\u{1003e}'), + ('\u{1004e}', '\u{1004f}'), + ('\u{1005e}', '\u{1007f}'), + ('\u{100fb}', '\u{100ff}'), + ('\u{10103}', '\u{10106}'), + ('\u{10134}', '\u{10136}'), + ('\u{1018f}', '\u{1018f}'), + ('\u{1019d}', '\u{1019f}'), + ('\u{101a1}', '\u{101cf}'), + ('\u{101fe}', '\u{1027f}'), + ('\u{1029d}', '\u{1029f}'), + ('\u{102d1}', '\u{102df}'), + ('\u{102fc}', '\u{102ff}'), + ('\u{10324}', '\u{1032c}'), + ('\u{1034b}', '\u{1034f}'), + ('\u{1037b}', '\u{1037f}'), + ('\u{1039e}', '\u{1039e}'), + ('\u{103c4}', '\u{103c7}'), + ('\u{103d6}', '\u{103ff}'), + ('\u{1049e}', '\u{1049f}'), + ('\u{104aa}', '\u{104af}'), + ('\u{104d4}', '\u{104d7}'), + ('\u{104fc}', '\u{104ff}'), + ('\u{10528}', '\u{1052f}'), + ('\u{10564}', '\u{1056e}'), + ('\u{1057b}', '\u{1057b}'), + ('\u{1058b}', '\u{1058b}'), + ('\u{10593}', '\u{10593}'), + ('\u{10596}', '\u{10596}'), + ('\u{105a2}', '\u{105a2}'), + ('\u{105b2}', '\u{105b2}'), + ('\u{105ba}', '\u{105ba}'), + ('\u{105bd}', '\u{105ff}'), + ('\u{10737}', '\u{1073f}'), + ('\u{10756}', '\u{1075f}'), + ('\u{10768}', '\u{1077f}'), + ('\u{10786}', '\u{10786}'), + ('\u{107b1}', '\u{107b1}'), + ('\u{107bb}', '\u{107ff}'), + ('\u{10806}', '\u{10807}'), + ('\u{10809}', '\u{10809}'), + ('\u{10836}', '\u{10836}'), + ('\u{10839}', '\u{1083b}'), + ('\u{1083d}', '\u{1083e}'), + ('\u{10856}', '\u{10856}'), + ('\u{1089f}', '\u{108a6}'), + ('\u{108b0}', '\u{108df}'), + ('\u{108f3}', '\u{108f3}'), + ('\u{108f6}', '\u{108fa}'), + ('\u{1091c}', '\u{1091e}'), + ('\u{1093a}', '\u{1093e}'), + ('\u{10940}', '\u{1097f}'), + ('\u{109b8}', '\u{109bb}'), + ('\u{109d0}', '\u{109d1}'), + ('\u{10a04}', '\u{10a04}'), + ('\u{10a07}', '\u{10a0b}'), + ('\u{10a14}', '\u{10a14}'), + ('\u{10a18}', '\u{10a18}'), + ('\u{10a36}', '\u{10a37}'), + ('\u{10a3b}', '\u{10a3e}'), + ('\u{10a49}', '\u{10a4f}'), + ('\u{10a59}', '\u{10a5f}'), + ('\u{10aa0}', '\u{10abf}'), + ('\u{10ae7}', '\u{10aea}'), + ('\u{10af7}', '\u{10aff}'), + ('\u{10b36}', '\u{10b38}'), + ('\u{10b56}', '\u{10b57}'), + ('\u{10b73}', '\u{10b77}'), + ('\u{10b92}', '\u{10b98}'), + ('\u{10b9d}', '\u{10ba8}'), + ('\u{10bb0}', '\u{10bff}'), + ('\u{10c49}', '\u{10c7f}'), + ('\u{10cb3}', '\u{10cbf}'), + ('\u{10cf3}', '\u{10cf9}'), + ('\u{10d28}', '\u{10d2f}'), + ('\u{10d3a}', '\u{10e5f}'), + ('\u{10e7f}', '\u{10e7f}'), + ('\u{10eaa}', '\u{10eaa}'), + ('\u{10eae}', '\u{10eaf}'), + ('\u{10eb2}', '\u{10efc}'), + ('\u{10f28}', '\u{10f2f}'), + ('\u{10f5a}', '\u{10f6f}'), + ('\u{10f8a}', '\u{10faf}'), + ('\u{10fcc}', '\u{10fdf}'), + ('\u{10ff7}', '\u{10fff}'), + ('\u{1104e}', '\u{11051}'), + ('\u{11076}', '\u{1107e}'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110c3}', '\u{110cf}'), + ('\u{110e9}', '\u{110ef}'), + ('\u{110fa}', '\u{110ff}'), + ('\u{11135}', '\u{11135}'), + ('\u{11148}', '\u{1114f}'), + ('\u{11177}', '\u{1117f}'), + ('\u{111e0}', '\u{111e0}'), + ('\u{111f5}', '\u{111ff}'), + ('\u{11212}', '\u{11212}'), + ('\u{11242}', '\u{1127f}'), + ('\u{11287}', '\u{11287}'), + ('\u{11289}', '\u{11289}'), + ('\u{1128e}', '\u{1128e}'), + ('\u{1129e}', '\u{1129e}'), + ('\u{112aa}', '\u{112af}'), + ('\u{112eb}', '\u{112ef}'), + ('\u{112fa}', '\u{112ff}'), + ('\u{11304}', '\u{11304}'), + ('\u{1130d}', '\u{1130e}'), + ('\u{11311}', '\u{11312}'), + ('\u{11329}', '\u{11329}'), + ('\u{11331}', '\u{11331}'), + ('\u{11334}', '\u{11334}'), + ('\u{1133a}', '\u{1133a}'), + ('\u{11345}', '\u{11346}'), + ('\u{11349}', '\u{1134a}'), + ('\u{1134e}', '\u{1134f}'), + ('\u{11351}', '\u{11356}'), + ('\u{11358}', '\u{1135c}'), + ('\u{11364}', '\u{11365}'), + ('\u{1136d}', '\u{1136f}'), + ('\u{11375}', '\u{113ff}'), + ('\u{1145c}', '\u{1145c}'), + ('\u{11462}', '\u{1147f}'), + ('\u{114c8}', '\u{114cf}'), + ('\u{114da}', '\u{1157f}'), + ('\u{115b6}', '\u{115b7}'), + ('\u{115de}', '\u{115ff}'), + ('\u{11645}', '\u{1164f}'), + ('\u{1165a}', '\u{1165f}'), + ('\u{1166d}', '\u{1167f}'), + ('\u{116ba}', '\u{116bf}'), + ('\u{116ca}', '\u{116ff}'), + ('\u{1171b}', '\u{1171c}'), + ('\u{1172c}', '\u{1172f}'), + ('\u{11747}', '\u{117ff}'), + ('\u{1183c}', '\u{1189f}'), + ('\u{118f3}', '\u{118fe}'), + ('\u{11907}', '\u{11908}'), + ('\u{1190a}', '\u{1190b}'), + ('\u{11914}', '\u{11914}'), + ('\u{11917}', '\u{11917}'), + ('\u{11936}', '\u{11936}'), + ('\u{11939}', '\u{1193a}'), + ('\u{11947}', '\u{1194f}'), + ('\u{1195a}', '\u{1199f}'), + ('\u{119a8}', '\u{119a9}'), + ('\u{119d8}', '\u{119d9}'), + ('\u{119e5}', '\u{119ff}'), + ('\u{11a48}', '\u{11a4f}'), + ('\u{11aa3}', '\u{11aaf}'), + ('\u{11af9}', '\u{11aff}'), + ('\u{11b0a}', '\u{11bff}'), + ('\u{11c09}', '\u{11c09}'), + ('\u{11c37}', '\u{11c37}'), + ('\u{11c46}', '\u{11c4f}'), + ('\u{11c6d}', '\u{11c6f}'), + ('\u{11c90}', '\u{11c91}'), + ('\u{11ca8}', '\u{11ca8}'), + ('\u{11cb7}', '\u{11cff}'), + ('\u{11d07}', '\u{11d07}'), + ('\u{11d0a}', '\u{11d0a}'), + ('\u{11d37}', '\u{11d39}'), + ('\u{11d3b}', '\u{11d3b}'), + ('\u{11d3e}', '\u{11d3e}'), + ('\u{11d48}', '\u{11d4f}'), + ('\u{11d5a}', '\u{11d5f}'), + ('\u{11d66}', '\u{11d66}'), + ('\u{11d69}', '\u{11d69}'), + ('\u{11d8f}', '\u{11d8f}'), + ('\u{11d92}', '\u{11d92}'), + ('\u{11d99}', '\u{11d9f}'), + ('\u{11daa}', '\u{11edf}'), + ('\u{11ef9}', '\u{11eff}'), + ('\u{11f11}', '\u{11f11}'), + ('\u{11f3b}', '\u{11f3d}'), + ('\u{11f5a}', '\u{11faf}'), + ('\u{11fb1}', '\u{11fbf}'), + ('\u{11ff2}', '\u{11ffe}'), + ('\u{1239a}', '\u{123ff}'), + ('\u{1246f}', '\u{1246f}'), + ('\u{12475}', '\u{1247f}'), + ('\u{12544}', '\u{12f8f}'), + ('\u{12ff3}', '\u{12fff}'), + ('\u{13430}', '\u{1343f}'), + ('\u{13456}', '\u{143ff}'), + ('\u{14647}', '\u{167ff}'), + ('\u{16a39}', '\u{16a3f}'), + ('\u{16a5f}', '\u{16a5f}'), + ('\u{16a6a}', '\u{16a6d}'), + ('\u{16abf}', '\u{16abf}'), + ('\u{16aca}', '\u{16acf}'), + ('\u{16aee}', '\u{16aef}'), + ('\u{16af6}', '\u{16aff}'), + ('\u{16b46}', '\u{16b4f}'), + ('\u{16b5a}', '\u{16b5a}'), + ('\u{16b62}', '\u{16b62}'), + ('\u{16b78}', '\u{16b7c}'), + ('\u{16b90}', '\u{16e3f}'), + ('\u{16e9b}', '\u{16eff}'), + ('\u{16f4b}', '\u{16f4e}'), + ('\u{16f88}', '\u{16f8e}'), + ('\u{16fa0}', '\u{16fdf}'), + ('\u{16fe5}', '\u{16fef}'), + ('\u{16ff2}', '\u{16fff}'), + ('\u{187f8}', '\u{187ff}'), + ('\u{18cd6}', '\u{18cff}'), + ('\u{18d09}', '\u{1afef}'), + ('\u{1aff4}', '\u{1aff4}'), + ('\u{1affc}', '\u{1affc}'), + ('\u{1afff}', '\u{1afff}'), + ('\u{1b123}', '\u{1b131}'), + ('\u{1b133}', '\u{1b14f}'), + ('\u{1b153}', '\u{1b154}'), + ('\u{1b156}', '\u{1b163}'), + ('\u{1b168}', '\u{1b16f}'), + ('\u{1b2fc}', '\u{1bbff}'), + ('\u{1bc6b}', '\u{1bc6f}'), + ('\u{1bc7d}', '\u{1bc7f}'), + ('\u{1bc89}', '\u{1bc8f}'), + ('\u{1bc9a}', '\u{1bc9b}'), + ('\u{1bca0}', '\u{1ceff}'), + ('\u{1cf2e}', '\u{1cf2f}'), + ('\u{1cf47}', '\u{1cf4f}'), + ('\u{1cfc4}', '\u{1cfff}'), + ('\u{1d0f6}', '\u{1d0ff}'), + ('\u{1d127}', '\u{1d128}'), + ('\u{1d173}', '\u{1d17a}'), + ('\u{1d1eb}', '\u{1d1ff}'), + ('\u{1d246}', '\u{1d2bf}'), + ('\u{1d2d4}', '\u{1d2df}'), + ('\u{1d2f4}', '\u{1d2ff}'), + ('\u{1d357}', '\u{1d35f}'), + ('\u{1d379}', '\u{1d3ff}'), + ('\u{1d455}', '\u{1d455}'), + ('\u{1d49d}', '\u{1d49d}'), + ('\u{1d4a0}', '\u{1d4a1}'), + ('\u{1d4a3}', '\u{1d4a4}'), + ('\u{1d4a7}', '\u{1d4a8}'), + ('\u{1d4ad}', '\u{1d4ad}'), + ('\u{1d4ba}', '\u{1d4ba}'), + ('\u{1d4bc}', '\u{1d4bc}'), + ('\u{1d4c4}', '\u{1d4c4}'), + ('\u{1d506}', '\u{1d506}'), + ('\u{1d50b}', '\u{1d50c}'), + ('\u{1d515}', '\u{1d515}'), + ('\u{1d51d}', '\u{1d51d}'), + ('\u{1d53a}', '\u{1d53a}'), + ('\u{1d53f}', '\u{1d53f}'), + ('\u{1d545}', '\u{1d545}'), + ('\u{1d547}', '\u{1d549}'), + ('\u{1d551}', '\u{1d551}'), + ('\u{1d6a6}', '\u{1d6a7}'), + ('\u{1d7cc}', '\u{1d7cd}'), + ('\u{1da8c}', '\u{1da9a}'), + ('\u{1daa0}', '\u{1daa0}'), + ('\u{1dab0}', '\u{1deff}'), + ('\u{1df1f}', '\u{1df24}'), + ('\u{1df2b}', '\u{1dfff}'), + ('\u{1e007}', '\u{1e007}'), + ('\u{1e019}', '\u{1e01a}'), + ('\u{1e022}', '\u{1e022}'), + ('\u{1e025}', '\u{1e025}'), + ('\u{1e02b}', '\u{1e02f}'), + ('\u{1e06e}', '\u{1e08e}'), + ('\u{1e090}', '\u{1e0ff}'), + ('\u{1e12d}', '\u{1e12f}'), + ('\u{1e13e}', '\u{1e13f}'), + ('\u{1e14a}', '\u{1e14d}'), + ('\u{1e150}', '\u{1e28f}'), + ('\u{1e2af}', '\u{1e2bf}'), + ('\u{1e2fa}', '\u{1e2fe}'), + ('\u{1e300}', '\u{1e4cf}'), + ('\u{1e4fa}', '\u{1e7df}'), + ('\u{1e7e7}', '\u{1e7e7}'), + ('\u{1e7ec}', '\u{1e7ec}'), + ('\u{1e7ef}', '\u{1e7ef}'), + ('\u{1e7ff}', '\u{1e7ff}'), + ('\u{1e8c5}', '\u{1e8c6}'), + ('\u{1e8d7}', '\u{1e8ff}'), + ('\u{1e94c}', '\u{1e94f}'), + ('\u{1e95a}', '\u{1e95d}'), + ('\u{1e960}', '\u{1ec70}'), + ('\u{1ecb5}', '\u{1ed00}'), + ('\u{1ed3e}', '\u{1edff}'), + ('\u{1ee04}', '\u{1ee04}'), + ('\u{1ee20}', '\u{1ee20}'), + ('\u{1ee23}', '\u{1ee23}'), + ('\u{1ee25}', '\u{1ee26}'), + ('\u{1ee28}', '\u{1ee28}'), + ('\u{1ee33}', '\u{1ee33}'), + ('\u{1ee38}', '\u{1ee38}'), + ('\u{1ee3a}', '\u{1ee3a}'), + ('\u{1ee3c}', '\u{1ee41}'), + ('\u{1ee43}', '\u{1ee46}'), + ('\u{1ee48}', '\u{1ee48}'), + ('\u{1ee4a}', '\u{1ee4a}'), + ('\u{1ee4c}', '\u{1ee4c}'), + ('\u{1ee50}', '\u{1ee50}'), + ('\u{1ee53}', '\u{1ee53}'), + ('\u{1ee55}', '\u{1ee56}'), + ('\u{1ee58}', '\u{1ee58}'), + ('\u{1ee5a}', '\u{1ee5a}'), + ('\u{1ee5c}', '\u{1ee5c}'), + ('\u{1ee5e}', '\u{1ee5e}'), + ('\u{1ee60}', '\u{1ee60}'), + ('\u{1ee63}', '\u{1ee63}'), + ('\u{1ee65}', '\u{1ee66}'), + ('\u{1ee6b}', '\u{1ee6b}'), + ('\u{1ee73}', '\u{1ee73}'), + ('\u{1ee78}', '\u{1ee78}'), + ('\u{1ee7d}', '\u{1ee7d}'), + ('\u{1ee7f}', '\u{1ee7f}'), + ('\u{1ee8a}', '\u{1ee8a}'), + ('\u{1ee9c}', '\u{1eea0}'), + ('\u{1eea4}', '\u{1eea4}'), + ('\u{1eeaa}', '\u{1eeaa}'), + ('\u{1eebc}', '\u{1eeef}'), + ('\u{1eef2}', '\u{1efff}'), + ('\u{1f02c}', '\u{1f02f}'), + ('\u{1f094}', '\u{1f09f}'), + ('\u{1f0af}', '\u{1f0b0}'), + ('\u{1f0c0}', '\u{1f0c0}'), + ('\u{1f0d0}', '\u{1f0d0}'), + ('\u{1f0f6}', '\u{1f0ff}'), + ('\u{1f1ae}', '\u{1f1e5}'), + ('\u{1f203}', '\u{1f20f}'), + ('\u{1f23c}', '\u{1f23f}'), + ('\u{1f249}', '\u{1f24f}'), + ('\u{1f252}', '\u{1f25f}'), + ('\u{1f266}', '\u{1f2ff}'), + ('\u{1f6d8}', '\u{1f6db}'), + ('\u{1f6ed}', '\u{1f6ef}'), + ('\u{1f6fd}', '\u{1f6ff}'), + ('\u{1f777}', '\u{1f77a}'), + ('\u{1f7da}', '\u{1f7df}'), + ('\u{1f7ec}', '\u{1f7ef}'), + ('\u{1f7f1}', '\u{1f7ff}'), + ('\u{1f80c}', '\u{1f80f}'), + ('\u{1f848}', '\u{1f84f}'), + ('\u{1f85a}', '\u{1f85f}'), + ('\u{1f888}', '\u{1f88f}'), + ('\u{1f8ae}', '\u{1f8af}'), + ('\u{1f8b2}', '\u{1f8ff}'), + ('\u{1fa54}', '\u{1fa5f}'), + ('\u{1fa6e}', '\u{1fa6f}'), + ('\u{1fa7d}', '\u{1fa7f}'), + ('\u{1fa89}', '\u{1fa8f}'), + ('\u{1fabe}', '\u{1fabe}'), + ('\u{1fac6}', '\u{1facd}'), + ('\u{1fadc}', '\u{1fadf}'), + ('\u{1fae9}', '\u{1faef}'), + ('\u{1faf9}', '\u{1faff}'), + ('\u{1fb93}', '\u{1fb93}'), + ('\u{1fbcb}', '\u{1fbef}'), + ('\u{1fbfa}', '\u{1ffff}'), + ('\u{2a6e0}', '\u{2a6ff}'), + ('\u{2b73a}', '\u{2b73f}'), + ('\u{2b81e}', '\u{2b81f}'), + ('\u{2cea2}', '\u{2ceaf}'), + ('\u{2ebe1}', '\u{2f7ff}'), + ('\u{2fa1e}', '\u{2ffff}'), + ('\u{3134b}', '\u{3134f}'), + ('\u{323b0}', '\u{e00ff}'), + ('\u{e01f0}', '\u{10ffff}'), +]; + +pub const OTHER_LETTER: &'static [(char, char)] = &[ + ('ª', 'ª'), + ('º', 'º'), + ('ƻ', 'ƻ'), + ('ǀ', 'ǃ'), + ('ʔ', 'ʔ'), + ('א', 'ת'), + ('ׯ', 'ײ'), + ('ؠ', 'ؿ'), + ('ف', 'ي'), + ('ٮ', 'ٯ'), + ('ٱ', 'ۓ'), + ('ە', 'ە'), + ('ۮ', 'ۯ'), + ('ۺ', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', 'ܐ'), + ('ܒ', 'ܯ'), + ('ݍ', 'ޥ'), + ('ޱ', 'ޱ'), + ('ߊ', 'ߪ'), + ('ࠀ', 'ࠕ'), + ('ࡀ', 'ࡘ'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('ࢠ', 'ࣈ'), + ('ऄ', 'ह'), + ('ऽ', 'ऽ'), + ('ॐ', 'ॐ'), + ('क़', 'ॡ'), + ('ॲ', 'ঀ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('ঽ', 'ঽ'), + ('ৎ', 'ৎ'), + ('ড়', 'ঢ়'), + ('য়', 'ৡ'), + ('ৰ', 'ৱ'), + ('ৼ', 'ৼ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('ੲ', 'ੴ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('ઽ', 'ઽ'), + ('ૐ', 'ૐ'), + ('ૠ', 'ૡ'), + ('ૹ', 'ૹ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('ଽ', 'ଽ'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୡ'), + ('ୱ', 'ୱ'), + ('ஃ', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('ௐ', 'ௐ'), + ('అ', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('ఽ', 'ఽ'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', 'ౡ'), + ('ಀ', 'ಀ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('ಽ', 'ಽ'), + ('ೝ', 'ೞ'), + ('ೠ', 'ೡ'), + ('ೱ', 'ೲ'), + ('ഄ', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), + ('ൎ', 'ൎ'), + ('ൔ', 'ൖ'), + ('ൟ', 'ൡ'), + ('ൺ', 'ൿ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('ก', 'ะ'), + ('า', 'ำ'), + ('เ', 'ๅ'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ະ'), + ('າ', 'ຳ'), + ('ຽ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('ཀ', 'ཇ'), + ('ཉ', 'ཬ'), + ('ྈ', 'ྌ'), + ('က', 'ဪ'), + ('ဿ', 'ဿ'), + ('ၐ', 'ၕ'), + ('ၚ', 'ၝ'), + ('ၡ', 'ၡ'), + ('ၥ', 'ၦ'), + ('ၮ', 'ၰ'), + ('ၵ', 'ႁ'), + ('ႎ', 'ႎ'), + ('ᄀ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('ᎀ', 'ᎏ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛱ', 'ᛸ'), + ('ᜀ', 'ᜑ'), + ('ᜟ', 'ᜱ'), + ('ᝀ', 'ᝑ'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('ក', 'ឳ'), + ('ៜ', 'ៜ'), + ('ᠠ', 'ᡂ'), + ('ᡄ', 'ᡸ'), + ('ᢀ', 'ᢄ'), + ('ᢇ', 'ᢨ'), + ('ᢪ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('ᥐ', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('ᨀ', 'ᨖ'), + ('ᨠ', 'ᩔ'), + ('ᬅ', 'ᬳ'), + ('ᭅ', 'ᭌ'), + ('ᮃ', 'ᮠ'), + ('ᮮ', 'ᮯ'), + ('ᮺ', 'ᯥ'), + ('ᰀ', 'ᰣ'), + ('ᱍ', 'ᱏ'), + ('ᱚ', 'ᱷ'), + ('ᳩ', 'ᳬ'), + ('ᳮ', 'ᳳ'), + ('ᳵ', 'ᳶ'), + ('ᳺ', 'ᳺ'), + ('ℵ', 'ℸ'), + ('ⴰ', 'ⵧ'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('〆', '〆'), + ('〼', '〼'), + ('ぁ', 'ゖ'), + ('ゟ', 'ゟ'), + ('ァ', 'ヺ'), + ('ヿ', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꀔ'), + ('ꀖ', 'ꒌ'), + ('ꓐ', 'ꓷ'), + ('ꔀ', 'ꘋ'), + ('ꘐ', 'ꘟ'), + ('ꘪ', 'ꘫ'), + ('ꙮ', 'ꙮ'), + ('ꚠ', 'ꛥ'), + ('ꞏ', 'ꞏ'), + ('ꟷ', 'ꟷ'), + ('ꟻ', 'ꠁ'), + ('ꠃ', 'ꠅ'), + ('ꠇ', 'ꠊ'), + ('ꠌ', 'ꠢ'), + ('ꡀ', 'ꡳ'), + ('ꢂ', 'ꢳ'), + ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', 'ꣾ'), + ('ꤊ', 'ꤥ'), + ('ꤰ', 'ꥆ'), + ('ꥠ', 'ꥼ'), + ('ꦄ', 'ꦲ'), + ('ꧠ', 'ꧤ'), + ('ꧧ', 'ꧯ'), + ('ꧺ', 'ꧾ'), + ('ꨀ', 'ꨨ'), + ('ꩀ', 'ꩂ'), + ('ꩄ', 'ꩋ'), + ('ꩠ', 'ꩯ'), + ('ꩱ', 'ꩶ'), + ('ꩺ', 'ꩺ'), + ('ꩾ', 'ꪯ'), + ('ꪱ', 'ꪱ'), + ('ꪵ', 'ꪶ'), + ('ꪹ', 'ꪽ'), + ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), + ('ꫛ', 'ꫜ'), + ('ꫠ', 'ꫪ'), + ('ꫲ', 'ꫲ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꯀ', 'ꯢ'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('יִ', 'יִ'), + ('ײַ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('ヲ', 'ッ'), + ('ア', 'ン'), + ('ᅠ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('𐌀', '𐌟'), + ('𐌭', '𐍀'), + ('𐍂', '𐍉'), + ('𐍐', '𐍵'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐑐', '𐒝'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '𐨀'), + ('𐨐', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '𐫤'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐴀', '𐴣'), + ('𐺀', '𐺩'), + ('𐺰', '𐺱'), + ('𐼀', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '𐽅'), + ('𐽰', '𐾁'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀃', '𑀷'), + ('𑁱', '𑁲'), + ('𑁵', '𑁵'), + ('𑂃', '𑂯'), + ('𑃐', '𑃨'), + ('𑄃', '𑄦'), + ('𑅄', '𑅄'), + ('𑅇', '𑅇'), + ('𑅐', '𑅲'), + ('𑅶', '𑅶'), + ('𑆃', '𑆲'), + ('𑇁', '𑇄'), + ('𑇚', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '𑈫'), + ('𑈿', '𑉀'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '𑋞'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('𑌽', '𑌽'), + ('𑍐', '𑍐'), + ('𑍝', '𑍡'), + ('𑐀', '𑐴'), + ('𑑇', '𑑊'), + ('𑑟', '𑑡'), + ('𑒀', '𑒯'), + ('𑓄', '𑓅'), + ('𑓇', '𑓇'), + ('𑖀', '𑖮'), + ('𑗘', '𑗛'), + ('𑘀', '𑘯'), + ('𑙄', '𑙄'), + ('𑚀', '𑚪'), + ('𑚸', '𑚸'), + ('𑜀', '𑜚'), + ('𑝀', '𑝆'), + ('𑠀', '𑠫'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤯'), + ('𑤿', '𑤿'), + ('𑥁', '𑥁'), + ('𑦠', '𑦧'), + ('𑦪', '𑧐'), + ('𑧡', '𑧡'), + ('𑧣', '𑧣'), + ('𑨀', '𑨀'), + ('𑨋', '𑨲'), + ('𑨺', '𑨺'), + ('𑩐', '𑩐'), + ('𑩜', '𑪉'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '𑰮'), + ('𑱀', '𑱀'), + ('𑱲', '𑲏'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '𑴰'), + ('𑵆', '𑵆'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶉'), + ('𑶘', '𑶘'), + ('𑻠', '𑻲'), + ('𑼂', '𑼂'), + ('𑼄', '𑼐'), + ('𑼒', '𑼳'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('𓑁', '𓑆'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩰', '𖪾'), + ('𖫐', '𖫭'), + ('𖬀', '𖬯'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖼀', '𖽊'), + ('𖽐', '𖽐'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('𝼊', '𝼊'), + ('𞄀', '𞄬'), + ('𞅎', '𞅎'), + ('𞊐', '𞊭'), + ('𞋀', '𞋫'), + ('𞓐', '𞓪'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; + +pub const OTHER_NUMBER: &'static [(char, char)] = &[ + ('²', '³'), + ('¹', '¹'), + ('¼', '¾'), + ('৴', '৹'), + ('୲', '୷'), + ('௰', '௲'), + ('౸', '౾'), + ('൘', '൞'), + ('൰', '൸'), + ('༪', '༳'), + ('፩', '፼'), + ('៰', '៹'), + ('᧚', '᧚'), + ('⁰', '⁰'), + ('⁴', '⁹'), + ('₀', '₉'), + ('⅐', '⅟'), + ('↉', '↉'), + ('①', '⒛'), + ('⓪', '⓿'), + ('❶', '➓'), + ('⳽', '⳽'), + ('㆒', '㆕'), + ('㈠', '㈩'), + ('㉈', '㉏'), + ('㉑', '㉟'), + ('㊀', '㊉'), + ('㊱', '㊿'), + ('꠰', '꠵'), + ('𐄇', '𐄳'), + ('𐅵', '𐅸'), + ('𐆊', '𐆋'), + ('𐋡', '𐋻'), + ('𐌠', '𐌣'), + ('𐡘', '𐡟'), + ('𐡹', '𐡿'), + ('𐢧', '𐢯'), + ('𐣻', '𐣿'), + ('𐤖', '𐤛'), + ('𐦼', '𐦽'), + ('𐧀', '𐧏'), + ('𐧒', '𐧿'), + ('𐩀', '𐩈'), + ('𐩽', '𐩾'), + ('𐪝', '𐪟'), + ('𐫫', '𐫯'), + ('𐭘', '𐭟'), + ('𐭸', '𐭿'), + ('𐮩', '𐮯'), + ('𐳺', '𐳿'), + ('𐹠', '𐹾'), + ('𐼝', '𐼦'), + ('𐽑', '𐽔'), + ('𐿅', '𐿋'), + ('𑁒', '𑁥'), + ('𑇡', '𑇴'), + ('𑜺', '𑜻'), + ('𑣪', '𑣲'), + ('𑱚', '𑱬'), + ('𑿀', '𑿔'), + ('𖭛', '𖭡'), + ('𖺀', '𖺖'), + ('𝋀', '𝋓'), + ('𝋠', '𝋳'), + ('𝍠', '𝍸'), + ('𞣇', '𞣏'), + ('𞱱', '𞲫'), + ('𞲭', '𞲯'), + ('𞲱', '𞲴'), + ('𞴁', '𞴭'), + ('𞴯', '𞴽'), + ('🄀', '🄌'), +]; + +pub const OTHER_PUNCTUATION: &'static [(char, char)] = &[ + ('!', '#'), + ('%', '\''), + ('*', '*'), + (',', ','), + ('.', '/'), + (':', ';'), + ('?', '@'), + ('\\', '\\'), + ('¡', '¡'), + ('§', '§'), + ('¶', '·'), + ('¿', '¿'), + (';', ';'), + ('·', '·'), + ('՚', '՟'), + ('։', '։'), + ('׀', '׀'), + ('׃', '׃'), + ('׆', '׆'), + ('׳', '״'), + ('؉', '؊'), + ('،', '؍'), + ('؛', '؛'), + ('؝', '؟'), + ('٪', '٭'), + ('۔', '۔'), + ('܀', '܍'), + ('߷', '߹'), + ('࠰', '࠾'), + ('࡞', '࡞'), + ('।', '॥'), + ('॰', '॰'), + ('৽', '৽'), + ('੶', '੶'), + ('૰', '૰'), + ('౷', '౷'), + ('಄', '಄'), + ('෴', '෴'), + ('๏', '๏'), + ('๚', '๛'), + ('༄', '༒'), + ('༔', '༔'), + ('྅', '྅'), + ('࿐', '࿔'), + ('࿙', '࿚'), + ('၊', '၏'), + ('჻', '჻'), + ('፠', '፨'), + ('᙮', '᙮'), + ('᛫', '᛭'), + ('᜵', '᜶'), + ('។', '៖'), + ('៘', '៚'), + ('᠀', '᠅'), + ('᠇', '᠊'), + ('᥄', '᥅'), + ('᨞', '᨟'), + ('᪠', '᪦'), + ('᪨', '᪭'), + ('᭚', '᭠'), + ('᭽', '᭾'), + ('᯼', '᯿'), + ('᰻', '᰿'), + ('᱾', '᱿'), + ('᳀', '᳇'), + ('᳓', '᳓'), + ('‖', '‗'), + ('†', '‧'), + ('‰', '‸'), + ('※', '‾'), + ('⁁', '⁃'), + ('⁇', '⁑'), + ('⁓', '⁓'), + ('⁕', '⁞'), + ('⳹', '⳼'), + ('⳾', '⳿'), + ('⵰', '⵰'), + ('⸀', '⸁'), + ('⸆', '⸈'), + ('⸋', '⸋'), + ('⸎', '⸖'), + ('⸘', '⸙'), + ('⸛', '⸛'), + ('⸞', '⸟'), + ('⸪', '⸮'), + ('⸰', '⸹'), + ('⸼', '⸿'), + ('⹁', '⹁'), + ('⹃', '⹏'), + ('⹒', '⹔'), + ('、', '〃'), + ('〽', '〽'), + ('・', '・'), + ('꓾', '꓿'), + ('꘍', '꘏'), + ('꙳', '꙳'), + ('꙾', '꙾'), + ('꛲', '꛷'), + ('꡴', '꡷'), + ('꣎', '꣏'), + ('꣸', '꣺'), + ('꣼', '꣼'), + ('꤮', '꤯'), + ('꥟', '꥟'), + ('꧁', '꧍'), + ('꧞', '꧟'), + ('꩜', '꩟'), + ('꫞', '꫟'), + ('꫰', '꫱'), + ('꯫', '꯫'), + ('︐', '︖'), + ('︙', '︙'), + ('︰', '︰'), + ('﹅', '﹆'), + ('﹉', '﹌'), + ('﹐', '﹒'), + ('﹔', '﹗'), + ('﹟', '﹡'), + ('﹨', '﹨'), + ('﹪', '﹫'), + ('!', '#'), + ('%', '''), + ('*', '*'), + (',', ','), + ('.', '/'), + (':', ';'), + ('?', '@'), + ('\', '\'), + ('。', '。'), + ('、', '・'), + ('𐄀', '𐄂'), + ('𐎟', '𐎟'), + ('𐏐', '𐏐'), + ('𐕯', '𐕯'), + ('𐡗', '𐡗'), + ('𐤟', '𐤟'), + ('𐤿', '𐤿'), + ('𐩐', '𐩘'), + ('𐩿', '𐩿'), + ('𐫰', '𐫶'), + ('𐬹', '𐬿'), + ('𐮙', '𐮜'), + ('𐽕', '𐽙'), + ('𐾆', '𐾉'), + ('𑁇', '𑁍'), + ('𑂻', '𑂼'), + ('𑂾', '𑃁'), + ('𑅀', '𑅃'), + ('𑅴', '𑅵'), + ('𑇅', '𑇈'), + ('𑇍', '𑇍'), + ('𑇛', '𑇛'), + ('𑇝', '𑇟'), + ('𑈸', '𑈽'), + ('𑊩', '𑊩'), + ('𑑋', '𑑏'), + ('𑑚', '𑑛'), + ('𑑝', '𑑝'), + ('𑓆', '𑓆'), + ('𑗁', '𑗗'), + ('𑙁', '𑙃'), + ('𑙠', '𑙬'), + ('𑚹', '𑚹'), + ('𑜼', '𑜾'), + ('𑠻', '𑠻'), + ('𑥄', '𑥆'), + ('𑧢', '𑧢'), + ('𑨿', '𑩆'), + ('𑪚', '𑪜'), + ('𑪞', '𑪢'), + ('𑬀', '𑬉'), + ('𑱁', '𑱅'), + ('𑱰', '𑱱'), + ('𑻷', '𑻸'), + ('𑽃', '𑽏'), + ('𑿿', '𑿿'), + ('𒑰', '𒑴'), + ('𒿱', '𒿲'), + ('𖩮', '𖩯'), + ('𖫵', '𖫵'), + ('𖬷', '𖬻'), + ('𖭄', '𖭄'), + ('𖺗', '𖺚'), + ('𖿢', '𖿢'), + ('𛲟', '𛲟'), + ('𝪇', '𝪋'), + ('𞥞', '𞥟'), +]; + +pub const OTHER_SYMBOL: &'static [(char, char)] = &[ + ('¦', '¦'), + ('©', '©'), + ('®', '®'), + ('°', '°'), + ('҂', '҂'), + ('֍', '֎'), + ('؎', '؏'), + ('۞', '۞'), + ('۩', '۩'), + ('۽', '۾'), + ('߶', '߶'), + ('৺', '৺'), + ('୰', '୰'), + ('௳', '௸'), + ('௺', '௺'), + ('౿', '౿'), + ('൏', '൏'), + ('൹', '൹'), + ('༁', '༃'), + ('༓', '༓'), + ('༕', '༗'), + ('༚', '༟'), + ('༴', '༴'), + ('༶', '༶'), + ('༸', '༸'), + ('྾', '࿅'), + ('࿇', '࿌'), + ('࿎', '࿏'), + ('࿕', '࿘'), + ('႞', '႟'), + ('᎐', '᎙'), + ('᙭', '᙭'), + ('᥀', '᥀'), + ('᧞', '᧿'), + ('᭡', '᭪'), + ('᭴', '᭼'), + ('℀', '℁'), + ('℃', '℆'), + ('℈', '℉'), + ('℔', '℔'), + ('№', '℗'), + ('℞', '℣'), + ('℥', '℥'), + ('℧', '℧'), + ('℩', '℩'), + ('℮', '℮'), + ('℺', '℻'), + ('⅊', '⅊'), + ('⅌', '⅍'), + ('⅏', '⅏'), + ('↊', '↋'), + ('↕', '↙'), + ('↜', '↟'), + ('↡', '↢'), + ('↤', '↥'), + ('↧', '↭'), + ('↯', '⇍'), + ('⇐', '⇑'), + ('⇓', '⇓'), + ('⇕', '⇳'), + ('⌀', '⌇'), + ('⌌', '⌟'), + ('⌢', '⌨'), + ('⌫', '⍻'), + ('⍽', '⎚'), + ('⎴', '⏛'), + ('⏢', '␦'), + ('⑀', '⑊'), + ('⒜', 'ⓩ'), + ('─', '▶'), + ('▸', '◀'), + ('◂', '◷'), + ('☀', '♮'), + ('♰', '❧'), + ('➔', '➿'), + ('⠀', '⣿'), + ('⬀', '⬯'), + ('⭅', '⭆'), + ('⭍', '⭳'), + ('⭶', '⮕'), + ('⮗', '⯿'), + ('⳥', '⳪'), + ('⹐', '⹑'), + ('⺀', '⺙'), + ('⺛', '⻳'), + ('⼀', '⿕'), + ('⿰', '⿻'), + ('〄', '〄'), + ('〒', '〓'), + ('〠', '〠'), + ('〶', '〷'), + ('〾', '〿'), + ('㆐', '㆑'), + ('㆖', '㆟'), + ('㇀', '㇣'), + ('㈀', '㈞'), + ('㈪', '㉇'), + ('㉐', '㉐'), + ('㉠', '㉿'), + ('㊊', '㊰'), + ('㋀', '㏿'), + ('䷀', '䷿'), + ('꒐', '꓆'), + ('꠨', '꠫'), + ('꠶', '꠷'), + ('꠹', '꠹'), + ('꩷', '꩹'), + ('﵀', '﵏'), + ('﷏', '﷏'), + ('﷽', '﷿'), + ('¦', '¦'), + ('│', '│'), + ('■', '○'), + ('', '�'), + ('𐄷', '𐄿'), + ('𐅹', '𐆉'), + ('𐆌', '𐆎'), + ('𐆐', '𐆜'), + ('𐆠', '𐆠'), + ('𐇐', '𐇼'), + ('𐡷', '𐡸'), + ('𐫈', '𐫈'), + ('𑜿', '𑜿'), + ('𑿕', '𑿜'), + ('𑿡', '𑿱'), + ('𖬼', '𖬿'), + ('𖭅', '𖭅'), + ('𛲜', '𛲜'), + ('𜽐', '𜿃'), + ('𝀀', '𝃵'), + ('𝄀', '𝄦'), + ('𝄩', '𝅘𝅥𝅲'), + ('𝅪', '𝅬'), + ('𝆃', '𝆄'), + ('𝆌', '𝆩'), + ('𝆮', '𝇪'), + ('𝈀', '𝉁'), + ('𝉅', '𝉅'), + ('𝌀', '𝍖'), + ('𝠀', '𝧿'), + ('𝨷', '𝨺'), + ('𝩭', '𝩴'), + ('𝩶', '𝪃'), + ('𝪅', '𝪆'), + ('𞅏', '𞅏'), + ('𞲬', '𞲬'), + ('𞴮', '𞴮'), + ('🀀', '🀫'), + ('🀰', '🂓'), + ('🂠', '🂮'), + ('🂱', '🂿'), + ('🃁', '🃏'), + ('🃑', '🃵'), + ('🄍', '🆭'), + ('🇦', '🈂'), + ('🈐', '🈻'), + ('🉀', '🉈'), + ('🉐', '🉑'), + ('🉠', '🉥'), + ('🌀', '🏺'), + ('🐀', '🛗'), + ('🛜', '🛬'), + ('🛰', '🛼'), + ('🜀', '🝶'), + ('🝻', '🟙'), + ('🟠', '🟫'), + ('🟰', '🟰'), + ('🠀', '🠋'), + ('🠐', '🡇'), + ('🡐', '🡙'), + ('🡠', '🢇'), + ('🢐', '🢭'), + ('🢰', '🢱'), + ('🤀', '🩓'), + ('🩠', '🩭'), + ('🩰', '🩼'), + ('🪀', '🪈'), + ('🪐', '🪽'), + ('🪿', '🫅'), + ('🫎', '🫛'), + ('🫠', '🫨'), + ('🫰', '🫸'), + ('🬀', '🮒'), + ('🮔', '🯊'), +]; + +pub const PARAGRAPH_SEPARATOR: &'static [(char, char)] = + &[('\u{2029}', '\u{2029}')]; + +pub const PRIVATE_USE: &'static [(char, char)] = &[ + ('\u{e000}', '\u{f8ff}'), + ('\u{f0000}', '\u{ffffd}'), + ('\u{100000}', '\u{10fffd}'), +]; + +pub const PUNCTUATION: &'static [(char, char)] = &[ + ('!', '#'), + ('%', '*'), + (',', '/'), + (':', ';'), + ('?', '@'), + ('[', ']'), + ('_', '_'), + ('{', '{'), + ('}', '}'), + ('¡', '¡'), + ('§', '§'), + ('«', '«'), + ('¶', '·'), + ('»', '»'), + ('¿', '¿'), + (';', ';'), + ('·', '·'), + ('՚', '՟'), + ('։', '֊'), + ('־', '־'), + ('׀', '׀'), + ('׃', '׃'), + ('׆', '׆'), + ('׳', '״'), + ('؉', '؊'), + ('،', '؍'), + ('؛', '؛'), + ('؝', '؟'), + ('٪', '٭'), + ('۔', '۔'), + ('܀', '܍'), + ('߷', '߹'), + ('࠰', '࠾'), + ('࡞', '࡞'), + ('।', '॥'), + ('॰', '॰'), + ('৽', '৽'), + ('੶', '੶'), + ('૰', '૰'), + ('౷', '౷'), + ('಄', '಄'), + ('෴', '෴'), + ('๏', '๏'), + ('๚', '๛'), + ('༄', '༒'), + ('༔', '༔'), + ('༺', '༽'), + ('྅', '྅'), + ('࿐', '࿔'), + ('࿙', '࿚'), + ('၊', '၏'), + ('჻', '჻'), + ('፠', '፨'), + ('᐀', '᐀'), + ('᙮', '᙮'), + ('᚛', '᚜'), + ('᛫', '᛭'), + ('᜵', '᜶'), + ('។', '៖'), + ('៘', '៚'), + ('᠀', '᠊'), + ('᥄', '᥅'), + ('᨞', '᨟'), + ('᪠', '᪦'), + ('᪨', '᪭'), + ('᭚', '᭠'), + ('᭽', '᭾'), + ('᯼', '᯿'), + ('᰻', '᰿'), + ('᱾', '᱿'), + ('᳀', '᳇'), + ('᳓', '᳓'), + ('‐', '‧'), + ('‰', '⁃'), + ('⁅', '⁑'), + ('⁓', '⁞'), + ('⁽', '⁾'), + ('₍', '₎'), + ('⌈', '⌋'), + ('〈', '〉'), + ('❨', '❵'), + ('⟅', '⟆'), + ('⟦', '⟯'), + ('⦃', '⦘'), + ('⧘', '⧛'), + ('⧼', '⧽'), + ('⳹', '⳼'), + ('⳾', '⳿'), + ('⵰', '⵰'), + ('⸀', '⸮'), + ('⸰', '⹏'), + ('⹒', '⹝'), + ('、', '〃'), + ('〈', '】'), + ('〔', '〟'), + ('〰', '〰'), + ('〽', '〽'), + ('゠', '゠'), + ('・', '・'), + ('꓾', '꓿'), + ('꘍', '꘏'), + ('꙳', '꙳'), + ('꙾', '꙾'), + ('꛲', '꛷'), + ('꡴', '꡷'), + ('꣎', '꣏'), + ('꣸', '꣺'), + ('꣼', '꣼'), + ('꤮', '꤯'), + ('꥟', '꥟'), + ('꧁', '꧍'), + ('꧞', '꧟'), + ('꩜', '꩟'), + ('꫞', '꫟'), + ('꫰', '꫱'), + ('꯫', '꯫'), + ('﴾', '﴿'), + ('︐', '︙'), + ('︰', '﹒'), + ('﹔', '﹡'), + ('﹣', '﹣'), + ('﹨', '﹨'), + ('﹪', '﹫'), + ('!', '#'), + ('%', '*'), + (',', '/'), + (':', ';'), + ('?', '@'), + ('[', ']'), + ('_', '_'), + ('{', '{'), + ('}', '}'), + ('⦅', '・'), + ('𐄀', '𐄂'), + ('𐎟', '𐎟'), + ('𐏐', '𐏐'), + ('𐕯', '𐕯'), + ('𐡗', '𐡗'), + ('𐤟', '𐤟'), + ('𐤿', '𐤿'), + ('𐩐', '𐩘'), + ('𐩿', '𐩿'), + ('𐫰', '𐫶'), + ('𐬹', '𐬿'), + ('𐮙', '𐮜'), + ('𐺭', '𐺭'), + ('𐽕', '𐽙'), + ('𐾆', '𐾉'), + ('𑁇', '𑁍'), + ('𑂻', '𑂼'), + ('𑂾', '𑃁'), + ('𑅀', '𑅃'), + ('𑅴', '𑅵'), + ('𑇅', '𑇈'), + ('𑇍', '𑇍'), + ('𑇛', '𑇛'), + ('𑇝', '𑇟'), + ('𑈸', '𑈽'), + ('𑊩', '𑊩'), + ('𑑋', '𑑏'), + ('𑑚', '𑑛'), + ('𑑝', '𑑝'), + ('𑓆', '𑓆'), + ('𑗁', '𑗗'), + ('𑙁', '𑙃'), + ('𑙠', '𑙬'), + ('𑚹', '𑚹'), + ('𑜼', '𑜾'), + ('𑠻', '𑠻'), + ('𑥄', '𑥆'), + ('𑧢', '𑧢'), + ('𑨿', '𑩆'), + ('𑪚', '𑪜'), + ('𑪞', '𑪢'), + ('𑬀', '𑬉'), + ('𑱁', '𑱅'), + ('𑱰', '𑱱'), + ('𑻷', '𑻸'), + ('𑽃', '𑽏'), + ('𑿿', '𑿿'), + ('𒑰', '𒑴'), + ('𒿱', '𒿲'), + ('𖩮', '𖩯'), + ('𖫵', '𖫵'), + ('𖬷', '𖬻'), + ('𖭄', '𖭄'), + ('𖺗', '𖺚'), + ('𖿢', '𖿢'), + ('𛲟', '𛲟'), + ('𝪇', '𝪋'), + ('𞥞', '𞥟'), +]; + +pub const SEPARATOR: &'static [(char, char)] = &[ + (' ', ' '), + ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), + ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; + +pub const SPACE_SEPARATOR: &'static [(char, char)] = &[ + (' ', ' '), + ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), + ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; + +pub const SPACING_MARK: &'static [(char, char)] = &[ + ('ः', 'ः'), + ('ऻ', 'ऻ'), + ('ा', 'ी'), + ('ॉ', 'ौ'), + ('ॎ', 'ॏ'), + ('ং', 'ঃ'), + ('\u{9be}', 'ী'), + ('ে', 'ৈ'), + ('ো', 'ৌ'), + ('\u{9d7}', '\u{9d7}'), + ('ਃ', 'ਃ'), + ('ਾ', 'ੀ'), + ('ઃ', 'ઃ'), + ('ા', 'ી'), + ('ૉ', 'ૉ'), + ('ો', 'ૌ'), + ('ଂ', 'ଃ'), + ('\u{b3e}', '\u{b3e}'), + ('ୀ', 'ୀ'), + ('େ', 'ୈ'), + ('ୋ', 'ୌ'), + ('\u{b57}', '\u{b57}'), + ('\u{bbe}', 'ி'), + ('ு', 'ூ'), + ('ெ', 'ை'), + ('ொ', 'ௌ'), + ('\u{bd7}', '\u{bd7}'), + ('ఁ', 'ః'), + ('ు', 'ౄ'), + ('ಂ', 'ಃ'), + ('ಾ', 'ಾ'), + ('ೀ', 'ೄ'), + ('ೇ', 'ೈ'), + ('ೊ', 'ೋ'), + ('\u{cd5}', '\u{cd6}'), + ('ೳ', 'ೳ'), + ('ം', 'ഃ'), + ('\u{d3e}', 'ീ'), + ('െ', 'ൈ'), + ('ൊ', 'ൌ'), + ('\u{d57}', '\u{d57}'), + ('ං', 'ඃ'), + ('\u{dcf}', 'ෑ'), + ('ෘ', '\u{ddf}'), + ('ෲ', 'ෳ'), + ('༾', '༿'), + ('ཿ', 'ཿ'), + ('ါ', 'ာ'), + ('ေ', 'ေ'), + ('း', 'း'), + ('ျ', 'ြ'), + ('ၖ', 'ၗ'), + ('ၢ', 'ၤ'), + ('ၧ', 'ၭ'), + ('ႃ', 'ႄ'), + ('ႇ', 'ႌ'), + ('ႏ', 'ႏ'), + ('ႚ', 'ႜ'), + ('᜕', '᜕'), + ('᜴', '᜴'), + ('ា', 'ា'), + ('ើ', 'ៅ'), + ('ះ', 'ៈ'), + ('ᤣ', 'ᤦ'), + ('ᤩ', 'ᤫ'), + ('ᤰ', 'ᤱ'), + ('ᤳ', 'ᤸ'), + ('ᨙ', 'ᨚ'), + ('ᩕ', 'ᩕ'), + ('ᩗ', 'ᩗ'), + ('ᩡ', 'ᩡ'), + ('ᩣ', 'ᩤ'), + ('ᩭ', 'ᩲ'), + ('ᬄ', 'ᬄ'), + ('\u{1b35}', '\u{1b35}'), + ('ᬻ', 'ᬻ'), + ('ᬽ', 'ᭁ'), + ('ᭃ', '᭄'), + ('ᮂ', 'ᮂ'), + ('ᮡ', 'ᮡ'), + ('ᮦ', 'ᮧ'), + ('᮪', '᮪'), + ('ᯧ', 'ᯧ'), + ('ᯪ', 'ᯬ'), + ('ᯮ', 'ᯮ'), + ('᯲', '᯳'), + ('ᰤ', 'ᰫ'), + ('ᰴ', 'ᰵ'), + ('᳡', '᳡'), + ('᳷', '᳷'), + ('\u{302e}', '\u{302f}'), + ('ꠣ', 'ꠤ'), + ('ꠧ', 'ꠧ'), + ('ꢀ', 'ꢁ'), + ('ꢴ', 'ꣃ'), + ('ꥒ', '꥓'), + ('ꦃ', 'ꦃ'), + ('ꦴ', 'ꦵ'), + ('ꦺ', 'ꦻ'), + ('ꦾ', '꧀'), + ('ꨯ', 'ꨰ'), + ('ꨳ', 'ꨴ'), + ('ꩍ', 'ꩍ'), + ('ꩻ', 'ꩻ'), + ('ꩽ', 'ꩽ'), + ('ꫫ', 'ꫫ'), + ('ꫮ', 'ꫯ'), + ('ꫵ', 'ꫵ'), + ('ꯣ', 'ꯤ'), + ('ꯦ', 'ꯧ'), + ('ꯩ', 'ꯪ'), + ('꯬', '꯬'), + ('𑀀', '𑀀'), + ('𑀂', '𑀂'), + ('𑂂', '𑂂'), + ('𑂰', '𑂲'), + ('𑂷', '𑂸'), + ('𑄬', '𑄬'), + ('𑅅', '𑅆'), + ('𑆂', '𑆂'), + ('𑆳', '𑆵'), + ('𑆿', '𑇀'), + ('𑇎', '𑇎'), + ('𑈬', '𑈮'), + ('𑈲', '𑈳'), + ('𑈵', '𑈵'), + ('𑋠', '𑋢'), + ('𑌂', '𑌃'), + ('\u{1133e}', '𑌿'), + ('𑍁', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('\u{11357}', '\u{11357}'), + ('𑍢', '𑍣'), + ('𑐵', '𑐷'), + ('𑑀', '𑑁'), + ('𑑅', '𑑅'), + ('\u{114b0}', '𑒲'), + ('𑒹', '𑒹'), + ('𑒻', '𑒾'), + ('𑓁', '𑓁'), + ('\u{115af}', '𑖱'), + ('𑖸', '𑖻'), + ('𑖾', '𑖾'), + ('𑘰', '𑘲'), + ('𑘻', '𑘼'), + ('𑘾', '𑘾'), + ('𑚬', '𑚬'), + ('𑚮', '𑚯'), + ('𑚶', '𑚶'), + ('𑜠', '𑜡'), + ('𑜦', '𑜦'), + ('𑠬', '𑠮'), + ('𑠸', '𑠸'), + ('\u{11930}', '𑤵'), + ('𑤷', '𑤸'), + ('𑤽', '𑤽'), + ('𑥀', '𑥀'), + ('𑥂', '𑥂'), + ('𑧑', '𑧓'), + ('𑧜', '𑧟'), + ('𑧤', '𑧤'), + ('𑨹', '𑨹'), + ('𑩗', '𑩘'), + ('𑪗', '𑪗'), + ('𑰯', '𑰯'), + ('𑰾', '𑰾'), + ('𑲩', '𑲩'), + ('𑲱', '𑲱'), + ('𑲴', '𑲴'), + ('𑶊', '𑶎'), + ('𑶓', '𑶔'), + ('𑶖', '𑶖'), + ('𑻵', '𑻶'), + ('𑼃', '𑼃'), + ('𑼴', '𑼵'), + ('𑼾', '𑼿'), + ('𑽁', '𑽁'), + ('𖽑', '𖾇'), + ('𖿰', '𖿱'), + ('\u{1d165}', '𝅦'), + ('𝅭', '\u{1d172}'), +]; + +pub const SYMBOL: &'static [(char, char)] = &[ + ('$', '$'), + ('+', '+'), + ('<', '>'), + ('^', '^'), + ('`', '`'), + ('|', '|'), + ('~', '~'), + ('¢', '¦'), + ('¨', '©'), + ('¬', '¬'), + ('®', '±'), + ('´', '´'), + ('¸', '¸'), + ('×', '×'), + ('÷', '÷'), + ('˂', '˅'), + ('˒', '˟'), + ('˥', '˫'), + ('˭', '˭'), + ('˯', '˿'), + ('͵', '͵'), + ('΄', '΅'), + ('϶', '϶'), + ('҂', '҂'), + ('֍', '֏'), + ('؆', '؈'), + ('؋', '؋'), + ('؎', '؏'), + ('۞', '۞'), + ('۩', '۩'), + ('۽', '۾'), + ('߶', '߶'), + ('߾', '߿'), + ('࢈', '࢈'), + ('৲', '৳'), + ('৺', '৻'), + ('૱', '૱'), + ('୰', '୰'), + ('௳', '௺'), + ('౿', '౿'), + ('൏', '൏'), + ('൹', '൹'), + ('฿', '฿'), + ('༁', '༃'), + ('༓', '༓'), + ('༕', '༗'), + ('༚', '༟'), + ('༴', '༴'), + ('༶', '༶'), + ('༸', '༸'), + ('྾', '࿅'), + ('࿇', '࿌'), + ('࿎', '࿏'), + ('࿕', '࿘'), + ('႞', '႟'), + ('᎐', '᎙'), + ('᙭', '᙭'), + ('៛', '៛'), + ('᥀', '᥀'), + ('᧞', '᧿'), + ('᭡', '᭪'), + ('᭴', '᭼'), + ('᾽', '᾽'), + ('᾿', '῁'), + ('῍', '῏'), + ('῝', '῟'), + ('῭', '`'), + ('´', '῾'), + ('⁄', '⁄'), + ('⁒', '⁒'), + ('⁺', '⁼'), + ('₊', '₌'), + ('₠', '⃀'), + ('℀', '℁'), + ('℃', '℆'), + ('℈', '℉'), + ('℔', '℔'), + ('№', '℘'), + ('℞', '℣'), + ('℥', '℥'), + ('℧', '℧'), + ('℩', '℩'), + ('℮', '℮'), + ('℺', '℻'), + ('⅀', '⅄'), + ('⅊', '⅍'), + ('⅏', '⅏'), + ('↊', '↋'), + ('←', '⌇'), + ('⌌', '⌨'), + ('⌫', '␦'), + ('⑀', '⑊'), + ('⒜', 'ⓩ'), + ('─', '❧'), + ('➔', '⟄'), + ('⟇', '⟥'), + ('⟰', '⦂'), + ('⦙', '⧗'), + ('⧜', '⧻'), + ('⧾', '⭳'), + ('⭶', '⮕'), + ('⮗', '⯿'), + ('⳥', '⳪'), + ('⹐', '⹑'), + ('⺀', '⺙'), + ('⺛', '⻳'), + ('⼀', '⿕'), + ('⿰', '⿻'), + ('〄', '〄'), + ('〒', '〓'), + ('〠', '〠'), + ('〶', '〷'), + ('〾', '〿'), + ('゛', '゜'), + ('㆐', '㆑'), + ('㆖', '㆟'), + ('㇀', '㇣'), + ('㈀', '㈞'), + ('㈪', '㉇'), + ('㉐', '㉐'), + ('㉠', '㉿'), + ('㊊', '㊰'), + ('㋀', '㏿'), + ('䷀', '䷿'), + ('꒐', '꓆'), + ('꜀', '꜖'), + ('꜠', '꜡'), + ('꞉', '꞊'), + ('꠨', '꠫'), + ('꠶', '꠹'), + ('꩷', '꩹'), + ('꭛', '꭛'), + ('꭪', '꭫'), + ('﬩', '﬩'), + ('﮲', '﯂'), + ('﵀', '﵏'), + ('﷏', '﷏'), + ('﷼', '﷿'), + ('﹢', '﹢'), + ('﹤', '﹦'), + ('﹩', '﹩'), + ('$', '$'), + ('+', '+'), + ('<', '>'), + ('^', '^'), + ('`', '`'), + ('|', '|'), + ('~', '~'), + ('¢', '₩'), + ('│', '○'), + ('', '�'), + ('𐄷', '𐄿'), + ('𐅹', '𐆉'), + ('𐆌', '𐆎'), + ('𐆐', '𐆜'), + ('𐆠', '𐆠'), + ('𐇐', '𐇼'), + ('𐡷', '𐡸'), + ('𐫈', '𐫈'), + ('𑜿', '𑜿'), + ('𑿕', '𑿱'), + ('𖬼', '𖬿'), + ('𖭅', '𖭅'), + ('𛲜', '𛲜'), + ('𜽐', '𜿃'), + ('𝀀', '𝃵'), + ('𝄀', '𝄦'), + ('𝄩', '𝅘𝅥𝅲'), + ('𝅪', '𝅬'), + ('𝆃', '𝆄'), + ('𝆌', '𝆩'), + ('𝆮', '𝇪'), + ('𝈀', '𝉁'), + ('𝉅', '𝉅'), + ('𝌀', '𝍖'), + ('𝛁', '𝛁'), + ('𝛛', '𝛛'), + ('𝛻', '𝛻'), + ('𝜕', '𝜕'), + ('𝜵', '𝜵'), + ('𝝏', '𝝏'), + ('𝝯', '𝝯'), + ('𝞉', '𝞉'), + ('𝞩', '𝞩'), + ('𝟃', '𝟃'), + ('𝠀', '𝧿'), + ('𝨷', '𝨺'), + ('𝩭', '𝩴'), + ('𝩶', '𝪃'), + ('𝪅', '𝪆'), + ('𞅏', '𞅏'), + ('𞋿', '𞋿'), + ('𞲬', '𞲬'), + ('𞲰', '𞲰'), + ('𞴮', '𞴮'), + ('𞻰', '𞻱'), + ('🀀', '🀫'), + ('🀰', '🂓'), + ('🂠', '🂮'), + ('🂱', '🂿'), + ('🃁', '🃏'), + ('🃑', '🃵'), + ('🄍', '🆭'), + ('🇦', '🈂'), + ('🈐', '🈻'), + ('🉀', '🉈'), + ('🉐', '🉑'), + ('🉠', '🉥'), + ('🌀', '🛗'), + ('🛜', '🛬'), + ('🛰', '🛼'), + ('🜀', '🝶'), + ('🝻', '🟙'), + ('🟠', '🟫'), + ('🟰', '🟰'), + ('🠀', '🠋'), + ('🠐', '🡇'), + ('🡐', '🡙'), + ('🡠', '🢇'), + ('🢐', '🢭'), + ('🢰', '🢱'), + ('🤀', '🩓'), + ('🩠', '🩭'), + ('🩰', '🩼'), + ('🪀', '🪈'), + ('🪐', '🪽'), + ('🪿', '🫅'), + ('🫎', '🫛'), + ('🫠', '🫨'), + ('🫰', '🫸'), + ('🬀', '🮒'), + ('🮔', '🯊'), +]; + +pub const TITLECASE_LETTER: &'static [(char, char)] = &[ + ('Dž', 'Dž'), + ('Lj', 'Lj'), + ('Nj', 'Nj'), + ('Dz', 'Dz'), + ('ᾈ', 'ᾏ'), + ('ᾘ', 'ᾟ'), + ('ᾨ', 'ᾯ'), + ('ᾼ', 'ᾼ'), + ('ῌ', 'ῌ'), + ('ῼ', 'ῼ'), +]; + +pub const UNASSIGNED: &'static [(char, char)] = &[ + ('\u{378}', '\u{379}'), + ('\u{380}', '\u{383}'), + ('\u{38b}', '\u{38b}'), + ('\u{38d}', '\u{38d}'), + ('\u{3a2}', '\u{3a2}'), + ('\u{530}', '\u{530}'), + ('\u{557}', '\u{558}'), + ('\u{58b}', '\u{58c}'), + ('\u{590}', '\u{590}'), + ('\u{5c8}', '\u{5cf}'), + ('\u{5eb}', '\u{5ee}'), + ('\u{5f5}', '\u{5ff}'), + ('\u{70e}', '\u{70e}'), + ('\u{74b}', '\u{74c}'), + ('\u{7b2}', '\u{7bf}'), + ('\u{7fb}', '\u{7fc}'), + ('\u{82e}', '\u{82f}'), + ('\u{83f}', '\u{83f}'), + ('\u{85c}', '\u{85d}'), + ('\u{85f}', '\u{85f}'), + ('\u{86b}', '\u{86f}'), + ('\u{88f}', '\u{88f}'), + ('\u{892}', '\u{897}'), + ('\u{984}', '\u{984}'), + ('\u{98d}', '\u{98e}'), + ('\u{991}', '\u{992}'), + ('\u{9a9}', '\u{9a9}'), + ('\u{9b1}', '\u{9b1}'), + ('\u{9b3}', '\u{9b5}'), + ('\u{9ba}', '\u{9bb}'), + ('\u{9c5}', '\u{9c6}'), + ('\u{9c9}', '\u{9ca}'), + ('\u{9cf}', '\u{9d6}'), + ('\u{9d8}', '\u{9db}'), + ('\u{9de}', '\u{9de}'), + ('\u{9e4}', '\u{9e5}'), + ('\u{9ff}', '\u{a00}'), + ('\u{a04}', '\u{a04}'), + ('\u{a0b}', '\u{a0e}'), + ('\u{a11}', '\u{a12}'), + ('\u{a29}', '\u{a29}'), + ('\u{a31}', '\u{a31}'), + ('\u{a34}', '\u{a34}'), + ('\u{a37}', '\u{a37}'), + ('\u{a3a}', '\u{a3b}'), + ('\u{a3d}', '\u{a3d}'), + ('\u{a43}', '\u{a46}'), + ('\u{a49}', '\u{a4a}'), + ('\u{a4e}', '\u{a50}'), + ('\u{a52}', '\u{a58}'), + ('\u{a5d}', '\u{a5d}'), + ('\u{a5f}', '\u{a65}'), + ('\u{a77}', '\u{a80}'), + ('\u{a84}', '\u{a84}'), + ('\u{a8e}', '\u{a8e}'), + ('\u{a92}', '\u{a92}'), + ('\u{aa9}', '\u{aa9}'), + ('\u{ab1}', '\u{ab1}'), + ('\u{ab4}', '\u{ab4}'), + ('\u{aba}', '\u{abb}'), + ('\u{ac6}', '\u{ac6}'), + ('\u{aca}', '\u{aca}'), + ('\u{ace}', '\u{acf}'), + ('\u{ad1}', '\u{adf}'), + ('\u{ae4}', '\u{ae5}'), + ('\u{af2}', '\u{af8}'), + ('\u{b00}', '\u{b00}'), + ('\u{b04}', '\u{b04}'), + ('\u{b0d}', '\u{b0e}'), + ('\u{b11}', '\u{b12}'), + ('\u{b29}', '\u{b29}'), + ('\u{b31}', '\u{b31}'), + ('\u{b34}', '\u{b34}'), + ('\u{b3a}', '\u{b3b}'), + ('\u{b45}', '\u{b46}'), + ('\u{b49}', '\u{b4a}'), + ('\u{b4e}', '\u{b54}'), + ('\u{b58}', '\u{b5b}'), + ('\u{b5e}', '\u{b5e}'), + ('\u{b64}', '\u{b65}'), + ('\u{b78}', '\u{b81}'), + ('\u{b84}', '\u{b84}'), + ('\u{b8b}', '\u{b8d}'), + ('\u{b91}', '\u{b91}'), + ('\u{b96}', '\u{b98}'), + ('\u{b9b}', '\u{b9b}'), + ('\u{b9d}', '\u{b9d}'), + ('\u{ba0}', '\u{ba2}'), + ('\u{ba5}', '\u{ba7}'), + ('\u{bab}', '\u{bad}'), + ('\u{bba}', '\u{bbd}'), + ('\u{bc3}', '\u{bc5}'), + ('\u{bc9}', '\u{bc9}'), + ('\u{bce}', '\u{bcf}'), + ('\u{bd1}', '\u{bd6}'), + ('\u{bd8}', '\u{be5}'), + ('\u{bfb}', '\u{bff}'), + ('\u{c0d}', '\u{c0d}'), + ('\u{c11}', '\u{c11}'), + ('\u{c29}', '\u{c29}'), + ('\u{c3a}', '\u{c3b}'), + ('\u{c45}', '\u{c45}'), + ('\u{c49}', '\u{c49}'), + ('\u{c4e}', '\u{c54}'), + ('\u{c57}', '\u{c57}'), + ('\u{c5b}', '\u{c5c}'), + ('\u{c5e}', '\u{c5f}'), + ('\u{c64}', '\u{c65}'), + ('\u{c70}', '\u{c76}'), + ('\u{c8d}', '\u{c8d}'), + ('\u{c91}', '\u{c91}'), + ('\u{ca9}', '\u{ca9}'), + ('\u{cb4}', '\u{cb4}'), + ('\u{cba}', '\u{cbb}'), + ('\u{cc5}', '\u{cc5}'), + ('\u{cc9}', '\u{cc9}'), + ('\u{cce}', '\u{cd4}'), + ('\u{cd7}', '\u{cdc}'), + ('\u{cdf}', '\u{cdf}'), + ('\u{ce4}', '\u{ce5}'), + ('\u{cf0}', '\u{cf0}'), + ('\u{cf4}', '\u{cff}'), + ('\u{d0d}', '\u{d0d}'), + ('\u{d11}', '\u{d11}'), + ('\u{d45}', '\u{d45}'), + ('\u{d49}', '\u{d49}'), + ('\u{d50}', '\u{d53}'), + ('\u{d64}', '\u{d65}'), + ('\u{d80}', '\u{d80}'), + ('\u{d84}', '\u{d84}'), + ('\u{d97}', '\u{d99}'), + ('\u{db2}', '\u{db2}'), + ('\u{dbc}', '\u{dbc}'), + ('\u{dbe}', '\u{dbf}'), + ('\u{dc7}', '\u{dc9}'), + ('\u{dcb}', '\u{dce}'), + ('\u{dd5}', '\u{dd5}'), + ('\u{dd7}', '\u{dd7}'), + ('\u{de0}', '\u{de5}'), + ('\u{df0}', '\u{df1}'), + ('\u{df5}', '\u{e00}'), + ('\u{e3b}', '\u{e3e}'), + ('\u{e5c}', '\u{e80}'), + ('\u{e83}', '\u{e83}'), + ('\u{e85}', '\u{e85}'), + ('\u{e8b}', '\u{e8b}'), + ('\u{ea4}', '\u{ea4}'), + ('\u{ea6}', '\u{ea6}'), + ('\u{ebe}', '\u{ebf}'), + ('\u{ec5}', '\u{ec5}'), + ('\u{ec7}', '\u{ec7}'), + ('\u{ecf}', '\u{ecf}'), + ('\u{eda}', '\u{edb}'), + ('\u{ee0}', '\u{eff}'), + ('\u{f48}', '\u{f48}'), + ('\u{f6d}', '\u{f70}'), + ('\u{f98}', '\u{f98}'), + ('\u{fbd}', '\u{fbd}'), + ('\u{fcd}', '\u{fcd}'), + ('\u{fdb}', '\u{fff}'), + ('\u{10c6}', '\u{10c6}'), + ('\u{10c8}', '\u{10cc}'), + ('\u{10ce}', '\u{10cf}'), + ('\u{1249}', '\u{1249}'), + ('\u{124e}', '\u{124f}'), + ('\u{1257}', '\u{1257}'), + ('\u{1259}', '\u{1259}'), + ('\u{125e}', '\u{125f}'), + ('\u{1289}', '\u{1289}'), + ('\u{128e}', '\u{128f}'), + ('\u{12b1}', '\u{12b1}'), + ('\u{12b6}', '\u{12b7}'), + ('\u{12bf}', '\u{12bf}'), + ('\u{12c1}', '\u{12c1}'), + ('\u{12c6}', '\u{12c7}'), + ('\u{12d7}', '\u{12d7}'), + ('\u{1311}', '\u{1311}'), + ('\u{1316}', '\u{1317}'), + ('\u{135b}', '\u{135c}'), + ('\u{137d}', '\u{137f}'), + ('\u{139a}', '\u{139f}'), + ('\u{13f6}', '\u{13f7}'), + ('\u{13fe}', '\u{13ff}'), + ('\u{169d}', '\u{169f}'), + ('\u{16f9}', '\u{16ff}'), + ('\u{1716}', '\u{171e}'), + ('\u{1737}', '\u{173f}'), + ('\u{1754}', '\u{175f}'), + ('\u{176d}', '\u{176d}'), + ('\u{1771}', '\u{1771}'), + ('\u{1774}', '\u{177f}'), + ('\u{17de}', '\u{17df}'), + ('\u{17ea}', '\u{17ef}'), + ('\u{17fa}', '\u{17ff}'), + ('\u{181a}', '\u{181f}'), + ('\u{1879}', '\u{187f}'), + ('\u{18ab}', '\u{18af}'), + ('\u{18f6}', '\u{18ff}'), + ('\u{191f}', '\u{191f}'), + ('\u{192c}', '\u{192f}'), + ('\u{193c}', '\u{193f}'), + ('\u{1941}', '\u{1943}'), + ('\u{196e}', '\u{196f}'), + ('\u{1975}', '\u{197f}'), + ('\u{19ac}', '\u{19af}'), + ('\u{19ca}', '\u{19cf}'), + ('\u{19db}', '\u{19dd}'), + ('\u{1a1c}', '\u{1a1d}'), + ('\u{1a5f}', '\u{1a5f}'), + ('\u{1a7d}', '\u{1a7e}'), + ('\u{1a8a}', '\u{1a8f}'), + ('\u{1a9a}', '\u{1a9f}'), + ('\u{1aae}', '\u{1aaf}'), + ('\u{1acf}', '\u{1aff}'), + ('\u{1b4d}', '\u{1b4f}'), + ('\u{1b7f}', '\u{1b7f}'), + ('\u{1bf4}', '\u{1bfb}'), + ('\u{1c38}', '\u{1c3a}'), + ('\u{1c4a}', '\u{1c4c}'), + ('\u{1c89}', '\u{1c8f}'), + ('\u{1cbb}', '\u{1cbc}'), + ('\u{1cc8}', '\u{1ccf}'), + ('\u{1cfb}', '\u{1cff}'), + ('\u{1f16}', '\u{1f17}'), + ('\u{1f1e}', '\u{1f1f}'), + ('\u{1f46}', '\u{1f47}'), + ('\u{1f4e}', '\u{1f4f}'), + ('\u{1f58}', '\u{1f58}'), + ('\u{1f5a}', '\u{1f5a}'), + ('\u{1f5c}', '\u{1f5c}'), + ('\u{1f5e}', '\u{1f5e}'), + ('\u{1f7e}', '\u{1f7f}'), + ('\u{1fb5}', '\u{1fb5}'), + ('\u{1fc5}', '\u{1fc5}'), + ('\u{1fd4}', '\u{1fd5}'), + ('\u{1fdc}', '\u{1fdc}'), + ('\u{1ff0}', '\u{1ff1}'), + ('\u{1ff5}', '\u{1ff5}'), + ('\u{1fff}', '\u{1fff}'), + ('\u{2065}', '\u{2065}'), + ('\u{2072}', '\u{2073}'), + ('\u{208f}', '\u{208f}'), + ('\u{209d}', '\u{209f}'), + ('\u{20c1}', '\u{20cf}'), + ('\u{20f1}', '\u{20ff}'), + ('\u{218c}', '\u{218f}'), + ('\u{2427}', '\u{243f}'), + ('\u{244b}', '\u{245f}'), + ('\u{2b74}', '\u{2b75}'), + ('\u{2b96}', '\u{2b96}'), + ('\u{2cf4}', '\u{2cf8}'), + ('\u{2d26}', '\u{2d26}'), + ('\u{2d28}', '\u{2d2c}'), + ('\u{2d2e}', '\u{2d2f}'), + ('\u{2d68}', '\u{2d6e}'), + ('\u{2d71}', '\u{2d7e}'), + ('\u{2d97}', '\u{2d9f}'), + ('\u{2da7}', '\u{2da7}'), + ('\u{2daf}', '\u{2daf}'), + ('\u{2db7}', '\u{2db7}'), + ('\u{2dbf}', '\u{2dbf}'), + ('\u{2dc7}', '\u{2dc7}'), + ('\u{2dcf}', '\u{2dcf}'), + ('\u{2dd7}', '\u{2dd7}'), + ('\u{2ddf}', '\u{2ddf}'), + ('\u{2e5e}', '\u{2e7f}'), + ('\u{2e9a}', '\u{2e9a}'), + ('\u{2ef4}', '\u{2eff}'), + ('\u{2fd6}', '\u{2fef}'), + ('\u{2ffc}', '\u{2fff}'), + ('\u{3040}', '\u{3040}'), + ('\u{3097}', '\u{3098}'), + ('\u{3100}', '\u{3104}'), + ('\u{3130}', '\u{3130}'), + ('\u{318f}', '\u{318f}'), + ('\u{31e4}', '\u{31ef}'), + ('\u{321f}', '\u{321f}'), + ('\u{a48d}', '\u{a48f}'), + ('\u{a4c7}', '\u{a4cf}'), + ('\u{a62c}', '\u{a63f}'), + ('\u{a6f8}', '\u{a6ff}'), + ('\u{a7cb}', '\u{a7cf}'), + ('\u{a7d2}', '\u{a7d2}'), + ('\u{a7d4}', '\u{a7d4}'), + ('\u{a7da}', '\u{a7f1}'), + ('\u{a82d}', '\u{a82f}'), + ('\u{a83a}', '\u{a83f}'), + ('\u{a878}', '\u{a87f}'), + ('\u{a8c6}', '\u{a8cd}'), + ('\u{a8da}', '\u{a8df}'), + ('\u{a954}', '\u{a95e}'), + ('\u{a97d}', '\u{a97f}'), + ('\u{a9ce}', '\u{a9ce}'), + ('\u{a9da}', '\u{a9dd}'), + ('\u{a9ff}', '\u{a9ff}'), + ('\u{aa37}', '\u{aa3f}'), + ('\u{aa4e}', '\u{aa4f}'), + ('\u{aa5a}', '\u{aa5b}'), + ('\u{aac3}', '\u{aada}'), + ('\u{aaf7}', '\u{ab00}'), + ('\u{ab07}', '\u{ab08}'), + ('\u{ab0f}', '\u{ab10}'), + ('\u{ab17}', '\u{ab1f}'), + ('\u{ab27}', '\u{ab27}'), + ('\u{ab2f}', '\u{ab2f}'), + ('\u{ab6c}', '\u{ab6f}'), + ('\u{abee}', '\u{abef}'), + ('\u{abfa}', '\u{abff}'), + ('\u{d7a4}', '\u{d7af}'), + ('\u{d7c7}', '\u{d7ca}'), + ('\u{d7fc}', '\u{d7ff}'), + ('\u{fa6e}', '\u{fa6f}'), + ('\u{fada}', '\u{faff}'), + ('\u{fb07}', '\u{fb12}'), + ('\u{fb18}', '\u{fb1c}'), + ('\u{fb37}', '\u{fb37}'), + ('\u{fb3d}', '\u{fb3d}'), + ('\u{fb3f}', '\u{fb3f}'), + ('\u{fb42}', '\u{fb42}'), + ('\u{fb45}', '\u{fb45}'), + ('\u{fbc3}', '\u{fbd2}'), + ('\u{fd90}', '\u{fd91}'), + ('\u{fdc8}', '\u{fdce}'), + ('\u{fdd0}', '\u{fdef}'), + ('\u{fe1a}', '\u{fe1f}'), + ('\u{fe53}', '\u{fe53}'), + ('\u{fe67}', '\u{fe67}'), + ('\u{fe6c}', '\u{fe6f}'), + ('\u{fe75}', '\u{fe75}'), + ('\u{fefd}', '\u{fefe}'), + ('\u{ff00}', '\u{ff00}'), + ('\u{ffbf}', '\u{ffc1}'), + ('\u{ffc8}', '\u{ffc9}'), + ('\u{ffd0}', '\u{ffd1}'), + ('\u{ffd8}', '\u{ffd9}'), + ('\u{ffdd}', '\u{ffdf}'), + ('\u{ffe7}', '\u{ffe7}'), + ('\u{ffef}', '\u{fff8}'), + ('\u{fffe}', '\u{ffff}'), + ('\u{1000c}', '\u{1000c}'), + ('\u{10027}', '\u{10027}'), + ('\u{1003b}', '\u{1003b}'), + ('\u{1003e}', '\u{1003e}'), + ('\u{1004e}', '\u{1004f}'), + ('\u{1005e}', '\u{1007f}'), + ('\u{100fb}', '\u{100ff}'), + ('\u{10103}', '\u{10106}'), + ('\u{10134}', '\u{10136}'), + ('\u{1018f}', '\u{1018f}'), + ('\u{1019d}', '\u{1019f}'), + ('\u{101a1}', '\u{101cf}'), + ('\u{101fe}', '\u{1027f}'), + ('\u{1029d}', '\u{1029f}'), + ('\u{102d1}', '\u{102df}'), + ('\u{102fc}', '\u{102ff}'), + ('\u{10324}', '\u{1032c}'), + ('\u{1034b}', '\u{1034f}'), + ('\u{1037b}', '\u{1037f}'), + ('\u{1039e}', '\u{1039e}'), + ('\u{103c4}', '\u{103c7}'), + ('\u{103d6}', '\u{103ff}'), + ('\u{1049e}', '\u{1049f}'), + ('\u{104aa}', '\u{104af}'), + ('\u{104d4}', '\u{104d7}'), + ('\u{104fc}', '\u{104ff}'), + ('\u{10528}', '\u{1052f}'), + ('\u{10564}', '\u{1056e}'), + ('\u{1057b}', '\u{1057b}'), + ('\u{1058b}', '\u{1058b}'), + ('\u{10593}', '\u{10593}'), + ('\u{10596}', '\u{10596}'), + ('\u{105a2}', '\u{105a2}'), + ('\u{105b2}', '\u{105b2}'), + ('\u{105ba}', '\u{105ba}'), + ('\u{105bd}', '\u{105ff}'), + ('\u{10737}', '\u{1073f}'), + ('\u{10756}', '\u{1075f}'), + ('\u{10768}', '\u{1077f}'), + ('\u{10786}', '\u{10786}'), + ('\u{107b1}', '\u{107b1}'), + ('\u{107bb}', '\u{107ff}'), + ('\u{10806}', '\u{10807}'), + ('\u{10809}', '\u{10809}'), + ('\u{10836}', '\u{10836}'), + ('\u{10839}', '\u{1083b}'), + ('\u{1083d}', '\u{1083e}'), + ('\u{10856}', '\u{10856}'), + ('\u{1089f}', '\u{108a6}'), + ('\u{108b0}', '\u{108df}'), + ('\u{108f3}', '\u{108f3}'), + ('\u{108f6}', '\u{108fa}'), + ('\u{1091c}', '\u{1091e}'), + ('\u{1093a}', '\u{1093e}'), + ('\u{10940}', '\u{1097f}'), + ('\u{109b8}', '\u{109bb}'), + ('\u{109d0}', '\u{109d1}'), + ('\u{10a04}', '\u{10a04}'), + ('\u{10a07}', '\u{10a0b}'), + ('\u{10a14}', '\u{10a14}'), + ('\u{10a18}', '\u{10a18}'), + ('\u{10a36}', '\u{10a37}'), + ('\u{10a3b}', '\u{10a3e}'), + ('\u{10a49}', '\u{10a4f}'), + ('\u{10a59}', '\u{10a5f}'), + ('\u{10aa0}', '\u{10abf}'), + ('\u{10ae7}', '\u{10aea}'), + ('\u{10af7}', '\u{10aff}'), + ('\u{10b36}', '\u{10b38}'), + ('\u{10b56}', '\u{10b57}'), + ('\u{10b73}', '\u{10b77}'), + ('\u{10b92}', '\u{10b98}'), + ('\u{10b9d}', '\u{10ba8}'), + ('\u{10bb0}', '\u{10bff}'), + ('\u{10c49}', '\u{10c7f}'), + ('\u{10cb3}', '\u{10cbf}'), + ('\u{10cf3}', '\u{10cf9}'), + ('\u{10d28}', '\u{10d2f}'), + ('\u{10d3a}', '\u{10e5f}'), + ('\u{10e7f}', '\u{10e7f}'), + ('\u{10eaa}', '\u{10eaa}'), + ('\u{10eae}', '\u{10eaf}'), + ('\u{10eb2}', '\u{10efc}'), + ('\u{10f28}', '\u{10f2f}'), + ('\u{10f5a}', '\u{10f6f}'), + ('\u{10f8a}', '\u{10faf}'), + ('\u{10fcc}', '\u{10fdf}'), + ('\u{10ff7}', '\u{10fff}'), + ('\u{1104e}', '\u{11051}'), + ('\u{11076}', '\u{1107e}'), + ('\u{110c3}', '\u{110cc}'), + ('\u{110ce}', '\u{110cf}'), + ('\u{110e9}', '\u{110ef}'), + ('\u{110fa}', '\u{110ff}'), + ('\u{11135}', '\u{11135}'), + ('\u{11148}', '\u{1114f}'), + ('\u{11177}', '\u{1117f}'), + ('\u{111e0}', '\u{111e0}'), + ('\u{111f5}', '\u{111ff}'), + ('\u{11212}', '\u{11212}'), + ('\u{11242}', '\u{1127f}'), + ('\u{11287}', '\u{11287}'), + ('\u{11289}', '\u{11289}'), + ('\u{1128e}', '\u{1128e}'), + ('\u{1129e}', '\u{1129e}'), + ('\u{112aa}', '\u{112af}'), + ('\u{112eb}', '\u{112ef}'), + ('\u{112fa}', '\u{112ff}'), + ('\u{11304}', '\u{11304}'), + ('\u{1130d}', '\u{1130e}'), + ('\u{11311}', '\u{11312}'), + ('\u{11329}', '\u{11329}'), + ('\u{11331}', '\u{11331}'), + ('\u{11334}', '\u{11334}'), + ('\u{1133a}', '\u{1133a}'), + ('\u{11345}', '\u{11346}'), + ('\u{11349}', '\u{1134a}'), + ('\u{1134e}', '\u{1134f}'), + ('\u{11351}', '\u{11356}'), + ('\u{11358}', '\u{1135c}'), + ('\u{11364}', '\u{11365}'), + ('\u{1136d}', '\u{1136f}'), + ('\u{11375}', '\u{113ff}'), + ('\u{1145c}', '\u{1145c}'), + ('\u{11462}', '\u{1147f}'), + ('\u{114c8}', '\u{114cf}'), + ('\u{114da}', '\u{1157f}'), + ('\u{115b6}', '\u{115b7}'), + ('\u{115de}', '\u{115ff}'), + ('\u{11645}', '\u{1164f}'), + ('\u{1165a}', '\u{1165f}'), + ('\u{1166d}', '\u{1167f}'), + ('\u{116ba}', '\u{116bf}'), + ('\u{116ca}', '\u{116ff}'), + ('\u{1171b}', '\u{1171c}'), + ('\u{1172c}', '\u{1172f}'), + ('\u{11747}', '\u{117ff}'), + ('\u{1183c}', '\u{1189f}'), + ('\u{118f3}', '\u{118fe}'), + ('\u{11907}', '\u{11908}'), + ('\u{1190a}', '\u{1190b}'), + ('\u{11914}', '\u{11914}'), + ('\u{11917}', '\u{11917}'), + ('\u{11936}', '\u{11936}'), + ('\u{11939}', '\u{1193a}'), + ('\u{11947}', '\u{1194f}'), + ('\u{1195a}', '\u{1199f}'), + ('\u{119a8}', '\u{119a9}'), + ('\u{119d8}', '\u{119d9}'), + ('\u{119e5}', '\u{119ff}'), + ('\u{11a48}', '\u{11a4f}'), + ('\u{11aa3}', '\u{11aaf}'), + ('\u{11af9}', '\u{11aff}'), + ('\u{11b0a}', '\u{11bff}'), + ('\u{11c09}', '\u{11c09}'), + ('\u{11c37}', '\u{11c37}'), + ('\u{11c46}', '\u{11c4f}'), + ('\u{11c6d}', '\u{11c6f}'), + ('\u{11c90}', '\u{11c91}'), + ('\u{11ca8}', '\u{11ca8}'), + ('\u{11cb7}', '\u{11cff}'), + ('\u{11d07}', '\u{11d07}'), + ('\u{11d0a}', '\u{11d0a}'), + ('\u{11d37}', '\u{11d39}'), + ('\u{11d3b}', '\u{11d3b}'), + ('\u{11d3e}', '\u{11d3e}'), + ('\u{11d48}', '\u{11d4f}'), + ('\u{11d5a}', '\u{11d5f}'), + ('\u{11d66}', '\u{11d66}'), + ('\u{11d69}', '\u{11d69}'), + ('\u{11d8f}', '\u{11d8f}'), + ('\u{11d92}', '\u{11d92}'), + ('\u{11d99}', '\u{11d9f}'), + ('\u{11daa}', '\u{11edf}'), + ('\u{11ef9}', '\u{11eff}'), + ('\u{11f11}', '\u{11f11}'), + ('\u{11f3b}', '\u{11f3d}'), + ('\u{11f5a}', '\u{11faf}'), + ('\u{11fb1}', '\u{11fbf}'), + ('\u{11ff2}', '\u{11ffe}'), + ('\u{1239a}', '\u{123ff}'), + ('\u{1246f}', '\u{1246f}'), + ('\u{12475}', '\u{1247f}'), + ('\u{12544}', '\u{12f8f}'), + ('\u{12ff3}', '\u{12fff}'), + ('\u{13456}', '\u{143ff}'), + ('\u{14647}', '\u{167ff}'), + ('\u{16a39}', '\u{16a3f}'), + ('\u{16a5f}', '\u{16a5f}'), + ('\u{16a6a}', '\u{16a6d}'), + ('\u{16abf}', '\u{16abf}'), + ('\u{16aca}', '\u{16acf}'), + ('\u{16aee}', '\u{16aef}'), + ('\u{16af6}', '\u{16aff}'), + ('\u{16b46}', '\u{16b4f}'), + ('\u{16b5a}', '\u{16b5a}'), + ('\u{16b62}', '\u{16b62}'), + ('\u{16b78}', '\u{16b7c}'), + ('\u{16b90}', '\u{16e3f}'), + ('\u{16e9b}', '\u{16eff}'), + ('\u{16f4b}', '\u{16f4e}'), + ('\u{16f88}', '\u{16f8e}'), + ('\u{16fa0}', '\u{16fdf}'), + ('\u{16fe5}', '\u{16fef}'), + ('\u{16ff2}', '\u{16fff}'), + ('\u{187f8}', '\u{187ff}'), + ('\u{18cd6}', '\u{18cff}'), + ('\u{18d09}', '\u{1afef}'), + ('\u{1aff4}', '\u{1aff4}'), + ('\u{1affc}', '\u{1affc}'), + ('\u{1afff}', '\u{1afff}'), + ('\u{1b123}', '\u{1b131}'), + ('\u{1b133}', '\u{1b14f}'), + ('\u{1b153}', '\u{1b154}'), + ('\u{1b156}', '\u{1b163}'), + ('\u{1b168}', '\u{1b16f}'), + ('\u{1b2fc}', '\u{1bbff}'), + ('\u{1bc6b}', '\u{1bc6f}'), + ('\u{1bc7d}', '\u{1bc7f}'), + ('\u{1bc89}', '\u{1bc8f}'), + ('\u{1bc9a}', '\u{1bc9b}'), + ('\u{1bca4}', '\u{1ceff}'), + ('\u{1cf2e}', '\u{1cf2f}'), + ('\u{1cf47}', '\u{1cf4f}'), + ('\u{1cfc4}', '\u{1cfff}'), + ('\u{1d0f6}', '\u{1d0ff}'), + ('\u{1d127}', '\u{1d128}'), + ('\u{1d1eb}', '\u{1d1ff}'), + ('\u{1d246}', '\u{1d2bf}'), + ('\u{1d2d4}', '\u{1d2df}'), + ('\u{1d2f4}', '\u{1d2ff}'), + ('\u{1d357}', '\u{1d35f}'), + ('\u{1d379}', '\u{1d3ff}'), + ('\u{1d455}', '\u{1d455}'), + ('\u{1d49d}', '\u{1d49d}'), + ('\u{1d4a0}', '\u{1d4a1}'), + ('\u{1d4a3}', '\u{1d4a4}'), + ('\u{1d4a7}', '\u{1d4a8}'), + ('\u{1d4ad}', '\u{1d4ad}'), + ('\u{1d4ba}', '\u{1d4ba}'), + ('\u{1d4bc}', '\u{1d4bc}'), + ('\u{1d4c4}', '\u{1d4c4}'), + ('\u{1d506}', '\u{1d506}'), + ('\u{1d50b}', '\u{1d50c}'), + ('\u{1d515}', '\u{1d515}'), + ('\u{1d51d}', '\u{1d51d}'), + ('\u{1d53a}', '\u{1d53a}'), + ('\u{1d53f}', '\u{1d53f}'), + ('\u{1d545}', '\u{1d545}'), + ('\u{1d547}', '\u{1d549}'), + ('\u{1d551}', '\u{1d551}'), + ('\u{1d6a6}', '\u{1d6a7}'), + ('\u{1d7cc}', '\u{1d7cd}'), + ('\u{1da8c}', '\u{1da9a}'), + ('\u{1daa0}', '\u{1daa0}'), + ('\u{1dab0}', '\u{1deff}'), + ('\u{1df1f}', '\u{1df24}'), + ('\u{1df2b}', '\u{1dfff}'), + ('\u{1e007}', '\u{1e007}'), + ('\u{1e019}', '\u{1e01a}'), + ('\u{1e022}', '\u{1e022}'), + ('\u{1e025}', '\u{1e025}'), + ('\u{1e02b}', '\u{1e02f}'), + ('\u{1e06e}', '\u{1e08e}'), + ('\u{1e090}', '\u{1e0ff}'), + ('\u{1e12d}', '\u{1e12f}'), + ('\u{1e13e}', '\u{1e13f}'), + ('\u{1e14a}', '\u{1e14d}'), + ('\u{1e150}', '\u{1e28f}'), + ('\u{1e2af}', '\u{1e2bf}'), + ('\u{1e2fa}', '\u{1e2fe}'), + ('\u{1e300}', '\u{1e4cf}'), + ('\u{1e4fa}', '\u{1e7df}'), + ('\u{1e7e7}', '\u{1e7e7}'), + ('\u{1e7ec}', '\u{1e7ec}'), + ('\u{1e7ef}', '\u{1e7ef}'), + ('\u{1e7ff}', '\u{1e7ff}'), + ('\u{1e8c5}', '\u{1e8c6}'), + ('\u{1e8d7}', '\u{1e8ff}'), + ('\u{1e94c}', '\u{1e94f}'), + ('\u{1e95a}', '\u{1e95d}'), + ('\u{1e960}', '\u{1ec70}'), + ('\u{1ecb5}', '\u{1ed00}'), + ('\u{1ed3e}', '\u{1edff}'), + ('\u{1ee04}', '\u{1ee04}'), + ('\u{1ee20}', '\u{1ee20}'), + ('\u{1ee23}', '\u{1ee23}'), + ('\u{1ee25}', '\u{1ee26}'), + ('\u{1ee28}', '\u{1ee28}'), + ('\u{1ee33}', '\u{1ee33}'), + ('\u{1ee38}', '\u{1ee38}'), + ('\u{1ee3a}', '\u{1ee3a}'), + ('\u{1ee3c}', '\u{1ee41}'), + ('\u{1ee43}', '\u{1ee46}'), + ('\u{1ee48}', '\u{1ee48}'), + ('\u{1ee4a}', '\u{1ee4a}'), + ('\u{1ee4c}', '\u{1ee4c}'), + ('\u{1ee50}', '\u{1ee50}'), + ('\u{1ee53}', '\u{1ee53}'), + ('\u{1ee55}', '\u{1ee56}'), + ('\u{1ee58}', '\u{1ee58}'), + ('\u{1ee5a}', '\u{1ee5a}'), + ('\u{1ee5c}', '\u{1ee5c}'), + ('\u{1ee5e}', '\u{1ee5e}'), + ('\u{1ee60}', '\u{1ee60}'), + ('\u{1ee63}', '\u{1ee63}'), + ('\u{1ee65}', '\u{1ee66}'), + ('\u{1ee6b}', '\u{1ee6b}'), + ('\u{1ee73}', '\u{1ee73}'), + ('\u{1ee78}', '\u{1ee78}'), + ('\u{1ee7d}', '\u{1ee7d}'), + ('\u{1ee7f}', '\u{1ee7f}'), + ('\u{1ee8a}', '\u{1ee8a}'), + ('\u{1ee9c}', '\u{1eea0}'), + ('\u{1eea4}', '\u{1eea4}'), + ('\u{1eeaa}', '\u{1eeaa}'), + ('\u{1eebc}', '\u{1eeef}'), + ('\u{1eef2}', '\u{1efff}'), + ('\u{1f02c}', '\u{1f02f}'), + ('\u{1f094}', '\u{1f09f}'), + ('\u{1f0af}', '\u{1f0b0}'), + ('\u{1f0c0}', '\u{1f0c0}'), + ('\u{1f0d0}', '\u{1f0d0}'), + ('\u{1f0f6}', '\u{1f0ff}'), + ('\u{1f1ae}', '\u{1f1e5}'), + ('\u{1f203}', '\u{1f20f}'), + ('\u{1f23c}', '\u{1f23f}'), + ('\u{1f249}', '\u{1f24f}'), + ('\u{1f252}', '\u{1f25f}'), + ('\u{1f266}', '\u{1f2ff}'), + ('\u{1f6d8}', '\u{1f6db}'), + ('\u{1f6ed}', '\u{1f6ef}'), + ('\u{1f6fd}', '\u{1f6ff}'), + ('\u{1f777}', '\u{1f77a}'), + ('\u{1f7da}', '\u{1f7df}'), + ('\u{1f7ec}', '\u{1f7ef}'), + ('\u{1f7f1}', '\u{1f7ff}'), + ('\u{1f80c}', '\u{1f80f}'), + ('\u{1f848}', '\u{1f84f}'), + ('\u{1f85a}', '\u{1f85f}'), + ('\u{1f888}', '\u{1f88f}'), + ('\u{1f8ae}', '\u{1f8af}'), + ('\u{1f8b2}', '\u{1f8ff}'), + ('\u{1fa54}', '\u{1fa5f}'), + ('\u{1fa6e}', '\u{1fa6f}'), + ('\u{1fa7d}', '\u{1fa7f}'), + ('\u{1fa89}', '\u{1fa8f}'), + ('\u{1fabe}', '\u{1fabe}'), + ('\u{1fac6}', '\u{1facd}'), + ('\u{1fadc}', '\u{1fadf}'), + ('\u{1fae9}', '\u{1faef}'), + ('\u{1faf9}', '\u{1faff}'), + ('\u{1fb93}', '\u{1fb93}'), + ('\u{1fbcb}', '\u{1fbef}'), + ('\u{1fbfa}', '\u{1ffff}'), + ('\u{2a6e0}', '\u{2a6ff}'), + ('\u{2b73a}', '\u{2b73f}'), + ('\u{2b81e}', '\u{2b81f}'), + ('\u{2cea2}', '\u{2ceaf}'), + ('\u{2ebe1}', '\u{2f7ff}'), + ('\u{2fa1e}', '\u{2ffff}'), + ('\u{3134b}', '\u{3134f}'), + ('\u{323b0}', '\u{e0000}'), + ('\u{e0002}', '\u{e001f}'), + ('\u{e0080}', '\u{e00ff}'), + ('\u{e01f0}', '\u{effff}'), + ('\u{ffffe}', '\u{fffff}'), + ('\u{10fffe}', '\u{10ffff}'), +]; + +pub const UPPERCASE_LETTER: &'static [(char, char)] = &[ + ('A', 'Z'), + ('À', 'Ö'), + ('Ø', 'Þ'), + ('Ā', 'Ā'), + ('Ă', 'Ă'), + ('Ą', 'Ą'), + ('Ć', 'Ć'), + ('Ĉ', 'Ĉ'), + ('Ċ', 'Ċ'), + ('Č', 'Č'), + ('Ď', 'Ď'), + ('Đ', 'Đ'), + ('Ē', 'Ē'), + ('Ĕ', 'Ĕ'), + ('Ė', 'Ė'), + ('Ę', 'Ę'), + ('Ě', 'Ě'), + ('Ĝ', 'Ĝ'), + ('Ğ', 'Ğ'), + ('Ġ', 'Ġ'), + ('Ģ', 'Ģ'), + ('Ĥ', 'Ĥ'), + ('Ħ', 'Ħ'), + ('Ĩ', 'Ĩ'), + ('Ī', 'Ī'), + ('Ĭ', 'Ĭ'), + ('Į', 'Į'), + ('İ', 'İ'), + ('IJ', 'IJ'), + ('Ĵ', 'Ĵ'), + ('Ķ', 'Ķ'), + ('Ĺ', 'Ĺ'), + ('Ļ', 'Ļ'), + ('Ľ', 'Ľ'), + ('Ŀ', 'Ŀ'), + ('Ł', 'Ł'), + ('Ń', 'Ń'), + ('Ņ', 'Ņ'), + ('Ň', 'Ň'), + ('Ŋ', 'Ŋ'), + ('Ō', 'Ō'), + ('Ŏ', 'Ŏ'), + ('Ő', 'Ő'), + ('Œ', 'Œ'), + ('Ŕ', 'Ŕ'), + ('Ŗ', 'Ŗ'), + ('Ř', 'Ř'), + ('Ś', 'Ś'), + ('Ŝ', 'Ŝ'), + ('Ş', 'Ş'), + ('Š', 'Š'), + ('Ţ', 'Ţ'), + ('Ť', 'Ť'), + ('Ŧ', 'Ŧ'), + ('Ũ', 'Ũ'), + ('Ū', 'Ū'), + ('Ŭ', 'Ŭ'), + ('Ů', 'Ů'), + ('Ű', 'Ű'), + ('Ų', 'Ų'), + ('Ŵ', 'Ŵ'), + ('Ŷ', 'Ŷ'), + ('Ÿ', 'Ź'), + ('Ż', 'Ż'), + ('Ž', 'Ž'), + ('Ɓ', 'Ƃ'), + ('Ƅ', 'Ƅ'), + ('Ɔ', 'Ƈ'), + ('Ɖ', 'Ƌ'), + ('Ǝ', 'Ƒ'), + ('Ɠ', 'Ɣ'), + ('Ɩ', 'Ƙ'), + ('Ɯ', 'Ɲ'), + ('Ɵ', 'Ơ'), + ('Ƣ', 'Ƣ'), + ('Ƥ', 'Ƥ'), + ('Ʀ', 'Ƨ'), + ('Ʃ', 'Ʃ'), + ('Ƭ', 'Ƭ'), + ('Ʈ', 'Ư'), + ('Ʊ', 'Ƴ'), + ('Ƶ', 'Ƶ'), + ('Ʒ', 'Ƹ'), + ('Ƽ', 'Ƽ'), + ('DŽ', 'DŽ'), + ('LJ', 'LJ'), + ('NJ', 'NJ'), + ('Ǎ', 'Ǎ'), + ('Ǐ', 'Ǐ'), + ('Ǒ', 'Ǒ'), + ('Ǔ', 'Ǔ'), + ('Ǖ', 'Ǖ'), + ('Ǘ', 'Ǘ'), + ('Ǚ', 'Ǚ'), + ('Ǜ', 'Ǜ'), + ('Ǟ', 'Ǟ'), + ('Ǡ', 'Ǡ'), + ('Ǣ', 'Ǣ'), + ('Ǥ', 'Ǥ'), + ('Ǧ', 'Ǧ'), + ('Ǩ', 'Ǩ'), + ('Ǫ', 'Ǫ'), + ('Ǭ', 'Ǭ'), + ('Ǯ', 'Ǯ'), + ('DZ', 'DZ'), + ('Ǵ', 'Ǵ'), + ('Ƕ', 'Ǹ'), + ('Ǻ', 'Ǻ'), + ('Ǽ', 'Ǽ'), + ('Ǿ', 'Ǿ'), + ('Ȁ', 'Ȁ'), + ('Ȃ', 'Ȃ'), + ('Ȅ', 'Ȅ'), + ('Ȇ', 'Ȇ'), + ('Ȉ', 'Ȉ'), + ('Ȋ', 'Ȋ'), + ('Ȍ', 'Ȍ'), + ('Ȏ', 'Ȏ'), + ('Ȑ', 'Ȑ'), + ('Ȓ', 'Ȓ'), + ('Ȕ', 'Ȕ'), + ('Ȗ', 'Ȗ'), + ('Ș', 'Ș'), + ('Ț', 'Ț'), + ('Ȝ', 'Ȝ'), + ('Ȟ', 'Ȟ'), + ('Ƞ', 'Ƞ'), + ('Ȣ', 'Ȣ'), + ('Ȥ', 'Ȥ'), + ('Ȧ', 'Ȧ'), + ('Ȩ', 'Ȩ'), + ('Ȫ', 'Ȫ'), + ('Ȭ', 'Ȭ'), + ('Ȯ', 'Ȯ'), + ('Ȱ', 'Ȱ'), + ('Ȳ', 'Ȳ'), + ('Ⱥ', 'Ȼ'), + ('Ƚ', 'Ⱦ'), + ('Ɂ', 'Ɂ'), + ('Ƀ', 'Ɇ'), + ('Ɉ', 'Ɉ'), + ('Ɋ', 'Ɋ'), + ('Ɍ', 'Ɍ'), + ('Ɏ', 'Ɏ'), + ('Ͱ', 'Ͱ'), + ('Ͳ', 'Ͳ'), + ('Ͷ', 'Ͷ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ώ'), + ('Α', 'Ρ'), + ('Σ', 'Ϋ'), + ('Ϗ', 'Ϗ'), + ('ϒ', 'ϔ'), + ('Ϙ', 'Ϙ'), + ('Ϛ', 'Ϛ'), + ('Ϝ', 'Ϝ'), + ('Ϟ', 'Ϟ'), + ('Ϡ', 'Ϡ'), + ('Ϣ', 'Ϣ'), + ('Ϥ', 'Ϥ'), + ('Ϧ', 'Ϧ'), + ('Ϩ', 'Ϩ'), + ('Ϫ', 'Ϫ'), + ('Ϭ', 'Ϭ'), + ('Ϯ', 'Ϯ'), + ('ϴ', 'ϴ'), + ('Ϸ', 'Ϸ'), + ('Ϲ', 'Ϻ'), + ('Ͻ', 'Я'), + ('Ѡ', 'Ѡ'), + ('Ѣ', 'Ѣ'), + ('Ѥ', 'Ѥ'), + ('Ѧ', 'Ѧ'), + ('Ѩ', 'Ѩ'), + ('Ѫ', 'Ѫ'), + ('Ѭ', 'Ѭ'), + ('Ѯ', 'Ѯ'), + ('Ѱ', 'Ѱ'), + ('Ѳ', 'Ѳ'), + ('Ѵ', 'Ѵ'), + ('Ѷ', 'Ѷ'), + ('Ѹ', 'Ѹ'), + ('Ѻ', 'Ѻ'), + ('Ѽ', 'Ѽ'), + ('Ѿ', 'Ѿ'), + ('Ҁ', 'Ҁ'), + ('Ҋ', 'Ҋ'), + ('Ҍ', 'Ҍ'), + ('Ҏ', 'Ҏ'), + ('Ґ', 'Ґ'), + ('Ғ', 'Ғ'), + ('Ҕ', 'Ҕ'), + ('Җ', 'Җ'), + ('Ҙ', 'Ҙ'), + ('Қ', 'Қ'), + ('Ҝ', 'Ҝ'), + ('Ҟ', 'Ҟ'), + ('Ҡ', 'Ҡ'), + ('Ң', 'Ң'), + ('Ҥ', 'Ҥ'), + ('Ҧ', 'Ҧ'), + ('Ҩ', 'Ҩ'), + ('Ҫ', 'Ҫ'), + ('Ҭ', 'Ҭ'), + ('Ү', 'Ү'), + ('Ұ', 'Ұ'), + ('Ҳ', 'Ҳ'), + ('Ҵ', 'Ҵ'), + ('Ҷ', 'Ҷ'), + ('Ҹ', 'Ҹ'), + ('Һ', 'Һ'), + ('Ҽ', 'Ҽ'), + ('Ҿ', 'Ҿ'), + ('Ӏ', 'Ӂ'), + ('Ӄ', 'Ӄ'), + ('Ӆ', 'Ӆ'), + ('Ӈ', 'Ӈ'), + ('Ӊ', 'Ӊ'), + ('Ӌ', 'Ӌ'), + ('Ӎ', 'Ӎ'), + ('Ӑ', 'Ӑ'), + ('Ӓ', 'Ӓ'), + ('Ӕ', 'Ӕ'), + ('Ӗ', 'Ӗ'), + ('Ә', 'Ә'), + ('Ӛ', 'Ӛ'), + ('Ӝ', 'Ӝ'), + ('Ӟ', 'Ӟ'), + ('Ӡ', 'Ӡ'), + ('Ӣ', 'Ӣ'), + ('Ӥ', 'Ӥ'), + ('Ӧ', 'Ӧ'), + ('Ө', 'Ө'), + ('Ӫ', 'Ӫ'), + ('Ӭ', 'Ӭ'), + ('Ӯ', 'Ӯ'), + ('Ӱ', 'Ӱ'), + ('Ӳ', 'Ӳ'), + ('Ӵ', 'Ӵ'), + ('Ӷ', 'Ӷ'), + ('Ӹ', 'Ӹ'), + ('Ӻ', 'Ӻ'), + ('Ӽ', 'Ӽ'), + ('Ӿ', 'Ӿ'), + ('Ԁ', 'Ԁ'), + ('Ԃ', 'Ԃ'), + ('Ԅ', 'Ԅ'), + ('Ԇ', 'Ԇ'), + ('Ԉ', 'Ԉ'), + ('Ԋ', 'Ԋ'), + ('Ԍ', 'Ԍ'), + ('Ԏ', 'Ԏ'), + ('Ԑ', 'Ԑ'), + ('Ԓ', 'Ԓ'), + ('Ԕ', 'Ԕ'), + ('Ԗ', 'Ԗ'), + ('Ԙ', 'Ԙ'), + ('Ԛ', 'Ԛ'), + ('Ԝ', 'Ԝ'), + ('Ԟ', 'Ԟ'), + ('Ԡ', 'Ԡ'), + ('Ԣ', 'Ԣ'), + ('Ԥ', 'Ԥ'), + ('Ԧ', 'Ԧ'), + ('Ԩ', 'Ԩ'), + ('Ԫ', 'Ԫ'), + ('Ԭ', 'Ԭ'), + ('Ԯ', 'Ԯ'), + ('Ա', 'Ֆ'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('Ꭰ', 'Ᏽ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('Ḁ', 'Ḁ'), + ('Ḃ', 'Ḃ'), + ('Ḅ', 'Ḅ'), + ('Ḇ', 'Ḇ'), + ('Ḉ', 'Ḉ'), + ('Ḋ', 'Ḋ'), + ('Ḍ', 'Ḍ'), + ('Ḏ', 'Ḏ'), + ('Ḑ', 'Ḑ'), + ('Ḓ', 'Ḓ'), + ('Ḕ', 'Ḕ'), + ('Ḗ', 'Ḗ'), + ('Ḙ', 'Ḙ'), + ('Ḛ', 'Ḛ'), + ('Ḝ', 'Ḝ'), + ('Ḟ', 'Ḟ'), + ('Ḡ', 'Ḡ'), + ('Ḣ', 'Ḣ'), + ('Ḥ', 'Ḥ'), + ('Ḧ', 'Ḧ'), + ('Ḩ', 'Ḩ'), + ('Ḫ', 'Ḫ'), + ('Ḭ', 'Ḭ'), + ('Ḯ', 'Ḯ'), + ('Ḱ', 'Ḱ'), + ('Ḳ', 'Ḳ'), + ('Ḵ', 'Ḵ'), + ('Ḷ', 'Ḷ'), + ('Ḹ', 'Ḹ'), + ('Ḻ', 'Ḻ'), + ('Ḽ', 'Ḽ'), + ('Ḿ', 'Ḿ'), + ('Ṁ', 'Ṁ'), + ('Ṃ', 'Ṃ'), + ('Ṅ', 'Ṅ'), + ('Ṇ', 'Ṇ'), + ('Ṉ', 'Ṉ'), + ('Ṋ', 'Ṋ'), + ('Ṍ', 'Ṍ'), + ('Ṏ', 'Ṏ'), + ('Ṑ', 'Ṑ'), + ('Ṓ', 'Ṓ'), + ('Ṕ', 'Ṕ'), + ('Ṗ', 'Ṗ'), + ('Ṙ', 'Ṙ'), + ('Ṛ', 'Ṛ'), + ('Ṝ', 'Ṝ'), + ('Ṟ', 'Ṟ'), + ('Ṡ', 'Ṡ'), + ('Ṣ', 'Ṣ'), + ('Ṥ', 'Ṥ'), + ('Ṧ', 'Ṧ'), + ('Ṩ', 'Ṩ'), + ('Ṫ', 'Ṫ'), + ('Ṭ', 'Ṭ'), + ('Ṯ', 'Ṯ'), + ('Ṱ', 'Ṱ'), + ('Ṳ', 'Ṳ'), + ('Ṵ', 'Ṵ'), + ('Ṷ', 'Ṷ'), + ('Ṹ', 'Ṹ'), + ('Ṻ', 'Ṻ'), + ('Ṽ', 'Ṽ'), + ('Ṿ', 'Ṿ'), + ('Ẁ', 'Ẁ'), + ('Ẃ', 'Ẃ'), + ('Ẅ', 'Ẅ'), + ('Ẇ', 'Ẇ'), + ('Ẉ', 'Ẉ'), + ('Ẋ', 'Ẋ'), + ('Ẍ', 'Ẍ'), + ('Ẏ', 'Ẏ'), + ('Ẑ', 'Ẑ'), + ('Ẓ', 'Ẓ'), + ('Ẕ', 'Ẕ'), + ('ẞ', 'ẞ'), + ('Ạ', 'Ạ'), + ('Ả', 'Ả'), + ('Ấ', 'Ấ'), + ('Ầ', 'Ầ'), + ('Ẩ', 'Ẩ'), + ('Ẫ', 'Ẫ'), + ('Ậ', 'Ậ'), + ('Ắ', 'Ắ'), + ('Ằ', 'Ằ'), + ('Ẳ', 'Ẳ'), + ('Ẵ', 'Ẵ'), + ('Ặ', 'Ặ'), + ('Ẹ', 'Ẹ'), + ('Ẻ', 'Ẻ'), + ('Ẽ', 'Ẽ'), + ('Ế', 'Ế'), + ('Ề', 'Ề'), + ('Ể', 'Ể'), + ('Ễ', 'Ễ'), + ('Ệ', 'Ệ'), + ('Ỉ', 'Ỉ'), + ('Ị', 'Ị'), + ('Ọ', 'Ọ'), + ('Ỏ', 'Ỏ'), + ('Ố', 'Ố'), + ('Ồ', 'Ồ'), + ('Ổ', 'Ổ'), + ('Ỗ', 'Ỗ'), + ('Ộ', 'Ộ'), + ('Ớ', 'Ớ'), + ('Ờ', 'Ờ'), + ('Ở', 'Ở'), + ('Ỡ', 'Ỡ'), + ('Ợ', 'Ợ'), + ('Ụ', 'Ụ'), + ('Ủ', 'Ủ'), + ('Ứ', 'Ứ'), + ('Ừ', 'Ừ'), + ('Ử', 'Ử'), + ('Ữ', 'Ữ'), + ('Ự', 'Ự'), + ('Ỳ', 'Ỳ'), + ('Ỵ', 'Ỵ'), + ('Ỷ', 'Ỷ'), + ('Ỹ', 'Ỹ'), + ('Ỻ', 'Ỻ'), + ('Ỽ', 'Ỽ'), + ('Ỿ', 'Ỿ'), + ('Ἀ', 'Ἇ'), + ('Ἐ', 'Ἕ'), + ('Ἠ', 'Ἧ'), + ('Ἰ', 'Ἷ'), + ('Ὀ', 'Ὅ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'Ὗ'), + ('Ὠ', 'Ὧ'), + ('Ᾰ', 'Ά'), + ('Ὲ', 'Ή'), + ('Ῐ', 'Ί'), + ('Ῠ', 'Ῥ'), + ('Ὸ', 'Ώ'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℋ', 'ℍ'), + ('ℐ', 'ℒ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℰ', 'ℳ'), + ('ℾ', 'ℿ'), + ('ⅅ', 'ⅅ'), + ('Ↄ', 'Ↄ'), + ('Ⰰ', 'Ⱟ'), + ('Ⱡ', 'Ⱡ'), + ('Ɫ', 'Ɽ'), + ('Ⱨ', 'Ⱨ'), + ('Ⱪ', 'Ⱪ'), + ('Ⱬ', 'Ⱬ'), + ('Ɑ', 'Ɒ'), + ('Ⱳ', 'Ⱳ'), + ('Ⱶ', 'Ⱶ'), + ('Ȿ', 'Ⲁ'), + ('Ⲃ', 'Ⲃ'), + ('Ⲅ', 'Ⲅ'), + ('Ⲇ', 'Ⲇ'), + ('Ⲉ', 'Ⲉ'), + ('Ⲋ', 'Ⲋ'), + ('Ⲍ', 'Ⲍ'), + ('Ⲏ', 'Ⲏ'), + ('Ⲑ', 'Ⲑ'), + ('Ⲓ', 'Ⲓ'), + ('Ⲕ', 'Ⲕ'), + ('Ⲗ', 'Ⲗ'), + ('Ⲙ', 'Ⲙ'), + ('Ⲛ', 'Ⲛ'), + ('Ⲝ', 'Ⲝ'), + ('Ⲟ', 'Ⲟ'), + ('Ⲡ', 'Ⲡ'), + ('Ⲣ', 'Ⲣ'), + ('Ⲥ', 'Ⲥ'), + ('Ⲧ', 'Ⲧ'), + ('Ⲩ', 'Ⲩ'), + ('Ⲫ', 'Ⲫ'), + ('Ⲭ', 'Ⲭ'), + ('Ⲯ', 'Ⲯ'), + ('Ⲱ', 'Ⲱ'), + ('Ⲳ', 'Ⲳ'), + ('Ⲵ', 'Ⲵ'), + ('Ⲷ', 'Ⲷ'), + ('Ⲹ', 'Ⲹ'), + ('Ⲻ', 'Ⲻ'), + ('Ⲽ', 'Ⲽ'), + ('Ⲿ', 'Ⲿ'), + ('Ⳁ', 'Ⳁ'), + ('Ⳃ', 'Ⳃ'), + ('Ⳅ', 'Ⳅ'), + ('Ⳇ', 'Ⳇ'), + ('Ⳉ', 'Ⳉ'), + ('Ⳋ', 'Ⳋ'), + ('Ⳍ', 'Ⳍ'), + ('Ⳏ', 'Ⳏ'), + ('Ⳑ', 'Ⳑ'), + ('Ⳓ', 'Ⳓ'), + ('Ⳕ', 'Ⳕ'), + ('Ⳗ', 'Ⳗ'), + ('Ⳙ', 'Ⳙ'), + ('Ⳛ', 'Ⳛ'), + ('Ⳝ', 'Ⳝ'), + ('Ⳟ', 'Ⳟ'), + ('Ⳡ', 'Ⳡ'), + ('Ⳣ', 'Ⳣ'), + ('Ⳬ', 'Ⳬ'), + ('Ⳮ', 'Ⳮ'), + ('Ⳳ', 'Ⳳ'), + ('Ꙁ', 'Ꙁ'), + ('Ꙃ', 'Ꙃ'), + ('Ꙅ', 'Ꙅ'), + ('Ꙇ', 'Ꙇ'), + ('Ꙉ', 'Ꙉ'), + ('Ꙋ', 'Ꙋ'), + ('Ꙍ', 'Ꙍ'), + ('Ꙏ', 'Ꙏ'), + ('Ꙑ', 'Ꙑ'), + ('Ꙓ', 'Ꙓ'), + ('Ꙕ', 'Ꙕ'), + ('Ꙗ', 'Ꙗ'), + ('Ꙙ', 'Ꙙ'), + ('Ꙛ', 'Ꙛ'), + ('Ꙝ', 'Ꙝ'), + ('Ꙟ', 'Ꙟ'), + ('Ꙡ', 'Ꙡ'), + ('Ꙣ', 'Ꙣ'), + ('Ꙥ', 'Ꙥ'), + ('Ꙧ', 'Ꙧ'), + ('Ꙩ', 'Ꙩ'), + ('Ꙫ', 'Ꙫ'), + ('Ꙭ', 'Ꙭ'), + ('Ꚁ', 'Ꚁ'), + ('Ꚃ', 'Ꚃ'), + ('Ꚅ', 'Ꚅ'), + ('Ꚇ', 'Ꚇ'), + ('Ꚉ', 'Ꚉ'), + ('Ꚋ', 'Ꚋ'), + ('Ꚍ', 'Ꚍ'), + ('Ꚏ', 'Ꚏ'), + ('Ꚑ', 'Ꚑ'), + ('Ꚓ', 'Ꚓ'), + ('Ꚕ', 'Ꚕ'), + ('Ꚗ', 'Ꚗ'), + ('Ꚙ', 'Ꚙ'), + ('Ꚛ', 'Ꚛ'), + ('Ꜣ', 'Ꜣ'), + ('Ꜥ', 'Ꜥ'), + ('Ꜧ', 'Ꜧ'), + ('Ꜩ', 'Ꜩ'), + ('Ꜫ', 'Ꜫ'), + ('Ꜭ', 'Ꜭ'), + ('Ꜯ', 'Ꜯ'), + ('Ꜳ', 'Ꜳ'), + ('Ꜵ', 'Ꜵ'), + ('Ꜷ', 'Ꜷ'), + ('Ꜹ', 'Ꜹ'), + ('Ꜻ', 'Ꜻ'), + ('Ꜽ', 'Ꜽ'), + ('Ꜿ', 'Ꜿ'), + ('Ꝁ', 'Ꝁ'), + ('Ꝃ', 'Ꝃ'), + ('Ꝅ', 'Ꝅ'), + ('Ꝇ', 'Ꝇ'), + ('Ꝉ', 'Ꝉ'), + ('Ꝋ', 'Ꝋ'), + ('Ꝍ', 'Ꝍ'), + ('Ꝏ', 'Ꝏ'), + ('Ꝑ', 'Ꝑ'), + ('Ꝓ', 'Ꝓ'), + ('Ꝕ', 'Ꝕ'), + ('Ꝗ', 'Ꝗ'), + ('Ꝙ', 'Ꝙ'), + ('Ꝛ', 'Ꝛ'), + ('Ꝝ', 'Ꝝ'), + ('Ꝟ', 'Ꝟ'), + ('Ꝡ', 'Ꝡ'), + ('Ꝣ', 'Ꝣ'), + ('Ꝥ', 'Ꝥ'), + ('Ꝧ', 'Ꝧ'), + ('Ꝩ', 'Ꝩ'), + ('Ꝫ', 'Ꝫ'), + ('Ꝭ', 'Ꝭ'), + ('Ꝯ', 'Ꝯ'), + ('Ꝺ', 'Ꝺ'), + ('Ꝼ', 'Ꝼ'), + ('Ᵹ', 'Ꝿ'), + ('Ꞁ', 'Ꞁ'), + ('Ꞃ', 'Ꞃ'), + ('Ꞅ', 'Ꞅ'), + ('Ꞇ', 'Ꞇ'), + ('Ꞌ', 'Ꞌ'), + ('Ɥ', 'Ɥ'), + ('Ꞑ', 'Ꞑ'), + ('Ꞓ', 'Ꞓ'), + ('Ꞗ', 'Ꞗ'), + ('Ꞙ', 'Ꞙ'), + ('Ꞛ', 'Ꞛ'), + ('Ꞝ', 'Ꞝ'), + ('Ꞟ', 'Ꞟ'), + ('Ꞡ', 'Ꞡ'), + ('Ꞣ', 'Ꞣ'), + ('Ꞥ', 'Ꞥ'), + ('Ꞧ', 'Ꞧ'), + ('Ꞩ', 'Ꞩ'), + ('Ɦ', 'Ɪ'), + ('Ʞ', 'Ꞵ'), + ('Ꞷ', 'Ꞷ'), + ('Ꞹ', 'Ꞹ'), + ('Ꞻ', 'Ꞻ'), + ('Ꞽ', 'Ꞽ'), + ('Ꞿ', 'Ꞿ'), + ('Ꟁ', 'Ꟁ'), + ('Ꟃ', 'Ꟃ'), + ('Ꞔ', 'Ꟈ'), + ('Ꟊ', 'Ꟊ'), + ('Ꟑ', 'Ꟑ'), + ('Ꟗ', 'Ꟗ'), + ('Ꟙ', 'Ꟙ'), + ('Ꟶ', 'Ꟶ'), + ('A', 'Z'), + ('𐐀', '𐐧'), + ('𐒰', '𐓓'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐲀', '𐲲'), + ('𑢠', '𑢿'), + ('𖹀', '𖹟'), + ('𝐀', '𝐙'), + ('𝐴', '𝑍'), + ('𝑨', '𝒁'), + ('𝒜', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒵'), + ('𝓐', '𝓩'), + ('𝔄', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔸', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕬', '𝖅'), + ('𝖠', '𝖹'), + ('𝗔', '𝗭'), + ('𝘈', '𝘡'), + ('𝘼', '𝙕'), + ('𝙰', '𝚉'), + ('𝚨', '𝛀'), + ('𝛢', '𝛺'), + ('𝜜', '𝜴'), + ('𝝖', '𝝮'), + ('𝞐', '𝞨'), + ('𝟊', '𝟊'), + ('𞤀', '𞤡'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/grapheme_cluster_break.rs b/vendor/regex-syntax/src/unicode_tables/grapheme_cluster_break.rs new file mode 100644 index 0000000000..294dfbdcc0 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/grapheme_cluster_break.rs @@ -0,0 +1,1416 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate grapheme-cluster-break ucd-15.0.0 --chars +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("CR", CR), + ("Control", CONTROL), + ("Extend", EXTEND), + ("L", L), + ("LF", LF), + ("LV", LV), + ("LVT", LVT), + ("Prepend", PREPEND), + ("Regional_Indicator", REGIONAL_INDICATOR), + ("SpacingMark", SPACINGMARK), + ("T", T), + ("V", V), + ("ZWJ", ZWJ), +]; + +pub const CR: &'static [(char, char)] = &[('\r', '\r')]; + +pub const CONTROL: &'static [(char, char)] = &[ + ('\0', '\t'), + ('\u{b}', '\u{c}'), + ('\u{e}', '\u{1f}'), + ('\u{7f}', '\u{9f}'), + ('\u{ad}', '\u{ad}'), + ('\u{61c}', '\u{61c}'), + ('\u{180e}', '\u{180e}'), + ('\u{200b}', '\u{200b}'), + ('\u{200e}', '\u{200f}'), + ('\u{2028}', '\u{202e}'), + ('\u{2060}', '\u{206f}'), + ('\u{feff}', '\u{feff}'), + ('\u{fff0}', '\u{fffb}'), + ('\u{13430}', '\u{1343f}'), + ('\u{1bca0}', '\u{1bca3}'), + ('\u{1d173}', '\u{1d17a}'), + ('\u{e0000}', '\u{e001f}'), + ('\u{e0080}', '\u{e00ff}'), + ('\u{e01f0}', '\u{e0fff}'), +]; + +pub const EXTEND: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{483}', '\u{489}'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6df}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', '\u{7f3}'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{819}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('\u{898}', '\u{89f}'), + ('\u{8ca}', '\u{8e1}'), + ('\u{8e3}', '\u{902}'), + ('\u{93a}', '\u{93a}'), + ('\u{93c}', '\u{93c}'), + ('\u{941}', '\u{948}'), + ('\u{94d}', '\u{94d}'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', '\u{981}'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9be}', '\u{9be}'), + ('\u{9c1}', '\u{9c4}'), + ('\u{9cd}', '\u{9cd}'), + ('\u{9d7}', '\u{9d7}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', '\u{a02}'), + ('\u{a3c}', '\u{a3c}'), + ('\u{a41}', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', '\u{a82}'), + ('\u{abc}', '\u{abc}'), + ('\u{ac1}', '\u{ac5}'), + ('\u{ac7}', '\u{ac8}'), + ('\u{acd}', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', '\u{b01}'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3e}', '\u{b3f}'), + ('\u{b41}', '\u{b44}'), + ('\u{b4d}', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bbe}', '\u{bbe}'), + ('\u{bc0}', '\u{bc0}'), + ('\u{bcd}', '\u{bcd}'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', '\u{c00}'), + ('\u{c04}', '\u{c04}'), + ('\u{c3c}', '\u{c3c}'), + ('\u{c3e}', '\u{c40}'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', '\u{c81}'), + ('\u{cbc}', '\u{cbc}'), + ('\u{cbf}', '\u{cbf}'), + ('\u{cc2}', '\u{cc2}'), + ('\u{cc6}', '\u{cc6}'), + ('\u{ccc}', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{ce2}', '\u{ce3}'), + ('\u{d00}', '\u{d01}'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d3e}', '\u{d3e}'), + ('\u{d41}', '\u{d44}'), + ('\u{d4d}', '\u{d4d}'), + ('\u{d57}', '\u{d57}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', '\u{d81}'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dcf}'), + ('\u{dd2}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('\u{ddf}', '\u{ddf}'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e47}', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('\u{ec8}', '\u{ece}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('\u{f71}', '\u{f7e}'), + ('\u{f80}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('\u{102d}', '\u{1030}'), + ('\u{1032}', '\u{1037}'), + ('\u{1039}', '\u{103a}'), + ('\u{103d}', '\u{103e}'), + ('\u{1058}', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{1082}'), + ('\u{1085}', '\u{1086}'), + ('\u{108d}', '\u{108d}'), + ('\u{109d}', '\u{109d}'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '\u{1714}'), + ('\u{1732}', '\u{1733}'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17b5}'), + ('\u{17b7}', '\u{17bd}'), + ('\u{17c6}', '\u{17c6}'), + ('\u{17c9}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '\u{180f}'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', '\u{1922}'), + ('\u{1927}', '\u{1928}'), + ('\u{1932}', '\u{1932}'), + ('\u{1939}', '\u{193b}'), + ('\u{1a17}', '\u{1a18}'), + ('\u{1a1b}', '\u{1a1b}'), + ('\u{1a56}', '\u{1a56}'), + ('\u{1a58}', '\u{1a5e}'), + ('\u{1a60}', '\u{1a60}'), + ('\u{1a62}', '\u{1a62}'), + ('\u{1a65}', '\u{1a6c}'), + ('\u{1a73}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1b00}', '\u{1b03}'), + ('\u{1b34}', '\u{1b3a}'), + ('\u{1b3c}', '\u{1b3c}'), + ('\u{1b42}', '\u{1b42}'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '\u{1b81}'), + ('\u{1ba2}', '\u{1ba5}'), + ('\u{1ba8}', '\u{1ba9}'), + ('\u{1bab}', '\u{1bad}'), + ('\u{1be6}', '\u{1be6}'), + ('\u{1be8}', '\u{1be9}'), + ('\u{1bed}', '\u{1bed}'), + ('\u{1bef}', '\u{1bf1}'), + ('\u{1c2c}', '\u{1c33}'), + ('\u{1c36}', '\u{1c37}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce0}'), + ('\u{1ce2}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{1dc0}', '\u{1dff}'), + ('\u{200c}', '\u{200c}'), + ('\u{20d0}', '\u{20f0}'), + ('\u{2cef}', '\u{2cf1}'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('\u{302a}', '\u{302f}'), + ('\u{3099}', '\u{309a}'), + ('\u{a66f}', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('\u{a825}', '\u{a826}'), + ('\u{a82c}', '\u{a82c}'), + ('\u{a8c4}', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', '\u{a951}'), + ('\u{a980}', '\u{a982}'), + ('\u{a9b3}', '\u{a9b3}'), + ('\u{a9b6}', '\u{a9b9}'), + ('\u{a9bc}', '\u{a9bd}'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa2e}'), + ('\u{aa31}', '\u{aa32}'), + ('\u{aa35}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', '\u{aa4c}'), + ('\u{aa7c}', '\u{aa7c}'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('\u{aaec}', '\u{aaed}'), + ('\u{aaf6}', '\u{aaf6}'), + ('\u{abe5}', '\u{abe5}'), + ('\u{abe8}', '\u{abe8}'), + ('\u{abed}', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('\u{ff9e}', '\u{ff9f}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10efd}', '\u{10eff}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{10f82}', '\u{10f85}'), + ('\u{11001}', '\u{11001}'), + ('\u{11038}', '\u{11046}'), + ('\u{11070}', '\u{11070}'), + ('\u{11073}', '\u{11074}'), + ('\u{1107f}', '\u{11081}'), + ('\u{110b3}', '\u{110b6}'), + ('\u{110b9}', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{1112b}'), + ('\u{1112d}', '\u{11134}'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', '\u{11181}'), + ('\u{111b6}', '\u{111be}'), + ('\u{111c9}', '\u{111cc}'), + ('\u{111cf}', '\u{111cf}'), + ('\u{1122f}', '\u{11231}'), + ('\u{11234}', '\u{11234}'), + ('\u{11236}', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{11241}', '\u{11241}'), + ('\u{112df}', '\u{112df}'), + ('\u{112e3}', '\u{112ea}'), + ('\u{11300}', '\u{11301}'), + ('\u{1133b}', '\u{1133c}'), + ('\u{1133e}', '\u{1133e}'), + ('\u{11340}', '\u{11340}'), + ('\u{11357}', '\u{11357}'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('\u{11438}', '\u{1143f}'), + ('\u{11442}', '\u{11444}'), + ('\u{11446}', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b0}', '\u{114b0}'), + ('\u{114b3}', '\u{114b8}'), + ('\u{114ba}', '\u{114ba}'), + ('\u{114bd}', '\u{114bd}'), + ('\u{114bf}', '\u{114c0}'), + ('\u{114c2}', '\u{114c3}'), + ('\u{115af}', '\u{115af}'), + ('\u{115b2}', '\u{115b5}'), + ('\u{115bc}', '\u{115bd}'), + ('\u{115bf}', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('\u{11633}', '\u{1163a}'), + ('\u{1163d}', '\u{1163d}'), + ('\u{1163f}', '\u{11640}'), + ('\u{116ab}', '\u{116ab}'), + ('\u{116ad}', '\u{116ad}'), + ('\u{116b0}', '\u{116b5}'), + ('\u{116b7}', '\u{116b7}'), + ('\u{1171d}', '\u{1171f}'), + ('\u{11722}', '\u{11725}'), + ('\u{11727}', '\u{1172b}'), + ('\u{1182f}', '\u{11837}'), + ('\u{11839}', '\u{1183a}'), + ('\u{11930}', '\u{11930}'), + ('\u{1193b}', '\u{1193c}'), + ('\u{1193e}', '\u{1193e}'), + ('\u{11943}', '\u{11943}'), + ('\u{119d4}', '\u{119d7}'), + ('\u{119da}', '\u{119db}'), + ('\u{119e0}', '\u{119e0}'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', '\u{11a38}'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a56}'), + ('\u{11a59}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a96}'), + ('\u{11a98}', '\u{11a99}'), + ('\u{11c30}', '\u{11c36}'), + ('\u{11c38}', '\u{11c3d}'), + ('\u{11c3f}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('\u{11caa}', '\u{11cb0}'), + ('\u{11cb2}', '\u{11cb3}'), + ('\u{11cb5}', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('\u{11d90}', '\u{11d91}'), + ('\u{11d95}', '\u{11d95}'), + ('\u{11d97}', '\u{11d97}'), + ('\u{11ef3}', '\u{11ef4}'), + ('\u{11f00}', '\u{11f01}'), + ('\u{11f36}', '\u{11f3a}'), + ('\u{11f40}', '\u{11f40}'), + ('\u{11f42}', '\u{11f42}'), + ('\u{13440}', '\u{13440}'), + ('\u{13447}', '\u{13455}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f4f}', '\u{16f4f}'), + ('\u{16f8f}', '\u{16f92}'), + ('\u{16fe4}', '\u{16fe4}'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d165}', '\u{1d165}'), + ('\u{1d167}', '\u{1d169}'), + ('\u{1d16e}', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e08f}', '\u{1e08f}'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ae}', '\u{1e2ae}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e4ec}', '\u{1e4ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e94a}'), + ('🏻', '🏿'), + ('\u{e0020}', '\u{e007f}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const L: &'static [(char, char)] = &[('ᄀ', 'ᅟ'), ('ꥠ', 'ꥼ')]; + +pub const LF: &'static [(char, char)] = &[('\n', '\n')]; + +pub const LV: &'static [(char, char)] = &[ + ('가', '가'), + ('개', '개'), + ('갸', '갸'), + ('걔', '걔'), + ('거', '거'), + ('게', '게'), + ('겨', '겨'), + ('계', '계'), + ('고', '고'), + ('과', '과'), + ('괘', '괘'), + ('괴', '괴'), + ('교', '교'), + ('구', '구'), + ('궈', '궈'), + ('궤', '궤'), + ('귀', '귀'), + ('규', '규'), + ('그', '그'), + ('긔', '긔'), + ('기', '기'), + ('까', '까'), + ('깨', '깨'), + ('꺄', '꺄'), + ('꺠', '꺠'), + ('꺼', '꺼'), + ('께', '께'), + ('껴', '껴'), + ('꼐', '꼐'), + ('꼬', '꼬'), + ('꽈', '꽈'), + ('꽤', '꽤'), + ('꾀', '꾀'), + ('꾜', '꾜'), + ('꾸', '꾸'), + ('꿔', '꿔'), + ('꿰', '꿰'), + ('뀌', '뀌'), + ('뀨', '뀨'), + ('끄', '끄'), + ('끠', '끠'), + ('끼', '끼'), + ('나', '나'), + ('내', '내'), + ('냐', '냐'), + ('냬', '냬'), + ('너', '너'), + ('네', '네'), + ('녀', '녀'), + ('녜', '녜'), + ('노', '노'), + ('놔', '놔'), + ('놰', '놰'), + ('뇌', '뇌'), + ('뇨', '뇨'), + ('누', '누'), + ('눠', '눠'), + ('눼', '눼'), + ('뉘', '뉘'), + ('뉴', '뉴'), + ('느', '느'), + ('늬', '늬'), + ('니', '니'), + ('다', '다'), + ('대', '대'), + ('댜', '댜'), + ('댸', '댸'), + ('더', '더'), + ('데', '데'), + ('뎌', '뎌'), + ('뎨', '뎨'), + ('도', '도'), + ('돠', '돠'), + ('돼', '돼'), + ('되', '되'), + ('됴', '됴'), + ('두', '두'), + ('둬', '둬'), + ('뒈', '뒈'), + ('뒤', '뒤'), + ('듀', '듀'), + ('드', '드'), + ('듸', '듸'), + ('디', '디'), + ('따', '따'), + ('때', '때'), + ('땨', '땨'), + ('떄', '떄'), + ('떠', '떠'), + ('떼', '떼'), + ('뗘', '뗘'), + ('뗴', '뗴'), + ('또', '또'), + ('똬', '똬'), + ('뙈', '뙈'), + ('뙤', '뙤'), + ('뚀', '뚀'), + ('뚜', '뚜'), + ('뚸', '뚸'), + ('뛔', '뛔'), + ('뛰', '뛰'), + ('뜌', '뜌'), + ('뜨', '뜨'), + ('띄', '띄'), + ('띠', '띠'), + ('라', '라'), + ('래', '래'), + ('랴', '랴'), + ('럐', '럐'), + ('러', '러'), + ('레', '레'), + ('려', '려'), + ('례', '례'), + ('로', '로'), + ('롸', '롸'), + ('뢔', '뢔'), + ('뢰', '뢰'), + ('료', '료'), + ('루', '루'), + ('뤄', '뤄'), + ('뤠', '뤠'), + ('뤼', '뤼'), + ('류', '류'), + ('르', '르'), + ('릐', '릐'), + ('리', '리'), + ('마', '마'), + ('매', '매'), + ('먀', '먀'), + ('먜', '먜'), + ('머', '머'), + ('메', '메'), + ('며', '며'), + ('몌', '몌'), + ('모', '모'), + ('뫄', '뫄'), + ('뫠', '뫠'), + ('뫼', '뫼'), + ('묘', '묘'), + ('무', '무'), + ('뭐', '뭐'), + ('뭬', '뭬'), + ('뮈', '뮈'), + ('뮤', '뮤'), + ('므', '므'), + ('믜', '믜'), + ('미', '미'), + ('바', '바'), + ('배', '배'), + ('뱌', '뱌'), + ('뱨', '뱨'), + ('버', '버'), + ('베', '베'), + ('벼', '벼'), + ('볘', '볘'), + ('보', '보'), + ('봐', '봐'), + ('봬', '봬'), + ('뵈', '뵈'), + ('뵤', '뵤'), + ('부', '부'), + ('붜', '붜'), + ('붸', '붸'), + ('뷔', '뷔'), + ('뷰', '뷰'), + ('브', '브'), + ('븨', '븨'), + ('비', '비'), + ('빠', '빠'), + ('빼', '빼'), + ('뺘', '뺘'), + ('뺴', '뺴'), + ('뻐', '뻐'), + ('뻬', '뻬'), + ('뼈', '뼈'), + ('뼤', '뼤'), + ('뽀', '뽀'), + ('뽜', '뽜'), + ('뽸', '뽸'), + ('뾔', '뾔'), + ('뾰', '뾰'), + ('뿌', '뿌'), + ('뿨', '뿨'), + ('쀄', '쀄'), + ('쀠', '쀠'), + ('쀼', '쀼'), + ('쁘', '쁘'), + ('쁴', '쁴'), + ('삐', '삐'), + ('사', '사'), + ('새', '새'), + ('샤', '샤'), + ('섀', '섀'), + ('서', '서'), + ('세', '세'), + ('셔', '셔'), + ('셰', '셰'), + ('소', '소'), + ('솨', '솨'), + ('쇄', '쇄'), + ('쇠', '쇠'), + ('쇼', '쇼'), + ('수', '수'), + ('숴', '숴'), + ('쉐', '쉐'), + ('쉬', '쉬'), + ('슈', '슈'), + ('스', '스'), + ('싀', '싀'), + ('시', '시'), + ('싸', '싸'), + ('쌔', '쌔'), + ('쌰', '쌰'), + ('썌', '썌'), + ('써', '써'), + ('쎄', '쎄'), + ('쎠', '쎠'), + ('쎼', '쎼'), + ('쏘', '쏘'), + ('쏴', '쏴'), + ('쐐', '쐐'), + ('쐬', '쐬'), + ('쑈', '쑈'), + ('쑤', '쑤'), + ('쒀', '쒀'), + ('쒜', '쒜'), + ('쒸', '쒸'), + ('쓔', '쓔'), + ('쓰', '쓰'), + ('씌', '씌'), + ('씨', '씨'), + ('아', '아'), + ('애', '애'), + ('야', '야'), + ('얘', '얘'), + ('어', '어'), + ('에', '에'), + ('여', '여'), + ('예', '예'), + ('오', '오'), + ('와', '와'), + ('왜', '왜'), + ('외', '외'), + ('요', '요'), + ('우', '우'), + ('워', '워'), + ('웨', '웨'), + ('위', '위'), + ('유', '유'), + ('으', '으'), + ('의', '의'), + ('이', '이'), + ('자', '자'), + ('재', '재'), + ('쟈', '쟈'), + ('쟤', '쟤'), + ('저', '저'), + ('제', '제'), + ('져', '져'), + ('졔', '졔'), + ('조', '조'), + ('좌', '좌'), + ('좨', '좨'), + ('죄', '죄'), + ('죠', '죠'), + ('주', '주'), + ('줘', '줘'), + ('줴', '줴'), + ('쥐', '쥐'), + ('쥬', '쥬'), + ('즈', '즈'), + ('즤', '즤'), + ('지', '지'), + ('짜', '짜'), + ('째', '째'), + ('쨔', '쨔'), + ('쨰', '쨰'), + ('쩌', '쩌'), + ('쩨', '쩨'), + ('쪄', '쪄'), + ('쪠', '쪠'), + ('쪼', '쪼'), + ('쫘', '쫘'), + ('쫴', '쫴'), + ('쬐', '쬐'), + ('쬬', '쬬'), + ('쭈', '쭈'), + ('쭤', '쭤'), + ('쮀', '쮀'), + ('쮜', '쮜'), + ('쮸', '쮸'), + ('쯔', '쯔'), + ('쯰', '쯰'), + ('찌', '찌'), + ('차', '차'), + ('채', '채'), + ('챠', '챠'), + ('챼', '챼'), + ('처', '처'), + ('체', '체'), + ('쳐', '쳐'), + ('쳬', '쳬'), + ('초', '초'), + ('촤', '촤'), + ('쵀', '쵀'), + ('최', '최'), + ('쵸', '쵸'), + ('추', '추'), + ('춰', '춰'), + ('췌', '췌'), + ('취', '취'), + ('츄', '츄'), + ('츠', '츠'), + ('츼', '츼'), + ('치', '치'), + ('카', '카'), + ('캐', '캐'), + ('캬', '캬'), + ('컈', '컈'), + ('커', '커'), + ('케', '케'), + ('켜', '켜'), + ('켸', '켸'), + ('코', '코'), + ('콰', '콰'), + ('쾌', '쾌'), + ('쾨', '쾨'), + ('쿄', '쿄'), + ('쿠', '쿠'), + ('쿼', '쿼'), + ('퀘', '퀘'), + ('퀴', '퀴'), + ('큐', '큐'), + ('크', '크'), + ('킈', '킈'), + ('키', '키'), + ('타', '타'), + ('태', '태'), + ('탸', '탸'), + ('턔', '턔'), + ('터', '터'), + ('테', '테'), + ('텨', '텨'), + ('톄', '톄'), + ('토', '토'), + ('톼', '톼'), + ('퇘', '퇘'), + ('퇴', '퇴'), + ('툐', '툐'), + ('투', '투'), + ('퉈', '퉈'), + ('퉤', '퉤'), + ('튀', '튀'), + ('튜', '튜'), + ('트', '트'), + ('틔', '틔'), + ('티', '티'), + ('파', '파'), + ('패', '패'), + ('퍄', '퍄'), + ('퍠', '퍠'), + ('퍼', '퍼'), + ('페', '페'), + ('펴', '펴'), + ('폐', '폐'), + ('포', '포'), + ('퐈', '퐈'), + ('퐤', '퐤'), + ('푀', '푀'), + ('표', '표'), + ('푸', '푸'), + ('풔', '풔'), + ('풰', '풰'), + ('퓌', '퓌'), + ('퓨', '퓨'), + ('프', '프'), + ('픠', '픠'), + ('피', '피'), + ('하', '하'), + ('해', '해'), + ('햐', '햐'), + ('햬', '햬'), + ('허', '허'), + ('헤', '헤'), + ('혀', '혀'), + ('혜', '혜'), + ('호', '호'), + ('화', '화'), + ('홰', '홰'), + ('회', '회'), + ('효', '효'), + ('후', '후'), + ('훠', '훠'), + ('훼', '훼'), + ('휘', '휘'), + ('휴', '휴'), + ('흐', '흐'), + ('희', '희'), + ('히', '히'), +]; + +pub const LVT: &'static [(char, char)] = &[ + ('각', '갛'), + ('객', '갷'), + ('갹', '걓'), + ('걕', '걯'), + ('걱', '겋'), + ('겍', '겧'), + ('격', '곃'), + ('곅', '곟'), + ('곡', '곻'), + ('곽', '괗'), + ('괙', '괳'), + ('괵', '굏'), + ('굑', '굫'), + ('국', '궇'), + ('궉', '궣'), + ('궥', '궿'), + ('귁', '귛'), + ('귝', '귷'), + ('극', '긓'), + ('긕', '긯'), + ('긱', '깋'), + ('깍', '깧'), + ('깩', '꺃'), + ('꺅', '꺟'), + ('꺡', '꺻'), + ('꺽', '껗'), + ('껙', '껳'), + ('껵', '꼏'), + ('꼑', '꼫'), + ('꼭', '꽇'), + ('꽉', '꽣'), + ('꽥', '꽿'), + ('꾁', '꾛'), + ('꾝', '꾷'), + ('꾹', '꿓'), + ('꿕', '꿯'), + ('꿱', '뀋'), + ('뀍', '뀧'), + ('뀩', '끃'), + ('끅', '끟'), + ('끡', '끻'), + ('끽', '낗'), + ('낙', '낳'), + ('낵', '냏'), + ('냑', '냫'), + ('냭', '넇'), + ('넉', '넣'), + ('넥', '넿'), + ('녁', '녛'), + ('녝', '녷'), + ('녹', '놓'), + ('놕', '놯'), + ('놱', '뇋'), + ('뇍', '뇧'), + ('뇩', '눃'), + ('눅', '눟'), + ('눡', '눻'), + ('눽', '뉗'), + ('뉙', '뉳'), + ('뉵', '늏'), + ('늑', '늫'), + ('늭', '닇'), + ('닉', '닣'), + ('닥', '닿'), + ('댁', '댛'), + ('댝', '댷'), + ('댹', '덓'), + ('덕', '덯'), + ('덱', '뎋'), + ('뎍', '뎧'), + ('뎩', '돃'), + ('독', '돟'), + ('돡', '돻'), + ('돽', '됗'), + ('됙', '됳'), + ('됵', '둏'), + ('둑', '둫'), + ('둭', '뒇'), + ('뒉', '뒣'), + ('뒥', '뒿'), + ('듁', '듛'), + ('득', '듷'), + ('듹', '딓'), + ('딕', '딯'), + ('딱', '땋'), + ('땍', '땧'), + ('땩', '떃'), + ('떅', '떟'), + ('떡', '떻'), + ('떽', '뗗'), + ('뗙', '뗳'), + ('뗵', '똏'), + ('똑', '똫'), + ('똭', '뙇'), + ('뙉', '뙣'), + ('뙥', '뙿'), + ('뚁', '뚛'), + ('뚝', '뚷'), + ('뚹', '뛓'), + ('뛕', '뛯'), + ('뛱', '뜋'), + ('뜍', '뜧'), + ('뜩', '띃'), + ('띅', '띟'), + ('띡', '띻'), + ('락', '랗'), + ('랙', '랳'), + ('략', '럏'), + ('럑', '럫'), + ('럭', '렇'), + ('렉', '렣'), + ('력', '렿'), + ('롁', '롛'), + ('록', '롷'), + ('롹', '뢓'), + ('뢕', '뢯'), + ('뢱', '룋'), + ('룍', '룧'), + ('룩', '뤃'), + ('뤅', '뤟'), + ('뤡', '뤻'), + ('뤽', '륗'), + ('륙', '륳'), + ('륵', '릏'), + ('릑', '릫'), + ('릭', '맇'), + ('막', '맣'), + ('맥', '맿'), + ('먁', '먛'), + ('먝', '먷'), + ('먹', '멓'), + ('멕', '멯'), + ('멱', '몋'), + ('몍', '몧'), + ('목', '뫃'), + ('뫅', '뫟'), + ('뫡', '뫻'), + ('뫽', '묗'), + ('묙', '묳'), + ('묵', '뭏'), + ('뭑', '뭫'), + ('뭭', '뮇'), + ('뮉', '뮣'), + ('뮥', '뮿'), + ('믁', '믛'), + ('믝', '믷'), + ('믹', '밓'), + ('박', '밯'), + ('백', '뱋'), + ('뱍', '뱧'), + ('뱩', '벃'), + ('벅', '벟'), + ('벡', '벻'), + ('벽', '볗'), + ('볙', '볳'), + ('복', '봏'), + ('봑', '봫'), + ('봭', '뵇'), + ('뵉', '뵣'), + ('뵥', '뵿'), + ('북', '붛'), + ('붝', '붷'), + ('붹', '뷓'), + ('뷕', '뷯'), + ('뷱', '븋'), + ('븍', '븧'), + ('븩', '빃'), + ('빅', '빟'), + ('빡', '빻'), + ('빽', '뺗'), + ('뺙', '뺳'), + ('뺵', '뻏'), + ('뻑', '뻫'), + ('뻭', '뼇'), + ('뼉', '뼣'), + ('뼥', '뼿'), + ('뽁', '뽛'), + ('뽝', '뽷'), + ('뽹', '뾓'), + ('뾕', '뾯'), + ('뾱', '뿋'), + ('뿍', '뿧'), + ('뿩', '쀃'), + ('쀅', '쀟'), + ('쀡', '쀻'), + ('쀽', '쁗'), + ('쁙', '쁳'), + ('쁵', '삏'), + ('삑', '삫'), + ('삭', '샇'), + ('색', '샣'), + ('샥', '샿'), + ('섁', '섛'), + ('석', '섷'), + ('섹', '셓'), + ('셕', '셯'), + ('셱', '솋'), + ('속', '솧'), + ('솩', '쇃'), + ('쇅', '쇟'), + ('쇡', '쇻'), + ('쇽', '숗'), + ('숙', '숳'), + ('숵', '쉏'), + ('쉑', '쉫'), + ('쉭', '슇'), + ('슉', '슣'), + ('슥', '슿'), + ('싁', '싛'), + ('식', '싷'), + ('싹', '쌓'), + ('쌕', '쌯'), + ('쌱', '썋'), + ('썍', '썧'), + ('썩', '쎃'), + ('쎅', '쎟'), + ('쎡', '쎻'), + ('쎽', '쏗'), + ('쏙', '쏳'), + ('쏵', '쐏'), + ('쐑', '쐫'), + ('쐭', '쑇'), + ('쑉', '쑣'), + ('쑥', '쑿'), + ('쒁', '쒛'), + ('쒝', '쒷'), + ('쒹', '쓓'), + ('쓕', '쓯'), + ('쓱', '씋'), + ('씍', '씧'), + ('씩', '앃'), + ('악', '앟'), + ('액', '앻'), + ('약', '얗'), + ('얙', '얳'), + ('억', '엏'), + ('엑', '엫'), + ('역', '옇'), + ('옉', '옣'), + ('옥', '옿'), + ('왁', '왛'), + ('왝', '왷'), + ('왹', '욓'), + ('욕', '욯'), + ('욱', '웋'), + ('웍', '웧'), + ('웩', '윃'), + ('윅', '윟'), + ('육', '윻'), + ('윽', '읗'), + ('읙', '읳'), + ('익', '잏'), + ('작', '잫'), + ('잭', '쟇'), + ('쟉', '쟣'), + ('쟥', '쟿'), + ('적', '젛'), + ('젝', '젷'), + ('젹', '졓'), + ('졕', '졯'), + ('족', '좋'), + ('좍', '좧'), + ('좩', '죃'), + ('죅', '죟'), + ('죡', '죻'), + ('죽', '줗'), + ('줙', '줳'), + ('줵', '쥏'), + ('쥑', '쥫'), + ('쥭', '즇'), + ('즉', '즣'), + ('즥', '즿'), + ('직', '짛'), + ('짝', '짷'), + ('짹', '쨓'), + ('쨕', '쨯'), + ('쨱', '쩋'), + ('쩍', '쩧'), + ('쩩', '쪃'), + ('쪅', '쪟'), + ('쪡', '쪻'), + ('쪽', '쫗'), + ('쫙', '쫳'), + ('쫵', '쬏'), + ('쬑', '쬫'), + ('쬭', '쭇'), + ('쭉', '쭣'), + ('쭥', '쭿'), + ('쮁', '쮛'), + ('쮝', '쮷'), + ('쮹', '쯓'), + ('쯕', '쯯'), + ('쯱', '찋'), + ('찍', '찧'), + ('착', '챃'), + ('책', '챟'), + ('챡', '챻'), + ('챽', '첗'), + ('척', '첳'), + ('첵', '쳏'), + ('쳑', '쳫'), + ('쳭', '촇'), + ('촉', '촣'), + ('촥', '촿'), + ('쵁', '쵛'), + ('쵝', '쵷'), + ('쵹', '춓'), + ('축', '춯'), + ('춱', '췋'), + ('췍', '췧'), + ('췩', '츃'), + ('츅', '츟'), + ('측', '츻'), + ('츽', '칗'), + ('칙', '칳'), + ('칵', '캏'), + ('캑', '캫'), + ('캭', '컇'), + ('컉', '컣'), + ('컥', '컿'), + ('켁', '켛'), + ('켝', '켷'), + ('켹', '콓'), + ('콕', '콯'), + ('콱', '쾋'), + ('쾍', '쾧'), + ('쾩', '쿃'), + ('쿅', '쿟'), + ('쿡', '쿻'), + ('쿽', '퀗'), + ('퀙', '퀳'), + ('퀵', '큏'), + ('큑', '큫'), + ('큭', '킇'), + ('킉', '킣'), + ('킥', '킿'), + ('탁', '탛'), + ('택', '탷'), + ('탹', '턓'), + ('턕', '턯'), + ('턱', '텋'), + ('텍', '텧'), + ('텩', '톃'), + ('톅', '톟'), + ('톡', '톻'), + ('톽', '퇗'), + ('퇙', '퇳'), + ('퇵', '툏'), + ('툑', '툫'), + ('툭', '퉇'), + ('퉉', '퉣'), + ('퉥', '퉿'), + ('튁', '튛'), + ('튝', '튷'), + ('특', '틓'), + ('틕', '틯'), + ('틱', '팋'), + ('팍', '팧'), + ('팩', '퍃'), + ('퍅', '퍟'), + ('퍡', '퍻'), + ('퍽', '펗'), + ('펙', '펳'), + ('펵', '폏'), + ('폑', '폫'), + ('폭', '퐇'), + ('퐉', '퐣'), + ('퐥', '퐿'), + ('푁', '푛'), + ('푝', '푷'), + ('푹', '풓'), + ('풕', '풯'), + ('풱', '퓋'), + ('퓍', '퓧'), + ('퓩', '픃'), + ('픅', '픟'), + ('픡', '픻'), + ('픽', '핗'), + ('학', '핳'), + ('핵', '햏'), + ('햑', '햫'), + ('햭', '헇'), + ('헉', '헣'), + ('헥', '헿'), + ('혁', '혛'), + ('혝', '혷'), + ('혹', '홓'), + ('확', '홯'), + ('홱', '횋'), + ('획', '횧'), + ('횩', '훃'), + ('훅', '훟'), + ('훡', '훻'), + ('훽', '휗'), + ('휙', '휳'), + ('휵', '흏'), + ('흑', '흫'), + ('흭', '힇'), + ('힉', '힣'), +]; + +pub const PREPEND: &'static [(char, char)] = &[ + ('\u{600}', '\u{605}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{70f}', '\u{70f}'), + ('\u{890}', '\u{891}'), + ('\u{8e2}', '\u{8e2}'), + ('ൎ', 'ൎ'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110cd}', '\u{110cd}'), + ('𑇂', '𑇃'), + ('𑤿', '𑤿'), + ('𑥁', '𑥁'), + ('𑨺', '𑨺'), + ('𑪄', '𑪉'), + ('𑵆', '𑵆'), + ('𑼂', '𑼂'), +]; + +pub const REGIONAL_INDICATOR: &'static [(char, char)] = &[('🇦', '🇿')]; + +pub const SPACINGMARK: &'static [(char, char)] = &[ + ('ः', 'ः'), + ('ऻ', 'ऻ'), + ('ा', 'ी'), + ('ॉ', 'ौ'), + ('ॎ', 'ॏ'), + ('ং', 'ঃ'), + ('ি', 'ী'), + ('ে', 'ৈ'), + ('ো', 'ৌ'), + ('ਃ', 'ਃ'), + ('ਾ', 'ੀ'), + ('ઃ', 'ઃ'), + ('ા', 'ી'), + ('ૉ', 'ૉ'), + ('ો', 'ૌ'), + ('ଂ', 'ଃ'), + ('ୀ', 'ୀ'), + ('େ', 'ୈ'), + ('ୋ', 'ୌ'), + ('ி', 'ி'), + ('ு', 'ூ'), + ('ெ', 'ை'), + ('ொ', 'ௌ'), + ('ఁ', 'ః'), + ('ు', 'ౄ'), + ('ಂ', 'ಃ'), + ('ಾ', 'ಾ'), + ('ೀ', 'ು'), + ('ೃ', 'ೄ'), + ('ೇ', 'ೈ'), + ('ೊ', 'ೋ'), + ('ೳ', 'ೳ'), + ('ം', 'ഃ'), + ('ി', 'ീ'), + ('െ', 'ൈ'), + ('ൊ', 'ൌ'), + ('ං', 'ඃ'), + ('ැ', 'ෑ'), + ('ෘ', 'ෞ'), + ('ෲ', 'ෳ'), + ('ำ', 'ำ'), + ('ຳ', 'ຳ'), + ('༾', '༿'), + ('ཿ', 'ཿ'), + ('ေ', 'ေ'), + ('ျ', 'ြ'), + ('ၖ', 'ၗ'), + ('ႄ', 'ႄ'), + ('᜕', '᜕'), + ('᜴', '᜴'), + ('ា', 'ា'), + ('ើ', 'ៅ'), + ('ះ', 'ៈ'), + ('ᤣ', 'ᤦ'), + ('ᤩ', 'ᤫ'), + ('ᤰ', 'ᤱ'), + ('ᤳ', 'ᤸ'), + ('ᨙ', 'ᨚ'), + ('ᩕ', 'ᩕ'), + ('ᩗ', 'ᩗ'), + ('ᩭ', 'ᩲ'), + ('ᬄ', 'ᬄ'), + ('ᬻ', 'ᬻ'), + ('ᬽ', 'ᭁ'), + ('ᭃ', '᭄'), + ('ᮂ', 'ᮂ'), + ('ᮡ', 'ᮡ'), + ('ᮦ', 'ᮧ'), + ('᮪', '᮪'), + ('ᯧ', 'ᯧ'), + ('ᯪ', 'ᯬ'), + ('ᯮ', 'ᯮ'), + ('᯲', '᯳'), + ('ᰤ', 'ᰫ'), + ('ᰴ', 'ᰵ'), + ('᳡', '᳡'), + ('᳷', '᳷'), + ('ꠣ', 'ꠤ'), + ('ꠧ', 'ꠧ'), + ('ꢀ', 'ꢁ'), + ('ꢴ', 'ꣃ'), + ('ꥒ', '꥓'), + ('ꦃ', 'ꦃ'), + ('ꦴ', 'ꦵ'), + ('ꦺ', 'ꦻ'), + ('ꦾ', '꧀'), + ('ꨯ', 'ꨰ'), + ('ꨳ', 'ꨴ'), + ('ꩍ', 'ꩍ'), + ('ꫫ', 'ꫫ'), + ('ꫮ', 'ꫯ'), + ('ꫵ', 'ꫵ'), + ('ꯣ', 'ꯤ'), + ('ꯦ', 'ꯧ'), + ('ꯩ', 'ꯪ'), + ('꯬', '꯬'), + ('𑀀', '𑀀'), + ('𑀂', '𑀂'), + ('𑂂', '𑂂'), + ('𑂰', '𑂲'), + ('𑂷', '𑂸'), + ('𑄬', '𑄬'), + ('𑅅', '𑅆'), + ('𑆂', '𑆂'), + ('𑆳', '𑆵'), + ('𑆿', '𑇀'), + ('𑇎', '𑇎'), + ('𑈬', '𑈮'), + ('𑈲', '𑈳'), + ('𑈵', '𑈵'), + ('𑋠', '𑋢'), + ('𑌂', '𑌃'), + ('𑌿', '𑌿'), + ('𑍁', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('𑍢', '𑍣'), + ('𑐵', '𑐷'), + ('𑑀', '𑑁'), + ('𑑅', '𑑅'), + ('𑒱', '𑒲'), + ('𑒹', '𑒹'), + ('𑒻', '𑒼'), + ('𑒾', '𑒾'), + ('𑓁', '𑓁'), + ('𑖰', '𑖱'), + ('𑖸', '𑖻'), + ('𑖾', '𑖾'), + ('𑘰', '𑘲'), + ('𑘻', '𑘼'), + ('𑘾', '𑘾'), + ('𑚬', '𑚬'), + ('𑚮', '𑚯'), + ('𑚶', '𑚶'), + ('𑜦', '𑜦'), + ('𑠬', '𑠮'), + ('𑠸', '𑠸'), + ('𑤱', '𑤵'), + ('𑤷', '𑤸'), + ('𑤽', '𑤽'), + ('𑥀', '𑥀'), + ('𑥂', '𑥂'), + ('𑧑', '𑧓'), + ('𑧜', '𑧟'), + ('𑧤', '𑧤'), + ('𑨹', '𑨹'), + ('𑩗', '𑩘'), + ('𑪗', '𑪗'), + ('𑰯', '𑰯'), + ('𑰾', '𑰾'), + ('𑲩', '𑲩'), + ('𑲱', '𑲱'), + ('𑲴', '𑲴'), + ('𑶊', '𑶎'), + ('𑶓', '𑶔'), + ('𑶖', '𑶖'), + ('𑻵', '𑻶'), + ('𑼃', '𑼃'), + ('𑼴', '𑼵'), + ('𑼾', '𑼿'), + ('𑽁', '𑽁'), + ('𖽑', '𖾇'), + ('𖿰', '𖿱'), + ('𝅦', '𝅦'), + ('𝅭', '𝅭'), +]; + +pub const T: &'static [(char, char)] = &[('ᆨ', 'ᇿ'), ('ퟋ', 'ퟻ')]; + +pub const V: &'static [(char, char)] = &[('ᅠ', 'ᆧ'), ('ힰ', 'ퟆ')]; + +pub const ZWJ: &'static [(char, char)] = &[('\u{200d}', '\u{200d}')]; diff --git a/vendor/regex-syntax/src/unicode_tables/mod.rs b/vendor/regex-syntax/src/unicode_tables/mod.rs new file mode 100644 index 0000000000..20736c7ac8 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/mod.rs @@ -0,0 +1,57 @@ +#[cfg(feature = "unicode-age")] +pub mod age; + +#[cfg(feature = "unicode-case")] +pub mod case_folding_simple; + +#[cfg(feature = "unicode-gencat")] +pub mod general_category; + +#[cfg(feature = "unicode-segment")] +pub mod grapheme_cluster_break; + +#[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] +#[allow(dead_code)] +pub mod perl_decimal; + +#[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] +#[allow(dead_code)] +pub mod perl_space; + +#[cfg(feature = "unicode-perl")] +pub mod perl_word; + +#[cfg(feature = "unicode-bool")] +pub mod property_bool; + +#[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", +))] +pub mod property_names; + +#[cfg(any( + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", +))] +pub mod property_values; + +#[cfg(feature = "unicode-script")] +pub mod script; + +#[cfg(feature = "unicode-script")] +pub mod script_extension; + +#[cfg(feature = "unicode-segment")] +pub mod sentence_break; + +#[cfg(feature = "unicode-segment")] +pub mod word_break; diff --git a/vendor/regex-syntax/src/unicode_tables/perl_decimal.rs b/vendor/regex-syntax/src/unicode_tables/perl_decimal.rs new file mode 100644 index 0000000000..4f4c08a128 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/perl_decimal.rs @@ -0,0 +1,77 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate general-category ucd-15.0.0 --chars --include decimalnumber +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = + &[("Decimal_Number", DECIMAL_NUMBER)]; + +pub const DECIMAL_NUMBER: &'static [(char, char)] = &[ + ('0', '9'), + ('٠', '٩'), + ('۰', '۹'), + ('߀', '߉'), + ('०', '९'), + ('০', '৯'), + ('੦', '੯'), + ('૦', '૯'), + ('୦', '୯'), + ('௦', '௯'), + ('౦', '౯'), + ('೦', '೯'), + ('൦', '൯'), + ('෦', '෯'), + ('๐', '๙'), + ('໐', '໙'), + ('༠', '༩'), + ('၀', '၉'), + ('႐', '႙'), + ('០', '៩'), + ('᠐', '᠙'), + ('᥆', '᥏'), + ('᧐', '᧙'), + ('᪀', '᪉'), + ('᪐', '᪙'), + ('᭐', '᭙'), + ('᮰', '᮹'), + ('᱀', '᱉'), + ('᱐', '᱙'), + ('꘠', '꘩'), + ('꣐', '꣙'), + ('꤀', '꤉'), + ('꧐', '꧙'), + ('꧰', '꧹'), + ('꩐', '꩙'), + ('꯰', '꯹'), + ('0', '9'), + ('𐒠', '𐒩'), + ('𐴰', '𐴹'), + ('𑁦', '𑁯'), + ('𑃰', '𑃹'), + ('𑄶', '𑄿'), + ('𑇐', '𑇙'), + ('𑋰', '𑋹'), + ('𑑐', '𑑙'), + ('𑓐', '𑓙'), + ('𑙐', '𑙙'), + ('𑛀', '𑛉'), + ('𑜰', '𑜹'), + ('𑣠', '𑣩'), + ('𑥐', '𑥙'), + ('𑱐', '𑱙'), + ('𑵐', '𑵙'), + ('𑶠', '𑶩'), + ('𑽐', '𑽙'), + ('𖩠', '𖩩'), + ('𖫀', '𖫉'), + ('𖭐', '𖭙'), + ('𝟎', '𝟿'), + ('𞅀', '𞅉'), + ('𞋰', '𞋹'), + ('𞓰', '𞓹'), + ('𞥐', '𞥙'), + ('🯰', '🯹'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/perl_space.rs b/vendor/regex-syntax/src/unicode_tables/perl_space.rs new file mode 100644 index 0000000000..1741695795 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/perl_space.rs @@ -0,0 +1,23 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate property-bool ucd-15.0.0 --chars --include whitespace +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = + &[("White_Space", WHITE_SPACE)]; + +pub const WHITE_SPACE: &'static [(char, char)] = &[ + ('\t', '\r'), + (' ', ' '), + ('\u{85}', '\u{85}'), + ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), + ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/perl_word.rs b/vendor/regex-syntax/src/unicode_tables/perl_word.rs new file mode 100644 index 0000000000..c1b66bd9ab --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/perl_word.rs @@ -0,0 +1,781 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate perl-word ucd-15.0.0 --chars +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const PERL_WORD: &'static [(char, char)] = &[ + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ˁ'), + ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('\u{300}', 'ʹ'), + ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('\u{483}', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), + ('ՠ', 'ֈ'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('א', 'ת'), + ('ׯ', 'ײ'), + ('\u{610}', '\u{61a}'), + ('ؠ', '٩'), + ('ٮ', 'ۓ'), + ('ە', '\u{6dc}'), + ('\u{6df}', '\u{6e8}'), + ('\u{6ea}', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', '\u{74a}'), + ('ݍ', 'ޱ'), + ('߀', 'ߵ'), + ('ߺ', 'ߺ'), + ('\u{7fd}', '\u{7fd}'), + ('ࠀ', '\u{82d}'), + ('ࡀ', '\u{85b}'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('\u{898}', '\u{8e1}'), + ('\u{8e3}', '\u{963}'), + ('०', '९'), + ('ॱ', 'ঃ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('\u{9bc}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', 'ৎ'), + ('\u{9d7}', '\u{9d7}'), + ('ড়', 'ঢ়'), + ('য়', '\u{9e3}'), + ('০', 'ৱ'), + ('ৼ', 'ৼ'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ਃ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('੦', '\u{a75}'), + ('\u{a81}', 'ઃ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('ૐ', 'ૐ'), + ('ૠ', '\u{ae3}'), + ('૦', '૯'), + ('ૹ', '\u{aff}'), + ('\u{b01}', 'ଃ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('\u{b3c}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', '\u{b63}'), + ('୦', '୯'), + ('ୱ', 'ୱ'), + ('\u{b82}', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('ௐ', 'ௐ'), + ('\u{bd7}', '\u{bd7}'), + ('௦', '௯'), + ('\u{c00}', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('\u{c3c}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', '\u{c63}'), + ('౦', '౯'), + ('ಀ', 'ಃ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('\u{cbc}', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('ೝ', 'ೞ'), + ('ೠ', '\u{ce3}'), + ('೦', '೯'), + ('ೱ', 'ೳ'), + ('\u{d00}', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', 'ൎ'), + ('ൔ', '\u{d57}'), + ('ൟ', '\u{d63}'), + ('൦', '൯'), + ('ൺ', 'ൿ'), + ('\u{d81}', 'ඃ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('෦', '෯'), + ('ෲ', 'ෳ'), + ('ก', '\u{e3a}'), + ('เ', '\u{e4e}'), + ('๐', '๙'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('\u{ec8}', '\u{ece}'), + ('໐', '໙'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('\u{f18}', '\u{f19}'), + ('༠', '༩'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('༾', 'ཇ'), + ('ཉ', 'ཬ'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('က', '၉'), + ('ၐ', '\u{109d}'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('\u{135d}', '\u{135f}'), + ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛮ', 'ᛸ'), + ('ᜀ', '᜕'), + ('ᜟ', '᜴'), + ('ᝀ', '\u{1753}'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('\u{1772}', '\u{1773}'), + ('ក', '\u{17d3}'), + ('ៗ', 'ៗ'), + ('ៜ', '\u{17dd}'), + ('០', '៩'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '᠙'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('᥆', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('᧐', '᧙'), + ('ᨀ', '\u{1a1b}'), + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '᪉'), + ('᪐', '᪙'), + ('ᪧ', 'ᪧ'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1b00}', 'ᭌ'), + ('᭐', '᭙'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '᯳'), + ('ᰀ', '\u{1c37}'), + ('᱀', '᱉'), + ('ᱍ', 'ᱽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', 'ᳺ'), + ('ᴀ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('\u{200c}', '\u{200d}'), + ('‿', '⁀'), + ('⁔', '⁔'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('\u{20d0}', '\u{20f0}'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℯ', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⓐ', 'ⓩ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('\u{2d7f}', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('\u{2de0}', '\u{2dff}'), + ('ⸯ', 'ⸯ'), + ('々', '〇'), + ('〡', '\u{302f}'), + ('〱', '〵'), + ('〸', '〼'), + ('ぁ', 'ゖ'), + ('\u{3099}', '\u{309a}'), + ('ゝ', 'ゟ'), + ('ァ', 'ヺ'), + ('ー', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘫ'), + ('Ꙁ', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('ꙿ', '\u{a6f1}'), + ('ꜗ', 'ꜟ'), + ('Ꜣ', 'ꞈ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠧ'), + ('\u{a82c}', '\u{a82c}'), + ('ꡀ', 'ꡳ'), + ('ꢀ', '\u{a8c5}'), + ('꣐', '꣙'), + ('\u{a8e0}', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', '\u{a92d}'), + ('ꤰ', '꥓'), + ('ꥠ', 'ꥼ'), + ('\u{a980}', '꧀'), + ('ꧏ', '꧙'), + ('ꧠ', 'ꧾ'), + ('ꨀ', '\u{aa36}'), + ('ꩀ', 'ꩍ'), + ('꩐', '꩙'), + ('ꩠ', 'ꩶ'), + ('ꩺ', 'ꫂ'), + ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫯ'), + ('ꫲ', '\u{aaf6}'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꯪ'), + ('꯬', '\u{abed}'), + ('꯰', '꯹'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('יִ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('︳', '︴'), + ('﹍', '﹏'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐅀', '𐅴'), + ('\u{101fd}', '\u{101fd}'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('\u{102e0}', '\u{102e0}'), + ('𐌀', '𐌟'), + ('𐌭', '𐍊'), + ('𐍐', '\u{1037a}'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐏑', '𐏕'), + ('𐐀', '𐒝'), + ('𐒠', '𐒩'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '\u{10ae6}'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐴀', '\u{10d27}'), + ('𐴰', '𐴹'), + ('𐺀', '𐺩'), + ('\u{10eab}', '\u{10eac}'), + ('𐺰', '𐺱'), + ('\u{10efd}', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '\u{10f50}'), + ('𐽰', '\u{10f85}'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀀', '\u{11046}'), + ('𑁦', '𑁵'), + ('\u{1107f}', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('𑃐', '𑃨'), + ('𑃰', '𑃹'), + ('\u{11100}', '\u{11134}'), + ('𑄶', '𑄿'), + ('𑅄', '𑅇'), + ('𑅐', '\u{11173}'), + ('𑅶', '𑅶'), + ('\u{11180}', '𑇄'), + ('\u{111c9}', '\u{111cc}'), + ('𑇎', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '\u{11237}'), + ('\u{1123e}', '\u{11241}'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '\u{112ea}'), + ('𑋰', '𑋹'), + ('\u{11300}', '𑌃'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('\u{1133b}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('𑍐', '𑍐'), + ('\u{11357}', '\u{11357}'), + ('𑍝', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('𑐀', '𑑊'), + ('𑑐', '𑑙'), + ('\u{1145e}', '𑑡'), + ('𑒀', '𑓅'), + ('𑓇', '𑓇'), + ('𑓐', '𑓙'), + ('𑖀', '\u{115b5}'), + ('𑖸', '\u{115c0}'), + ('𑗘', '\u{115dd}'), + ('𑘀', '\u{11640}'), + ('𑙄', '𑙄'), + ('𑙐', '𑙙'), + ('𑚀', '𑚸'), + ('𑛀', '𑛉'), + ('𑜀', '𑜚'), + ('\u{1171d}', '\u{1172b}'), + ('𑜰', '𑜹'), + ('𑝀', '𑝆'), + ('𑠀', '\u{1183a}'), + ('𑢠', '𑣩'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '\u{11943}'), + ('𑥐', '𑥙'), + ('𑦠', '𑦧'), + ('𑦪', '\u{119d7}'), + ('\u{119da}', '𑧡'), + ('𑧣', '𑧤'), + ('𑨀', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('𑩐', '\u{11a99}'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '\u{11c36}'), + ('\u{11c38}', '𑱀'), + ('𑱐', '𑱙'), + ('𑱲', '𑲏'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('𑵐', '𑵙'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '𑶘'), + ('𑶠', '𑶩'), + ('𑻠', '𑻶'), + ('\u{11f00}', '𑼐'), + ('𑼒', '\u{11f3a}'), + ('𑼾', '\u{11f42}'), + ('𑽐', '𑽙'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒐀', '𒑮'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('\u{13440}', '\u{13455}'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩠', '𖩩'), + ('𖩰', '𖪾'), + ('𖫀', '𖫉'), + ('𖫐', '𖫭'), + ('\u{16af0}', '\u{16af4}'), + ('𖬀', '\u{16b36}'), + ('𖭀', '𖭃'), + ('𖭐', '𖭙'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖹿'), + ('𖼀', '𖽊'), + ('\u{16f4f}', '𖾇'), + ('\u{16f8f}', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '\u{16fe4}'), + ('𖿰', '𖿱'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d165}', '\u{1d169}'), + ('𝅭', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝟎', '𝟿'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('𞀰', '𞁭'), + ('\u{1e08f}', '\u{1e08f}'), + ('𞄀', '𞄬'), + ('\u{1e130}', '𞄽'), + ('𞅀', '𞅉'), + ('𞅎', '𞅎'), + ('𞊐', '\u{1e2ae}'), + ('𞋀', '𞋹'), + ('𞓐', '𞓹'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('𞤀', '𞥋'), + ('𞥐', '𞥙'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('🄰', '🅉'), + ('🅐', '🅩'), + ('🅰', '🆉'), + ('🯰', '🯹'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), + ('\u{e0100}', '\u{e01ef}'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/property_bool.rs b/vendor/regex-syntax/src/unicode_tables/property_bool.rs new file mode 100644 index 0000000000..a3e84b519c --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/property_bool.rs @@ -0,0 +1,11367 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate property-bool ucd-15.0.0 --chars +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("ASCII_Hex_Digit", ASCII_HEX_DIGIT), + ("Alphabetic", ALPHABETIC), + ("Bidi_Control", BIDI_CONTROL), + ("Bidi_Mirrored", BIDI_MIRRORED), + ("Case_Ignorable", CASE_IGNORABLE), + ("Cased", CASED), + ("Changes_When_Casefolded", CHANGES_WHEN_CASEFOLDED), + ("Changes_When_Casemapped", CHANGES_WHEN_CASEMAPPED), + ("Changes_When_Lowercased", CHANGES_WHEN_LOWERCASED), + ("Changes_When_Titlecased", CHANGES_WHEN_TITLECASED), + ("Changes_When_Uppercased", CHANGES_WHEN_UPPERCASED), + ("Dash", DASH), + ("Default_Ignorable_Code_Point", DEFAULT_IGNORABLE_CODE_POINT), + ("Deprecated", DEPRECATED), + ("Diacritic", DIACRITIC), + ("Emoji", EMOJI), + ("Emoji_Component", EMOJI_COMPONENT), + ("Emoji_Modifier", EMOJI_MODIFIER), + ("Emoji_Modifier_Base", EMOJI_MODIFIER_BASE), + ("Emoji_Presentation", EMOJI_PRESENTATION), + ("Extended_Pictographic", EXTENDED_PICTOGRAPHIC), + ("Extender", EXTENDER), + ("Grapheme_Base", GRAPHEME_BASE), + ("Grapheme_Extend", GRAPHEME_EXTEND), + ("Grapheme_Link", GRAPHEME_LINK), + ("Hex_Digit", HEX_DIGIT), + ("Hyphen", HYPHEN), + ("IDS_Binary_Operator", IDS_BINARY_OPERATOR), + ("IDS_Trinary_Operator", IDS_TRINARY_OPERATOR), + ("ID_Continue", ID_CONTINUE), + ("ID_Start", ID_START), + ("Ideographic", IDEOGRAPHIC), + ("Join_Control", JOIN_CONTROL), + ("Logical_Order_Exception", LOGICAL_ORDER_EXCEPTION), + ("Lowercase", LOWERCASE), + ("Math", MATH), + ("Noncharacter_Code_Point", NONCHARACTER_CODE_POINT), + ("Other_Alphabetic", OTHER_ALPHABETIC), + ("Other_Default_Ignorable_Code_Point", OTHER_DEFAULT_IGNORABLE_CODE_POINT), + ("Other_Grapheme_Extend", OTHER_GRAPHEME_EXTEND), + ("Other_ID_Continue", OTHER_ID_CONTINUE), + ("Other_ID_Start", OTHER_ID_START), + ("Other_Lowercase", OTHER_LOWERCASE), + ("Other_Math", OTHER_MATH), + ("Other_Uppercase", OTHER_UPPERCASE), + ("Pattern_Syntax", PATTERN_SYNTAX), + ("Pattern_White_Space", PATTERN_WHITE_SPACE), + ("Prepended_Concatenation_Mark", PREPENDED_CONCATENATION_MARK), + ("Quotation_Mark", QUOTATION_MARK), + ("Radical", RADICAL), + ("Regional_Indicator", REGIONAL_INDICATOR), + ("Sentence_Terminal", SENTENCE_TERMINAL), + ("Soft_Dotted", SOFT_DOTTED), + ("Terminal_Punctuation", TERMINAL_PUNCTUATION), + ("Unified_Ideograph", UNIFIED_IDEOGRAPH), + ("Uppercase", UPPERCASE), + ("Variation_Selector", VARIATION_SELECTOR), + ("White_Space", WHITE_SPACE), + ("XID_Continue", XID_CONTINUE), + ("XID_Start", XID_START), +]; + +pub const ASCII_HEX_DIGIT: &'static [(char, char)] = + &[('0', '9'), ('A', 'F'), ('a', 'f')]; + +pub const ALPHABETIC: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ˁ'), + ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('\u{345}', '\u{345}'), + ('Ͱ', 'ʹ'), + ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), + ('ՠ', 'ֈ'), + ('\u{5b0}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('א', 'ת'), + ('ׯ', 'ײ'), + ('\u{610}', '\u{61a}'), + ('ؠ', '\u{657}'), + ('\u{659}', '\u{65f}'), + ('ٮ', 'ۓ'), + ('ە', '\u{6dc}'), + ('\u{6e1}', '\u{6e8}'), + ('\u{6ed}', 'ۯ'), + ('ۺ', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', '\u{73f}'), + ('ݍ', 'ޱ'), + ('ߊ', 'ߪ'), + ('ߴ', 'ߵ'), + ('ߺ', 'ߺ'), + ('ࠀ', '\u{817}'), + ('ࠚ', '\u{82c}'), + ('ࡀ', 'ࡘ'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('ࢠ', 'ࣉ'), + ('\u{8d4}', '\u{8df}'), + ('\u{8e3}', '\u{8e9}'), + ('\u{8f0}', 'ऻ'), + ('ऽ', 'ौ'), + ('ॎ', 'ॐ'), + ('\u{955}', '\u{963}'), + ('ॱ', 'ঃ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('ঽ', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', 'ৌ'), + ('ৎ', 'ৎ'), + ('\u{9d7}', '\u{9d7}'), + ('ড়', 'ঢ়'), + ('য়', '\u{9e3}'), + ('ৰ', 'ৱ'), + ('ৼ', 'ৼ'), + ('\u{a01}', 'ਃ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4c}'), + ('\u{a51}', '\u{a51}'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('\u{a70}', '\u{a75}'), + ('\u{a81}', 'ઃ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('ઽ', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', 'ૌ'), + ('ૐ', 'ૐ'), + ('ૠ', '\u{ae3}'), + ('ૹ', '\u{afc}'), + ('\u{b01}', 'ଃ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('ଽ', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', 'ୌ'), + ('\u{b56}', '\u{b57}'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', '\u{b63}'), + ('ୱ', 'ୱ'), + ('\u{b82}', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', 'ௌ'), + ('ௐ', 'ௐ'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('ఽ', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4c}'), + ('\u{c55}', '\u{c56}'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', '\u{c63}'), + ('ಀ', 'ಃ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('ಽ', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccc}'), + ('\u{cd5}', '\u{cd6}'), + ('ೝ', 'ೞ'), + ('ೠ', '\u{ce3}'), + ('ೱ', 'ೳ'), + ('\u{d00}', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', 'ഺ'), + ('ഽ', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', 'ൌ'), + ('ൎ', 'ൎ'), + ('ൔ', '\u{d57}'), + ('ൟ', '\u{d63}'), + ('ൺ', 'ൿ'), + ('\u{d81}', 'ඃ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('ෲ', 'ෳ'), + ('ก', '\u{e3a}'), + ('เ', 'ๆ'), + ('\u{e4d}', '\u{e4d}'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', '\u{eb9}'), + ('\u{ebb}', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('\u{ecd}', '\u{ecd}'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('ཀ', 'ཇ'), + ('ཉ', 'ཬ'), + ('\u{f71}', '\u{f83}'), + ('ྈ', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('က', '\u{1036}'), + ('း', 'း'), + ('ျ', 'ဿ'), + ('ၐ', 'ႏ'), + ('ႚ', '\u{109d}'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛮ', 'ᛸ'), + ('ᜀ', '\u{1713}'), + ('ᜟ', '\u{1733}'), + ('ᝀ', '\u{1753}'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('\u{1772}', '\u{1773}'), + ('ក', 'ឳ'), + ('ា', 'ៈ'), + ('ៗ', 'ៗ'), + ('ៜ', 'ៜ'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', 'ᤸ'), + ('ᥐ', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('ᨀ', '\u{1a1b}'), + ('ᨠ', '\u{1a5e}'), + ('ᩡ', '\u{1a74}'), + ('ᪧ', 'ᪧ'), + ('\u{1abf}', '\u{1ac0}'), + ('\u{1acc}', '\u{1ace}'), + ('\u{1b00}', 'ᬳ'), + ('\u{1b35}', 'ᭃ'), + ('ᭅ', 'ᭌ'), + ('\u{1b80}', '\u{1ba9}'), + ('\u{1bac}', 'ᮯ'), + ('ᮺ', 'ᯥ'), + ('ᯧ', '\u{1bf1}'), + ('ᰀ', '\u{1c36}'), + ('ᱍ', 'ᱏ'), + ('ᱚ', 'ᱽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ᳩ', 'ᳬ'), + ('ᳮ', 'ᳳ'), + ('ᳵ', 'ᳶ'), + ('ᳺ', 'ᳺ'), + ('ᴀ', 'ᶿ'), + ('\u{1de7}', '\u{1df4}'), + ('Ḁ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℯ', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⓐ', 'ⓩ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('\u{2de0}', '\u{2dff}'), + ('ⸯ', 'ⸯ'), + ('々', '〇'), + ('〡', '〩'), + ('〱', '〵'), + ('〸', '〼'), + ('ぁ', 'ゖ'), + ('ゝ', 'ゟ'), + ('ァ', 'ヺ'), + ('ー', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘟ'), + ('ꘪ', 'ꘫ'), + ('Ꙁ', 'ꙮ'), + ('\u{a674}', '\u{a67b}'), + ('ꙿ', 'ꛯ'), + ('ꜗ', 'ꜟ'), + ('Ꜣ', 'ꞈ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠅ'), + ('ꠇ', 'ꠧ'), + ('ꡀ', 'ꡳ'), + ('ꢀ', 'ꣃ'), + ('\u{a8c5}', '\u{a8c5}'), + ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', '\u{a8ff}'), + ('ꤊ', '\u{a92a}'), + ('ꤰ', 'ꥒ'), + ('ꥠ', 'ꥼ'), + ('\u{a980}', 'ꦲ'), + ('ꦴ', 'ꦿ'), + ('ꧏ', 'ꧏ'), + ('ꧠ', 'ꧯ'), + ('ꧺ', 'ꧾ'), + ('ꨀ', '\u{aa36}'), + ('ꩀ', 'ꩍ'), + ('ꩠ', 'ꩶ'), + ('ꩺ', '\u{aabe}'), + ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), + ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫯ'), + ('ꫲ', 'ꫵ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꯪ'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('יִ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('A', 'Z'), + ('a', 'z'), + ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐅀', '𐅴'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('𐌀', '𐌟'), + ('𐌭', '𐍊'), + ('𐍐', '\u{1037a}'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐏑', '𐏕'), + ('𐐀', '𐒝'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '𐫤'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐴀', '\u{10d27}'), + ('𐺀', '𐺩'), + ('\u{10eab}', '\u{10eac}'), + ('𐺰', '𐺱'), + ('𐼀', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '𐽅'), + ('𐽰', '𐾁'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀀', '\u{11045}'), + ('𑁱', '𑁵'), + ('\u{11080}', '𑂸'), + ('\u{110c2}', '\u{110c2}'), + ('𑃐', '𑃨'), + ('\u{11100}', '\u{11132}'), + ('𑅄', '𑅇'), + ('𑅐', '𑅲'), + ('𑅶', '𑅶'), + ('\u{11180}', '𑆿'), + ('𑇁', '𑇄'), + ('𑇎', '\u{111cf}'), + ('𑇚', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '\u{11234}'), + ('\u{11237}', '\u{11237}'), + ('\u{1123e}', '\u{11241}'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '\u{112e8}'), + ('\u{11300}', '𑌃'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('𑌽', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍌'), + ('𑍐', '𑍐'), + ('\u{11357}', '\u{11357}'), + ('𑍝', '𑍣'), + ('𑐀', '𑑁'), + ('\u{11443}', '𑑅'), + ('𑑇', '𑑊'), + ('𑑟', '𑑡'), + ('𑒀', '𑓁'), + ('𑓄', '𑓅'), + ('𑓇', '𑓇'), + ('𑖀', '\u{115b5}'), + ('𑖸', '𑖾'), + ('𑗘', '\u{115dd}'), + ('𑘀', '𑘾'), + ('\u{11640}', '\u{11640}'), + ('𑙄', '𑙄'), + ('𑚀', '\u{116b5}'), + ('𑚸', '𑚸'), + ('𑜀', '𑜚'), + ('\u{1171d}', '\u{1172a}'), + ('𑝀', '𑝆'), + ('𑠀', '𑠸'), + ('𑢠', '𑣟'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '\u{1193c}'), + ('𑤿', '𑥂'), + ('𑦠', '𑦧'), + ('𑦪', '\u{119d7}'), + ('\u{119da}', '𑧟'), + ('𑧡', '𑧡'), + ('𑧣', '𑧤'), + ('𑨀', '𑨲'), + ('\u{11a35}', '\u{11a3e}'), + ('𑩐', '𑪗'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '\u{11c36}'), + ('\u{11c38}', '𑰾'), + ('𑱀', '𑱀'), + ('𑱲', '𑲏'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d41}'), + ('\u{11d43}', '\u{11d43}'), + ('𑵆', '\u{11d47}'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '𑶖'), + ('𑶘', '𑶘'), + ('𑻠', '𑻶'), + ('\u{11f00}', '𑼐'), + ('𑼒', '\u{11f3a}'), + ('𑼾', '\u{11f40}'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒐀', '𒑮'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('𓑁', '𓑆'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩰', '𖪾'), + ('𖫐', '𖫭'), + ('𖬀', '𖬯'), + ('𖭀', '𖭃'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖹿'), + ('𖼀', '𖽊'), + ('\u{16f4f}', '𖾇'), + ('\u{16f8f}', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '𖿣'), + ('𖿰', '𖿱'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('\u{1bc9e}', '\u{1bc9e}'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('𞀰', '𞁭'), + ('\u{1e08f}', '\u{1e08f}'), + ('𞄀', '𞄬'), + ('𞄷', '𞄽'), + ('𞅎', '𞅎'), + ('𞊐', '𞊭'), + ('𞋀', '𞋫'), + ('𞓐', '𞓫'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('𞤀', '𞥃'), + ('\u{1e947}', '\u{1e947}'), + ('𞥋', '𞥋'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('🄰', '🅉'), + ('🅐', '🅩'), + ('🅰', '🆉'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; + +pub const BIDI_CONTROL: &'static [(char, char)] = &[ + ('\u{61c}', '\u{61c}'), + ('\u{200e}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), + ('\u{2066}', '\u{2069}'), +]; + +pub const BIDI_MIRRORED: &'static [(char, char)] = &[ + ('(', ')'), + ('<', '<'), + ('>', '>'), + ('[', '['), + (']', ']'), + ('{', '{'), + ('}', '}'), + ('«', '«'), + ('»', '»'), + ('༺', '༽'), + ('᚛', '᚜'), + ('‹', '›'), + ('⁅', '⁆'), + ('⁽', '⁾'), + ('₍', '₎'), + ('⅀', '⅀'), + ('∁', '∄'), + ('∈', '∍'), + ('∑', '∑'), + ('∕', '∖'), + ('√', '∝'), + ('∟', '∢'), + ('∤', '∤'), + ('∦', '∦'), + ('∫', '∳'), + ('∹', '∹'), + ('∻', '≌'), + ('≒', '≕'), + ('≟', '≠'), + ('≢', '≢'), + ('≤', '≫'), + ('≮', '⊌'), + ('⊏', '⊒'), + ('⊘', '⊘'), + ('⊢', '⊣'), + ('⊦', '⊸'), + ('⊾', '⊿'), + ('⋉', '⋍'), + ('⋐', '⋑'), + ('⋖', '⋭'), + ('⋰', '⋿'), + ('⌈', '⌋'), + ('⌠', '⌡'), + ('〈', '〉'), + ('❨', '❵'), + ('⟀', '⟀'), + ('⟃', '⟆'), + ('⟈', '⟉'), + ('⟋', '⟍'), + ('⟓', '⟖'), + ('⟜', '⟞'), + ('⟢', '⟯'), + ('⦃', '⦘'), + ('⦛', '⦠'), + ('⦢', '⦯'), + ('⦸', '⦸'), + ('⧀', '⧅'), + ('⧉', '⧉'), + ('⧎', '⧒'), + ('⧔', '⧕'), + ('⧘', '⧜'), + ('⧡', '⧡'), + ('⧣', '⧥'), + ('⧨', '⧩'), + ('⧴', '⧹'), + ('⧼', '⧽'), + ('⨊', '⨜'), + ('⨞', '⨡'), + ('⨤', '⨤'), + ('⨦', '⨦'), + ('⨩', '⨩'), + ('⨫', '⨮'), + ('⨴', '⨵'), + ('⨼', '⨾'), + ('⩗', '⩘'), + ('⩤', '⩥'), + ('⩪', '⩭'), + ('⩯', '⩰'), + ('⩳', '⩴'), + ('⩹', '⪣'), + ('⪦', '⪭'), + ('⪯', '⫖'), + ('⫝̸', '⫝̸'), + ('⫞', '⫞'), + ('⫢', '⫦'), + ('⫬', '⫮'), + ('⫳', '⫳'), + ('⫷', '⫻'), + ('⫽', '⫽'), + ('⯾', '⯾'), + ('⸂', '⸅'), + ('⸉', '⸊'), + ('⸌', '⸍'), + ('⸜', '⸝'), + ('⸠', '⸩'), + ('⹕', '⹜'), + ('〈', '】'), + ('〔', '〛'), + ('﹙', '﹞'), + ('﹤', '﹥'), + ('(', ')'), + ('<', '<'), + ('>', '>'), + ('[', '['), + (']', ']'), + ('{', '{'), + ('}', '}'), + ('⦅', '⦆'), + ('「', '」'), + ('𝛛', '𝛛'), + ('𝜕', '𝜕'), + ('𝝏', '𝝏'), + ('𝞉', '𝞉'), + ('𝟃', '𝟃'), +]; + +pub const CASE_IGNORABLE: &'static [(char, char)] = &[ + ('\'', '\''), + ('.', '.'), + (':', ':'), + ('^', '^'), + ('`', '`'), + ('¨', '¨'), + ('\u{ad}', '\u{ad}'), + ('¯', '¯'), + ('´', '´'), + ('·', '¸'), + ('ʰ', '\u{36f}'), + ('ʹ', '͵'), + ('ͺ', 'ͺ'), + ('΄', '΅'), + ('·', '·'), + ('\u{483}', '\u{489}'), + ('ՙ', 'ՙ'), + ('՟', '՟'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('״', '״'), + ('\u{600}', '\u{605}'), + ('\u{610}', '\u{61a}'), + ('\u{61c}', '\u{61c}'), + ('ـ', 'ـ'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dd}'), + ('\u{6df}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{70f}', '\u{70f}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', 'ߵ'), + ('ߺ', 'ߺ'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('࢈', '࢈'), + ('\u{890}', '\u{891}'), + ('\u{898}', '\u{89f}'), + ('ࣉ', '\u{902}'), + ('\u{93a}', '\u{93a}'), + ('\u{93c}', '\u{93c}'), + ('\u{941}', '\u{948}'), + ('\u{94d}', '\u{94d}'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('ॱ', 'ॱ'), + ('\u{981}', '\u{981}'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9c1}', '\u{9c4}'), + ('\u{9cd}', '\u{9cd}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', '\u{a02}'), + ('\u{a3c}', '\u{a3c}'), + ('\u{a41}', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', '\u{a82}'), + ('\u{abc}', '\u{abc}'), + ('\u{ac1}', '\u{ac5}'), + ('\u{ac7}', '\u{ac8}'), + ('\u{acd}', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', '\u{b01}'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3f}', '\u{b3f}'), + ('\u{b41}', '\u{b44}'), + ('\u{b4d}', '\u{b4d}'), + ('\u{b55}', '\u{b56}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bc0}', '\u{bc0}'), + ('\u{bcd}', '\u{bcd}'), + ('\u{c00}', '\u{c00}'), + ('\u{c04}', '\u{c04}'), + ('\u{c3c}', '\u{c3c}'), + ('\u{c3e}', '\u{c40}'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', '\u{c81}'), + ('\u{cbc}', '\u{cbc}'), + ('\u{cbf}', '\u{cbf}'), + ('\u{cc6}', '\u{cc6}'), + ('\u{ccc}', '\u{ccd}'), + ('\u{ce2}', '\u{ce3}'), + ('\u{d00}', '\u{d01}'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d41}', '\u{d44}'), + ('\u{d4d}', '\u{d4d}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', '\u{d81}'), + ('\u{dca}', '\u{dca}'), + ('\u{dd2}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('ๆ', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('ໆ', 'ໆ'), + ('\u{ec8}', '\u{ece}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('\u{f71}', '\u{f7e}'), + ('\u{f80}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('\u{102d}', '\u{1030}'), + ('\u{1032}', '\u{1037}'), + ('\u{1039}', '\u{103a}'), + ('\u{103d}', '\u{103e}'), + ('\u{1058}', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{1082}'), + ('\u{1085}', '\u{1086}'), + ('\u{108d}', '\u{108d}'), + ('\u{109d}', '\u{109d}'), + ('ჼ', 'ჼ'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '\u{1714}'), + ('\u{1732}', '\u{1733}'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17b5}'), + ('\u{17b7}', '\u{17bd}'), + ('\u{17c6}', '\u{17c6}'), + ('\u{17c9}', '\u{17d3}'), + ('ៗ', 'ៗ'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180f}'), + ('ᡃ', 'ᡃ'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', '\u{1922}'), + ('\u{1927}', '\u{1928}'), + ('\u{1932}', '\u{1932}'), + ('\u{1939}', '\u{193b}'), + ('\u{1a17}', '\u{1a18}'), + ('\u{1a1b}', '\u{1a1b}'), + ('\u{1a56}', '\u{1a56}'), + ('\u{1a58}', '\u{1a5e}'), + ('\u{1a60}', '\u{1a60}'), + ('\u{1a62}', '\u{1a62}'), + ('\u{1a65}', '\u{1a6c}'), + ('\u{1a73}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('ᪧ', 'ᪧ'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1b00}', '\u{1b03}'), + ('\u{1b34}', '\u{1b34}'), + ('\u{1b36}', '\u{1b3a}'), + ('\u{1b3c}', '\u{1b3c}'), + ('\u{1b42}', '\u{1b42}'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '\u{1b81}'), + ('\u{1ba2}', '\u{1ba5}'), + ('\u{1ba8}', '\u{1ba9}'), + ('\u{1bab}', '\u{1bad}'), + ('\u{1be6}', '\u{1be6}'), + ('\u{1be8}', '\u{1be9}'), + ('\u{1bed}', '\u{1bed}'), + ('\u{1bef}', '\u{1bf1}'), + ('\u{1c2c}', '\u{1c33}'), + ('\u{1c36}', '\u{1c37}'), + ('ᱸ', 'ᱽ'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce0}'), + ('\u{1ce2}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('\u{1cf8}', '\u{1cf9}'), + ('ᴬ', 'ᵪ'), + ('ᵸ', 'ᵸ'), + ('ᶛ', '\u{1dff}'), + ('᾽', '᾽'), + ('᾿', '῁'), + ('῍', '῏'), + ('῝', '῟'), + ('῭', '`'), + ('´', '῾'), + ('\u{200b}', '\u{200f}'), + ('‘', '’'), + ('․', '․'), + ('‧', '‧'), + ('\u{202a}', '\u{202e}'), + ('\u{2060}', '\u{2064}'), + ('\u{2066}', '\u{206f}'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('\u{20d0}', '\u{20f0}'), + ('ⱼ', 'ⱽ'), + ('\u{2cef}', '\u{2cf1}'), + ('ⵯ', 'ⵯ'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('ⸯ', 'ⸯ'), + ('々', '々'), + ('\u{302a}', '\u{302d}'), + ('〱', '〵'), + ('〻', '〻'), + ('\u{3099}', 'ゞ'), + ('ー', 'ヾ'), + ('ꀕ', 'ꀕ'), + ('ꓸ', 'ꓽ'), + ('ꘌ', 'ꘌ'), + ('\u{a66f}', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('ꙿ', 'ꙿ'), + ('ꚜ', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('꜀', '꜡'), + ('ꝰ', 'ꝰ'), + ('ꞈ', '꞊'), + ('ꟲ', 'ꟴ'), + ('ꟸ', 'ꟹ'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('\u{a825}', '\u{a826}'), + ('\u{a82c}', '\u{a82c}'), + ('\u{a8c4}', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', '\u{a951}'), + ('\u{a980}', '\u{a982}'), + ('\u{a9b3}', '\u{a9b3}'), + ('\u{a9b6}', '\u{a9b9}'), + ('\u{a9bc}', '\u{a9bd}'), + ('ꧏ', 'ꧏ'), + ('\u{a9e5}', 'ꧦ'), + ('\u{aa29}', '\u{aa2e}'), + ('\u{aa31}', '\u{aa32}'), + ('\u{aa35}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', '\u{aa4c}'), + ('ꩰ', 'ꩰ'), + ('\u{aa7c}', '\u{aa7c}'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('ꫝ', 'ꫝ'), + ('\u{aaec}', '\u{aaed}'), + ('ꫳ', 'ꫴ'), + ('\u{aaf6}', '\u{aaf6}'), + ('꭛', 'ꭟ'), + ('ꭩ', '꭫'), + ('\u{abe5}', '\u{abe5}'), + ('\u{abe8}', '\u{abe8}'), + ('\u{abed}', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('﮲', '﯂'), + ('\u{fe00}', '\u{fe0f}'), + ('︓', '︓'), + ('\u{fe20}', '\u{fe2f}'), + ('﹒', '﹒'), + ('﹕', '﹕'), + ('\u{feff}', '\u{feff}'), + (''', '''), + ('.', '.'), + (':', ':'), + ('^', '^'), + ('`', '`'), + ('ー', 'ー'), + ('\u{ff9e}', '\u{ff9f}'), + (' ̄', ' ̄'), + ('\u{fff9}', '\u{fffb}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10efd}', '\u{10eff}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{10f82}', '\u{10f85}'), + ('\u{11001}', '\u{11001}'), + ('\u{11038}', '\u{11046}'), + ('\u{11070}', '\u{11070}'), + ('\u{11073}', '\u{11074}'), + ('\u{1107f}', '\u{11081}'), + ('\u{110b3}', '\u{110b6}'), + ('\u{110b9}', '\u{110ba}'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110c2}', '\u{110c2}'), + ('\u{110cd}', '\u{110cd}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{1112b}'), + ('\u{1112d}', '\u{11134}'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', '\u{11181}'), + ('\u{111b6}', '\u{111be}'), + ('\u{111c9}', '\u{111cc}'), + ('\u{111cf}', '\u{111cf}'), + ('\u{1122f}', '\u{11231}'), + ('\u{11234}', '\u{11234}'), + ('\u{11236}', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{11241}', '\u{11241}'), + ('\u{112df}', '\u{112df}'), + ('\u{112e3}', '\u{112ea}'), + ('\u{11300}', '\u{11301}'), + ('\u{1133b}', '\u{1133c}'), + ('\u{11340}', '\u{11340}'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('\u{11438}', '\u{1143f}'), + ('\u{11442}', '\u{11444}'), + ('\u{11446}', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b3}', '\u{114b8}'), + ('\u{114ba}', '\u{114ba}'), + ('\u{114bf}', '\u{114c0}'), + ('\u{114c2}', '\u{114c3}'), + ('\u{115b2}', '\u{115b5}'), + ('\u{115bc}', '\u{115bd}'), + ('\u{115bf}', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('\u{11633}', '\u{1163a}'), + ('\u{1163d}', '\u{1163d}'), + ('\u{1163f}', '\u{11640}'), + ('\u{116ab}', '\u{116ab}'), + ('\u{116ad}', '\u{116ad}'), + ('\u{116b0}', '\u{116b5}'), + ('\u{116b7}', '\u{116b7}'), + ('\u{1171d}', '\u{1171f}'), + ('\u{11722}', '\u{11725}'), + ('\u{11727}', '\u{1172b}'), + ('\u{1182f}', '\u{11837}'), + ('\u{11839}', '\u{1183a}'), + ('\u{1193b}', '\u{1193c}'), + ('\u{1193e}', '\u{1193e}'), + ('\u{11943}', '\u{11943}'), + ('\u{119d4}', '\u{119d7}'), + ('\u{119da}', '\u{119db}'), + ('\u{119e0}', '\u{119e0}'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', '\u{11a38}'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a56}'), + ('\u{11a59}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a96}'), + ('\u{11a98}', '\u{11a99}'), + ('\u{11c30}', '\u{11c36}'), + ('\u{11c38}', '\u{11c3d}'), + ('\u{11c3f}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('\u{11caa}', '\u{11cb0}'), + ('\u{11cb2}', '\u{11cb3}'), + ('\u{11cb5}', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('\u{11d90}', '\u{11d91}'), + ('\u{11d95}', '\u{11d95}'), + ('\u{11d97}', '\u{11d97}'), + ('\u{11ef3}', '\u{11ef4}'), + ('\u{11f00}', '\u{11f01}'), + ('\u{11f36}', '\u{11f3a}'), + ('\u{11f40}', '\u{11f40}'), + ('\u{11f42}', '\u{11f42}'), + ('\u{13430}', '\u{13440}'), + ('\u{13447}', '\u{13455}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('𖭀', '𖭃'), + ('\u{16f4f}', '\u{16f4f}'), + ('\u{16f8f}', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '\u{16fe4}'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1bca0}', '\u{1bca3}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d167}', '\u{1d169}'), + ('\u{1d173}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('𞀰', '𞁭'), + ('\u{1e08f}', '\u{1e08f}'), + ('\u{1e130}', '𞄽'), + ('\u{1e2ae}', '\u{1e2ae}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('𞓫', '\u{1e4ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '𞥋'), + ('🏻', '🏿'), + ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const CASED: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ƺ'), + ('Ƽ', 'ƿ'), + ('DŽ', 'ʓ'), + ('ʕ', 'ʸ'), + ('ˀ', 'ˁ'), + ('ˠ', 'ˤ'), + ('\u{345}', '\u{345}'), + ('Ͱ', 'ͳ'), + ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՠ', 'ֈ'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ჿ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ᴀ', 'ᶿ'), + ('Ḁ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℯ', 'ℴ'), + ('ℹ', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ⅿ'), + ('Ↄ', 'ↄ'), + ('Ⓐ', 'ⓩ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('Ꙁ', 'ꙭ'), + ('Ꚁ', 'ꚝ'), + ('Ꜣ', 'ꞇ'), + ('Ꞌ', 'ꞎ'), + ('Ꞑ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꟶ'), + ('ꟸ', 'ꟺ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꮿ'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('A', 'Z'), + ('a', 'z'), + ('𐐀', '𐑏'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐞀', '𐞀'), + ('𐞃', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𑢠', '𑣟'), + ('𖹀', '𖹿'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝼀', '𝼉'), + ('𝼋', '𝼞'), + ('𝼥', '𝼪'), + ('𞀰', '𞁭'), + ('𞤀', '𞥃'), + ('🄰', '🅉'), + ('🅐', '🅩'), + ('🅰', '🆉'), +]; + +pub const CHANGES_WHEN_CASEFOLDED: &'static [(char, char)] = &[ + ('A', 'Z'), + ('µ', 'µ'), + ('À', 'Ö'), + ('Ø', 'ß'), + ('Ā', 'Ā'), + ('Ă', 'Ă'), + ('Ą', 'Ą'), + ('Ć', 'Ć'), + ('Ĉ', 'Ĉ'), + ('Ċ', 'Ċ'), + ('Č', 'Č'), + ('Ď', 'Ď'), + ('Đ', 'Đ'), + ('Ē', 'Ē'), + ('Ĕ', 'Ĕ'), + ('Ė', 'Ė'), + ('Ę', 'Ę'), + ('Ě', 'Ě'), + ('Ĝ', 'Ĝ'), + ('Ğ', 'Ğ'), + ('Ġ', 'Ġ'), + ('Ģ', 'Ģ'), + ('Ĥ', 'Ĥ'), + ('Ħ', 'Ħ'), + ('Ĩ', 'Ĩ'), + ('Ī', 'Ī'), + ('Ĭ', 'Ĭ'), + ('Į', 'Į'), + ('İ', 'İ'), + ('IJ', 'IJ'), + ('Ĵ', 'Ĵ'), + ('Ķ', 'Ķ'), + ('Ĺ', 'Ĺ'), + ('Ļ', 'Ļ'), + ('Ľ', 'Ľ'), + ('Ŀ', 'Ŀ'), + ('Ł', 'Ł'), + ('Ń', 'Ń'), + ('Ņ', 'Ņ'), + ('Ň', 'Ň'), + ('ʼn', 'Ŋ'), + ('Ō', 'Ō'), + ('Ŏ', 'Ŏ'), + ('Ő', 'Ő'), + ('Œ', 'Œ'), + ('Ŕ', 'Ŕ'), + ('Ŗ', 'Ŗ'), + ('Ř', 'Ř'), + ('Ś', 'Ś'), + ('Ŝ', 'Ŝ'), + ('Ş', 'Ş'), + ('Š', 'Š'), + ('Ţ', 'Ţ'), + ('Ť', 'Ť'), + ('Ŧ', 'Ŧ'), + ('Ũ', 'Ũ'), + ('Ū', 'Ū'), + ('Ŭ', 'Ŭ'), + ('Ů', 'Ů'), + ('Ű', 'Ű'), + ('Ų', 'Ų'), + ('Ŵ', 'Ŵ'), + ('Ŷ', 'Ŷ'), + ('Ÿ', 'Ź'), + ('Ż', 'Ż'), + ('Ž', 'Ž'), + ('ſ', 'ſ'), + ('Ɓ', 'Ƃ'), + ('Ƅ', 'Ƅ'), + ('Ɔ', 'Ƈ'), + ('Ɖ', 'Ƌ'), + ('Ǝ', 'Ƒ'), + ('Ɠ', 'Ɣ'), + ('Ɩ', 'Ƙ'), + ('Ɯ', 'Ɲ'), + ('Ɵ', 'Ơ'), + ('Ƣ', 'Ƣ'), + ('Ƥ', 'Ƥ'), + ('Ʀ', 'Ƨ'), + ('Ʃ', 'Ʃ'), + ('Ƭ', 'Ƭ'), + ('Ʈ', 'Ư'), + ('Ʊ', 'Ƴ'), + ('Ƶ', 'Ƶ'), + ('Ʒ', 'Ƹ'), + ('Ƽ', 'Ƽ'), + ('DŽ', 'Dž'), + ('LJ', 'Lj'), + ('NJ', 'Nj'), + ('Ǎ', 'Ǎ'), + ('Ǐ', 'Ǐ'), + ('Ǒ', 'Ǒ'), + ('Ǔ', 'Ǔ'), + ('Ǖ', 'Ǖ'), + ('Ǘ', 'Ǘ'), + ('Ǚ', 'Ǚ'), + ('Ǜ', 'Ǜ'), + ('Ǟ', 'Ǟ'), + ('Ǡ', 'Ǡ'), + ('Ǣ', 'Ǣ'), + ('Ǥ', 'Ǥ'), + ('Ǧ', 'Ǧ'), + ('Ǩ', 'Ǩ'), + ('Ǫ', 'Ǫ'), + ('Ǭ', 'Ǭ'), + ('Ǯ', 'Ǯ'), + ('DZ', 'Dz'), + ('Ǵ', 'Ǵ'), + ('Ƕ', 'Ǹ'), + ('Ǻ', 'Ǻ'), + ('Ǽ', 'Ǽ'), + ('Ǿ', 'Ǿ'), + ('Ȁ', 'Ȁ'), + ('Ȃ', 'Ȃ'), + ('Ȅ', 'Ȅ'), + ('Ȇ', 'Ȇ'), + ('Ȉ', 'Ȉ'), + ('Ȋ', 'Ȋ'), + ('Ȍ', 'Ȍ'), + ('Ȏ', 'Ȏ'), + ('Ȑ', 'Ȑ'), + ('Ȓ', 'Ȓ'), + ('Ȕ', 'Ȕ'), + ('Ȗ', 'Ȗ'), + ('Ș', 'Ș'), + ('Ț', 'Ț'), + ('Ȝ', 'Ȝ'), + ('Ȟ', 'Ȟ'), + ('Ƞ', 'Ƞ'), + ('Ȣ', 'Ȣ'), + ('Ȥ', 'Ȥ'), + ('Ȧ', 'Ȧ'), + ('Ȩ', 'Ȩ'), + ('Ȫ', 'Ȫ'), + ('Ȭ', 'Ȭ'), + ('Ȯ', 'Ȯ'), + ('Ȱ', 'Ȱ'), + ('Ȳ', 'Ȳ'), + ('Ⱥ', 'Ȼ'), + ('Ƚ', 'Ⱦ'), + ('Ɂ', 'Ɂ'), + ('Ƀ', 'Ɇ'), + ('Ɉ', 'Ɉ'), + ('Ɋ', 'Ɋ'), + ('Ɍ', 'Ɍ'), + ('Ɏ', 'Ɏ'), + ('\u{345}', '\u{345}'), + ('Ͱ', 'Ͱ'), + ('Ͳ', 'Ͳ'), + ('Ͷ', 'Ͷ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ώ'), + ('Α', 'Ρ'), + ('Σ', 'Ϋ'), + ('ς', 'ς'), + ('Ϗ', 'ϑ'), + ('ϕ', 'ϖ'), + ('Ϙ', 'Ϙ'), + ('Ϛ', 'Ϛ'), + ('Ϝ', 'Ϝ'), + ('Ϟ', 'Ϟ'), + ('Ϡ', 'Ϡ'), + ('Ϣ', 'Ϣ'), + ('Ϥ', 'Ϥ'), + ('Ϧ', 'Ϧ'), + ('Ϩ', 'Ϩ'), + ('Ϫ', 'Ϫ'), + ('Ϭ', 'Ϭ'), + ('Ϯ', 'Ϯ'), + ('ϰ', 'ϱ'), + ('ϴ', 'ϵ'), + ('Ϸ', 'Ϸ'), + ('Ϲ', 'Ϻ'), + ('Ͻ', 'Я'), + ('Ѡ', 'Ѡ'), + ('Ѣ', 'Ѣ'), + ('Ѥ', 'Ѥ'), + ('Ѧ', 'Ѧ'), + ('Ѩ', 'Ѩ'), + ('Ѫ', 'Ѫ'), + ('Ѭ', 'Ѭ'), + ('Ѯ', 'Ѯ'), + ('Ѱ', 'Ѱ'), + ('Ѳ', 'Ѳ'), + ('Ѵ', 'Ѵ'), + ('Ѷ', 'Ѷ'), + ('Ѹ', 'Ѹ'), + ('Ѻ', 'Ѻ'), + ('Ѽ', 'Ѽ'), + ('Ѿ', 'Ѿ'), + ('Ҁ', 'Ҁ'), + ('Ҋ', 'Ҋ'), + ('Ҍ', 'Ҍ'), + ('Ҏ', 'Ҏ'), + ('Ґ', 'Ґ'), + ('Ғ', 'Ғ'), + ('Ҕ', 'Ҕ'), + ('Җ', 'Җ'), + ('Ҙ', 'Ҙ'), + ('Қ', 'Қ'), + ('Ҝ', 'Ҝ'), + ('Ҟ', 'Ҟ'), + ('Ҡ', 'Ҡ'), + ('Ң', 'Ң'), + ('Ҥ', 'Ҥ'), + ('Ҧ', 'Ҧ'), + ('Ҩ', 'Ҩ'), + ('Ҫ', 'Ҫ'), + ('Ҭ', 'Ҭ'), + ('Ү', 'Ү'), + ('Ұ', 'Ұ'), + ('Ҳ', 'Ҳ'), + ('Ҵ', 'Ҵ'), + ('Ҷ', 'Ҷ'), + ('Ҹ', 'Ҹ'), + ('Һ', 'Һ'), + ('Ҽ', 'Ҽ'), + ('Ҿ', 'Ҿ'), + ('Ӏ', 'Ӂ'), + ('Ӄ', 'Ӄ'), + ('Ӆ', 'Ӆ'), + ('Ӈ', 'Ӈ'), + ('Ӊ', 'Ӊ'), + ('Ӌ', 'Ӌ'), + ('Ӎ', 'Ӎ'), + ('Ӑ', 'Ӑ'), + ('Ӓ', 'Ӓ'), + ('Ӕ', 'Ӕ'), + ('Ӗ', 'Ӗ'), + ('Ә', 'Ә'), + ('Ӛ', 'Ӛ'), + ('Ӝ', 'Ӝ'), + ('Ӟ', 'Ӟ'), + ('Ӡ', 'Ӡ'), + ('Ӣ', 'Ӣ'), + ('Ӥ', 'Ӥ'), + ('Ӧ', 'Ӧ'), + ('Ө', 'Ө'), + ('Ӫ', 'Ӫ'), + ('Ӭ', 'Ӭ'), + ('Ӯ', 'Ӯ'), + ('Ӱ', 'Ӱ'), + ('Ӳ', 'Ӳ'), + ('Ӵ', 'Ӵ'), + ('Ӷ', 'Ӷ'), + ('Ӹ', 'Ӹ'), + ('Ӻ', 'Ӻ'), + ('Ӽ', 'Ӽ'), + ('Ӿ', 'Ӿ'), + ('Ԁ', 'Ԁ'), + ('Ԃ', 'Ԃ'), + ('Ԅ', 'Ԅ'), + ('Ԇ', 'Ԇ'), + ('Ԉ', 'Ԉ'), + ('Ԋ', 'Ԋ'), + ('Ԍ', 'Ԍ'), + ('Ԏ', 'Ԏ'), + ('Ԑ', 'Ԑ'), + ('Ԓ', 'Ԓ'), + ('Ԕ', 'Ԕ'), + ('Ԗ', 'Ԗ'), + ('Ԙ', 'Ԙ'), + ('Ԛ', 'Ԛ'), + ('Ԝ', 'Ԝ'), + ('Ԟ', 'Ԟ'), + ('Ԡ', 'Ԡ'), + ('Ԣ', 'Ԣ'), + ('Ԥ', 'Ԥ'), + ('Ԧ', 'Ԧ'), + ('Ԩ', 'Ԩ'), + ('Ԫ', 'Ԫ'), + ('Ԭ', 'Ԭ'), + ('Ԯ', 'Ԯ'), + ('Ա', 'Ֆ'), + ('և', 'և'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ᏸ', 'ᏽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('Ḁ', 'Ḁ'), + ('Ḃ', 'Ḃ'), + ('Ḅ', 'Ḅ'), + ('Ḇ', 'Ḇ'), + ('Ḉ', 'Ḉ'), + ('Ḋ', 'Ḋ'), + ('Ḍ', 'Ḍ'), + ('Ḏ', 'Ḏ'), + ('Ḑ', 'Ḑ'), + ('Ḓ', 'Ḓ'), + ('Ḕ', 'Ḕ'), + ('Ḗ', 'Ḗ'), + ('Ḙ', 'Ḙ'), + ('Ḛ', 'Ḛ'), + ('Ḝ', 'Ḝ'), + ('Ḟ', 'Ḟ'), + ('Ḡ', 'Ḡ'), + ('Ḣ', 'Ḣ'), + ('Ḥ', 'Ḥ'), + ('Ḧ', 'Ḧ'), + ('Ḩ', 'Ḩ'), + ('Ḫ', 'Ḫ'), + ('Ḭ', 'Ḭ'), + ('Ḯ', 'Ḯ'), + ('Ḱ', 'Ḱ'), + ('Ḳ', 'Ḳ'), + ('Ḵ', 'Ḵ'), + ('Ḷ', 'Ḷ'), + ('Ḹ', 'Ḹ'), + ('Ḻ', 'Ḻ'), + ('Ḽ', 'Ḽ'), + ('Ḿ', 'Ḿ'), + ('Ṁ', 'Ṁ'), + ('Ṃ', 'Ṃ'), + ('Ṅ', 'Ṅ'), + ('Ṇ', 'Ṇ'), + ('Ṉ', 'Ṉ'), + ('Ṋ', 'Ṋ'), + ('Ṍ', 'Ṍ'), + ('Ṏ', 'Ṏ'), + ('Ṑ', 'Ṑ'), + ('Ṓ', 'Ṓ'), + ('Ṕ', 'Ṕ'), + ('Ṗ', 'Ṗ'), + ('Ṙ', 'Ṙ'), + ('Ṛ', 'Ṛ'), + ('Ṝ', 'Ṝ'), + ('Ṟ', 'Ṟ'), + ('Ṡ', 'Ṡ'), + ('Ṣ', 'Ṣ'), + ('Ṥ', 'Ṥ'), + ('Ṧ', 'Ṧ'), + ('Ṩ', 'Ṩ'), + ('Ṫ', 'Ṫ'), + ('Ṭ', 'Ṭ'), + ('Ṯ', 'Ṯ'), + ('Ṱ', 'Ṱ'), + ('Ṳ', 'Ṳ'), + ('Ṵ', 'Ṵ'), + ('Ṷ', 'Ṷ'), + ('Ṹ', 'Ṹ'), + ('Ṻ', 'Ṻ'), + ('Ṽ', 'Ṽ'), + ('Ṿ', 'Ṿ'), + ('Ẁ', 'Ẁ'), + ('Ẃ', 'Ẃ'), + ('Ẅ', 'Ẅ'), + ('Ẇ', 'Ẇ'), + ('Ẉ', 'Ẉ'), + ('Ẋ', 'Ẋ'), + ('Ẍ', 'Ẍ'), + ('Ẏ', 'Ẏ'), + ('Ẑ', 'Ẑ'), + ('Ẓ', 'Ẓ'), + ('Ẕ', 'Ẕ'), + ('ẚ', 'ẛ'), + ('ẞ', 'ẞ'), + ('Ạ', 'Ạ'), + ('Ả', 'Ả'), + ('Ấ', 'Ấ'), + ('Ầ', 'Ầ'), + ('Ẩ', 'Ẩ'), + ('Ẫ', 'Ẫ'), + ('Ậ', 'Ậ'), + ('Ắ', 'Ắ'), + ('Ằ', 'Ằ'), + ('Ẳ', 'Ẳ'), + ('Ẵ', 'Ẵ'), + ('Ặ', 'Ặ'), + ('Ẹ', 'Ẹ'), + ('Ẻ', 'Ẻ'), + ('Ẽ', 'Ẽ'), + ('Ế', 'Ế'), + ('Ề', 'Ề'), + ('Ể', 'Ể'), + ('Ễ', 'Ễ'), + ('Ệ', 'Ệ'), + ('Ỉ', 'Ỉ'), + ('Ị', 'Ị'), + ('Ọ', 'Ọ'), + ('Ỏ', 'Ỏ'), + ('Ố', 'Ố'), + ('Ồ', 'Ồ'), + ('Ổ', 'Ổ'), + ('Ỗ', 'Ỗ'), + ('Ộ', 'Ộ'), + ('Ớ', 'Ớ'), + ('Ờ', 'Ờ'), + ('Ở', 'Ở'), + ('Ỡ', 'Ỡ'), + ('Ợ', 'Ợ'), + ('Ụ', 'Ụ'), + ('Ủ', 'Ủ'), + ('Ứ', 'Ứ'), + ('Ừ', 'Ừ'), + ('Ử', 'Ử'), + ('Ữ', 'Ữ'), + ('Ự', 'Ự'), + ('Ỳ', 'Ỳ'), + ('Ỵ', 'Ỵ'), + ('Ỷ', 'Ỷ'), + ('Ỹ', 'Ỹ'), + ('Ỻ', 'Ỻ'), + ('Ỽ', 'Ỽ'), + ('Ỿ', 'Ỿ'), + ('Ἀ', 'Ἇ'), + ('Ἐ', 'Ἕ'), + ('Ἠ', 'Ἧ'), + ('Ἰ', 'Ἷ'), + ('Ὀ', 'Ὅ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'Ὗ'), + ('Ὠ', 'Ὧ'), + ('ᾀ', 'ᾯ'), + ('ᾲ', 'ᾴ'), + ('ᾷ', 'ᾼ'), + ('ῂ', 'ῄ'), + ('ῇ', 'ῌ'), + ('Ῐ', 'Ί'), + ('Ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῷ', 'ῼ'), + ('Ω', 'Ω'), + ('K', 'Å'), + ('Ⅎ', 'Ⅎ'), + ('Ⅰ', 'Ⅿ'), + ('Ↄ', 'Ↄ'), + ('Ⓐ', 'Ⓩ'), + ('Ⰰ', 'Ⱟ'), + ('Ⱡ', 'Ⱡ'), + ('Ɫ', 'Ɽ'), + ('Ⱨ', 'Ⱨ'), + ('Ⱪ', 'Ⱪ'), + ('Ⱬ', 'Ⱬ'), + ('Ɑ', 'Ɒ'), + ('Ⱳ', 'Ⱳ'), + ('Ⱶ', 'Ⱶ'), + ('Ȿ', 'Ⲁ'), + ('Ⲃ', 'Ⲃ'), + ('Ⲅ', 'Ⲅ'), + ('Ⲇ', 'Ⲇ'), + ('Ⲉ', 'Ⲉ'), + ('Ⲋ', 'Ⲋ'), + ('Ⲍ', 'Ⲍ'), + ('Ⲏ', 'Ⲏ'), + ('Ⲑ', 'Ⲑ'), + ('Ⲓ', 'Ⲓ'), + ('Ⲕ', 'Ⲕ'), + ('Ⲗ', 'Ⲗ'), + ('Ⲙ', 'Ⲙ'), + ('Ⲛ', 'Ⲛ'), + ('Ⲝ', 'Ⲝ'), + ('Ⲟ', 'Ⲟ'), + ('Ⲡ', 'Ⲡ'), + ('Ⲣ', 'Ⲣ'), + ('Ⲥ', 'Ⲥ'), + ('Ⲧ', 'Ⲧ'), + ('Ⲩ', 'Ⲩ'), + ('Ⲫ', 'Ⲫ'), + ('Ⲭ', 'Ⲭ'), + ('Ⲯ', 'Ⲯ'), + ('Ⲱ', 'Ⲱ'), + ('Ⲳ', 'Ⲳ'), + ('Ⲵ', 'Ⲵ'), + ('Ⲷ', 'Ⲷ'), + ('Ⲹ', 'Ⲹ'), + ('Ⲻ', 'Ⲻ'), + ('Ⲽ', 'Ⲽ'), + ('Ⲿ', 'Ⲿ'), + ('Ⳁ', 'Ⳁ'), + ('Ⳃ', 'Ⳃ'), + ('Ⳅ', 'Ⳅ'), + ('Ⳇ', 'Ⳇ'), + ('Ⳉ', 'Ⳉ'), + ('Ⳋ', 'Ⳋ'), + ('Ⳍ', 'Ⳍ'), + ('Ⳏ', 'Ⳏ'), + ('Ⳑ', 'Ⳑ'), + ('Ⳓ', 'Ⳓ'), + ('Ⳕ', 'Ⳕ'), + ('Ⳗ', 'Ⳗ'), + ('Ⳙ', 'Ⳙ'), + ('Ⳛ', 'Ⳛ'), + ('Ⳝ', 'Ⳝ'), + ('Ⳟ', 'Ⳟ'), + ('Ⳡ', 'Ⳡ'), + ('Ⳣ', 'Ⳣ'), + ('Ⳬ', 'Ⳬ'), + ('Ⳮ', 'Ⳮ'), + ('Ⳳ', 'Ⳳ'), + ('Ꙁ', 'Ꙁ'), + ('Ꙃ', 'Ꙃ'), + ('Ꙅ', 'Ꙅ'), + ('Ꙇ', 'Ꙇ'), + ('Ꙉ', 'Ꙉ'), + ('Ꙋ', 'Ꙋ'), + ('Ꙍ', 'Ꙍ'), + ('Ꙏ', 'Ꙏ'), + ('Ꙑ', 'Ꙑ'), + ('Ꙓ', 'Ꙓ'), + ('Ꙕ', 'Ꙕ'), + ('Ꙗ', 'Ꙗ'), + ('Ꙙ', 'Ꙙ'), + ('Ꙛ', 'Ꙛ'), + ('Ꙝ', 'Ꙝ'), + ('Ꙟ', 'Ꙟ'), + ('Ꙡ', 'Ꙡ'), + ('Ꙣ', 'Ꙣ'), + ('Ꙥ', 'Ꙥ'), + ('Ꙧ', 'Ꙧ'), + ('Ꙩ', 'Ꙩ'), + ('Ꙫ', 'Ꙫ'), + ('Ꙭ', 'Ꙭ'), + ('Ꚁ', 'Ꚁ'), + ('Ꚃ', 'Ꚃ'), + ('Ꚅ', 'Ꚅ'), + ('Ꚇ', 'Ꚇ'), + ('Ꚉ', 'Ꚉ'), + ('Ꚋ', 'Ꚋ'), + ('Ꚍ', 'Ꚍ'), + ('Ꚏ', 'Ꚏ'), + ('Ꚑ', 'Ꚑ'), + ('Ꚓ', 'Ꚓ'), + ('Ꚕ', 'Ꚕ'), + ('Ꚗ', 'Ꚗ'), + ('Ꚙ', 'Ꚙ'), + ('Ꚛ', 'Ꚛ'), + ('Ꜣ', 'Ꜣ'), + ('Ꜥ', 'Ꜥ'), + ('Ꜧ', 'Ꜧ'), + ('Ꜩ', 'Ꜩ'), + ('Ꜫ', 'Ꜫ'), + ('Ꜭ', 'Ꜭ'), + ('Ꜯ', 'Ꜯ'), + ('Ꜳ', 'Ꜳ'), + ('Ꜵ', 'Ꜵ'), + ('Ꜷ', 'Ꜷ'), + ('Ꜹ', 'Ꜹ'), + ('Ꜻ', 'Ꜻ'), + ('Ꜽ', 'Ꜽ'), + ('Ꜿ', 'Ꜿ'), + ('Ꝁ', 'Ꝁ'), + ('Ꝃ', 'Ꝃ'), + ('Ꝅ', 'Ꝅ'), + ('Ꝇ', 'Ꝇ'), + ('Ꝉ', 'Ꝉ'), + ('Ꝋ', 'Ꝋ'), + ('Ꝍ', 'Ꝍ'), + ('Ꝏ', 'Ꝏ'), + ('Ꝑ', 'Ꝑ'), + ('Ꝓ', 'Ꝓ'), + ('Ꝕ', 'Ꝕ'), + ('Ꝗ', 'Ꝗ'), + ('Ꝙ', 'Ꝙ'), + ('Ꝛ', 'Ꝛ'), + ('Ꝝ', 'Ꝝ'), + ('Ꝟ', 'Ꝟ'), + ('Ꝡ', 'Ꝡ'), + ('Ꝣ', 'Ꝣ'), + ('Ꝥ', 'Ꝥ'), + ('Ꝧ', 'Ꝧ'), + ('Ꝩ', 'Ꝩ'), + ('Ꝫ', 'Ꝫ'), + ('Ꝭ', 'Ꝭ'), + ('Ꝯ', 'Ꝯ'), + ('Ꝺ', 'Ꝺ'), + ('Ꝼ', 'Ꝼ'), + ('Ᵹ', 'Ꝿ'), + ('Ꞁ', 'Ꞁ'), + ('Ꞃ', 'Ꞃ'), + ('Ꞅ', 'Ꞅ'), + ('Ꞇ', 'Ꞇ'), + ('Ꞌ', 'Ꞌ'), + ('Ɥ', 'Ɥ'), + ('Ꞑ', 'Ꞑ'), + ('Ꞓ', 'Ꞓ'), + ('Ꞗ', 'Ꞗ'), + ('Ꞙ', 'Ꞙ'), + ('Ꞛ', 'Ꞛ'), + ('Ꞝ', 'Ꞝ'), + ('Ꞟ', 'Ꞟ'), + ('Ꞡ', 'Ꞡ'), + ('Ꞣ', 'Ꞣ'), + ('Ꞥ', 'Ꞥ'), + ('Ꞧ', 'Ꞧ'), + ('Ꞩ', 'Ꞩ'), + ('Ɦ', 'Ɪ'), + ('Ʞ', 'Ꞵ'), + ('Ꞷ', 'Ꞷ'), + ('Ꞹ', 'Ꞹ'), + ('Ꞻ', 'Ꞻ'), + ('Ꞽ', 'Ꞽ'), + ('Ꞿ', 'Ꞿ'), + ('Ꟁ', 'Ꟁ'), + ('Ꟃ', 'Ꟃ'), + ('Ꞔ', 'Ꟈ'), + ('Ꟊ', 'Ꟊ'), + ('Ꟑ', 'Ꟑ'), + ('Ꟗ', 'Ꟗ'), + ('Ꟙ', 'Ꟙ'), + ('Ꟶ', 'Ꟶ'), + ('ꭰ', 'ꮿ'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('A', 'Z'), + ('𐐀', '𐐧'), + ('𐒰', '𐓓'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐲀', '𐲲'), + ('𑢠', '𑢿'), + ('𖹀', '𖹟'), + ('𞤀', '𞤡'), +]; + +pub const CHANGES_WHEN_CASEMAPPED: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('µ', 'µ'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ķ'), + ('Ĺ', 'ƌ'), + ('Ǝ', 'ƚ'), + ('Ɯ', 'Ʃ'), + ('Ƭ', 'ƹ'), + ('Ƽ', 'ƽ'), + ('ƿ', 'ƿ'), + ('DŽ', 'Ƞ'), + ('Ȣ', 'ȳ'), + ('Ⱥ', 'ɔ'), + ('ɖ', 'ɗ'), + ('ə', 'ə'), + ('ɛ', 'ɜ'), + ('ɠ', 'ɡ'), + ('ɣ', 'ɣ'), + ('ɥ', 'ɦ'), + ('ɨ', 'ɬ'), + ('ɯ', 'ɯ'), + ('ɱ', 'ɲ'), + ('ɵ', 'ɵ'), + ('ɽ', 'ɽ'), + ('ʀ', 'ʀ'), + ('ʂ', 'ʃ'), + ('ʇ', 'ʌ'), + ('ʒ', 'ʒ'), + ('ʝ', 'ʞ'), + ('\u{345}', '\u{345}'), + ('Ͱ', 'ͳ'), + ('Ͷ', 'ͷ'), + ('ͻ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϑ'), + ('ϕ', 'ϵ'), + ('Ϸ', 'ϻ'), + ('Ͻ', 'ҁ'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ա', 'և'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჽ', 'ჿ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ᵹ', 'ᵹ'), + ('ᵽ', 'ᵽ'), + ('ᶎ', 'ᶎ'), + ('Ḁ', 'ẛ'), + ('ẞ', 'ẞ'), + ('Ạ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('Ω', 'Ω'), + ('K', 'Å'), + ('Ⅎ', 'Ⅎ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ⅿ'), + ('Ↄ', 'ↄ'), + ('Ⓐ', 'ⓩ'), + ('Ⰰ', 'Ɒ'), + ('Ⱳ', 'ⱳ'), + ('Ⱶ', 'ⱶ'), + ('Ȿ', 'ⳣ'), + ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('Ꙁ', 'ꙭ'), + ('Ꚁ', 'ꚛ'), + ('Ꜣ', 'ꜯ'), + ('Ꜳ', 'ꝯ'), + ('Ꝺ', 'ꞇ'), + ('Ꞌ', 'Ɥ'), + ('Ꞑ', 'ꞔ'), + ('Ꞗ', 'Ɪ'), + ('Ʞ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('Ꟗ', 'ꟙ'), + ('Ꟶ', 'ꟶ'), + ('ꭓ', 'ꭓ'), + ('ꭰ', 'ꮿ'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('A', 'Z'), + ('a', 'z'), + ('𐐀', '𐑏'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𑢠', '𑣟'), + ('𖹀', '𖹿'), + ('𞤀', '𞥃'), +]; + +pub const CHANGES_WHEN_LOWERCASED: &'static [(char, char)] = &[ + ('A', 'Z'), + ('À', 'Ö'), + ('Ø', 'Þ'), + ('Ā', 'Ā'), + ('Ă', 'Ă'), + ('Ą', 'Ą'), + ('Ć', 'Ć'), + ('Ĉ', 'Ĉ'), + ('Ċ', 'Ċ'), + ('Č', 'Č'), + ('Ď', 'Ď'), + ('Đ', 'Đ'), + ('Ē', 'Ē'), + ('Ĕ', 'Ĕ'), + ('Ė', 'Ė'), + ('Ę', 'Ę'), + ('Ě', 'Ě'), + ('Ĝ', 'Ĝ'), + ('Ğ', 'Ğ'), + ('Ġ', 'Ġ'), + ('Ģ', 'Ģ'), + ('Ĥ', 'Ĥ'), + ('Ħ', 'Ħ'), + ('Ĩ', 'Ĩ'), + ('Ī', 'Ī'), + ('Ĭ', 'Ĭ'), + ('Į', 'Į'), + ('İ', 'İ'), + ('IJ', 'IJ'), + ('Ĵ', 'Ĵ'), + ('Ķ', 'Ķ'), + ('Ĺ', 'Ĺ'), + ('Ļ', 'Ļ'), + ('Ľ', 'Ľ'), + ('Ŀ', 'Ŀ'), + ('Ł', 'Ł'), + ('Ń', 'Ń'), + ('Ņ', 'Ņ'), + ('Ň', 'Ň'), + ('Ŋ', 'Ŋ'), + ('Ō', 'Ō'), + ('Ŏ', 'Ŏ'), + ('Ő', 'Ő'), + ('Œ', 'Œ'), + ('Ŕ', 'Ŕ'), + ('Ŗ', 'Ŗ'), + ('Ř', 'Ř'), + ('Ś', 'Ś'), + ('Ŝ', 'Ŝ'), + ('Ş', 'Ş'), + ('Š', 'Š'), + ('Ţ', 'Ţ'), + ('Ť', 'Ť'), + ('Ŧ', 'Ŧ'), + ('Ũ', 'Ũ'), + ('Ū', 'Ū'), + ('Ŭ', 'Ŭ'), + ('Ů', 'Ů'), + ('Ű', 'Ű'), + ('Ų', 'Ų'), + ('Ŵ', 'Ŵ'), + ('Ŷ', 'Ŷ'), + ('Ÿ', 'Ź'), + ('Ż', 'Ż'), + ('Ž', 'Ž'), + ('Ɓ', 'Ƃ'), + ('Ƅ', 'Ƅ'), + ('Ɔ', 'Ƈ'), + ('Ɖ', 'Ƌ'), + ('Ǝ', 'Ƒ'), + ('Ɠ', 'Ɣ'), + ('Ɩ', 'Ƙ'), + ('Ɯ', 'Ɲ'), + ('Ɵ', 'Ơ'), + ('Ƣ', 'Ƣ'), + ('Ƥ', 'Ƥ'), + ('Ʀ', 'Ƨ'), + ('Ʃ', 'Ʃ'), + ('Ƭ', 'Ƭ'), + ('Ʈ', 'Ư'), + ('Ʊ', 'Ƴ'), + ('Ƶ', 'Ƶ'), + ('Ʒ', 'Ƹ'), + ('Ƽ', 'Ƽ'), + ('DŽ', 'Dž'), + ('LJ', 'Lj'), + ('NJ', 'Nj'), + ('Ǎ', 'Ǎ'), + ('Ǐ', 'Ǐ'), + ('Ǒ', 'Ǒ'), + ('Ǔ', 'Ǔ'), + ('Ǖ', 'Ǖ'), + ('Ǘ', 'Ǘ'), + ('Ǚ', 'Ǚ'), + ('Ǜ', 'Ǜ'), + ('Ǟ', 'Ǟ'), + ('Ǡ', 'Ǡ'), + ('Ǣ', 'Ǣ'), + ('Ǥ', 'Ǥ'), + ('Ǧ', 'Ǧ'), + ('Ǩ', 'Ǩ'), + ('Ǫ', 'Ǫ'), + ('Ǭ', 'Ǭ'), + ('Ǯ', 'Ǯ'), + ('DZ', 'Dz'), + ('Ǵ', 'Ǵ'), + ('Ƕ', 'Ǹ'), + ('Ǻ', 'Ǻ'), + ('Ǽ', 'Ǽ'), + ('Ǿ', 'Ǿ'), + ('Ȁ', 'Ȁ'), + ('Ȃ', 'Ȃ'), + ('Ȅ', 'Ȅ'), + ('Ȇ', 'Ȇ'), + ('Ȉ', 'Ȉ'), + ('Ȋ', 'Ȋ'), + ('Ȍ', 'Ȍ'), + ('Ȏ', 'Ȏ'), + ('Ȑ', 'Ȑ'), + ('Ȓ', 'Ȓ'), + ('Ȕ', 'Ȕ'), + ('Ȗ', 'Ȗ'), + ('Ș', 'Ș'), + ('Ț', 'Ț'), + ('Ȝ', 'Ȝ'), + ('Ȟ', 'Ȟ'), + ('Ƞ', 'Ƞ'), + ('Ȣ', 'Ȣ'), + ('Ȥ', 'Ȥ'), + ('Ȧ', 'Ȧ'), + ('Ȩ', 'Ȩ'), + ('Ȫ', 'Ȫ'), + ('Ȭ', 'Ȭ'), + ('Ȯ', 'Ȯ'), + ('Ȱ', 'Ȱ'), + ('Ȳ', 'Ȳ'), + ('Ⱥ', 'Ȼ'), + ('Ƚ', 'Ⱦ'), + ('Ɂ', 'Ɂ'), + ('Ƀ', 'Ɇ'), + ('Ɉ', 'Ɉ'), + ('Ɋ', 'Ɋ'), + ('Ɍ', 'Ɍ'), + ('Ɏ', 'Ɏ'), + ('Ͱ', 'Ͱ'), + ('Ͳ', 'Ͳ'), + ('Ͷ', 'Ͷ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ώ'), + ('Α', 'Ρ'), + ('Σ', 'Ϋ'), + ('Ϗ', 'Ϗ'), + ('Ϙ', 'Ϙ'), + ('Ϛ', 'Ϛ'), + ('Ϝ', 'Ϝ'), + ('Ϟ', 'Ϟ'), + ('Ϡ', 'Ϡ'), + ('Ϣ', 'Ϣ'), + ('Ϥ', 'Ϥ'), + ('Ϧ', 'Ϧ'), + ('Ϩ', 'Ϩ'), + ('Ϫ', 'Ϫ'), + ('Ϭ', 'Ϭ'), + ('Ϯ', 'Ϯ'), + ('ϴ', 'ϴ'), + ('Ϸ', 'Ϸ'), + ('Ϲ', 'Ϻ'), + ('Ͻ', 'Я'), + ('Ѡ', 'Ѡ'), + ('Ѣ', 'Ѣ'), + ('Ѥ', 'Ѥ'), + ('Ѧ', 'Ѧ'), + ('Ѩ', 'Ѩ'), + ('Ѫ', 'Ѫ'), + ('Ѭ', 'Ѭ'), + ('Ѯ', 'Ѯ'), + ('Ѱ', 'Ѱ'), + ('Ѳ', 'Ѳ'), + ('Ѵ', 'Ѵ'), + ('Ѷ', 'Ѷ'), + ('Ѹ', 'Ѹ'), + ('Ѻ', 'Ѻ'), + ('Ѽ', 'Ѽ'), + ('Ѿ', 'Ѿ'), + ('Ҁ', 'Ҁ'), + ('Ҋ', 'Ҋ'), + ('Ҍ', 'Ҍ'), + ('Ҏ', 'Ҏ'), + ('Ґ', 'Ґ'), + ('Ғ', 'Ғ'), + ('Ҕ', 'Ҕ'), + ('Җ', 'Җ'), + ('Ҙ', 'Ҙ'), + ('Қ', 'Қ'), + ('Ҝ', 'Ҝ'), + ('Ҟ', 'Ҟ'), + ('Ҡ', 'Ҡ'), + ('Ң', 'Ң'), + ('Ҥ', 'Ҥ'), + ('Ҧ', 'Ҧ'), + ('Ҩ', 'Ҩ'), + ('Ҫ', 'Ҫ'), + ('Ҭ', 'Ҭ'), + ('Ү', 'Ү'), + ('Ұ', 'Ұ'), + ('Ҳ', 'Ҳ'), + ('Ҵ', 'Ҵ'), + ('Ҷ', 'Ҷ'), + ('Ҹ', 'Ҹ'), + ('Һ', 'Һ'), + ('Ҽ', 'Ҽ'), + ('Ҿ', 'Ҿ'), + ('Ӏ', 'Ӂ'), + ('Ӄ', 'Ӄ'), + ('Ӆ', 'Ӆ'), + ('Ӈ', 'Ӈ'), + ('Ӊ', 'Ӊ'), + ('Ӌ', 'Ӌ'), + ('Ӎ', 'Ӎ'), + ('Ӑ', 'Ӑ'), + ('Ӓ', 'Ӓ'), + ('Ӕ', 'Ӕ'), + ('Ӗ', 'Ӗ'), + ('Ә', 'Ә'), + ('Ӛ', 'Ӛ'), + ('Ӝ', 'Ӝ'), + ('Ӟ', 'Ӟ'), + ('Ӡ', 'Ӡ'), + ('Ӣ', 'Ӣ'), + ('Ӥ', 'Ӥ'), + ('Ӧ', 'Ӧ'), + ('Ө', 'Ө'), + ('Ӫ', 'Ӫ'), + ('Ӭ', 'Ӭ'), + ('Ӯ', 'Ӯ'), + ('Ӱ', 'Ӱ'), + ('Ӳ', 'Ӳ'), + ('Ӵ', 'Ӵ'), + ('Ӷ', 'Ӷ'), + ('Ӹ', 'Ӹ'), + ('Ӻ', 'Ӻ'), + ('Ӽ', 'Ӽ'), + ('Ӿ', 'Ӿ'), + ('Ԁ', 'Ԁ'), + ('Ԃ', 'Ԃ'), + ('Ԅ', 'Ԅ'), + ('Ԇ', 'Ԇ'), + ('Ԉ', 'Ԉ'), + ('Ԋ', 'Ԋ'), + ('Ԍ', 'Ԍ'), + ('Ԏ', 'Ԏ'), + ('Ԑ', 'Ԑ'), + ('Ԓ', 'Ԓ'), + ('Ԕ', 'Ԕ'), + ('Ԗ', 'Ԗ'), + ('Ԙ', 'Ԙ'), + ('Ԛ', 'Ԛ'), + ('Ԝ', 'Ԝ'), + ('Ԟ', 'Ԟ'), + ('Ԡ', 'Ԡ'), + ('Ԣ', 'Ԣ'), + ('Ԥ', 'Ԥ'), + ('Ԧ', 'Ԧ'), + ('Ԩ', 'Ԩ'), + ('Ԫ', 'Ԫ'), + ('Ԭ', 'Ԭ'), + ('Ԯ', 'Ԯ'), + ('Ա', 'Ֆ'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('Ꭰ', 'Ᏽ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('Ḁ', 'Ḁ'), + ('Ḃ', 'Ḃ'), + ('Ḅ', 'Ḅ'), + ('Ḇ', 'Ḇ'), + ('Ḉ', 'Ḉ'), + ('Ḋ', 'Ḋ'), + ('Ḍ', 'Ḍ'), + ('Ḏ', 'Ḏ'), + ('Ḑ', 'Ḑ'), + ('Ḓ', 'Ḓ'), + ('Ḕ', 'Ḕ'), + ('Ḗ', 'Ḗ'), + ('Ḙ', 'Ḙ'), + ('Ḛ', 'Ḛ'), + ('Ḝ', 'Ḝ'), + ('Ḟ', 'Ḟ'), + ('Ḡ', 'Ḡ'), + ('Ḣ', 'Ḣ'), + ('Ḥ', 'Ḥ'), + ('Ḧ', 'Ḧ'), + ('Ḩ', 'Ḩ'), + ('Ḫ', 'Ḫ'), + ('Ḭ', 'Ḭ'), + ('Ḯ', 'Ḯ'), + ('Ḱ', 'Ḱ'), + ('Ḳ', 'Ḳ'), + ('Ḵ', 'Ḵ'), + ('Ḷ', 'Ḷ'), + ('Ḹ', 'Ḹ'), + ('Ḻ', 'Ḻ'), + ('Ḽ', 'Ḽ'), + ('Ḿ', 'Ḿ'), + ('Ṁ', 'Ṁ'), + ('Ṃ', 'Ṃ'), + ('Ṅ', 'Ṅ'), + ('Ṇ', 'Ṇ'), + ('Ṉ', 'Ṉ'), + ('Ṋ', 'Ṋ'), + ('Ṍ', 'Ṍ'), + ('Ṏ', 'Ṏ'), + ('Ṑ', 'Ṑ'), + ('Ṓ', 'Ṓ'), + ('Ṕ', 'Ṕ'), + ('Ṗ', 'Ṗ'), + ('Ṙ', 'Ṙ'), + ('Ṛ', 'Ṛ'), + ('Ṝ', 'Ṝ'), + ('Ṟ', 'Ṟ'), + ('Ṡ', 'Ṡ'), + ('Ṣ', 'Ṣ'), + ('Ṥ', 'Ṥ'), + ('Ṧ', 'Ṧ'), + ('Ṩ', 'Ṩ'), + ('Ṫ', 'Ṫ'), + ('Ṭ', 'Ṭ'), + ('Ṯ', 'Ṯ'), + ('Ṱ', 'Ṱ'), + ('Ṳ', 'Ṳ'), + ('Ṵ', 'Ṵ'), + ('Ṷ', 'Ṷ'), + ('Ṹ', 'Ṹ'), + ('Ṻ', 'Ṻ'), + ('Ṽ', 'Ṽ'), + ('Ṿ', 'Ṿ'), + ('Ẁ', 'Ẁ'), + ('Ẃ', 'Ẃ'), + ('Ẅ', 'Ẅ'), + ('Ẇ', 'Ẇ'), + ('Ẉ', 'Ẉ'), + ('Ẋ', 'Ẋ'), + ('Ẍ', 'Ẍ'), + ('Ẏ', 'Ẏ'), + ('Ẑ', 'Ẑ'), + ('Ẓ', 'Ẓ'), + ('Ẕ', 'Ẕ'), + ('ẞ', 'ẞ'), + ('Ạ', 'Ạ'), + ('Ả', 'Ả'), + ('Ấ', 'Ấ'), + ('Ầ', 'Ầ'), + ('Ẩ', 'Ẩ'), + ('Ẫ', 'Ẫ'), + ('Ậ', 'Ậ'), + ('Ắ', 'Ắ'), + ('Ằ', 'Ằ'), + ('Ẳ', 'Ẳ'), + ('Ẵ', 'Ẵ'), + ('Ặ', 'Ặ'), + ('Ẹ', 'Ẹ'), + ('Ẻ', 'Ẻ'), + ('Ẽ', 'Ẽ'), + ('Ế', 'Ế'), + ('Ề', 'Ề'), + ('Ể', 'Ể'), + ('Ễ', 'Ễ'), + ('Ệ', 'Ệ'), + ('Ỉ', 'Ỉ'), + ('Ị', 'Ị'), + ('Ọ', 'Ọ'), + ('Ỏ', 'Ỏ'), + ('Ố', 'Ố'), + ('Ồ', 'Ồ'), + ('Ổ', 'Ổ'), + ('Ỗ', 'Ỗ'), + ('Ộ', 'Ộ'), + ('Ớ', 'Ớ'), + ('Ờ', 'Ờ'), + ('Ở', 'Ở'), + ('Ỡ', 'Ỡ'), + ('Ợ', 'Ợ'), + ('Ụ', 'Ụ'), + ('Ủ', 'Ủ'), + ('Ứ', 'Ứ'), + ('Ừ', 'Ừ'), + ('Ử', 'Ử'), + ('Ữ', 'Ữ'), + ('Ự', 'Ự'), + ('Ỳ', 'Ỳ'), + ('Ỵ', 'Ỵ'), + ('Ỷ', 'Ỷ'), + ('Ỹ', 'Ỹ'), + ('Ỻ', 'Ỻ'), + ('Ỽ', 'Ỽ'), + ('Ỿ', 'Ỿ'), + ('Ἀ', 'Ἇ'), + ('Ἐ', 'Ἕ'), + ('Ἠ', 'Ἧ'), + ('Ἰ', 'Ἷ'), + ('Ὀ', 'Ὅ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'Ὗ'), + ('Ὠ', 'Ὧ'), + ('ᾈ', 'ᾏ'), + ('ᾘ', 'ᾟ'), + ('ᾨ', 'ᾯ'), + ('Ᾰ', 'ᾼ'), + ('Ὲ', 'ῌ'), + ('Ῐ', 'Ί'), + ('Ῠ', 'Ῥ'), + ('Ὸ', 'ῼ'), + ('Ω', 'Ω'), + ('K', 'Å'), + ('Ⅎ', 'Ⅎ'), + ('Ⅰ', 'Ⅿ'), + ('Ↄ', 'Ↄ'), + ('Ⓐ', 'Ⓩ'), + ('Ⰰ', 'Ⱟ'), + ('Ⱡ', 'Ⱡ'), + ('Ɫ', 'Ɽ'), + ('Ⱨ', 'Ⱨ'), + ('Ⱪ', 'Ⱪ'), + ('Ⱬ', 'Ⱬ'), + ('Ɑ', 'Ɒ'), + ('Ⱳ', 'Ⱳ'), + ('Ⱶ', 'Ⱶ'), + ('Ȿ', 'Ⲁ'), + ('Ⲃ', 'Ⲃ'), + ('Ⲅ', 'Ⲅ'), + ('Ⲇ', 'Ⲇ'), + ('Ⲉ', 'Ⲉ'), + ('Ⲋ', 'Ⲋ'), + ('Ⲍ', 'Ⲍ'), + ('Ⲏ', 'Ⲏ'), + ('Ⲑ', 'Ⲑ'), + ('Ⲓ', 'Ⲓ'), + ('Ⲕ', 'Ⲕ'), + ('Ⲗ', 'Ⲗ'), + ('Ⲙ', 'Ⲙ'), + ('Ⲛ', 'Ⲛ'), + ('Ⲝ', 'Ⲝ'), + ('Ⲟ', 'Ⲟ'), + ('Ⲡ', 'Ⲡ'), + ('Ⲣ', 'Ⲣ'), + ('Ⲥ', 'Ⲥ'), + ('Ⲧ', 'Ⲧ'), + ('Ⲩ', 'Ⲩ'), + ('Ⲫ', 'Ⲫ'), + ('Ⲭ', 'Ⲭ'), + ('Ⲯ', 'Ⲯ'), + ('Ⲱ', 'Ⲱ'), + ('Ⲳ', 'Ⲳ'), + ('Ⲵ', 'Ⲵ'), + ('Ⲷ', 'Ⲷ'), + ('Ⲹ', 'Ⲹ'), + ('Ⲻ', 'Ⲻ'), + ('Ⲽ', 'Ⲽ'), + ('Ⲿ', 'Ⲿ'), + ('Ⳁ', 'Ⳁ'), + ('Ⳃ', 'Ⳃ'), + ('Ⳅ', 'Ⳅ'), + ('Ⳇ', 'Ⳇ'), + ('Ⳉ', 'Ⳉ'), + ('Ⳋ', 'Ⳋ'), + ('Ⳍ', 'Ⳍ'), + ('Ⳏ', 'Ⳏ'), + ('Ⳑ', 'Ⳑ'), + ('Ⳓ', 'Ⳓ'), + ('Ⳕ', 'Ⳕ'), + ('Ⳗ', 'Ⳗ'), + ('Ⳙ', 'Ⳙ'), + ('Ⳛ', 'Ⳛ'), + ('Ⳝ', 'Ⳝ'), + ('Ⳟ', 'Ⳟ'), + ('Ⳡ', 'Ⳡ'), + ('Ⳣ', 'Ⳣ'), + ('Ⳬ', 'Ⳬ'), + ('Ⳮ', 'Ⳮ'), + ('Ⳳ', 'Ⳳ'), + ('Ꙁ', 'Ꙁ'), + ('Ꙃ', 'Ꙃ'), + ('Ꙅ', 'Ꙅ'), + ('Ꙇ', 'Ꙇ'), + ('Ꙉ', 'Ꙉ'), + ('Ꙋ', 'Ꙋ'), + ('Ꙍ', 'Ꙍ'), + ('Ꙏ', 'Ꙏ'), + ('Ꙑ', 'Ꙑ'), + ('Ꙓ', 'Ꙓ'), + ('Ꙕ', 'Ꙕ'), + ('Ꙗ', 'Ꙗ'), + ('Ꙙ', 'Ꙙ'), + ('Ꙛ', 'Ꙛ'), + ('Ꙝ', 'Ꙝ'), + ('Ꙟ', 'Ꙟ'), + ('Ꙡ', 'Ꙡ'), + ('Ꙣ', 'Ꙣ'), + ('Ꙥ', 'Ꙥ'), + ('Ꙧ', 'Ꙧ'), + ('Ꙩ', 'Ꙩ'), + ('Ꙫ', 'Ꙫ'), + ('Ꙭ', 'Ꙭ'), + ('Ꚁ', 'Ꚁ'), + ('Ꚃ', 'Ꚃ'), + ('Ꚅ', 'Ꚅ'), + ('Ꚇ', 'Ꚇ'), + ('Ꚉ', 'Ꚉ'), + ('Ꚋ', 'Ꚋ'), + ('Ꚍ', 'Ꚍ'), + ('Ꚏ', 'Ꚏ'), + ('Ꚑ', 'Ꚑ'), + ('Ꚓ', 'Ꚓ'), + ('Ꚕ', 'Ꚕ'), + ('Ꚗ', 'Ꚗ'), + ('Ꚙ', 'Ꚙ'), + ('Ꚛ', 'Ꚛ'), + ('Ꜣ', 'Ꜣ'), + ('Ꜥ', 'Ꜥ'), + ('Ꜧ', 'Ꜧ'), + ('Ꜩ', 'Ꜩ'), + ('Ꜫ', 'Ꜫ'), + ('Ꜭ', 'Ꜭ'), + ('Ꜯ', 'Ꜯ'), + ('Ꜳ', 'Ꜳ'), + ('Ꜵ', 'Ꜵ'), + ('Ꜷ', 'Ꜷ'), + ('Ꜹ', 'Ꜹ'), + ('Ꜻ', 'Ꜻ'), + ('Ꜽ', 'Ꜽ'), + ('Ꜿ', 'Ꜿ'), + ('Ꝁ', 'Ꝁ'), + ('Ꝃ', 'Ꝃ'), + ('Ꝅ', 'Ꝅ'), + ('Ꝇ', 'Ꝇ'), + ('Ꝉ', 'Ꝉ'), + ('Ꝋ', 'Ꝋ'), + ('Ꝍ', 'Ꝍ'), + ('Ꝏ', 'Ꝏ'), + ('Ꝑ', 'Ꝑ'), + ('Ꝓ', 'Ꝓ'), + ('Ꝕ', 'Ꝕ'), + ('Ꝗ', 'Ꝗ'), + ('Ꝙ', 'Ꝙ'), + ('Ꝛ', 'Ꝛ'), + ('Ꝝ', 'Ꝝ'), + ('Ꝟ', 'Ꝟ'), + ('Ꝡ', 'Ꝡ'), + ('Ꝣ', 'Ꝣ'), + ('Ꝥ', 'Ꝥ'), + ('Ꝧ', 'Ꝧ'), + ('Ꝩ', 'Ꝩ'), + ('Ꝫ', 'Ꝫ'), + ('Ꝭ', 'Ꝭ'), + ('Ꝯ', 'Ꝯ'), + ('Ꝺ', 'Ꝺ'), + ('Ꝼ', 'Ꝼ'), + ('Ᵹ', 'Ꝿ'), + ('Ꞁ', 'Ꞁ'), + ('Ꞃ', 'Ꞃ'), + ('Ꞅ', 'Ꞅ'), + ('Ꞇ', 'Ꞇ'), + ('Ꞌ', 'Ꞌ'), + ('Ɥ', 'Ɥ'), + ('Ꞑ', 'Ꞑ'), + ('Ꞓ', 'Ꞓ'), + ('Ꞗ', 'Ꞗ'), + ('Ꞙ', 'Ꞙ'), + ('Ꞛ', 'Ꞛ'), + ('Ꞝ', 'Ꞝ'), + ('Ꞟ', 'Ꞟ'), + ('Ꞡ', 'Ꞡ'), + ('Ꞣ', 'Ꞣ'), + ('Ꞥ', 'Ꞥ'), + ('Ꞧ', 'Ꞧ'), + ('Ꞩ', 'Ꞩ'), + ('Ɦ', 'Ɪ'), + ('Ʞ', 'Ꞵ'), + ('Ꞷ', 'Ꞷ'), + ('Ꞹ', 'Ꞹ'), + ('Ꞻ', 'Ꞻ'), + ('Ꞽ', 'Ꞽ'), + ('Ꞿ', 'Ꞿ'), + ('Ꟁ', 'Ꟁ'), + ('Ꟃ', 'Ꟃ'), + ('Ꞔ', 'Ꟈ'), + ('Ꟊ', 'Ꟊ'), + ('Ꟑ', 'Ꟑ'), + ('Ꟗ', 'Ꟗ'), + ('Ꟙ', 'Ꟙ'), + ('Ꟶ', 'Ꟶ'), + ('A', 'Z'), + ('𐐀', '𐐧'), + ('𐒰', '𐓓'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐲀', '𐲲'), + ('𑢠', '𑢿'), + ('𖹀', '𖹟'), + ('𞤀', '𞤡'), +]; + +pub const CHANGES_WHEN_TITLECASED: &'static [(char, char)] = &[ + ('a', 'z'), + ('µ', 'µ'), + ('ß', 'ö'), + ('ø', 'ÿ'), + ('ā', 'ā'), + ('ă', 'ă'), + ('ą', 'ą'), + ('ć', 'ć'), + ('ĉ', 'ĉ'), + ('ċ', 'ċ'), + ('č', 'č'), + ('ď', 'ď'), + ('đ', 'đ'), + ('ē', 'ē'), + ('ĕ', 'ĕ'), + ('ė', 'ė'), + ('ę', 'ę'), + ('ě', 'ě'), + ('ĝ', 'ĝ'), + ('ğ', 'ğ'), + ('ġ', 'ġ'), + ('ģ', 'ģ'), + ('ĥ', 'ĥ'), + ('ħ', 'ħ'), + ('ĩ', 'ĩ'), + ('ī', 'ī'), + ('ĭ', 'ĭ'), + ('į', 'į'), + ('ı', 'ı'), + ('ij', 'ij'), + ('ĵ', 'ĵ'), + ('ķ', 'ķ'), + ('ĺ', 'ĺ'), + ('ļ', 'ļ'), + ('ľ', 'ľ'), + ('ŀ', 'ŀ'), + ('ł', 'ł'), + ('ń', 'ń'), + ('ņ', 'ņ'), + ('ň', 'ʼn'), + ('ŋ', 'ŋ'), + ('ō', 'ō'), + ('ŏ', 'ŏ'), + ('ő', 'ő'), + ('œ', 'œ'), + ('ŕ', 'ŕ'), + ('ŗ', 'ŗ'), + ('ř', 'ř'), + ('ś', 'ś'), + ('ŝ', 'ŝ'), + ('ş', 'ş'), + ('š', 'š'), + ('ţ', 'ţ'), + ('ť', 'ť'), + ('ŧ', 'ŧ'), + ('ũ', 'ũ'), + ('ū', 'ū'), + ('ŭ', 'ŭ'), + ('ů', 'ů'), + ('ű', 'ű'), + ('ų', 'ų'), + ('ŵ', 'ŵ'), + ('ŷ', 'ŷ'), + ('ź', 'ź'), + ('ż', 'ż'), + ('ž', 'ƀ'), + ('ƃ', 'ƃ'), + ('ƅ', 'ƅ'), + ('ƈ', 'ƈ'), + ('ƌ', 'ƌ'), + ('ƒ', 'ƒ'), + ('ƕ', 'ƕ'), + ('ƙ', 'ƚ'), + ('ƞ', 'ƞ'), + ('ơ', 'ơ'), + ('ƣ', 'ƣ'), + ('ƥ', 'ƥ'), + ('ƨ', 'ƨ'), + ('ƭ', 'ƭ'), + ('ư', 'ư'), + ('ƴ', 'ƴ'), + ('ƶ', 'ƶ'), + ('ƹ', 'ƹ'), + ('ƽ', 'ƽ'), + ('ƿ', 'ƿ'), + ('DŽ', 'DŽ'), + ('dž', 'LJ'), + ('lj', 'NJ'), + ('nj', 'nj'), + ('ǎ', 'ǎ'), + ('ǐ', 'ǐ'), + ('ǒ', 'ǒ'), + ('ǔ', 'ǔ'), + ('ǖ', 'ǖ'), + ('ǘ', 'ǘ'), + ('ǚ', 'ǚ'), + ('ǜ', 'ǝ'), + ('ǟ', 'ǟ'), + ('ǡ', 'ǡ'), + ('ǣ', 'ǣ'), + ('ǥ', 'ǥ'), + ('ǧ', 'ǧ'), + ('ǩ', 'ǩ'), + ('ǫ', 'ǫ'), + ('ǭ', 'ǭ'), + ('ǯ', 'DZ'), + ('dz', 'dz'), + ('ǵ', 'ǵ'), + ('ǹ', 'ǹ'), + ('ǻ', 'ǻ'), + ('ǽ', 'ǽ'), + ('ǿ', 'ǿ'), + ('ȁ', 'ȁ'), + ('ȃ', 'ȃ'), + ('ȅ', 'ȅ'), + ('ȇ', 'ȇ'), + ('ȉ', 'ȉ'), + ('ȋ', 'ȋ'), + ('ȍ', 'ȍ'), + ('ȏ', 'ȏ'), + ('ȑ', 'ȑ'), + ('ȓ', 'ȓ'), + ('ȕ', 'ȕ'), + ('ȗ', 'ȗ'), + ('ș', 'ș'), + ('ț', 'ț'), + ('ȝ', 'ȝ'), + ('ȟ', 'ȟ'), + ('ȣ', 'ȣ'), + ('ȥ', 'ȥ'), + ('ȧ', 'ȧ'), + ('ȩ', 'ȩ'), + ('ȫ', 'ȫ'), + ('ȭ', 'ȭ'), + ('ȯ', 'ȯ'), + ('ȱ', 'ȱ'), + ('ȳ', 'ȳ'), + ('ȼ', 'ȼ'), + ('ȿ', 'ɀ'), + ('ɂ', 'ɂ'), + ('ɇ', 'ɇ'), + ('ɉ', 'ɉ'), + ('ɋ', 'ɋ'), + ('ɍ', 'ɍ'), + ('ɏ', 'ɔ'), + ('ɖ', 'ɗ'), + ('ə', 'ə'), + ('ɛ', 'ɜ'), + ('ɠ', 'ɡ'), + ('ɣ', 'ɣ'), + ('ɥ', 'ɦ'), + ('ɨ', 'ɬ'), + ('ɯ', 'ɯ'), + ('ɱ', 'ɲ'), + ('ɵ', 'ɵ'), + ('ɽ', 'ɽ'), + ('ʀ', 'ʀ'), + ('ʂ', 'ʃ'), + ('ʇ', 'ʌ'), + ('ʒ', 'ʒ'), + ('ʝ', 'ʞ'), + ('\u{345}', '\u{345}'), + ('ͱ', 'ͱ'), + ('ͳ', 'ͳ'), + ('ͷ', 'ͷ'), + ('ͻ', 'ͽ'), + ('ΐ', 'ΐ'), + ('ά', 'ώ'), + ('ϐ', 'ϑ'), + ('ϕ', 'ϗ'), + ('ϙ', 'ϙ'), + ('ϛ', 'ϛ'), + ('ϝ', 'ϝ'), + ('ϟ', 'ϟ'), + ('ϡ', 'ϡ'), + ('ϣ', 'ϣ'), + ('ϥ', 'ϥ'), + ('ϧ', 'ϧ'), + ('ϩ', 'ϩ'), + ('ϫ', 'ϫ'), + ('ϭ', 'ϭ'), + ('ϯ', 'ϳ'), + ('ϵ', 'ϵ'), + ('ϸ', 'ϸ'), + ('ϻ', 'ϻ'), + ('а', 'џ'), + ('ѡ', 'ѡ'), + ('ѣ', 'ѣ'), + ('ѥ', 'ѥ'), + ('ѧ', 'ѧ'), + ('ѩ', 'ѩ'), + ('ѫ', 'ѫ'), + ('ѭ', 'ѭ'), + ('ѯ', 'ѯ'), + ('ѱ', 'ѱ'), + ('ѳ', 'ѳ'), + ('ѵ', 'ѵ'), + ('ѷ', 'ѷ'), + ('ѹ', 'ѹ'), + ('ѻ', 'ѻ'), + ('ѽ', 'ѽ'), + ('ѿ', 'ѿ'), + ('ҁ', 'ҁ'), + ('ҋ', 'ҋ'), + ('ҍ', 'ҍ'), + ('ҏ', 'ҏ'), + ('ґ', 'ґ'), + ('ғ', 'ғ'), + ('ҕ', 'ҕ'), + ('җ', 'җ'), + ('ҙ', 'ҙ'), + ('қ', 'қ'), + ('ҝ', 'ҝ'), + ('ҟ', 'ҟ'), + ('ҡ', 'ҡ'), + ('ң', 'ң'), + ('ҥ', 'ҥ'), + ('ҧ', 'ҧ'), + ('ҩ', 'ҩ'), + ('ҫ', 'ҫ'), + ('ҭ', 'ҭ'), + ('ү', 'ү'), + ('ұ', 'ұ'), + ('ҳ', 'ҳ'), + ('ҵ', 'ҵ'), + ('ҷ', 'ҷ'), + ('ҹ', 'ҹ'), + ('һ', 'һ'), + ('ҽ', 'ҽ'), + ('ҿ', 'ҿ'), + ('ӂ', 'ӂ'), + ('ӄ', 'ӄ'), + ('ӆ', 'ӆ'), + ('ӈ', 'ӈ'), + ('ӊ', 'ӊ'), + ('ӌ', 'ӌ'), + ('ӎ', 'ӏ'), + ('ӑ', 'ӑ'), + ('ӓ', 'ӓ'), + ('ӕ', 'ӕ'), + ('ӗ', 'ӗ'), + ('ә', 'ә'), + ('ӛ', 'ӛ'), + ('ӝ', 'ӝ'), + ('ӟ', 'ӟ'), + ('ӡ', 'ӡ'), + ('ӣ', 'ӣ'), + ('ӥ', 'ӥ'), + ('ӧ', 'ӧ'), + ('ө', 'ө'), + ('ӫ', 'ӫ'), + ('ӭ', 'ӭ'), + ('ӯ', 'ӯ'), + ('ӱ', 'ӱ'), + ('ӳ', 'ӳ'), + ('ӵ', 'ӵ'), + ('ӷ', 'ӷ'), + ('ӹ', 'ӹ'), + ('ӻ', 'ӻ'), + ('ӽ', 'ӽ'), + ('ӿ', 'ӿ'), + ('ԁ', 'ԁ'), + ('ԃ', 'ԃ'), + ('ԅ', 'ԅ'), + ('ԇ', 'ԇ'), + ('ԉ', 'ԉ'), + ('ԋ', 'ԋ'), + ('ԍ', 'ԍ'), + ('ԏ', 'ԏ'), + ('ԑ', 'ԑ'), + ('ԓ', 'ԓ'), + ('ԕ', 'ԕ'), + ('ԗ', 'ԗ'), + ('ԙ', 'ԙ'), + ('ԛ', 'ԛ'), + ('ԝ', 'ԝ'), + ('ԟ', 'ԟ'), + ('ԡ', 'ԡ'), + ('ԣ', 'ԣ'), + ('ԥ', 'ԥ'), + ('ԧ', 'ԧ'), + ('ԩ', 'ԩ'), + ('ԫ', 'ԫ'), + ('ԭ', 'ԭ'), + ('ԯ', 'ԯ'), + ('ա', 'և'), + ('ᏸ', 'ᏽ'), + ('ᲀ', 'ᲈ'), + ('ᵹ', 'ᵹ'), + ('ᵽ', 'ᵽ'), + ('ᶎ', 'ᶎ'), + ('ḁ', 'ḁ'), + ('ḃ', 'ḃ'), + ('ḅ', 'ḅ'), + ('ḇ', 'ḇ'), + ('ḉ', 'ḉ'), + ('ḋ', 'ḋ'), + ('ḍ', 'ḍ'), + ('ḏ', 'ḏ'), + ('ḑ', 'ḑ'), + ('ḓ', 'ḓ'), + ('ḕ', 'ḕ'), + ('ḗ', 'ḗ'), + ('ḙ', 'ḙ'), + ('ḛ', 'ḛ'), + ('ḝ', 'ḝ'), + ('ḟ', 'ḟ'), + ('ḡ', 'ḡ'), + ('ḣ', 'ḣ'), + ('ḥ', 'ḥ'), + ('ḧ', 'ḧ'), + ('ḩ', 'ḩ'), + ('ḫ', 'ḫ'), + ('ḭ', 'ḭ'), + ('ḯ', 'ḯ'), + ('ḱ', 'ḱ'), + ('ḳ', 'ḳ'), + ('ḵ', 'ḵ'), + ('ḷ', 'ḷ'), + ('ḹ', 'ḹ'), + ('ḻ', 'ḻ'), + ('ḽ', 'ḽ'), + ('ḿ', 'ḿ'), + ('ṁ', 'ṁ'), + ('ṃ', 'ṃ'), + ('ṅ', 'ṅ'), + ('ṇ', 'ṇ'), + ('ṉ', 'ṉ'), + ('ṋ', 'ṋ'), + ('ṍ', 'ṍ'), + ('ṏ', 'ṏ'), + ('ṑ', 'ṑ'), + ('ṓ', 'ṓ'), + ('ṕ', 'ṕ'), + ('ṗ', 'ṗ'), + ('ṙ', 'ṙ'), + ('ṛ', 'ṛ'), + ('ṝ', 'ṝ'), + ('ṟ', 'ṟ'), + ('ṡ', 'ṡ'), + ('ṣ', 'ṣ'), + ('ṥ', 'ṥ'), + ('ṧ', 'ṧ'), + ('ṩ', 'ṩ'), + ('ṫ', 'ṫ'), + ('ṭ', 'ṭ'), + ('ṯ', 'ṯ'), + ('ṱ', 'ṱ'), + ('ṳ', 'ṳ'), + ('ṵ', 'ṵ'), + ('ṷ', 'ṷ'), + ('ṹ', 'ṹ'), + ('ṻ', 'ṻ'), + ('ṽ', 'ṽ'), + ('ṿ', 'ṿ'), + ('ẁ', 'ẁ'), + ('ẃ', 'ẃ'), + ('ẅ', 'ẅ'), + ('ẇ', 'ẇ'), + ('ẉ', 'ẉ'), + ('ẋ', 'ẋ'), + ('ẍ', 'ẍ'), + ('ẏ', 'ẏ'), + ('ẑ', 'ẑ'), + ('ẓ', 'ẓ'), + ('ẕ', 'ẛ'), + ('ạ', 'ạ'), + ('ả', 'ả'), + ('ấ', 'ấ'), + ('ầ', 'ầ'), + ('ẩ', 'ẩ'), + ('ẫ', 'ẫ'), + ('ậ', 'ậ'), + ('ắ', 'ắ'), + ('ằ', 'ằ'), + ('ẳ', 'ẳ'), + ('ẵ', 'ẵ'), + ('ặ', 'ặ'), + ('ẹ', 'ẹ'), + ('ẻ', 'ẻ'), + ('ẽ', 'ẽ'), + ('ế', 'ế'), + ('ề', 'ề'), + ('ể', 'ể'), + ('ễ', 'ễ'), + ('ệ', 'ệ'), + ('ỉ', 'ỉ'), + ('ị', 'ị'), + ('ọ', 'ọ'), + ('ỏ', 'ỏ'), + ('ố', 'ố'), + ('ồ', 'ồ'), + ('ổ', 'ổ'), + ('ỗ', 'ỗ'), + ('ộ', 'ộ'), + ('ớ', 'ớ'), + ('ờ', 'ờ'), + ('ở', 'ở'), + ('ỡ', 'ỡ'), + ('ợ', 'ợ'), + ('ụ', 'ụ'), + ('ủ', 'ủ'), + ('ứ', 'ứ'), + ('ừ', 'ừ'), + ('ử', 'ử'), + ('ữ', 'ữ'), + ('ự', 'ự'), + ('ỳ', 'ỳ'), + ('ỵ', 'ỵ'), + ('ỷ', 'ỷ'), + ('ỹ', 'ỹ'), + ('ỻ', 'ỻ'), + ('ỽ', 'ỽ'), + ('ỿ', 'ἇ'), + ('ἐ', 'ἕ'), + ('ἠ', 'ἧ'), + ('ἰ', 'ἷ'), + ('ὀ', 'ὅ'), + ('ὐ', 'ὗ'), + ('ὠ', 'ὧ'), + ('ὰ', 'ώ'), + ('ᾀ', 'ᾇ'), + ('ᾐ', 'ᾗ'), + ('ᾠ', 'ᾧ'), + ('ᾰ', 'ᾴ'), + ('ᾶ', 'ᾷ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῇ'), + ('ῐ', 'ΐ'), + ('ῖ', 'ῗ'), + ('ῠ', 'ῧ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῷ'), + ('ⅎ', 'ⅎ'), + ('ⅰ', 'ⅿ'), + ('ↄ', 'ↄ'), + ('ⓐ', 'ⓩ'), + ('ⰰ', 'ⱟ'), + ('ⱡ', 'ⱡ'), + ('ⱥ', 'ⱦ'), + ('ⱨ', 'ⱨ'), + ('ⱪ', 'ⱪ'), + ('ⱬ', 'ⱬ'), + ('ⱳ', 'ⱳ'), + ('ⱶ', 'ⱶ'), + ('ⲁ', 'ⲁ'), + ('ⲃ', 'ⲃ'), + ('ⲅ', 'ⲅ'), + ('ⲇ', 'ⲇ'), + ('ⲉ', 'ⲉ'), + ('ⲋ', 'ⲋ'), + ('ⲍ', 'ⲍ'), + ('ⲏ', 'ⲏ'), + ('ⲑ', 'ⲑ'), + ('ⲓ', 'ⲓ'), + ('ⲕ', 'ⲕ'), + ('ⲗ', 'ⲗ'), + ('ⲙ', 'ⲙ'), + ('ⲛ', 'ⲛ'), + ('ⲝ', 'ⲝ'), + ('ⲟ', 'ⲟ'), + ('ⲡ', 'ⲡ'), + ('ⲣ', 'ⲣ'), + ('ⲥ', 'ⲥ'), + ('ⲧ', 'ⲧ'), + ('ⲩ', 'ⲩ'), + ('ⲫ', 'ⲫ'), + ('ⲭ', 'ⲭ'), + ('ⲯ', 'ⲯ'), + ('ⲱ', 'ⲱ'), + ('ⲳ', 'ⲳ'), + ('ⲵ', 'ⲵ'), + ('ⲷ', 'ⲷ'), + ('ⲹ', 'ⲹ'), + ('ⲻ', 'ⲻ'), + ('ⲽ', 'ⲽ'), + ('ⲿ', 'ⲿ'), + ('ⳁ', 'ⳁ'), + ('ⳃ', 'ⳃ'), + ('ⳅ', 'ⳅ'), + ('ⳇ', 'ⳇ'), + ('ⳉ', 'ⳉ'), + ('ⳋ', 'ⳋ'), + ('ⳍ', 'ⳍ'), + ('ⳏ', 'ⳏ'), + ('ⳑ', 'ⳑ'), + ('ⳓ', 'ⳓ'), + ('ⳕ', 'ⳕ'), + ('ⳗ', 'ⳗ'), + ('ⳙ', 'ⳙ'), + ('ⳛ', 'ⳛ'), + ('ⳝ', 'ⳝ'), + ('ⳟ', 'ⳟ'), + ('ⳡ', 'ⳡ'), + ('ⳣ', 'ⳣ'), + ('ⳬ', 'ⳬ'), + ('ⳮ', 'ⳮ'), + ('ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ꙁ', 'ꙁ'), + ('ꙃ', 'ꙃ'), + ('ꙅ', 'ꙅ'), + ('ꙇ', 'ꙇ'), + ('ꙉ', 'ꙉ'), + ('ꙋ', 'ꙋ'), + ('ꙍ', 'ꙍ'), + ('ꙏ', 'ꙏ'), + ('ꙑ', 'ꙑ'), + ('ꙓ', 'ꙓ'), + ('ꙕ', 'ꙕ'), + ('ꙗ', 'ꙗ'), + ('ꙙ', 'ꙙ'), + ('ꙛ', 'ꙛ'), + ('ꙝ', 'ꙝ'), + ('ꙟ', 'ꙟ'), + ('ꙡ', 'ꙡ'), + ('ꙣ', 'ꙣ'), + ('ꙥ', 'ꙥ'), + ('ꙧ', 'ꙧ'), + ('ꙩ', 'ꙩ'), + ('ꙫ', 'ꙫ'), + ('ꙭ', 'ꙭ'), + ('ꚁ', 'ꚁ'), + ('ꚃ', 'ꚃ'), + ('ꚅ', 'ꚅ'), + ('ꚇ', 'ꚇ'), + ('ꚉ', 'ꚉ'), + ('ꚋ', 'ꚋ'), + ('ꚍ', 'ꚍ'), + ('ꚏ', 'ꚏ'), + ('ꚑ', 'ꚑ'), + ('ꚓ', 'ꚓ'), + ('ꚕ', 'ꚕ'), + ('ꚗ', 'ꚗ'), + ('ꚙ', 'ꚙ'), + ('ꚛ', 'ꚛ'), + ('ꜣ', 'ꜣ'), + ('ꜥ', 'ꜥ'), + ('ꜧ', 'ꜧ'), + ('ꜩ', 'ꜩ'), + ('ꜫ', 'ꜫ'), + ('ꜭ', 'ꜭ'), + ('ꜯ', 'ꜯ'), + ('ꜳ', 'ꜳ'), + ('ꜵ', 'ꜵ'), + ('ꜷ', 'ꜷ'), + ('ꜹ', 'ꜹ'), + ('ꜻ', 'ꜻ'), + ('ꜽ', 'ꜽ'), + ('ꜿ', 'ꜿ'), + ('ꝁ', 'ꝁ'), + ('ꝃ', 'ꝃ'), + ('ꝅ', 'ꝅ'), + ('ꝇ', 'ꝇ'), + ('ꝉ', 'ꝉ'), + ('ꝋ', 'ꝋ'), + ('ꝍ', 'ꝍ'), + ('ꝏ', 'ꝏ'), + ('ꝑ', 'ꝑ'), + ('ꝓ', 'ꝓ'), + ('ꝕ', 'ꝕ'), + ('ꝗ', 'ꝗ'), + ('ꝙ', 'ꝙ'), + ('ꝛ', 'ꝛ'), + ('ꝝ', 'ꝝ'), + ('ꝟ', 'ꝟ'), + ('ꝡ', 'ꝡ'), + ('ꝣ', 'ꝣ'), + ('ꝥ', 'ꝥ'), + ('ꝧ', 'ꝧ'), + ('ꝩ', 'ꝩ'), + ('ꝫ', 'ꝫ'), + ('ꝭ', 'ꝭ'), + ('ꝯ', 'ꝯ'), + ('ꝺ', 'ꝺ'), + ('ꝼ', 'ꝼ'), + ('ꝿ', 'ꝿ'), + ('ꞁ', 'ꞁ'), + ('ꞃ', 'ꞃ'), + ('ꞅ', 'ꞅ'), + ('ꞇ', 'ꞇ'), + ('ꞌ', 'ꞌ'), + ('ꞑ', 'ꞑ'), + ('ꞓ', 'ꞔ'), + ('ꞗ', 'ꞗ'), + ('ꞙ', 'ꞙ'), + ('ꞛ', 'ꞛ'), + ('ꞝ', 'ꞝ'), + ('ꞟ', 'ꞟ'), + ('ꞡ', 'ꞡ'), + ('ꞣ', 'ꞣ'), + ('ꞥ', 'ꞥ'), + ('ꞧ', 'ꞧ'), + ('ꞩ', 'ꞩ'), + ('ꞵ', 'ꞵ'), + ('ꞷ', 'ꞷ'), + ('ꞹ', 'ꞹ'), + ('ꞻ', 'ꞻ'), + ('ꞽ', 'ꞽ'), + ('ꞿ', 'ꞿ'), + ('ꟁ', 'ꟁ'), + ('ꟃ', 'ꟃ'), + ('ꟈ', 'ꟈ'), + ('ꟊ', 'ꟊ'), + ('ꟑ', 'ꟑ'), + ('ꟗ', 'ꟗ'), + ('ꟙ', 'ꟙ'), + ('ꟶ', 'ꟶ'), + ('ꭓ', 'ꭓ'), + ('ꭰ', 'ꮿ'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('a', 'z'), + ('𐐨', '𐑏'), + ('𐓘', '𐓻'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐳀', '𐳲'), + ('𑣀', '𑣟'), + ('𖹠', '𖹿'), + ('𞤢', '𞥃'), +]; + +pub const CHANGES_WHEN_UPPERCASED: &'static [(char, char)] = &[ + ('a', 'z'), + ('µ', 'µ'), + ('ß', 'ö'), + ('ø', 'ÿ'), + ('ā', 'ā'), + ('ă', 'ă'), + ('ą', 'ą'), + ('ć', 'ć'), + ('ĉ', 'ĉ'), + ('ċ', 'ċ'), + ('č', 'č'), + ('ď', 'ď'), + ('đ', 'đ'), + ('ē', 'ē'), + ('ĕ', 'ĕ'), + ('ė', 'ė'), + ('ę', 'ę'), + ('ě', 'ě'), + ('ĝ', 'ĝ'), + ('ğ', 'ğ'), + ('ġ', 'ġ'), + ('ģ', 'ģ'), + ('ĥ', 'ĥ'), + ('ħ', 'ħ'), + ('ĩ', 'ĩ'), + ('ī', 'ī'), + ('ĭ', 'ĭ'), + ('į', 'į'), + ('ı', 'ı'), + ('ij', 'ij'), + ('ĵ', 'ĵ'), + ('ķ', 'ķ'), + ('ĺ', 'ĺ'), + ('ļ', 'ļ'), + ('ľ', 'ľ'), + ('ŀ', 'ŀ'), + ('ł', 'ł'), + ('ń', 'ń'), + ('ņ', 'ņ'), + ('ň', 'ʼn'), + ('ŋ', 'ŋ'), + ('ō', 'ō'), + ('ŏ', 'ŏ'), + ('ő', 'ő'), + ('œ', 'œ'), + ('ŕ', 'ŕ'), + ('ŗ', 'ŗ'), + ('ř', 'ř'), + ('ś', 'ś'), + ('ŝ', 'ŝ'), + ('ş', 'ş'), + ('š', 'š'), + ('ţ', 'ţ'), + ('ť', 'ť'), + ('ŧ', 'ŧ'), + ('ũ', 'ũ'), + ('ū', 'ū'), + ('ŭ', 'ŭ'), + ('ů', 'ů'), + ('ű', 'ű'), + ('ų', 'ų'), + ('ŵ', 'ŵ'), + ('ŷ', 'ŷ'), + ('ź', 'ź'), + ('ż', 'ż'), + ('ž', 'ƀ'), + ('ƃ', 'ƃ'), + ('ƅ', 'ƅ'), + ('ƈ', 'ƈ'), + ('ƌ', 'ƌ'), + ('ƒ', 'ƒ'), + ('ƕ', 'ƕ'), + ('ƙ', 'ƚ'), + ('ƞ', 'ƞ'), + ('ơ', 'ơ'), + ('ƣ', 'ƣ'), + ('ƥ', 'ƥ'), + ('ƨ', 'ƨ'), + ('ƭ', 'ƭ'), + ('ư', 'ư'), + ('ƴ', 'ƴ'), + ('ƶ', 'ƶ'), + ('ƹ', 'ƹ'), + ('ƽ', 'ƽ'), + ('ƿ', 'ƿ'), + ('Dž', 'dž'), + ('Lj', 'lj'), + ('Nj', 'nj'), + ('ǎ', 'ǎ'), + ('ǐ', 'ǐ'), + ('ǒ', 'ǒ'), + ('ǔ', 'ǔ'), + ('ǖ', 'ǖ'), + ('ǘ', 'ǘ'), + ('ǚ', 'ǚ'), + ('ǜ', 'ǝ'), + ('ǟ', 'ǟ'), + ('ǡ', 'ǡ'), + ('ǣ', 'ǣ'), + ('ǥ', 'ǥ'), + ('ǧ', 'ǧ'), + ('ǩ', 'ǩ'), + ('ǫ', 'ǫ'), + ('ǭ', 'ǭ'), + ('ǯ', 'ǰ'), + ('Dz', 'dz'), + ('ǵ', 'ǵ'), + ('ǹ', 'ǹ'), + ('ǻ', 'ǻ'), + ('ǽ', 'ǽ'), + ('ǿ', 'ǿ'), + ('ȁ', 'ȁ'), + ('ȃ', 'ȃ'), + ('ȅ', 'ȅ'), + ('ȇ', 'ȇ'), + ('ȉ', 'ȉ'), + ('ȋ', 'ȋ'), + ('ȍ', 'ȍ'), + ('ȏ', 'ȏ'), + ('ȑ', 'ȑ'), + ('ȓ', 'ȓ'), + ('ȕ', 'ȕ'), + ('ȗ', 'ȗ'), + ('ș', 'ș'), + ('ț', 'ț'), + ('ȝ', 'ȝ'), + ('ȟ', 'ȟ'), + ('ȣ', 'ȣ'), + ('ȥ', 'ȥ'), + ('ȧ', 'ȧ'), + ('ȩ', 'ȩ'), + ('ȫ', 'ȫ'), + ('ȭ', 'ȭ'), + ('ȯ', 'ȯ'), + ('ȱ', 'ȱ'), + ('ȳ', 'ȳ'), + ('ȼ', 'ȼ'), + ('ȿ', 'ɀ'), + ('ɂ', 'ɂ'), + ('ɇ', 'ɇ'), + ('ɉ', 'ɉ'), + ('ɋ', 'ɋ'), + ('ɍ', 'ɍ'), + ('ɏ', 'ɔ'), + ('ɖ', 'ɗ'), + ('ə', 'ə'), + ('ɛ', 'ɜ'), + ('ɠ', 'ɡ'), + ('ɣ', 'ɣ'), + ('ɥ', 'ɦ'), + ('ɨ', 'ɬ'), + ('ɯ', 'ɯ'), + ('ɱ', 'ɲ'), + ('ɵ', 'ɵ'), + ('ɽ', 'ɽ'), + ('ʀ', 'ʀ'), + ('ʂ', 'ʃ'), + ('ʇ', 'ʌ'), + ('ʒ', 'ʒ'), + ('ʝ', 'ʞ'), + ('\u{345}', '\u{345}'), + ('ͱ', 'ͱ'), + ('ͳ', 'ͳ'), + ('ͷ', 'ͷ'), + ('ͻ', 'ͽ'), + ('ΐ', 'ΐ'), + ('ά', 'ώ'), + ('ϐ', 'ϑ'), + ('ϕ', 'ϗ'), + ('ϙ', 'ϙ'), + ('ϛ', 'ϛ'), + ('ϝ', 'ϝ'), + ('ϟ', 'ϟ'), + ('ϡ', 'ϡ'), + ('ϣ', 'ϣ'), + ('ϥ', 'ϥ'), + ('ϧ', 'ϧ'), + ('ϩ', 'ϩ'), + ('ϫ', 'ϫ'), + ('ϭ', 'ϭ'), + ('ϯ', 'ϳ'), + ('ϵ', 'ϵ'), + ('ϸ', 'ϸ'), + ('ϻ', 'ϻ'), + ('а', 'џ'), + ('ѡ', 'ѡ'), + ('ѣ', 'ѣ'), + ('ѥ', 'ѥ'), + ('ѧ', 'ѧ'), + ('ѩ', 'ѩ'), + ('ѫ', 'ѫ'), + ('ѭ', 'ѭ'), + ('ѯ', 'ѯ'), + ('ѱ', 'ѱ'), + ('ѳ', 'ѳ'), + ('ѵ', 'ѵ'), + ('ѷ', 'ѷ'), + ('ѹ', 'ѹ'), + ('ѻ', 'ѻ'), + ('ѽ', 'ѽ'), + ('ѿ', 'ѿ'), + ('ҁ', 'ҁ'), + ('ҋ', 'ҋ'), + ('ҍ', 'ҍ'), + ('ҏ', 'ҏ'), + ('ґ', 'ґ'), + ('ғ', 'ғ'), + ('ҕ', 'ҕ'), + ('җ', 'җ'), + ('ҙ', 'ҙ'), + ('қ', 'қ'), + ('ҝ', 'ҝ'), + ('ҟ', 'ҟ'), + ('ҡ', 'ҡ'), + ('ң', 'ң'), + ('ҥ', 'ҥ'), + ('ҧ', 'ҧ'), + ('ҩ', 'ҩ'), + ('ҫ', 'ҫ'), + ('ҭ', 'ҭ'), + ('ү', 'ү'), + ('ұ', 'ұ'), + ('ҳ', 'ҳ'), + ('ҵ', 'ҵ'), + ('ҷ', 'ҷ'), + ('ҹ', 'ҹ'), + ('һ', 'һ'), + ('ҽ', 'ҽ'), + ('ҿ', 'ҿ'), + ('ӂ', 'ӂ'), + ('ӄ', 'ӄ'), + ('ӆ', 'ӆ'), + ('ӈ', 'ӈ'), + ('ӊ', 'ӊ'), + ('ӌ', 'ӌ'), + ('ӎ', 'ӏ'), + ('ӑ', 'ӑ'), + ('ӓ', 'ӓ'), + ('ӕ', 'ӕ'), + ('ӗ', 'ӗ'), + ('ә', 'ә'), + ('ӛ', 'ӛ'), + ('ӝ', 'ӝ'), + ('ӟ', 'ӟ'), + ('ӡ', 'ӡ'), + ('ӣ', 'ӣ'), + ('ӥ', 'ӥ'), + ('ӧ', 'ӧ'), + ('ө', 'ө'), + ('ӫ', 'ӫ'), + ('ӭ', 'ӭ'), + ('ӯ', 'ӯ'), + ('ӱ', 'ӱ'), + ('ӳ', 'ӳ'), + ('ӵ', 'ӵ'), + ('ӷ', 'ӷ'), + ('ӹ', 'ӹ'), + ('ӻ', 'ӻ'), + ('ӽ', 'ӽ'), + ('ӿ', 'ӿ'), + ('ԁ', 'ԁ'), + ('ԃ', 'ԃ'), + ('ԅ', 'ԅ'), + ('ԇ', 'ԇ'), + ('ԉ', 'ԉ'), + ('ԋ', 'ԋ'), + ('ԍ', 'ԍ'), + ('ԏ', 'ԏ'), + ('ԑ', 'ԑ'), + ('ԓ', 'ԓ'), + ('ԕ', 'ԕ'), + ('ԗ', 'ԗ'), + ('ԙ', 'ԙ'), + ('ԛ', 'ԛ'), + ('ԝ', 'ԝ'), + ('ԟ', 'ԟ'), + ('ԡ', 'ԡ'), + ('ԣ', 'ԣ'), + ('ԥ', 'ԥ'), + ('ԧ', 'ԧ'), + ('ԩ', 'ԩ'), + ('ԫ', 'ԫ'), + ('ԭ', 'ԭ'), + ('ԯ', 'ԯ'), + ('ա', 'և'), + ('ა', 'ჺ'), + ('ჽ', 'ჿ'), + ('ᏸ', 'ᏽ'), + ('ᲀ', 'ᲈ'), + ('ᵹ', 'ᵹ'), + ('ᵽ', 'ᵽ'), + ('ᶎ', 'ᶎ'), + ('ḁ', 'ḁ'), + ('ḃ', 'ḃ'), + ('ḅ', 'ḅ'), + ('ḇ', 'ḇ'), + ('ḉ', 'ḉ'), + ('ḋ', 'ḋ'), + ('ḍ', 'ḍ'), + ('ḏ', 'ḏ'), + ('ḑ', 'ḑ'), + ('ḓ', 'ḓ'), + ('ḕ', 'ḕ'), + ('ḗ', 'ḗ'), + ('ḙ', 'ḙ'), + ('ḛ', 'ḛ'), + ('ḝ', 'ḝ'), + ('ḟ', 'ḟ'), + ('ḡ', 'ḡ'), + ('ḣ', 'ḣ'), + ('ḥ', 'ḥ'), + ('ḧ', 'ḧ'), + ('ḩ', 'ḩ'), + ('ḫ', 'ḫ'), + ('ḭ', 'ḭ'), + ('ḯ', 'ḯ'), + ('ḱ', 'ḱ'), + ('ḳ', 'ḳ'), + ('ḵ', 'ḵ'), + ('ḷ', 'ḷ'), + ('ḹ', 'ḹ'), + ('ḻ', 'ḻ'), + ('ḽ', 'ḽ'), + ('ḿ', 'ḿ'), + ('ṁ', 'ṁ'), + ('ṃ', 'ṃ'), + ('ṅ', 'ṅ'), + ('ṇ', 'ṇ'), + ('ṉ', 'ṉ'), + ('ṋ', 'ṋ'), + ('ṍ', 'ṍ'), + ('ṏ', 'ṏ'), + ('ṑ', 'ṑ'), + ('ṓ', 'ṓ'), + ('ṕ', 'ṕ'), + ('ṗ', 'ṗ'), + ('ṙ', 'ṙ'), + ('ṛ', 'ṛ'), + ('ṝ', 'ṝ'), + ('ṟ', 'ṟ'), + ('ṡ', 'ṡ'), + ('ṣ', 'ṣ'), + ('ṥ', 'ṥ'), + ('ṧ', 'ṧ'), + ('ṩ', 'ṩ'), + ('ṫ', 'ṫ'), + ('ṭ', 'ṭ'), + ('ṯ', 'ṯ'), + ('ṱ', 'ṱ'), + ('ṳ', 'ṳ'), + ('ṵ', 'ṵ'), + ('ṷ', 'ṷ'), + ('ṹ', 'ṹ'), + ('ṻ', 'ṻ'), + ('ṽ', 'ṽ'), + ('ṿ', 'ṿ'), + ('ẁ', 'ẁ'), + ('ẃ', 'ẃ'), + ('ẅ', 'ẅ'), + ('ẇ', 'ẇ'), + ('ẉ', 'ẉ'), + ('ẋ', 'ẋ'), + ('ẍ', 'ẍ'), + ('ẏ', 'ẏ'), + ('ẑ', 'ẑ'), + ('ẓ', 'ẓ'), + ('ẕ', 'ẛ'), + ('ạ', 'ạ'), + ('ả', 'ả'), + ('ấ', 'ấ'), + ('ầ', 'ầ'), + ('ẩ', 'ẩ'), + ('ẫ', 'ẫ'), + ('ậ', 'ậ'), + ('ắ', 'ắ'), + ('ằ', 'ằ'), + ('ẳ', 'ẳ'), + ('ẵ', 'ẵ'), + ('ặ', 'ặ'), + ('ẹ', 'ẹ'), + ('ẻ', 'ẻ'), + ('ẽ', 'ẽ'), + ('ế', 'ế'), + ('ề', 'ề'), + ('ể', 'ể'), + ('ễ', 'ễ'), + ('ệ', 'ệ'), + ('ỉ', 'ỉ'), + ('ị', 'ị'), + ('ọ', 'ọ'), + ('ỏ', 'ỏ'), + ('ố', 'ố'), + ('ồ', 'ồ'), + ('ổ', 'ổ'), + ('ỗ', 'ỗ'), + ('ộ', 'ộ'), + ('ớ', 'ớ'), + ('ờ', 'ờ'), + ('ở', 'ở'), + ('ỡ', 'ỡ'), + ('ợ', 'ợ'), + ('ụ', 'ụ'), + ('ủ', 'ủ'), + ('ứ', 'ứ'), + ('ừ', 'ừ'), + ('ử', 'ử'), + ('ữ', 'ữ'), + ('ự', 'ự'), + ('ỳ', 'ỳ'), + ('ỵ', 'ỵ'), + ('ỷ', 'ỷ'), + ('ỹ', 'ỹ'), + ('ỻ', 'ỻ'), + ('ỽ', 'ỽ'), + ('ỿ', 'ἇ'), + ('ἐ', 'ἕ'), + ('ἠ', 'ἧ'), + ('ἰ', 'ἷ'), + ('ὀ', 'ὅ'), + ('ὐ', 'ὗ'), + ('ὠ', 'ὧ'), + ('ὰ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾷ'), + ('ᾼ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῇ'), + ('ῌ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'ῗ'), + ('ῠ', 'ῧ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῷ'), + ('ῼ', 'ῼ'), + ('ⅎ', 'ⅎ'), + ('ⅰ', 'ⅿ'), + ('ↄ', 'ↄ'), + ('ⓐ', 'ⓩ'), + ('ⰰ', 'ⱟ'), + ('ⱡ', 'ⱡ'), + ('ⱥ', 'ⱦ'), + ('ⱨ', 'ⱨ'), + ('ⱪ', 'ⱪ'), + ('ⱬ', 'ⱬ'), + ('ⱳ', 'ⱳ'), + ('ⱶ', 'ⱶ'), + ('ⲁ', 'ⲁ'), + ('ⲃ', 'ⲃ'), + ('ⲅ', 'ⲅ'), + ('ⲇ', 'ⲇ'), + ('ⲉ', 'ⲉ'), + ('ⲋ', 'ⲋ'), + ('ⲍ', 'ⲍ'), + ('ⲏ', 'ⲏ'), + ('ⲑ', 'ⲑ'), + ('ⲓ', 'ⲓ'), + ('ⲕ', 'ⲕ'), + ('ⲗ', 'ⲗ'), + ('ⲙ', 'ⲙ'), + ('ⲛ', 'ⲛ'), + ('ⲝ', 'ⲝ'), + ('ⲟ', 'ⲟ'), + ('ⲡ', 'ⲡ'), + ('ⲣ', 'ⲣ'), + ('ⲥ', 'ⲥ'), + ('ⲧ', 'ⲧ'), + ('ⲩ', 'ⲩ'), + ('ⲫ', 'ⲫ'), + ('ⲭ', 'ⲭ'), + ('ⲯ', 'ⲯ'), + ('ⲱ', 'ⲱ'), + ('ⲳ', 'ⲳ'), + ('ⲵ', 'ⲵ'), + ('ⲷ', 'ⲷ'), + ('ⲹ', 'ⲹ'), + ('ⲻ', 'ⲻ'), + ('ⲽ', 'ⲽ'), + ('ⲿ', 'ⲿ'), + ('ⳁ', 'ⳁ'), + ('ⳃ', 'ⳃ'), + ('ⳅ', 'ⳅ'), + ('ⳇ', 'ⳇ'), + ('ⳉ', 'ⳉ'), + ('ⳋ', 'ⳋ'), + ('ⳍ', 'ⳍ'), + ('ⳏ', 'ⳏ'), + ('ⳑ', 'ⳑ'), + ('ⳓ', 'ⳓ'), + ('ⳕ', 'ⳕ'), + ('ⳗ', 'ⳗ'), + ('ⳙ', 'ⳙ'), + ('ⳛ', 'ⳛ'), + ('ⳝ', 'ⳝ'), + ('ⳟ', 'ⳟ'), + ('ⳡ', 'ⳡ'), + ('ⳣ', 'ⳣ'), + ('ⳬ', 'ⳬ'), + ('ⳮ', 'ⳮ'), + ('ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ꙁ', 'ꙁ'), + ('ꙃ', 'ꙃ'), + ('ꙅ', 'ꙅ'), + ('ꙇ', 'ꙇ'), + ('ꙉ', 'ꙉ'), + ('ꙋ', 'ꙋ'), + ('ꙍ', 'ꙍ'), + ('ꙏ', 'ꙏ'), + ('ꙑ', 'ꙑ'), + ('ꙓ', 'ꙓ'), + ('ꙕ', 'ꙕ'), + ('ꙗ', 'ꙗ'), + ('ꙙ', 'ꙙ'), + ('ꙛ', 'ꙛ'), + ('ꙝ', 'ꙝ'), + ('ꙟ', 'ꙟ'), + ('ꙡ', 'ꙡ'), + ('ꙣ', 'ꙣ'), + ('ꙥ', 'ꙥ'), + ('ꙧ', 'ꙧ'), + ('ꙩ', 'ꙩ'), + ('ꙫ', 'ꙫ'), + ('ꙭ', 'ꙭ'), + ('ꚁ', 'ꚁ'), + ('ꚃ', 'ꚃ'), + ('ꚅ', 'ꚅ'), + ('ꚇ', 'ꚇ'), + ('ꚉ', 'ꚉ'), + ('ꚋ', 'ꚋ'), + ('ꚍ', 'ꚍ'), + ('ꚏ', 'ꚏ'), + ('ꚑ', 'ꚑ'), + ('ꚓ', 'ꚓ'), + ('ꚕ', 'ꚕ'), + ('ꚗ', 'ꚗ'), + ('ꚙ', 'ꚙ'), + ('ꚛ', 'ꚛ'), + ('ꜣ', 'ꜣ'), + ('ꜥ', 'ꜥ'), + ('ꜧ', 'ꜧ'), + ('ꜩ', 'ꜩ'), + ('ꜫ', 'ꜫ'), + ('ꜭ', 'ꜭ'), + ('ꜯ', 'ꜯ'), + ('ꜳ', 'ꜳ'), + ('ꜵ', 'ꜵ'), + ('ꜷ', 'ꜷ'), + ('ꜹ', 'ꜹ'), + ('ꜻ', 'ꜻ'), + ('ꜽ', 'ꜽ'), + ('ꜿ', 'ꜿ'), + ('ꝁ', 'ꝁ'), + ('ꝃ', 'ꝃ'), + ('ꝅ', 'ꝅ'), + ('ꝇ', 'ꝇ'), + ('ꝉ', 'ꝉ'), + ('ꝋ', 'ꝋ'), + ('ꝍ', 'ꝍ'), + ('ꝏ', 'ꝏ'), + ('ꝑ', 'ꝑ'), + ('ꝓ', 'ꝓ'), + ('ꝕ', 'ꝕ'), + ('ꝗ', 'ꝗ'), + ('ꝙ', 'ꝙ'), + ('ꝛ', 'ꝛ'), + ('ꝝ', 'ꝝ'), + ('ꝟ', 'ꝟ'), + ('ꝡ', 'ꝡ'), + ('ꝣ', 'ꝣ'), + ('ꝥ', 'ꝥ'), + ('ꝧ', 'ꝧ'), + ('ꝩ', 'ꝩ'), + ('ꝫ', 'ꝫ'), + ('ꝭ', 'ꝭ'), + ('ꝯ', 'ꝯ'), + ('ꝺ', 'ꝺ'), + ('ꝼ', 'ꝼ'), + ('ꝿ', 'ꝿ'), + ('ꞁ', 'ꞁ'), + ('ꞃ', 'ꞃ'), + ('ꞅ', 'ꞅ'), + ('ꞇ', 'ꞇ'), + ('ꞌ', 'ꞌ'), + ('ꞑ', 'ꞑ'), + ('ꞓ', 'ꞔ'), + ('ꞗ', 'ꞗ'), + ('ꞙ', 'ꞙ'), + ('ꞛ', 'ꞛ'), + ('ꞝ', 'ꞝ'), + ('ꞟ', 'ꞟ'), + ('ꞡ', 'ꞡ'), + ('ꞣ', 'ꞣ'), + ('ꞥ', 'ꞥ'), + ('ꞧ', 'ꞧ'), + ('ꞩ', 'ꞩ'), + ('ꞵ', 'ꞵ'), + ('ꞷ', 'ꞷ'), + ('ꞹ', 'ꞹ'), + ('ꞻ', 'ꞻ'), + ('ꞽ', 'ꞽ'), + ('ꞿ', 'ꞿ'), + ('ꟁ', 'ꟁ'), + ('ꟃ', 'ꟃ'), + ('ꟈ', 'ꟈ'), + ('ꟊ', 'ꟊ'), + ('ꟑ', 'ꟑ'), + ('ꟗ', 'ꟗ'), + ('ꟙ', 'ꟙ'), + ('ꟶ', 'ꟶ'), + ('ꭓ', 'ꭓ'), + ('ꭰ', 'ꮿ'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('a', 'z'), + ('𐐨', '𐑏'), + ('𐓘', '𐓻'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐳀', '𐳲'), + ('𑣀', '𑣟'), + ('𖹠', '𖹿'), + ('𞤢', '𞥃'), +]; + +pub const DASH: &'static [(char, char)] = &[ + ('-', '-'), + ('֊', '֊'), + ('־', '־'), + ('᐀', '᐀'), + ('᠆', '᠆'), + ('‐', '―'), + ('⁓', '⁓'), + ('⁻', '⁻'), + ('₋', '₋'), + ('−', '−'), + ('⸗', '⸗'), + ('⸚', '⸚'), + ('⸺', '⸻'), + ('⹀', '⹀'), + ('⹝', '⹝'), + ('〜', '〜'), + ('〰', '〰'), + ('゠', '゠'), + ('︱', '︲'), + ('﹘', '﹘'), + ('﹣', '﹣'), + ('-', '-'), + ('𐺭', '𐺭'), +]; + +pub const DEFAULT_IGNORABLE_CODE_POINT: &'static [(char, char)] = &[ + ('\u{ad}', '\u{ad}'), + ('\u{34f}', '\u{34f}'), + ('\u{61c}', '\u{61c}'), + ('ᅟ', 'ᅠ'), + ('\u{17b4}', '\u{17b5}'), + ('\u{180b}', '\u{180f}'), + ('\u{200b}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), + ('\u{2060}', '\u{206f}'), + ('ㅤ', 'ㅤ'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{feff}', '\u{feff}'), + ('ᅠ', 'ᅠ'), + ('\u{fff0}', '\u{fff8}'), + ('\u{1bca0}', '\u{1bca3}'), + ('\u{1d173}', '\u{1d17a}'), + ('\u{e0000}', '\u{e0fff}'), +]; + +pub const DEPRECATED: &'static [(char, char)] = &[ + ('ʼn', 'ʼn'), + ('ٳ', 'ٳ'), + ('\u{f77}', '\u{f77}'), + ('\u{f79}', '\u{f79}'), + ('ឣ', 'ឤ'), + ('\u{206a}', '\u{206f}'), + ('〈', '〉'), + ('\u{e0001}', '\u{e0001}'), +]; + +pub const DIACRITIC: &'static [(char, char)] = &[ + ('^', '^'), + ('`', '`'), + ('¨', '¨'), + ('¯', '¯'), + ('´', '´'), + ('·', '¸'), + ('ʰ', '\u{34e}'), + ('\u{350}', '\u{357}'), + ('\u{35d}', '\u{362}'), + ('ʹ', '͵'), + ('ͺ', 'ͺ'), + ('΄', '΅'), + ('\u{483}', '\u{487}'), + ('ՙ', 'ՙ'), + ('\u{591}', '\u{5a1}'), + ('\u{5a3}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c4}'), + ('\u{64b}', '\u{652}'), + ('\u{657}', '\u{658}'), + ('\u{6df}', '\u{6e0}'), + ('ۥ', 'ۦ'), + ('\u{6ea}', '\u{6ec}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', 'ߵ'), + ('\u{818}', '\u{819}'), + ('\u{898}', '\u{89f}'), + ('ࣉ', '\u{8d2}'), + ('\u{8e3}', '\u{8fe}'), + ('\u{93c}', '\u{93c}'), + ('\u{94d}', '\u{94d}'), + ('\u{951}', '\u{954}'), + ('ॱ', 'ॱ'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9cd}', '\u{9cd}'), + ('\u{a3c}', '\u{a3c}'), + ('\u{a4d}', '\u{a4d}'), + ('\u{abc}', '\u{abc}'), + ('\u{acd}', '\u{acd}'), + ('\u{afd}', '\u{aff}'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b4d}', '\u{b4d}'), + ('\u{b55}', '\u{b55}'), + ('\u{bcd}', '\u{bcd}'), + ('\u{c3c}', '\u{c3c}'), + ('\u{c4d}', '\u{c4d}'), + ('\u{cbc}', '\u{cbc}'), + ('\u{ccd}', '\u{ccd}'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d4d}', '\u{d4d}'), + ('\u{dca}', '\u{dca}'), + ('\u{e47}', '\u{e4c}'), + ('\u{e4e}', '\u{e4e}'), + ('\u{eba}', '\u{eba}'), + ('\u{ec8}', '\u{ecc}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('༾', '༿'), + ('\u{f82}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{fc6}', '\u{fc6}'), + ('\u{1037}', '\u{1037}'), + ('\u{1039}', '\u{103a}'), + ('ၣ', 'ၤ'), + ('ၩ', 'ၭ'), + ('ႇ', '\u{108d}'), + ('ႏ', 'ႏ'), + ('ႚ', 'ႛ'), + ('\u{135d}', '\u{135f}'), + ('\u{1714}', '᜕'), + ('\u{17c9}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{1939}', '\u{193b}'), + ('\u{1a75}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1abe}'), + ('\u{1ac1}', '\u{1acb}'), + ('\u{1b34}', '\u{1b34}'), + ('᭄', '᭄'), + ('\u{1b6b}', '\u{1b73}'), + ('᮪', '\u{1bab}'), + ('\u{1c36}', '\u{1c37}'), + ('ᱸ', 'ᱽ'), + ('\u{1cd0}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('᳷', '\u{1cf9}'), + ('ᴬ', 'ᵪ'), + ('\u{1dc4}', '\u{1dcf}'), + ('\u{1df5}', '\u{1dff}'), + ('᾽', '᾽'), + ('᾿', '῁'), + ('῍', '῏'), + ('῝', '῟'), + ('῭', '`'), + ('´', '῾'), + ('\u{2cef}', '\u{2cf1}'), + ('ⸯ', 'ⸯ'), + ('\u{302a}', '\u{302f}'), + ('\u{3099}', '゜'), + ('ー', 'ー'), + ('\u{a66f}', '\u{a66f}'), + ('\u{a67c}', '\u{a67d}'), + ('ꙿ', 'ꙿ'), + ('ꚜ', 'ꚝ'), + ('\u{a6f0}', '\u{a6f1}'), + ('꜀', '꜡'), + ('ꞈ', '꞊'), + ('ꟸ', 'ꟹ'), + ('\u{a8c4}', '\u{a8c4}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a92b}', '꤮'), + ('꥓', '꥓'), + ('\u{a9b3}', '\u{a9b3}'), + ('꧀', '꧀'), + ('\u{a9e5}', '\u{a9e5}'), + ('ꩻ', 'ꩽ'), + ('\u{aabf}', 'ꫂ'), + ('\u{aaf6}', '\u{aaf6}'), + ('꭛', 'ꭟ'), + ('ꭩ', '꭫'), + ('꯬', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe20}', '\u{fe2f}'), + ('^', '^'), + ('`', '`'), + ('ー', 'ー'), + ('\u{ff9e}', '\u{ff9f}'), + (' ̄', ' ̄'), + ('\u{102e0}', '\u{102e0}'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('\u{10ae5}', '\u{10ae6}'), + ('𐴢', '\u{10d27}'), + ('\u{10efd}', '\u{10eff}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{10f82}', '\u{10f85}'), + ('\u{11046}', '\u{11046}'), + ('\u{11070}', '\u{11070}'), + ('\u{110b9}', '\u{110ba}'), + ('\u{11133}', '\u{11134}'), + ('\u{11173}', '\u{11173}'), + ('𑇀', '𑇀'), + ('\u{111ca}', '\u{111cc}'), + ('𑈵', '\u{11236}'), + ('\u{112e9}', '\u{112ea}'), + ('\u{1133c}', '\u{1133c}'), + ('𑍍', '𑍍'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('\u{11442}', '\u{11442}'), + ('\u{11446}', '\u{11446}'), + ('\u{114c2}', '\u{114c3}'), + ('\u{115bf}', '\u{115c0}'), + ('\u{1163f}', '\u{1163f}'), + ('𑚶', '\u{116b7}'), + ('\u{1172b}', '\u{1172b}'), + ('\u{11839}', '\u{1183a}'), + ('𑤽', '\u{1193e}'), + ('\u{11943}', '\u{11943}'), + ('\u{119e0}', '\u{119e0}'), + ('\u{11a34}', '\u{11a34}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a99}', '\u{11a99}'), + ('\u{11c3f}', '\u{11c3f}'), + ('\u{11d42}', '\u{11d42}'), + ('\u{11d44}', '\u{11d45}'), + ('\u{11d97}', '\u{11d97}'), + ('\u{13447}', '\u{13455}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f8f}', '𖾟'), + ('𖿰', '𖿱'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d167}', '\u{1d169}'), + ('𝅭', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('𞀰', '𞁭'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ae}', '\u{1e2ae}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e946}'), + ('\u{1e948}', '\u{1e94a}'), +]; + +pub const EMOJI: &'static [(char, char)] = &[ + ('#', '#'), + ('*', '*'), + ('0', '9'), + ('©', '©'), + ('®', '®'), + ('‼', '‼'), + ('⁉', '⁉'), + ('™', '™'), + ('ℹ', 'ℹ'), + ('↔', '↙'), + ('↩', '↪'), + ('⌚', '⌛'), + ('⌨', '⌨'), + ('⏏', '⏏'), + ('⏩', '⏳'), + ('⏸', '⏺'), + ('Ⓜ', 'Ⓜ'), + ('▪', '▫'), + ('▶', '▶'), + ('◀', '◀'), + ('◻', '◾'), + ('☀', '☄'), + ('☎', '☎'), + ('☑', '☑'), + ('☔', '☕'), + ('☘', '☘'), + ('☝', '☝'), + ('☠', '☠'), + ('☢', '☣'), + ('☦', '☦'), + ('☪', '☪'), + ('☮', '☯'), + ('☸', '☺'), + ('♀', '♀'), + ('♂', '♂'), + ('♈', '♓'), + ('♟', '♠'), + ('♣', '♣'), + ('♥', '♦'), + ('♨', '♨'), + ('♻', '♻'), + ('♾', '♿'), + ('⚒', '⚗'), + ('⚙', '⚙'), + ('⚛', '⚜'), + ('⚠', '⚡'), + ('⚧', '⚧'), + ('⚪', '⚫'), + ('⚰', '⚱'), + ('⚽', '⚾'), + ('⛄', '⛅'), + ('⛈', '⛈'), + ('⛎', '⛏'), + ('⛑', '⛑'), + ('⛓', '⛔'), + ('⛩', '⛪'), + ('⛰', '⛵'), + ('⛷', '⛺'), + ('⛽', '⛽'), + ('✂', '✂'), + ('✅', '✅'), + ('✈', '✍'), + ('✏', '✏'), + ('✒', '✒'), + ('✔', '✔'), + ('✖', '✖'), + ('✝', '✝'), + ('✡', '✡'), + ('✨', '✨'), + ('✳', '✴'), + ('❄', '❄'), + ('❇', '❇'), + ('❌', '❌'), + ('❎', '❎'), + ('❓', '❕'), + ('❗', '❗'), + ('❣', '❤'), + ('➕', '➗'), + ('➡', '➡'), + ('➰', '➰'), + ('➿', '➿'), + ('⤴', '⤵'), + ('⬅', '⬇'), + ('⬛', '⬜'), + ('⭐', '⭐'), + ('⭕', '⭕'), + ('〰', '〰'), + ('〽', '〽'), + ('㊗', '㊗'), + ('㊙', '㊙'), + ('🀄', '🀄'), + ('🃏', '🃏'), + ('🅰', '🅱'), + ('🅾', '🅿'), + ('🆎', '🆎'), + ('🆑', '🆚'), + ('🇦', '🇿'), + ('🈁', '🈂'), + ('🈚', '🈚'), + ('🈯', '🈯'), + ('🈲', '🈺'), + ('🉐', '🉑'), + ('🌀', '🌡'), + ('🌤', '🎓'), + ('🎖', '🎗'), + ('🎙', '🎛'), + ('🎞', '🏰'), + ('🏳', '🏵'), + ('🏷', '📽'), + ('📿', '🔽'), + ('🕉', '🕎'), + ('🕐', '🕧'), + ('🕯', '🕰'), + ('🕳', '🕺'), + ('🖇', '🖇'), + ('🖊', '🖍'), + ('🖐', '🖐'), + ('🖕', '🖖'), + ('🖤', '🖥'), + ('🖨', '🖨'), + ('🖱', '🖲'), + ('🖼', '🖼'), + ('🗂', '🗄'), + ('🗑', '🗓'), + ('🗜', '🗞'), + ('🗡', '🗡'), + ('🗣', '🗣'), + ('🗨', '🗨'), + ('🗯', '🗯'), + ('🗳', '🗳'), + ('🗺', '🙏'), + ('🚀', '🛅'), + ('🛋', '🛒'), + ('🛕', '🛗'), + ('🛜', '🛥'), + ('🛩', '🛩'), + ('🛫', '🛬'), + ('🛰', '🛰'), + ('🛳', '🛼'), + ('🟠', '🟫'), + ('🟰', '🟰'), + ('🤌', '🤺'), + ('🤼', '🥅'), + ('🥇', '🧿'), + ('🩰', '🩼'), + ('🪀', '🪈'), + ('🪐', '🪽'), + ('🪿', '🫅'), + ('🫎', '🫛'), + ('🫠', '🫨'), + ('🫰', '🫸'), +]; + +pub const EMOJI_COMPONENT: &'static [(char, char)] = &[ + ('#', '#'), + ('*', '*'), + ('0', '9'), + ('\u{200d}', '\u{200d}'), + ('\u{20e3}', '\u{20e3}'), + ('\u{fe0f}', '\u{fe0f}'), + ('🇦', '🇿'), + ('🏻', '🏿'), + ('🦰', '🦳'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const EMOJI_MODIFIER: &'static [(char, char)] = &[('🏻', '🏿')]; + +pub const EMOJI_MODIFIER_BASE: &'static [(char, char)] = &[ + ('☝', '☝'), + ('⛹', '⛹'), + ('✊', '✍'), + ('🎅', '🎅'), + ('🏂', '🏄'), + ('🏇', '🏇'), + ('🏊', '🏌'), + ('👂', '👃'), + ('👆', '👐'), + ('👦', '👸'), + ('👼', '👼'), + ('💁', '💃'), + ('💅', '💇'), + ('💏', '💏'), + ('💑', '💑'), + ('💪', '💪'), + ('🕴', '🕵'), + ('🕺', '🕺'), + ('🖐', '🖐'), + ('🖕', '🖖'), + ('🙅', '🙇'), + ('🙋', '🙏'), + ('🚣', '🚣'), + ('🚴', '🚶'), + ('🛀', '🛀'), + ('🛌', '🛌'), + ('🤌', '🤌'), + ('🤏', '🤏'), + ('🤘', '🤟'), + ('🤦', '🤦'), + ('🤰', '🤹'), + ('🤼', '🤾'), + ('🥷', '🥷'), + ('🦵', '🦶'), + ('🦸', '🦹'), + ('🦻', '🦻'), + ('🧍', '🧏'), + ('🧑', '🧝'), + ('🫃', '🫅'), + ('🫰', '🫸'), +]; + +pub const EMOJI_PRESENTATION: &'static [(char, char)] = &[ + ('⌚', '⌛'), + ('⏩', '⏬'), + ('⏰', '⏰'), + ('⏳', '⏳'), + ('◽', '◾'), + ('☔', '☕'), + ('♈', '♓'), + ('♿', '♿'), + ('⚓', '⚓'), + ('⚡', '⚡'), + ('⚪', '⚫'), + ('⚽', '⚾'), + ('⛄', '⛅'), + ('⛎', '⛎'), + ('⛔', '⛔'), + ('⛪', '⛪'), + ('⛲', '⛳'), + ('⛵', '⛵'), + ('⛺', '⛺'), + ('⛽', '⛽'), + ('✅', '✅'), + ('✊', '✋'), + ('✨', '✨'), + ('❌', '❌'), + ('❎', '❎'), + ('❓', '❕'), + ('❗', '❗'), + ('➕', '➗'), + ('➰', '➰'), + ('➿', '➿'), + ('⬛', '⬜'), + ('⭐', '⭐'), + ('⭕', '⭕'), + ('🀄', '🀄'), + ('🃏', '🃏'), + ('🆎', '🆎'), + ('🆑', '🆚'), + ('🇦', '🇿'), + ('🈁', '🈁'), + ('🈚', '🈚'), + ('🈯', '🈯'), + ('🈲', '🈶'), + ('🈸', '🈺'), + ('🉐', '🉑'), + ('🌀', '🌠'), + ('🌭', '🌵'), + ('🌷', '🍼'), + ('🍾', '🎓'), + ('🎠', '🏊'), + ('🏏', '🏓'), + ('🏠', '🏰'), + ('🏴', '🏴'), + ('🏸', '🐾'), + ('👀', '👀'), + ('👂', '📼'), + ('📿', '🔽'), + ('🕋', '🕎'), + ('🕐', '🕧'), + ('🕺', '🕺'), + ('🖕', '🖖'), + ('🖤', '🖤'), + ('🗻', '🙏'), + ('🚀', '🛅'), + ('🛌', '🛌'), + ('🛐', '🛒'), + ('🛕', '🛗'), + ('🛜', '🛟'), + ('🛫', '🛬'), + ('🛴', '🛼'), + ('🟠', '🟫'), + ('🟰', '🟰'), + ('🤌', '🤺'), + ('🤼', '🥅'), + ('🥇', '🧿'), + ('🩰', '🩼'), + ('🪀', '🪈'), + ('🪐', '🪽'), + ('🪿', '🫅'), + ('🫎', '🫛'), + ('🫠', '🫨'), + ('🫰', '🫸'), +]; + +pub const EXTENDED_PICTOGRAPHIC: &'static [(char, char)] = &[ + ('©', '©'), + ('®', '®'), + ('‼', '‼'), + ('⁉', '⁉'), + ('™', '™'), + ('ℹ', 'ℹ'), + ('↔', '↙'), + ('↩', '↪'), + ('⌚', '⌛'), + ('⌨', '⌨'), + ('⎈', '⎈'), + ('⏏', '⏏'), + ('⏩', '⏳'), + ('⏸', '⏺'), + ('Ⓜ', 'Ⓜ'), + ('▪', '▫'), + ('▶', '▶'), + ('◀', '◀'), + ('◻', '◾'), + ('☀', '★'), + ('☇', '☒'), + ('☔', '⚅'), + ('⚐', '✅'), + ('✈', '✒'), + ('✔', '✔'), + ('✖', '✖'), + ('✝', '✝'), + ('✡', '✡'), + ('✨', '✨'), + ('✳', '✴'), + ('❄', '❄'), + ('❇', '❇'), + ('❌', '❌'), + ('❎', '❎'), + ('❓', '❕'), + ('❗', '❗'), + ('❣', '❧'), + ('➕', '➗'), + ('➡', '➡'), + ('➰', '➰'), + ('➿', '➿'), + ('⤴', '⤵'), + ('⬅', '⬇'), + ('⬛', '⬜'), + ('⭐', '⭐'), + ('⭕', '⭕'), + ('〰', '〰'), + ('〽', '〽'), + ('㊗', '㊗'), + ('㊙', '㊙'), + ('🀀', '\u{1f0ff}'), + ('🄍', '🄏'), + ('🄯', '🄯'), + ('🅬', '🅱'), + ('🅾', '🅿'), + ('🆎', '🆎'), + ('🆑', '🆚'), + ('🆭', '\u{1f1e5}'), + ('🈁', '\u{1f20f}'), + ('🈚', '🈚'), + ('🈯', '🈯'), + ('🈲', '🈺'), + ('\u{1f23c}', '\u{1f23f}'), + ('\u{1f249}', '🏺'), + ('🐀', '🔽'), + ('🕆', '🙏'), + ('🚀', '\u{1f6ff}'), + ('🝴', '🝿'), + ('🟕', '\u{1f7ff}'), + ('\u{1f80c}', '\u{1f80f}'), + ('\u{1f848}', '\u{1f84f}'), + ('\u{1f85a}', '\u{1f85f}'), + ('\u{1f888}', '\u{1f88f}'), + ('\u{1f8ae}', '\u{1f8ff}'), + ('🤌', '🤺'), + ('🤼', '🥅'), + ('🥇', '\u{1faff}'), + ('\u{1fc00}', '\u{1fffd}'), +]; + +pub const EXTENDER: &'static [(char, char)] = &[ + ('·', '·'), + ('ː', 'ˑ'), + ('ـ', 'ـ'), + ('ߺ', 'ߺ'), + ('\u{b55}', '\u{b55}'), + ('ๆ', 'ๆ'), + ('ໆ', 'ໆ'), + ('᠊', '᠊'), + ('ᡃ', 'ᡃ'), + ('ᪧ', 'ᪧ'), + ('\u{1c36}', '\u{1c36}'), + ('ᱻ', 'ᱻ'), + ('々', '々'), + ('〱', '〵'), + ('ゝ', 'ゞ'), + ('ー', 'ヾ'), + ('ꀕ', 'ꀕ'), + ('ꘌ', 'ꘌ'), + ('ꧏ', 'ꧏ'), + ('ꧦ', 'ꧦ'), + ('ꩰ', 'ꩰ'), + ('ꫝ', 'ꫝ'), + ('ꫳ', 'ꫴ'), + ('ー', 'ー'), + ('𐞁', '𐞂'), + ('𑍝', '𑍝'), + ('𑗆', '𑗈'), + ('\u{11a98}', '\u{11a98}'), + ('𖭂', '𖭃'), + ('𖿠', '𖿡'), + ('𖿣', '𖿣'), + ('𞄼', '𞄽'), + ('\u{1e944}', '\u{1e946}'), +]; + +pub const GRAPHEME_BASE: &'static [(char, char)] = &[ + (' ', '~'), + ('\u{a0}', '¬'), + ('®', '˿'), + ('Ͱ', 'ͷ'), + ('ͺ', 'Ϳ'), + ('΄', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', '҂'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', '֊'), + ('֍', '֏'), + ('־', '־'), + ('׀', '׀'), + ('׃', '׃'), + ('׆', '׆'), + ('א', 'ת'), + ('ׯ', '״'), + ('؆', '؏'), + ('؛', '؛'), + ('؝', 'ي'), + ('٠', 'ٯ'), + ('ٱ', 'ە'), + ('۞', '۞'), + ('ۥ', 'ۦ'), + ('۩', '۩'), + ('ۮ', '܍'), + ('ܐ', 'ܐ'), + ('ܒ', 'ܯ'), + ('ݍ', 'ޥ'), + ('ޱ', 'ޱ'), + ('߀', 'ߪ'), + ('ߴ', 'ߺ'), + ('߾', 'ࠕ'), + ('ࠚ', 'ࠚ'), + ('ࠤ', 'ࠤ'), + ('ࠨ', 'ࠨ'), + ('࠰', '࠾'), + ('ࡀ', 'ࡘ'), + ('࡞', '࡞'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢎ'), + ('ࢠ', 'ࣉ'), + ('ः', 'ह'), + ('ऻ', 'ऻ'), + ('ऽ', 'ी'), + ('ॉ', 'ौ'), + ('ॎ', 'ॐ'), + ('क़', 'ॡ'), + ('।', 'ঀ'), + ('ং', 'ঃ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('ঽ', 'ঽ'), + ('ি', 'ী'), + ('ে', 'ৈ'), + ('ো', 'ৌ'), + ('ৎ', 'ৎ'), + ('ড়', 'ঢ়'), + ('য়', 'ৡ'), + ('০', '৽'), + ('ਃ', 'ਃ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('ਾ', 'ੀ'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('੦', '੯'), + ('ੲ', 'ੴ'), + ('੶', '੶'), + ('ઃ', 'ઃ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('ઽ', 'ી'), + ('ૉ', 'ૉ'), + ('ો', 'ૌ'), + ('ૐ', 'ૐ'), + ('ૠ', 'ૡ'), + ('૦', '૱'), + ('ૹ', 'ૹ'), + ('ଂ', 'ଃ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('ଽ', 'ଽ'), + ('ୀ', 'ୀ'), + ('େ', 'ୈ'), + ('ୋ', 'ୌ'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୡ'), + ('୦', '୷'), + ('ஃ', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('ி', 'ி'), + ('ு', 'ூ'), + ('ெ', 'ை'), + ('ொ', 'ௌ'), + ('ௐ', 'ௐ'), + ('௦', '௺'), + ('ఁ', 'ః'), + ('అ', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('ఽ', 'ఽ'), + ('ు', 'ౄ'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', 'ౡ'), + ('౦', '౯'), + ('౷', 'ಀ'), + ('ಂ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('ಽ', 'ಾ'), + ('ೀ', 'ು'), + ('ೃ', 'ೄ'), + ('ೇ', 'ೈ'), + ('ೊ', 'ೋ'), + ('ೝ', 'ೞ'), + ('ೠ', 'ೡ'), + ('೦', '೯'), + ('ೱ', 'ೳ'), + ('ം', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), + ('ി', 'ീ'), + ('െ', 'ൈ'), + ('ൊ', 'ൌ'), + ('ൎ', '൏'), + ('ൔ', 'ൖ'), + ('൘', 'ൡ'), + ('൦', 'ൿ'), + ('ං', 'ඃ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('ැ', 'ෑ'), + ('ෘ', 'ෞ'), + ('෦', '෯'), + ('ෲ', '෴'), + ('ก', 'ะ'), + ('า', 'ำ'), + ('฿', 'ๆ'), + ('๏', '๛'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ະ'), + ('າ', 'ຳ'), + ('ຽ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('໐', '໙'), + ('ໜ', 'ໟ'), + ('ༀ', '༗'), + ('༚', '༴'), + ('༶', '༶'), + ('༸', '༸'), + ('༺', 'ཇ'), + ('ཉ', 'ཬ'), + ('ཿ', 'ཿ'), + ('྅', '྅'), + ('ྈ', 'ྌ'), + ('྾', '࿅'), + ('࿇', '࿌'), + ('࿎', '࿚'), + ('က', 'ာ'), + ('ေ', 'ေ'), + ('း', 'း'), + ('ျ', 'ြ'), + ('ဿ', 'ၗ'), + ('ၚ', 'ၝ'), + ('ၡ', 'ၰ'), + ('ၵ', 'ႁ'), + ('ႃ', 'ႄ'), + ('ႇ', 'ႌ'), + ('ႎ', 'ႜ'), + ('႞', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('፠', '፼'), + ('ᎀ', '᎙'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('᐀', '᚜'), + ('ᚠ', 'ᛸ'), + ('ᜀ', 'ᜑ'), + ('᜕', '᜕'), + ('ᜟ', 'ᜱ'), + ('᜴', '᜶'), + ('ᝀ', 'ᝑ'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('ក', 'ឳ'), + ('ា', 'ា'), + ('ើ', 'ៅ'), + ('ះ', 'ៈ'), + ('។', 'ៜ'), + ('០', '៩'), + ('៰', '៹'), + ('᠀', '᠊'), + ('᠐', '᠙'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢄ'), + ('ᢇ', 'ᢨ'), + ('ᢪ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('ᤣ', 'ᤦ'), + ('ᤩ', 'ᤫ'), + ('ᤰ', 'ᤱ'), + ('ᤳ', 'ᤸ'), + ('᥀', '᥀'), + ('᥄', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('᧐', '᧚'), + ('᧞', 'ᨖ'), + ('ᨙ', 'ᨚ'), + ('᨞', 'ᩕ'), + ('ᩗ', 'ᩗ'), + ('ᩡ', 'ᩡ'), + ('ᩣ', 'ᩤ'), + ('ᩭ', 'ᩲ'), + ('᪀', '᪉'), + ('᪐', '᪙'), + ('᪠', '᪭'), + ('ᬄ', 'ᬳ'), + ('ᬻ', 'ᬻ'), + ('ᬽ', 'ᭁ'), + ('ᭃ', 'ᭌ'), + ('᭐', '᭪'), + ('᭴', '᭾'), + ('ᮂ', 'ᮡ'), + ('ᮦ', 'ᮧ'), + ('᮪', '᮪'), + ('ᮮ', 'ᯥ'), + ('ᯧ', 'ᯧ'), + ('ᯪ', 'ᯬ'), + ('ᯮ', 'ᯮ'), + ('᯲', '᯳'), + ('᯼', 'ᰫ'), + ('ᰴ', 'ᰵ'), + ('᰻', '᱉'), + ('ᱍ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', '᳇'), + ('᳓', '᳓'), + ('᳡', '᳡'), + ('ᳩ', 'ᳬ'), + ('ᳮ', 'ᳳ'), + ('ᳵ', '᳷'), + ('ᳺ', 'ᳺ'), + ('ᴀ', 'ᶿ'), + ('Ḁ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ῄ'), + ('ῆ', 'ΐ'), + ('ῖ', 'Ί'), + ('῝', '`'), + ('ῲ', 'ῴ'), + ('ῶ', '῾'), + ('\u{2000}', '\u{200a}'), + ('‐', '‧'), + ('\u{202f}', '\u{205f}'), + ('⁰', 'ⁱ'), + ('⁴', '₎'), + ('ₐ', 'ₜ'), + ('₠', '⃀'), + ('℀', '↋'), + ('←', '␦'), + ('⑀', '⑊'), + ('①', '⭳'), + ('⭶', '⮕'), + ('⮗', 'ⳮ'), + ('Ⳳ', 'ⳳ'), + ('⳹', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', '⵰'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('⸀', '⹝'), + ('⺀', '⺙'), + ('⺛', '⻳'), + ('⼀', '⿕'), + ('⿰', '⿻'), + ('\u{3000}', '〩'), + ('〰', '〿'), + ('ぁ', 'ゖ'), + ('゛', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('㆐', '㇣'), + ('ㇰ', '㈞'), + ('㈠', 'ꒌ'), + ('꒐', '꓆'), + ('ꓐ', 'ꘫ'), + ('Ꙁ', 'ꙮ'), + ('꙳', '꙳'), + ('꙾', 'ꚝ'), + ('ꚠ', 'ꛯ'), + ('꛲', '꛷'), + ('꜀', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠁ'), + ('ꠃ', 'ꠅ'), + ('ꠇ', 'ꠊ'), + ('ꠌ', 'ꠤ'), + ('ꠧ', '꠫'), + ('꠰', '꠹'), + ('ꡀ', '꡷'), + ('ꢀ', 'ꣃ'), + ('꣎', '꣙'), + ('ꣲ', 'ꣾ'), + ('꤀', 'ꤥ'), + ('꤮', 'ꥆ'), + ('ꥒ', '꥓'), + ('꥟', 'ꥼ'), + ('ꦃ', 'ꦲ'), + ('ꦴ', 'ꦵ'), + ('ꦺ', 'ꦻ'), + ('ꦾ', '꧍'), + ('ꧏ', '꧙'), + ('꧞', 'ꧤ'), + ('ꧦ', 'ꧾ'), + ('ꨀ', 'ꨨ'), + ('ꨯ', 'ꨰ'), + ('ꨳ', 'ꨴ'), + ('ꩀ', 'ꩂ'), + ('ꩄ', 'ꩋ'), + ('ꩍ', 'ꩍ'), + ('꩐', '꩙'), + ('꩜', 'ꩻ'), + ('ꩽ', 'ꪯ'), + ('ꪱ', 'ꪱ'), + ('ꪵ', 'ꪶ'), + ('ꪹ', 'ꪽ'), + ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), + ('ꫛ', 'ꫫ'), + ('ꫮ', 'ꫵ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', '꭫'), + ('ꭰ', 'ꯤ'), + ('ꯦ', 'ꯧ'), + ('ꯩ', '꯬'), + ('꯰', '꯹'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('יִ', 'יִ'), + ('ײַ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', '﯂'), + ('ﯓ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('﷏', '﷏'), + ('ﷰ', '﷿'), + ('︐', '︙'), + ('︰', '﹒'), + ('﹔', '﹦'), + ('﹨', '﹫'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('!', 'ン'), + ('ᅠ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('¢', '₩'), + ('│', '○'), + ('', '�'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐄀', '𐄂'), + ('𐄇', '𐄳'), + ('𐄷', '𐆎'), + ('𐆐', '𐆜'), + ('𐆠', '𐆠'), + ('𐇐', '𐇼'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('𐋡', '𐋻'), + ('𐌀', '𐌣'), + ('𐌭', '𐍊'), + ('𐍐', '𐍵'), + ('𐎀', '𐎝'), + ('𐎟', '𐏃'), + ('𐏈', '𐏕'), + ('𐐀', '𐒝'), + ('𐒠', '𐒩'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕯', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡗', '𐢞'), + ('𐢧', '𐢯'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐣻', '𐤛'), + ('𐤟', '𐤹'), + ('𐤿', '𐤿'), + ('𐦀', '𐦷'), + ('𐦼', '𐧏'), + ('𐧒', '𐨀'), + ('𐨐', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('𐩀', '𐩈'), + ('𐩐', '𐩘'), + ('𐩠', '𐪟'), + ('𐫀', '𐫤'), + ('𐫫', '𐫶'), + ('𐬀', '𐬵'), + ('𐬹', '𐭕'), + ('𐭘', '𐭲'), + ('𐭸', '𐮑'), + ('𐮙', '𐮜'), + ('𐮩', '𐮯'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐳺', '𐴣'), + ('𐴰', '𐴹'), + ('𐹠', '𐹾'), + ('𐺀', '𐺩'), + ('𐺭', '𐺭'), + ('𐺰', '𐺱'), + ('𐼀', '𐼧'), + ('𐼰', '𐽅'), + ('𐽑', '𐽙'), + ('𐽰', '𐾁'), + ('𐾆', '𐾉'), + ('𐾰', '𐿋'), + ('𐿠', '𐿶'), + ('𑀀', '𑀀'), + ('𑀂', '𑀷'), + ('𑁇', '𑁍'), + ('𑁒', '𑁯'), + ('𑁱', '𑁲'), + ('𑁵', '𑁵'), + ('𑂂', '𑂲'), + ('𑂷', '𑂸'), + ('𑂻', '𑂼'), + ('𑂾', '𑃁'), + ('𑃐', '𑃨'), + ('𑃰', '𑃹'), + ('𑄃', '𑄦'), + ('𑄬', '𑄬'), + ('𑄶', '𑅇'), + ('𑅐', '𑅲'), + ('𑅴', '𑅶'), + ('𑆂', '𑆵'), + ('𑆿', '𑇈'), + ('𑇍', '𑇎'), + ('𑇐', '𑇟'), + ('𑇡', '𑇴'), + ('𑈀', '𑈑'), + ('𑈓', '𑈮'), + ('𑈲', '𑈳'), + ('𑈵', '𑈵'), + ('𑈸', '𑈽'), + ('𑈿', '𑉀'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊩'), + ('𑊰', '𑋞'), + ('𑋠', '𑋢'), + ('𑋰', '𑋹'), + ('𑌂', '𑌃'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('𑌽', '𑌽'), + ('𑌿', '𑌿'), + ('𑍁', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('𑍐', '𑍐'), + ('𑍝', '𑍣'), + ('𑐀', '𑐷'), + ('𑑀', '𑑁'), + ('𑑅', '𑑅'), + ('𑑇', '𑑛'), + ('𑑝', '𑑝'), + ('𑑟', '𑑡'), + ('𑒀', '𑒯'), + ('𑒱', '𑒲'), + ('𑒹', '𑒹'), + ('𑒻', '𑒼'), + ('𑒾', '𑒾'), + ('𑓁', '𑓁'), + ('𑓄', '𑓇'), + ('𑓐', '𑓙'), + ('𑖀', '𑖮'), + ('𑖰', '𑖱'), + ('𑖸', '𑖻'), + ('𑖾', '𑖾'), + ('𑗁', '𑗛'), + ('𑘀', '𑘲'), + ('𑘻', '𑘼'), + ('𑘾', '𑘾'), + ('𑙁', '𑙄'), + ('𑙐', '𑙙'), + ('𑙠', '𑙬'), + ('𑚀', '𑚪'), + ('𑚬', '𑚬'), + ('𑚮', '𑚯'), + ('𑚶', '𑚶'), + ('𑚸', '𑚹'), + ('𑛀', '𑛉'), + ('𑜀', '𑜚'), + ('𑜠', '𑜡'), + ('𑜦', '𑜦'), + ('𑜰', '𑝆'), + ('𑠀', '𑠮'), + ('𑠸', '𑠸'), + ('𑠻', '𑠻'), + ('𑢠', '𑣲'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤯'), + ('𑤱', '𑤵'), + ('𑤷', '𑤸'), + ('𑤽', '𑤽'), + ('𑤿', '𑥂'), + ('𑥄', '𑥆'), + ('𑥐', '𑥙'), + ('𑦠', '𑦧'), + ('𑦪', '𑧓'), + ('𑧜', '𑧟'), + ('𑧡', '𑧤'), + ('𑨀', '𑨀'), + ('𑨋', '𑨲'), + ('𑨹', '𑨺'), + ('𑨿', '𑩆'), + ('𑩐', '𑩐'), + ('𑩗', '𑩘'), + ('𑩜', '𑪉'), + ('𑪗', '𑪗'), + ('𑪚', '𑪢'), + ('𑪰', '𑫸'), + ('𑬀', '𑬉'), + ('𑰀', '𑰈'), + ('𑰊', '𑰯'), + ('𑰾', '𑰾'), + ('𑱀', '𑱅'), + ('𑱐', '𑱬'), + ('𑱰', '𑲏'), + ('𑲩', '𑲩'), + ('𑲱', '𑲱'), + ('𑲴', '𑲴'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '𑴰'), + ('𑵆', '𑵆'), + ('𑵐', '𑵙'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶎'), + ('𑶓', '𑶔'), + ('𑶖', '𑶖'), + ('𑶘', '𑶘'), + ('𑶠', '𑶩'), + ('𑻠', '𑻲'), + ('𑻵', '𑻸'), + ('𑼂', '𑼐'), + ('𑼒', '𑼵'), + ('𑼾', '𑼿'), + ('𑽁', '𑽁'), + ('𑽃', '𑽙'), + ('𑾰', '𑾰'), + ('𑿀', '𑿱'), + ('𑿿', '𒎙'), + ('𒐀', '𒑮'), + ('𒑰', '𒑴'), + ('𒒀', '𒕃'), + ('𒾐', '𒿲'), + ('𓀀', '𓐯'), + ('𓑁', '𓑆'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩠', '𖩩'), + ('𖩮', '𖪾'), + ('𖫀', '𖫉'), + ('𖫐', '𖫭'), + ('𖫵', '𖫵'), + ('𖬀', '𖬯'), + ('𖬷', '𖭅'), + ('𖭐', '𖭙'), + ('𖭛', '𖭡'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖺚'), + ('𖼀', '𖽊'), + ('𖽐', '𖾇'), + ('𖾓', '𖾟'), + ('𖿠', '𖿣'), + ('𖿰', '𖿱'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('𛲜', '𛲜'), + ('𛲟', '𛲟'), + ('𜽐', '𜿃'), + ('𝀀', '𝃵'), + ('𝄀', '𝄦'), + ('𝄩', '𝅘𝅥𝅲'), + ('𝅦', '𝅦'), + ('𝅪', '𝅭'), + ('𝆃', '𝆄'), + ('𝆌', '𝆩'), + ('𝆮', '𝇪'), + ('𝈀', '𝉁'), + ('𝉅', '𝉅'), + ('𝋀', '𝋓'), + ('𝋠', '𝋳'), + ('𝌀', '𝍖'), + ('𝍠', '𝍸'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝟋'), + ('𝟎', '𝧿'), + ('𝨷', '𝨺'), + ('𝩭', '𝩴'), + ('𝩶', '𝪃'), + ('𝪅', '𝪋'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('𞀰', '𞁭'), + ('𞄀', '𞄬'), + ('𞄷', '𞄽'), + ('𞅀', '𞅉'), + ('𞅎', '𞅏'), + ('𞊐', '𞊭'), + ('𞋀', '𞋫'), + ('𞋰', '𞋹'), + ('𞋿', '𞋿'), + ('𞓐', '𞓫'), + ('𞓰', '𞓹'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('𞣇', '𞣏'), + ('𞤀', '𞥃'), + ('𞥋', '𞥋'), + ('𞥐', '𞥙'), + ('𞥞', '𞥟'), + ('𞱱', '𞲴'), + ('𞴁', '𞴽'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('𞻰', '𞻱'), + ('🀀', '🀫'), + ('🀰', '🂓'), + ('🂠', '🂮'), + ('🂱', '🂿'), + ('🃁', '🃏'), + ('🃑', '🃵'), + ('🄀', '🆭'), + ('🇦', '🈂'), + ('🈐', '🈻'), + ('🉀', '🉈'), + ('🉐', '🉑'), + ('🉠', '🉥'), + ('🌀', '🛗'), + ('🛜', '🛬'), + ('🛰', '🛼'), + ('🜀', '🝶'), + ('🝻', '🟙'), + ('🟠', '🟫'), + ('🟰', '🟰'), + ('🠀', '🠋'), + ('🠐', '🡇'), + ('🡐', '🡙'), + ('🡠', '🢇'), + ('🢐', '🢭'), + ('🢰', '🢱'), + ('🤀', '🩓'), + ('🩠', '🩭'), + ('🩰', '🩼'), + ('🪀', '🪈'), + ('🪐', '🪽'), + ('🪿', '🫅'), + ('🫎', '🫛'), + ('🫠', '🫨'), + ('🫰', '🫸'), + ('🬀', '🮒'), + ('🮔', '🯊'), + ('🯰', '🯹'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; + +pub const GRAPHEME_EXTEND: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{483}', '\u{489}'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6df}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', '\u{7f3}'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{819}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('\u{898}', '\u{89f}'), + ('\u{8ca}', '\u{8e1}'), + ('\u{8e3}', '\u{902}'), + ('\u{93a}', '\u{93a}'), + ('\u{93c}', '\u{93c}'), + ('\u{941}', '\u{948}'), + ('\u{94d}', '\u{94d}'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', '\u{981}'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9be}', '\u{9be}'), + ('\u{9c1}', '\u{9c4}'), + ('\u{9cd}', '\u{9cd}'), + ('\u{9d7}', '\u{9d7}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', '\u{a02}'), + ('\u{a3c}', '\u{a3c}'), + ('\u{a41}', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', '\u{a82}'), + ('\u{abc}', '\u{abc}'), + ('\u{ac1}', '\u{ac5}'), + ('\u{ac7}', '\u{ac8}'), + ('\u{acd}', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', '\u{b01}'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3e}', '\u{b3f}'), + ('\u{b41}', '\u{b44}'), + ('\u{b4d}', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bbe}', '\u{bbe}'), + ('\u{bc0}', '\u{bc0}'), + ('\u{bcd}', '\u{bcd}'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', '\u{c00}'), + ('\u{c04}', '\u{c04}'), + ('\u{c3c}', '\u{c3c}'), + ('\u{c3e}', '\u{c40}'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', '\u{c81}'), + ('\u{cbc}', '\u{cbc}'), + ('\u{cbf}', '\u{cbf}'), + ('\u{cc2}', '\u{cc2}'), + ('\u{cc6}', '\u{cc6}'), + ('\u{ccc}', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{ce2}', '\u{ce3}'), + ('\u{d00}', '\u{d01}'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d3e}', '\u{d3e}'), + ('\u{d41}', '\u{d44}'), + ('\u{d4d}', '\u{d4d}'), + ('\u{d57}', '\u{d57}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', '\u{d81}'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dcf}'), + ('\u{dd2}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('\u{ddf}', '\u{ddf}'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e47}', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('\u{ec8}', '\u{ece}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('\u{f71}', '\u{f7e}'), + ('\u{f80}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('\u{102d}', '\u{1030}'), + ('\u{1032}', '\u{1037}'), + ('\u{1039}', '\u{103a}'), + ('\u{103d}', '\u{103e}'), + ('\u{1058}', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{1082}'), + ('\u{1085}', '\u{1086}'), + ('\u{108d}', '\u{108d}'), + ('\u{109d}', '\u{109d}'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '\u{1714}'), + ('\u{1732}', '\u{1733}'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17b5}'), + ('\u{17b7}', '\u{17bd}'), + ('\u{17c6}', '\u{17c6}'), + ('\u{17c9}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '\u{180f}'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', '\u{1922}'), + ('\u{1927}', '\u{1928}'), + ('\u{1932}', '\u{1932}'), + ('\u{1939}', '\u{193b}'), + ('\u{1a17}', '\u{1a18}'), + ('\u{1a1b}', '\u{1a1b}'), + ('\u{1a56}', '\u{1a56}'), + ('\u{1a58}', '\u{1a5e}'), + ('\u{1a60}', '\u{1a60}'), + ('\u{1a62}', '\u{1a62}'), + ('\u{1a65}', '\u{1a6c}'), + ('\u{1a73}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1b00}', '\u{1b03}'), + ('\u{1b34}', '\u{1b3a}'), + ('\u{1b3c}', '\u{1b3c}'), + ('\u{1b42}', '\u{1b42}'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '\u{1b81}'), + ('\u{1ba2}', '\u{1ba5}'), + ('\u{1ba8}', '\u{1ba9}'), + ('\u{1bab}', '\u{1bad}'), + ('\u{1be6}', '\u{1be6}'), + ('\u{1be8}', '\u{1be9}'), + ('\u{1bed}', '\u{1bed}'), + ('\u{1bef}', '\u{1bf1}'), + ('\u{1c2c}', '\u{1c33}'), + ('\u{1c36}', '\u{1c37}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce0}'), + ('\u{1ce2}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{1dc0}', '\u{1dff}'), + ('\u{200c}', '\u{200c}'), + ('\u{20d0}', '\u{20f0}'), + ('\u{2cef}', '\u{2cf1}'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('\u{302a}', '\u{302f}'), + ('\u{3099}', '\u{309a}'), + ('\u{a66f}', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('\u{a825}', '\u{a826}'), + ('\u{a82c}', '\u{a82c}'), + ('\u{a8c4}', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', '\u{a951}'), + ('\u{a980}', '\u{a982}'), + ('\u{a9b3}', '\u{a9b3}'), + ('\u{a9b6}', '\u{a9b9}'), + ('\u{a9bc}', '\u{a9bd}'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa2e}'), + ('\u{aa31}', '\u{aa32}'), + ('\u{aa35}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', '\u{aa4c}'), + ('\u{aa7c}', '\u{aa7c}'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('\u{aaec}', '\u{aaed}'), + ('\u{aaf6}', '\u{aaf6}'), + ('\u{abe5}', '\u{abe5}'), + ('\u{abe8}', '\u{abe8}'), + ('\u{abed}', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('\u{ff9e}', '\u{ff9f}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10efd}', '\u{10eff}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{10f82}', '\u{10f85}'), + ('\u{11001}', '\u{11001}'), + ('\u{11038}', '\u{11046}'), + ('\u{11070}', '\u{11070}'), + ('\u{11073}', '\u{11074}'), + ('\u{1107f}', '\u{11081}'), + ('\u{110b3}', '\u{110b6}'), + ('\u{110b9}', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{1112b}'), + ('\u{1112d}', '\u{11134}'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', '\u{11181}'), + ('\u{111b6}', '\u{111be}'), + ('\u{111c9}', '\u{111cc}'), + ('\u{111cf}', '\u{111cf}'), + ('\u{1122f}', '\u{11231}'), + ('\u{11234}', '\u{11234}'), + ('\u{11236}', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{11241}', '\u{11241}'), + ('\u{112df}', '\u{112df}'), + ('\u{112e3}', '\u{112ea}'), + ('\u{11300}', '\u{11301}'), + ('\u{1133b}', '\u{1133c}'), + ('\u{1133e}', '\u{1133e}'), + ('\u{11340}', '\u{11340}'), + ('\u{11357}', '\u{11357}'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('\u{11438}', '\u{1143f}'), + ('\u{11442}', '\u{11444}'), + ('\u{11446}', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b0}', '\u{114b0}'), + ('\u{114b3}', '\u{114b8}'), + ('\u{114ba}', '\u{114ba}'), + ('\u{114bd}', '\u{114bd}'), + ('\u{114bf}', '\u{114c0}'), + ('\u{114c2}', '\u{114c3}'), + ('\u{115af}', '\u{115af}'), + ('\u{115b2}', '\u{115b5}'), + ('\u{115bc}', '\u{115bd}'), + ('\u{115bf}', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('\u{11633}', '\u{1163a}'), + ('\u{1163d}', '\u{1163d}'), + ('\u{1163f}', '\u{11640}'), + ('\u{116ab}', '\u{116ab}'), + ('\u{116ad}', '\u{116ad}'), + ('\u{116b0}', '\u{116b5}'), + ('\u{116b7}', '\u{116b7}'), + ('\u{1171d}', '\u{1171f}'), + ('\u{11722}', '\u{11725}'), + ('\u{11727}', '\u{1172b}'), + ('\u{1182f}', '\u{11837}'), + ('\u{11839}', '\u{1183a}'), + ('\u{11930}', '\u{11930}'), + ('\u{1193b}', '\u{1193c}'), + ('\u{1193e}', '\u{1193e}'), + ('\u{11943}', '\u{11943}'), + ('\u{119d4}', '\u{119d7}'), + ('\u{119da}', '\u{119db}'), + ('\u{119e0}', '\u{119e0}'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', '\u{11a38}'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a56}'), + ('\u{11a59}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a96}'), + ('\u{11a98}', '\u{11a99}'), + ('\u{11c30}', '\u{11c36}'), + ('\u{11c38}', '\u{11c3d}'), + ('\u{11c3f}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('\u{11caa}', '\u{11cb0}'), + ('\u{11cb2}', '\u{11cb3}'), + ('\u{11cb5}', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('\u{11d90}', '\u{11d91}'), + ('\u{11d95}', '\u{11d95}'), + ('\u{11d97}', '\u{11d97}'), + ('\u{11ef3}', '\u{11ef4}'), + ('\u{11f00}', '\u{11f01}'), + ('\u{11f36}', '\u{11f3a}'), + ('\u{11f40}', '\u{11f40}'), + ('\u{11f42}', '\u{11f42}'), + ('\u{13440}', '\u{13440}'), + ('\u{13447}', '\u{13455}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f4f}', '\u{16f4f}'), + ('\u{16f8f}', '\u{16f92}'), + ('\u{16fe4}', '\u{16fe4}'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d165}', '\u{1d165}'), + ('\u{1d167}', '\u{1d169}'), + ('\u{1d16e}', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e08f}', '\u{1e08f}'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ae}', '\u{1e2ae}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e4ec}', '\u{1e4ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e94a}'), + ('\u{e0020}', '\u{e007f}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const GRAPHEME_LINK: &'static [(char, char)] = &[ + ('\u{94d}', '\u{94d}'), + ('\u{9cd}', '\u{9cd}'), + ('\u{a4d}', '\u{a4d}'), + ('\u{acd}', '\u{acd}'), + ('\u{b4d}', '\u{b4d}'), + ('\u{bcd}', '\u{bcd}'), + ('\u{c4d}', '\u{c4d}'), + ('\u{ccd}', '\u{ccd}'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d4d}', '\u{d4d}'), + ('\u{dca}', '\u{dca}'), + ('\u{e3a}', '\u{e3a}'), + ('\u{eba}', '\u{eba}'), + ('\u{f84}', '\u{f84}'), + ('\u{1039}', '\u{103a}'), + ('\u{1714}', '᜕'), + ('᜴', '᜴'), + ('\u{17d2}', '\u{17d2}'), + ('\u{1a60}', '\u{1a60}'), + ('᭄', '᭄'), + ('᮪', '\u{1bab}'), + ('᯲', '᯳'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{a806}', '\u{a806}'), + ('\u{a82c}', '\u{a82c}'), + ('\u{a8c4}', '\u{a8c4}'), + ('꥓', '꥓'), + ('꧀', '꧀'), + ('\u{aaf6}', '\u{aaf6}'), + ('\u{abed}', '\u{abed}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{11046}', '\u{11046}'), + ('\u{11070}', '\u{11070}'), + ('\u{1107f}', '\u{1107f}'), + ('\u{110b9}', '\u{110b9}'), + ('\u{11133}', '\u{11134}'), + ('𑇀', '𑇀'), + ('𑈵', '𑈵'), + ('\u{112ea}', '\u{112ea}'), + ('𑍍', '𑍍'), + ('\u{11442}', '\u{11442}'), + ('\u{114c2}', '\u{114c2}'), + ('\u{115bf}', '\u{115bf}'), + ('\u{1163f}', '\u{1163f}'), + ('𑚶', '𑚶'), + ('\u{1172b}', '\u{1172b}'), + ('\u{11839}', '\u{11839}'), + ('𑤽', '\u{1193e}'), + ('\u{119e0}', '\u{119e0}'), + ('\u{11a34}', '\u{11a34}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a99}', '\u{11a99}'), + ('\u{11c3f}', '\u{11c3f}'), + ('\u{11d44}', '\u{11d45}'), + ('\u{11d97}', '\u{11d97}'), + ('𑽁', '\u{11f42}'), +]; + +pub const HEX_DIGIT: &'static [(char, char)] = &[ + ('0', '9'), + ('A', 'F'), + ('a', 'f'), + ('0', '9'), + ('A', 'F'), + ('a', 'f'), +]; + +pub const HYPHEN: &'static [(char, char)] = &[ + ('-', '-'), + ('\u{ad}', '\u{ad}'), + ('֊', '֊'), + ('᠆', '᠆'), + ('‐', '‑'), + ('⸗', '⸗'), + ('・', '・'), + ('﹣', '﹣'), + ('-', '-'), + ('・', '・'), +]; + +pub const IDS_BINARY_OPERATOR: &'static [(char, char)] = + &[('⿰', '⿱'), ('⿴', '⿻')]; + +pub const IDS_TRINARY_OPERATOR: &'static [(char, char)] = &[('⿲', '⿳')]; + +pub const ID_CONTINUE: &'static [(char, char)] = &[ + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('·', '·'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ˁ'), + ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('\u{300}', 'ʹ'), + ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('\u{483}', '\u{487}'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), + ('ՠ', 'ֈ'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('א', 'ת'), + ('ׯ', 'ײ'), + ('\u{610}', '\u{61a}'), + ('ؠ', '٩'), + ('ٮ', 'ۓ'), + ('ە', '\u{6dc}'), + ('\u{6df}', '\u{6e8}'), + ('\u{6ea}', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', '\u{74a}'), + ('ݍ', 'ޱ'), + ('߀', 'ߵ'), + ('ߺ', 'ߺ'), + ('\u{7fd}', '\u{7fd}'), + ('ࠀ', '\u{82d}'), + ('ࡀ', '\u{85b}'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('\u{898}', '\u{8e1}'), + ('\u{8e3}', '\u{963}'), + ('०', '९'), + ('ॱ', 'ঃ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('\u{9bc}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', 'ৎ'), + ('\u{9d7}', '\u{9d7}'), + ('ড়', 'ঢ়'), + ('য়', '\u{9e3}'), + ('০', 'ৱ'), + ('ৼ', 'ৼ'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ਃ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('੦', '\u{a75}'), + ('\u{a81}', 'ઃ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('ૐ', 'ૐ'), + ('ૠ', '\u{ae3}'), + ('૦', '૯'), + ('ૹ', '\u{aff}'), + ('\u{b01}', 'ଃ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('\u{b3c}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', '\u{b63}'), + ('୦', '୯'), + ('ୱ', 'ୱ'), + ('\u{b82}', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('ௐ', 'ௐ'), + ('\u{bd7}', '\u{bd7}'), + ('௦', '௯'), + ('\u{c00}', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('\u{c3c}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', '\u{c63}'), + ('౦', '౯'), + ('ಀ', 'ಃ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('\u{cbc}', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('ೝ', 'ೞ'), + ('ೠ', '\u{ce3}'), + ('೦', '೯'), + ('ೱ', 'ೳ'), + ('\u{d00}', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', 'ൎ'), + ('ൔ', '\u{d57}'), + ('ൟ', '\u{d63}'), + ('൦', '൯'), + ('ൺ', 'ൿ'), + ('\u{d81}', 'ඃ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('෦', '෯'), + ('ෲ', 'ෳ'), + ('ก', '\u{e3a}'), + ('เ', '\u{e4e}'), + ('๐', '๙'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('\u{ec8}', '\u{ece}'), + ('໐', '໙'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('\u{f18}', '\u{f19}'), + ('༠', '༩'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('༾', 'ཇ'), + ('ཉ', 'ཬ'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('က', '၉'), + ('ၐ', '\u{109d}'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('\u{135d}', '\u{135f}'), + ('፩', '፱'), + ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛮ', 'ᛸ'), + ('ᜀ', '᜕'), + ('ᜟ', '᜴'), + ('ᝀ', '\u{1753}'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('\u{1772}', '\u{1773}'), + ('ក', '\u{17d3}'), + ('ៗ', 'ៗ'), + ('ៜ', '\u{17dd}'), + ('០', '៩'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '᠙'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('᥆', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('᧐', '᧚'), + ('ᨀ', '\u{1a1b}'), + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '᪉'), + ('᪐', '᪙'), + ('ᪧ', 'ᪧ'), + ('\u{1ab0}', '\u{1abd}'), + ('\u{1abf}', '\u{1ace}'), + ('\u{1b00}', 'ᭌ'), + ('᭐', '᭙'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '᯳'), + ('ᰀ', '\u{1c37}'), + ('᱀', '᱉'), + ('ᱍ', 'ᱽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', 'ᳺ'), + ('ᴀ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('‿', '⁀'), + ('⁔', '⁔'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('\u{20d0}', '\u{20dc}'), + ('\u{20e1}', '\u{20e1}'), + ('\u{20e5}', '\u{20f0}'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('℘', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('\u{2d7f}', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('\u{2de0}', '\u{2dff}'), + ('々', '〇'), + ('〡', '\u{302f}'), + ('〱', '〵'), + ('〸', '〼'), + ('ぁ', 'ゖ'), + ('\u{3099}', 'ゟ'), + ('ァ', 'ヺ'), + ('ー', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘫ'), + ('Ꙁ', '\u{a66f}'), + ('\u{a674}', '\u{a67d}'), + ('ꙿ', '\u{a6f1}'), + ('ꜗ', 'ꜟ'), + ('Ꜣ', 'ꞈ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠧ'), + ('\u{a82c}', '\u{a82c}'), + ('ꡀ', 'ꡳ'), + ('ꢀ', '\u{a8c5}'), + ('꣐', '꣙'), + ('\u{a8e0}', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', '\u{a92d}'), + ('ꤰ', '꥓'), + ('ꥠ', 'ꥼ'), + ('\u{a980}', '꧀'), + ('ꧏ', '꧙'), + ('ꧠ', 'ꧾ'), + ('ꨀ', '\u{aa36}'), + ('ꩀ', 'ꩍ'), + ('꩐', '꩙'), + ('ꩠ', 'ꩶ'), + ('ꩺ', 'ꫂ'), + ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫯ'), + ('ꫲ', '\u{aaf6}'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꯪ'), + ('꯬', '\u{abed}'), + ('꯰', '꯹'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('יִ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('︳', '︴'), + ('﹍', '﹏'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐅀', '𐅴'), + ('\u{101fd}', '\u{101fd}'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('\u{102e0}', '\u{102e0}'), + ('𐌀', '𐌟'), + ('𐌭', '𐍊'), + ('𐍐', '\u{1037a}'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐏑', '𐏕'), + ('𐐀', '𐒝'), + ('𐒠', '𐒩'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '\u{10ae6}'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐴀', '\u{10d27}'), + ('𐴰', '𐴹'), + ('𐺀', '𐺩'), + ('\u{10eab}', '\u{10eac}'), + ('𐺰', '𐺱'), + ('\u{10efd}', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '\u{10f50}'), + ('𐽰', '\u{10f85}'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀀', '\u{11046}'), + ('𑁦', '𑁵'), + ('\u{1107f}', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('𑃐', '𑃨'), + ('𑃰', '𑃹'), + ('\u{11100}', '\u{11134}'), + ('𑄶', '𑄿'), + ('𑅄', '𑅇'), + ('𑅐', '\u{11173}'), + ('𑅶', '𑅶'), + ('\u{11180}', '𑇄'), + ('\u{111c9}', '\u{111cc}'), + ('𑇎', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '\u{11237}'), + ('\u{1123e}', '\u{11241}'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '\u{112ea}'), + ('𑋰', '𑋹'), + ('\u{11300}', '𑌃'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('\u{1133b}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('𑍐', '𑍐'), + ('\u{11357}', '\u{11357}'), + ('𑍝', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('𑐀', '𑑊'), + ('𑑐', '𑑙'), + ('\u{1145e}', '𑑡'), + ('𑒀', '𑓅'), + ('𑓇', '𑓇'), + ('𑓐', '𑓙'), + ('𑖀', '\u{115b5}'), + ('𑖸', '\u{115c0}'), + ('𑗘', '\u{115dd}'), + ('𑘀', '\u{11640}'), + ('𑙄', '𑙄'), + ('𑙐', '𑙙'), + ('𑚀', '𑚸'), + ('𑛀', '𑛉'), + ('𑜀', '𑜚'), + ('\u{1171d}', '\u{1172b}'), + ('𑜰', '𑜹'), + ('𑝀', '𑝆'), + ('𑠀', '\u{1183a}'), + ('𑢠', '𑣩'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '\u{11943}'), + ('𑥐', '𑥙'), + ('𑦠', '𑦧'), + ('𑦪', '\u{119d7}'), + ('\u{119da}', '𑧡'), + ('𑧣', '𑧤'), + ('𑨀', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('𑩐', '\u{11a99}'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '\u{11c36}'), + ('\u{11c38}', '𑱀'), + ('𑱐', '𑱙'), + ('𑱲', '𑲏'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('𑵐', '𑵙'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '𑶘'), + ('𑶠', '𑶩'), + ('𑻠', '𑻶'), + ('\u{11f00}', '𑼐'), + ('𑼒', '\u{11f3a}'), + ('𑼾', '\u{11f42}'), + ('𑽐', '𑽙'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒐀', '𒑮'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('\u{13440}', '\u{13455}'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩠', '𖩩'), + ('𖩰', '𖪾'), + ('𖫀', '𖫉'), + ('𖫐', '𖫭'), + ('\u{16af0}', '\u{16af4}'), + ('𖬀', '\u{16b36}'), + ('𖭀', '𖭃'), + ('𖭐', '𖭙'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖹿'), + ('𖼀', '𖽊'), + ('\u{16f4f}', '𖾇'), + ('\u{16f8f}', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '\u{16fe4}'), + ('𖿰', '𖿱'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d165}', '\u{1d169}'), + ('𝅭', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝟎', '𝟿'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('𞀰', '𞁭'), + ('\u{1e08f}', '\u{1e08f}'), + ('𞄀', '𞄬'), + ('\u{1e130}', '𞄽'), + ('𞅀', '𞅉'), + ('𞅎', '𞅎'), + ('𞊐', '\u{1e2ae}'), + ('𞋀', '𞋹'), + ('𞓐', '𞓹'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('𞤀', '𞥋'), + ('𞥐', '𞥙'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('🯰', '🯹'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const ID_START: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ˁ'), + ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('Ͱ', 'ʹ'), + ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), + ('ՠ', 'ֈ'), + ('א', 'ת'), + ('ׯ', 'ײ'), + ('ؠ', 'ي'), + ('ٮ', 'ٯ'), + ('ٱ', 'ۓ'), + ('ە', 'ە'), + ('ۥ', 'ۦ'), + ('ۮ', 'ۯ'), + ('ۺ', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', 'ܐ'), + ('ܒ', 'ܯ'), + ('ݍ', 'ޥ'), + ('ޱ', 'ޱ'), + ('ߊ', 'ߪ'), + ('ߴ', 'ߵ'), + ('ߺ', 'ߺ'), + ('ࠀ', 'ࠕ'), + ('ࠚ', 'ࠚ'), + ('ࠤ', 'ࠤ'), + ('ࠨ', 'ࠨ'), + ('ࡀ', 'ࡘ'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('ࢠ', 'ࣉ'), + ('ऄ', 'ह'), + ('ऽ', 'ऽ'), + ('ॐ', 'ॐ'), + ('क़', 'ॡ'), + ('ॱ', 'ঀ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('ঽ', 'ঽ'), + ('ৎ', 'ৎ'), + ('ড়', 'ঢ়'), + ('য়', 'ৡ'), + ('ৰ', 'ৱ'), + ('ৼ', 'ৼ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('ੲ', 'ੴ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('ઽ', 'ઽ'), + ('ૐ', 'ૐ'), + ('ૠ', 'ૡ'), + ('ૹ', 'ૹ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('ଽ', 'ଽ'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୡ'), + ('ୱ', 'ୱ'), + ('ஃ', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('ௐ', 'ௐ'), + ('అ', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('ఽ', 'ఽ'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', 'ౡ'), + ('ಀ', 'ಀ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('ಽ', 'ಽ'), + ('ೝ', 'ೞ'), + ('ೠ', 'ೡ'), + ('ೱ', 'ೲ'), + ('ഄ', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), + ('ൎ', 'ൎ'), + ('ൔ', 'ൖ'), + ('ൟ', 'ൡ'), + ('ൺ', 'ൿ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('ก', 'ะ'), + ('า', 'ำ'), + ('เ', 'ๆ'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ະ'), + ('າ', 'ຳ'), + ('ຽ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('ཀ', 'ཇ'), + ('ཉ', 'ཬ'), + ('ྈ', 'ྌ'), + ('က', 'ဪ'), + ('ဿ', 'ဿ'), + ('ၐ', 'ၕ'), + ('ၚ', 'ၝ'), + ('ၡ', 'ၡ'), + ('ၥ', 'ၦ'), + ('ၮ', 'ၰ'), + ('ၵ', 'ႁ'), + ('ႎ', 'ႎ'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛮ', 'ᛸ'), + ('ᜀ', 'ᜑ'), + ('ᜟ', 'ᜱ'), + ('ᝀ', 'ᝑ'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('ក', 'ឳ'), + ('ៗ', 'ៗ'), + ('ៜ', 'ៜ'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢨ'), + ('ᢪ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('ᥐ', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('ᨀ', 'ᨖ'), + ('ᨠ', 'ᩔ'), + ('ᪧ', 'ᪧ'), + ('ᬅ', 'ᬳ'), + ('ᭅ', 'ᭌ'), + ('ᮃ', 'ᮠ'), + ('ᮮ', 'ᮯ'), + ('ᮺ', 'ᯥ'), + ('ᰀ', 'ᰣ'), + ('ᱍ', 'ᱏ'), + ('ᱚ', 'ᱽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ᳩ', 'ᳬ'), + ('ᳮ', 'ᳳ'), + ('ᳵ', 'ᳶ'), + ('ᳺ', 'ᳺ'), + ('ᴀ', 'ᶿ'), + ('Ḁ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('℘', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('々', '〇'), + ('〡', '〩'), + ('〱', '〵'), + ('〸', '〼'), + ('ぁ', 'ゖ'), + ('゛', 'ゟ'), + ('ァ', 'ヺ'), + ('ー', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘟ'), + ('ꘪ', 'ꘫ'), + ('Ꙁ', 'ꙮ'), + ('ꙿ', 'ꚝ'), + ('ꚠ', 'ꛯ'), + ('ꜗ', 'ꜟ'), + ('Ꜣ', 'ꞈ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠁ'), + ('ꠃ', 'ꠅ'), + ('ꠇ', 'ꠊ'), + ('ꠌ', 'ꠢ'), + ('ꡀ', 'ꡳ'), + ('ꢂ', 'ꢳ'), + ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', 'ꣾ'), + ('ꤊ', 'ꤥ'), + ('ꤰ', 'ꥆ'), + ('ꥠ', 'ꥼ'), + ('ꦄ', 'ꦲ'), + ('ꧏ', 'ꧏ'), + ('ꧠ', 'ꧤ'), + ('ꧦ', 'ꧯ'), + ('ꧺ', 'ꧾ'), + ('ꨀ', 'ꨨ'), + ('ꩀ', 'ꩂ'), + ('ꩄ', 'ꩋ'), + ('ꩠ', 'ꩶ'), + ('ꩺ', 'ꩺ'), + ('ꩾ', 'ꪯ'), + ('ꪱ', 'ꪱ'), + ('ꪵ', 'ꪶ'), + ('ꪹ', 'ꪽ'), + ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), + ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫪ'), + ('ꫲ', 'ꫴ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꯢ'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('יִ', 'יִ'), + ('ײַ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('A', 'Z'), + ('a', 'z'), + ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐅀', '𐅴'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('𐌀', '𐌟'), + ('𐌭', '𐍊'), + ('𐍐', '𐍵'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐏑', '𐏕'), + ('𐐀', '𐒝'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '𐨀'), + ('𐨐', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '𐫤'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐴀', '𐴣'), + ('𐺀', '𐺩'), + ('𐺰', '𐺱'), + ('𐼀', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '𐽅'), + ('𐽰', '𐾁'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀃', '𑀷'), + ('𑁱', '𑁲'), + ('𑁵', '𑁵'), + ('𑂃', '𑂯'), + ('𑃐', '𑃨'), + ('𑄃', '𑄦'), + ('𑅄', '𑅄'), + ('𑅇', '𑅇'), + ('𑅐', '𑅲'), + ('𑅶', '𑅶'), + ('𑆃', '𑆲'), + ('𑇁', '𑇄'), + ('𑇚', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '𑈫'), + ('𑈿', '𑉀'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '𑋞'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('𑌽', '𑌽'), + ('𑍐', '𑍐'), + ('𑍝', '𑍡'), + ('𑐀', '𑐴'), + ('𑑇', '𑑊'), + ('𑑟', '𑑡'), + ('𑒀', '𑒯'), + ('𑓄', '𑓅'), + ('𑓇', '𑓇'), + ('𑖀', '𑖮'), + ('𑗘', '𑗛'), + ('𑘀', '𑘯'), + ('𑙄', '𑙄'), + ('𑚀', '𑚪'), + ('𑚸', '𑚸'), + ('𑜀', '𑜚'), + ('𑝀', '𑝆'), + ('𑠀', '𑠫'), + ('𑢠', '𑣟'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤯'), + ('𑤿', '𑤿'), + ('𑥁', '𑥁'), + ('𑦠', '𑦧'), + ('𑦪', '𑧐'), + ('𑧡', '𑧡'), + ('𑧣', '𑧣'), + ('𑨀', '𑨀'), + ('𑨋', '𑨲'), + ('𑨺', '𑨺'), + ('𑩐', '𑩐'), + ('𑩜', '𑪉'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '𑰮'), + ('𑱀', '𑱀'), + ('𑱲', '𑲏'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '𑴰'), + ('𑵆', '𑵆'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶉'), + ('𑶘', '𑶘'), + ('𑻠', '𑻲'), + ('𑼂', '𑼂'), + ('𑼄', '𑼐'), + ('𑼒', '𑼳'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒐀', '𒑮'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('𓑁', '𓑆'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩰', '𖪾'), + ('𖫐', '𖫭'), + ('𖬀', '𖬯'), + ('𖭀', '𖭃'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖹿'), + ('𖼀', '𖽊'), + ('𖽐', '𖽐'), + ('𖾓', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '𖿣'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('𞀰', '𞁭'), + ('𞄀', '𞄬'), + ('𞄷', '𞄽'), + ('𞅎', '𞅎'), + ('𞊐', '𞊭'), + ('𞋀', '𞋫'), + ('𞓐', '𞓫'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('𞤀', '𞥃'), + ('𞥋', '𞥋'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; + +pub const IDEOGRAPHIC: &'static [(char, char)] = &[ + ('〆', '〇'), + ('〡', '〩'), + ('〸', '〺'), + ('㐀', '䶿'), + ('一', '鿿'), + ('豈', '舘'), + ('並', '龎'), + ('\u{16fe4}', '\u{16fe4}'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𛅰', '𛋻'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; + +pub const JOIN_CONTROL: &'static [(char, char)] = &[('\u{200c}', '\u{200d}')]; + +pub const LOGICAL_ORDER_EXCEPTION: &'static [(char, char)] = &[ + ('เ', 'ไ'), + ('ເ', 'ໄ'), + ('ᦵ', 'ᦷ'), + ('ᦺ', 'ᦺ'), + ('ꪵ', 'ꪶ'), + ('ꪹ', 'ꪹ'), + ('ꪻ', 'ꪼ'), +]; + +pub const LOWERCASE: &'static [(char, char)] = &[ + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('ß', 'ö'), + ('ø', 'ÿ'), + ('ā', 'ā'), + ('ă', 'ă'), + ('ą', 'ą'), + ('ć', 'ć'), + ('ĉ', 'ĉ'), + ('ċ', 'ċ'), + ('č', 'č'), + ('ď', 'ď'), + ('đ', 'đ'), + ('ē', 'ē'), + ('ĕ', 'ĕ'), + ('ė', 'ė'), + ('ę', 'ę'), + ('ě', 'ě'), + ('ĝ', 'ĝ'), + ('ğ', 'ğ'), + ('ġ', 'ġ'), + ('ģ', 'ģ'), + ('ĥ', 'ĥ'), + ('ħ', 'ħ'), + ('ĩ', 'ĩ'), + ('ī', 'ī'), + ('ĭ', 'ĭ'), + ('į', 'į'), + ('ı', 'ı'), + ('ij', 'ij'), + ('ĵ', 'ĵ'), + ('ķ', 'ĸ'), + ('ĺ', 'ĺ'), + ('ļ', 'ļ'), + ('ľ', 'ľ'), + ('ŀ', 'ŀ'), + ('ł', 'ł'), + ('ń', 'ń'), + ('ņ', 'ņ'), + ('ň', 'ʼn'), + ('ŋ', 'ŋ'), + ('ō', 'ō'), + ('ŏ', 'ŏ'), + ('ő', 'ő'), + ('œ', 'œ'), + ('ŕ', 'ŕ'), + ('ŗ', 'ŗ'), + ('ř', 'ř'), + ('ś', 'ś'), + ('ŝ', 'ŝ'), + ('ş', 'ş'), + ('š', 'š'), + ('ţ', 'ţ'), + ('ť', 'ť'), + ('ŧ', 'ŧ'), + ('ũ', 'ũ'), + ('ū', 'ū'), + ('ŭ', 'ŭ'), + ('ů', 'ů'), + ('ű', 'ű'), + ('ų', 'ų'), + ('ŵ', 'ŵ'), + ('ŷ', 'ŷ'), + ('ź', 'ź'), + ('ż', 'ż'), + ('ž', 'ƀ'), + ('ƃ', 'ƃ'), + ('ƅ', 'ƅ'), + ('ƈ', 'ƈ'), + ('ƌ', 'ƍ'), + ('ƒ', 'ƒ'), + ('ƕ', 'ƕ'), + ('ƙ', 'ƛ'), + ('ƞ', 'ƞ'), + ('ơ', 'ơ'), + ('ƣ', 'ƣ'), + ('ƥ', 'ƥ'), + ('ƨ', 'ƨ'), + ('ƪ', 'ƫ'), + ('ƭ', 'ƭ'), + ('ư', 'ư'), + ('ƴ', 'ƴ'), + ('ƶ', 'ƶ'), + ('ƹ', 'ƺ'), + ('ƽ', 'ƿ'), + ('dž', 'dž'), + ('lj', 'lj'), + ('nj', 'nj'), + ('ǎ', 'ǎ'), + ('ǐ', 'ǐ'), + ('ǒ', 'ǒ'), + ('ǔ', 'ǔ'), + ('ǖ', 'ǖ'), + ('ǘ', 'ǘ'), + ('ǚ', 'ǚ'), + ('ǜ', 'ǝ'), + ('ǟ', 'ǟ'), + ('ǡ', 'ǡ'), + ('ǣ', 'ǣ'), + ('ǥ', 'ǥ'), + ('ǧ', 'ǧ'), + ('ǩ', 'ǩ'), + ('ǫ', 'ǫ'), + ('ǭ', 'ǭ'), + ('ǯ', 'ǰ'), + ('dz', 'dz'), + ('ǵ', 'ǵ'), + ('ǹ', 'ǹ'), + ('ǻ', 'ǻ'), + ('ǽ', 'ǽ'), + ('ǿ', 'ǿ'), + ('ȁ', 'ȁ'), + ('ȃ', 'ȃ'), + ('ȅ', 'ȅ'), + ('ȇ', 'ȇ'), + ('ȉ', 'ȉ'), + ('ȋ', 'ȋ'), + ('ȍ', 'ȍ'), + ('ȏ', 'ȏ'), + ('ȑ', 'ȑ'), + ('ȓ', 'ȓ'), + ('ȕ', 'ȕ'), + ('ȗ', 'ȗ'), + ('ș', 'ș'), + ('ț', 'ț'), + ('ȝ', 'ȝ'), + ('ȟ', 'ȟ'), + ('ȡ', 'ȡ'), + ('ȣ', 'ȣ'), + ('ȥ', 'ȥ'), + ('ȧ', 'ȧ'), + ('ȩ', 'ȩ'), + ('ȫ', 'ȫ'), + ('ȭ', 'ȭ'), + ('ȯ', 'ȯ'), + ('ȱ', 'ȱ'), + ('ȳ', 'ȹ'), + ('ȼ', 'ȼ'), + ('ȿ', 'ɀ'), + ('ɂ', 'ɂ'), + ('ɇ', 'ɇ'), + ('ɉ', 'ɉ'), + ('ɋ', 'ɋ'), + ('ɍ', 'ɍ'), + ('ɏ', 'ʓ'), + ('ʕ', 'ʸ'), + ('ˀ', 'ˁ'), + ('ˠ', 'ˤ'), + ('\u{345}', '\u{345}'), + ('ͱ', 'ͱ'), + ('ͳ', 'ͳ'), + ('ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('ΐ', 'ΐ'), + ('ά', 'ώ'), + ('ϐ', 'ϑ'), + ('ϕ', 'ϗ'), + ('ϙ', 'ϙ'), + ('ϛ', 'ϛ'), + ('ϝ', 'ϝ'), + ('ϟ', 'ϟ'), + ('ϡ', 'ϡ'), + ('ϣ', 'ϣ'), + ('ϥ', 'ϥ'), + ('ϧ', 'ϧ'), + ('ϩ', 'ϩ'), + ('ϫ', 'ϫ'), + ('ϭ', 'ϭ'), + ('ϯ', 'ϳ'), + ('ϵ', 'ϵ'), + ('ϸ', 'ϸ'), + ('ϻ', 'ϼ'), + ('а', 'џ'), + ('ѡ', 'ѡ'), + ('ѣ', 'ѣ'), + ('ѥ', 'ѥ'), + ('ѧ', 'ѧ'), + ('ѩ', 'ѩ'), + ('ѫ', 'ѫ'), + ('ѭ', 'ѭ'), + ('ѯ', 'ѯ'), + ('ѱ', 'ѱ'), + ('ѳ', 'ѳ'), + ('ѵ', 'ѵ'), + ('ѷ', 'ѷ'), + ('ѹ', 'ѹ'), + ('ѻ', 'ѻ'), + ('ѽ', 'ѽ'), + ('ѿ', 'ѿ'), + ('ҁ', 'ҁ'), + ('ҋ', 'ҋ'), + ('ҍ', 'ҍ'), + ('ҏ', 'ҏ'), + ('ґ', 'ґ'), + ('ғ', 'ғ'), + ('ҕ', 'ҕ'), + ('җ', 'җ'), + ('ҙ', 'ҙ'), + ('қ', 'қ'), + ('ҝ', 'ҝ'), + ('ҟ', 'ҟ'), + ('ҡ', 'ҡ'), + ('ң', 'ң'), + ('ҥ', 'ҥ'), + ('ҧ', 'ҧ'), + ('ҩ', 'ҩ'), + ('ҫ', 'ҫ'), + ('ҭ', 'ҭ'), + ('ү', 'ү'), + ('ұ', 'ұ'), + ('ҳ', 'ҳ'), + ('ҵ', 'ҵ'), + ('ҷ', 'ҷ'), + ('ҹ', 'ҹ'), + ('һ', 'һ'), + ('ҽ', 'ҽ'), + ('ҿ', 'ҿ'), + ('ӂ', 'ӂ'), + ('ӄ', 'ӄ'), + ('ӆ', 'ӆ'), + ('ӈ', 'ӈ'), + ('ӊ', 'ӊ'), + ('ӌ', 'ӌ'), + ('ӎ', 'ӏ'), + ('ӑ', 'ӑ'), + ('ӓ', 'ӓ'), + ('ӕ', 'ӕ'), + ('ӗ', 'ӗ'), + ('ә', 'ә'), + ('ӛ', 'ӛ'), + ('ӝ', 'ӝ'), + ('ӟ', 'ӟ'), + ('ӡ', 'ӡ'), + ('ӣ', 'ӣ'), + ('ӥ', 'ӥ'), + ('ӧ', 'ӧ'), + ('ө', 'ө'), + ('ӫ', 'ӫ'), + ('ӭ', 'ӭ'), + ('ӯ', 'ӯ'), + ('ӱ', 'ӱ'), + ('ӳ', 'ӳ'), + ('ӵ', 'ӵ'), + ('ӷ', 'ӷ'), + ('ӹ', 'ӹ'), + ('ӻ', 'ӻ'), + ('ӽ', 'ӽ'), + ('ӿ', 'ӿ'), + ('ԁ', 'ԁ'), + ('ԃ', 'ԃ'), + ('ԅ', 'ԅ'), + ('ԇ', 'ԇ'), + ('ԉ', 'ԉ'), + ('ԋ', 'ԋ'), + ('ԍ', 'ԍ'), + ('ԏ', 'ԏ'), + ('ԑ', 'ԑ'), + ('ԓ', 'ԓ'), + ('ԕ', 'ԕ'), + ('ԗ', 'ԗ'), + ('ԙ', 'ԙ'), + ('ԛ', 'ԛ'), + ('ԝ', 'ԝ'), + ('ԟ', 'ԟ'), + ('ԡ', 'ԡ'), + ('ԣ', 'ԣ'), + ('ԥ', 'ԥ'), + ('ԧ', 'ԧ'), + ('ԩ', 'ԩ'), + ('ԫ', 'ԫ'), + ('ԭ', 'ԭ'), + ('ԯ', 'ԯ'), + ('ՠ', 'ֈ'), + ('ა', 'ჺ'), + ('ჼ', 'ჿ'), + ('ᏸ', 'ᏽ'), + ('ᲀ', 'ᲈ'), + ('ᴀ', 'ᶿ'), + ('ḁ', 'ḁ'), + ('ḃ', 'ḃ'), + ('ḅ', 'ḅ'), + ('ḇ', 'ḇ'), + ('ḉ', 'ḉ'), + ('ḋ', 'ḋ'), + ('ḍ', 'ḍ'), + ('ḏ', 'ḏ'), + ('ḑ', 'ḑ'), + ('ḓ', 'ḓ'), + ('ḕ', 'ḕ'), + ('ḗ', 'ḗ'), + ('ḙ', 'ḙ'), + ('ḛ', 'ḛ'), + ('ḝ', 'ḝ'), + ('ḟ', 'ḟ'), + ('ḡ', 'ḡ'), + ('ḣ', 'ḣ'), + ('ḥ', 'ḥ'), + ('ḧ', 'ḧ'), + ('ḩ', 'ḩ'), + ('ḫ', 'ḫ'), + ('ḭ', 'ḭ'), + ('ḯ', 'ḯ'), + ('ḱ', 'ḱ'), + ('ḳ', 'ḳ'), + ('ḵ', 'ḵ'), + ('ḷ', 'ḷ'), + ('ḹ', 'ḹ'), + ('ḻ', 'ḻ'), + ('ḽ', 'ḽ'), + ('ḿ', 'ḿ'), + ('ṁ', 'ṁ'), + ('ṃ', 'ṃ'), + ('ṅ', 'ṅ'), + ('ṇ', 'ṇ'), + ('ṉ', 'ṉ'), + ('ṋ', 'ṋ'), + ('ṍ', 'ṍ'), + ('ṏ', 'ṏ'), + ('ṑ', 'ṑ'), + ('ṓ', 'ṓ'), + ('ṕ', 'ṕ'), + ('ṗ', 'ṗ'), + ('ṙ', 'ṙ'), + ('ṛ', 'ṛ'), + ('ṝ', 'ṝ'), + ('ṟ', 'ṟ'), + ('ṡ', 'ṡ'), + ('ṣ', 'ṣ'), + ('ṥ', 'ṥ'), + ('ṧ', 'ṧ'), + ('ṩ', 'ṩ'), + ('ṫ', 'ṫ'), + ('ṭ', 'ṭ'), + ('ṯ', 'ṯ'), + ('ṱ', 'ṱ'), + ('ṳ', 'ṳ'), + ('ṵ', 'ṵ'), + ('ṷ', 'ṷ'), + ('ṹ', 'ṹ'), + ('ṻ', 'ṻ'), + ('ṽ', 'ṽ'), + ('ṿ', 'ṿ'), + ('ẁ', 'ẁ'), + ('ẃ', 'ẃ'), + ('ẅ', 'ẅ'), + ('ẇ', 'ẇ'), + ('ẉ', 'ẉ'), + ('ẋ', 'ẋ'), + ('ẍ', 'ẍ'), + ('ẏ', 'ẏ'), + ('ẑ', 'ẑ'), + ('ẓ', 'ẓ'), + ('ẕ', 'ẝ'), + ('ẟ', 'ẟ'), + ('ạ', 'ạ'), + ('ả', 'ả'), + ('ấ', 'ấ'), + ('ầ', 'ầ'), + ('ẩ', 'ẩ'), + ('ẫ', 'ẫ'), + ('ậ', 'ậ'), + ('ắ', 'ắ'), + ('ằ', 'ằ'), + ('ẳ', 'ẳ'), + ('ẵ', 'ẵ'), + ('ặ', 'ặ'), + ('ẹ', 'ẹ'), + ('ẻ', 'ẻ'), + ('ẽ', 'ẽ'), + ('ế', 'ế'), + ('ề', 'ề'), + ('ể', 'ể'), + ('ễ', 'ễ'), + ('ệ', 'ệ'), + ('ỉ', 'ỉ'), + ('ị', 'ị'), + ('ọ', 'ọ'), + ('ỏ', 'ỏ'), + ('ố', 'ố'), + ('ồ', 'ồ'), + ('ổ', 'ổ'), + ('ỗ', 'ỗ'), + ('ộ', 'ộ'), + ('ớ', 'ớ'), + ('ờ', 'ờ'), + ('ở', 'ở'), + ('ỡ', 'ỡ'), + ('ợ', 'ợ'), + ('ụ', 'ụ'), + ('ủ', 'ủ'), + ('ứ', 'ứ'), + ('ừ', 'ừ'), + ('ử', 'ử'), + ('ữ', 'ữ'), + ('ự', 'ự'), + ('ỳ', 'ỳ'), + ('ỵ', 'ỵ'), + ('ỷ', 'ỷ'), + ('ỹ', 'ỹ'), + ('ỻ', 'ỻ'), + ('ỽ', 'ỽ'), + ('ỿ', 'ἇ'), + ('ἐ', 'ἕ'), + ('ἠ', 'ἧ'), + ('ἰ', 'ἷ'), + ('ὀ', 'ὅ'), + ('ὐ', 'ὗ'), + ('ὠ', 'ὧ'), + ('ὰ', 'ώ'), + ('ᾀ', 'ᾇ'), + ('ᾐ', 'ᾗ'), + ('ᾠ', 'ᾧ'), + ('ᾰ', 'ᾴ'), + ('ᾶ', 'ᾷ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῇ'), + ('ῐ', 'ΐ'), + ('ῖ', 'ῗ'), + ('ῠ', 'ῧ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῷ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('ℊ', 'ℊ'), + ('ℎ', 'ℏ'), + ('ℓ', 'ℓ'), + ('ℯ', 'ℯ'), + ('ℴ', 'ℴ'), + ('ℹ', 'ℹ'), + ('ℼ', 'ℽ'), + ('ⅆ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('ⅰ', 'ⅿ'), + ('ↄ', 'ↄ'), + ('ⓐ', 'ⓩ'), + ('ⰰ', 'ⱟ'), + ('ⱡ', 'ⱡ'), + ('ⱥ', 'ⱦ'), + ('ⱨ', 'ⱨ'), + ('ⱪ', 'ⱪ'), + ('ⱬ', 'ⱬ'), + ('ⱱ', 'ⱱ'), + ('ⱳ', 'ⱴ'), + ('ⱶ', 'ⱽ'), + ('ⲁ', 'ⲁ'), + ('ⲃ', 'ⲃ'), + ('ⲅ', 'ⲅ'), + ('ⲇ', 'ⲇ'), + ('ⲉ', 'ⲉ'), + ('ⲋ', 'ⲋ'), + ('ⲍ', 'ⲍ'), + ('ⲏ', 'ⲏ'), + ('ⲑ', 'ⲑ'), + ('ⲓ', 'ⲓ'), + ('ⲕ', 'ⲕ'), + ('ⲗ', 'ⲗ'), + ('ⲙ', 'ⲙ'), + ('ⲛ', 'ⲛ'), + ('ⲝ', 'ⲝ'), + ('ⲟ', 'ⲟ'), + ('ⲡ', 'ⲡ'), + ('ⲣ', 'ⲣ'), + ('ⲥ', 'ⲥ'), + ('ⲧ', 'ⲧ'), + ('ⲩ', 'ⲩ'), + ('ⲫ', 'ⲫ'), + ('ⲭ', 'ⲭ'), + ('ⲯ', 'ⲯ'), + ('ⲱ', 'ⲱ'), + ('ⲳ', 'ⲳ'), + ('ⲵ', 'ⲵ'), + ('ⲷ', 'ⲷ'), + ('ⲹ', 'ⲹ'), + ('ⲻ', 'ⲻ'), + ('ⲽ', 'ⲽ'), + ('ⲿ', 'ⲿ'), + ('ⳁ', 'ⳁ'), + ('ⳃ', 'ⳃ'), + ('ⳅ', 'ⳅ'), + ('ⳇ', 'ⳇ'), + ('ⳉ', 'ⳉ'), + ('ⳋ', 'ⳋ'), + ('ⳍ', 'ⳍ'), + ('ⳏ', 'ⳏ'), + ('ⳑ', 'ⳑ'), + ('ⳓ', 'ⳓ'), + ('ⳕ', 'ⳕ'), + ('ⳗ', 'ⳗ'), + ('ⳙ', 'ⳙ'), + ('ⳛ', 'ⳛ'), + ('ⳝ', 'ⳝ'), + ('ⳟ', 'ⳟ'), + ('ⳡ', 'ⳡ'), + ('ⳣ', 'ⳤ'), + ('ⳬ', 'ⳬ'), + ('ⳮ', 'ⳮ'), + ('ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ꙁ', 'ꙁ'), + ('ꙃ', 'ꙃ'), + ('ꙅ', 'ꙅ'), + ('ꙇ', 'ꙇ'), + ('ꙉ', 'ꙉ'), + ('ꙋ', 'ꙋ'), + ('ꙍ', 'ꙍ'), + ('ꙏ', 'ꙏ'), + ('ꙑ', 'ꙑ'), + ('ꙓ', 'ꙓ'), + ('ꙕ', 'ꙕ'), + ('ꙗ', 'ꙗ'), + ('ꙙ', 'ꙙ'), + ('ꙛ', 'ꙛ'), + ('ꙝ', 'ꙝ'), + ('ꙟ', 'ꙟ'), + ('ꙡ', 'ꙡ'), + ('ꙣ', 'ꙣ'), + ('ꙥ', 'ꙥ'), + ('ꙧ', 'ꙧ'), + ('ꙩ', 'ꙩ'), + ('ꙫ', 'ꙫ'), + ('ꙭ', 'ꙭ'), + ('ꚁ', 'ꚁ'), + ('ꚃ', 'ꚃ'), + ('ꚅ', 'ꚅ'), + ('ꚇ', 'ꚇ'), + ('ꚉ', 'ꚉ'), + ('ꚋ', 'ꚋ'), + ('ꚍ', 'ꚍ'), + ('ꚏ', 'ꚏ'), + ('ꚑ', 'ꚑ'), + ('ꚓ', 'ꚓ'), + ('ꚕ', 'ꚕ'), + ('ꚗ', 'ꚗ'), + ('ꚙ', 'ꚙ'), + ('ꚛ', 'ꚝ'), + ('ꜣ', 'ꜣ'), + ('ꜥ', 'ꜥ'), + ('ꜧ', 'ꜧ'), + ('ꜩ', 'ꜩ'), + ('ꜫ', 'ꜫ'), + ('ꜭ', 'ꜭ'), + ('ꜯ', 'ꜱ'), + ('ꜳ', 'ꜳ'), + ('ꜵ', 'ꜵ'), + ('ꜷ', 'ꜷ'), + ('ꜹ', 'ꜹ'), + ('ꜻ', 'ꜻ'), + ('ꜽ', 'ꜽ'), + ('ꜿ', 'ꜿ'), + ('ꝁ', 'ꝁ'), + ('ꝃ', 'ꝃ'), + ('ꝅ', 'ꝅ'), + ('ꝇ', 'ꝇ'), + ('ꝉ', 'ꝉ'), + ('ꝋ', 'ꝋ'), + ('ꝍ', 'ꝍ'), + ('ꝏ', 'ꝏ'), + ('ꝑ', 'ꝑ'), + ('ꝓ', 'ꝓ'), + ('ꝕ', 'ꝕ'), + ('ꝗ', 'ꝗ'), + ('ꝙ', 'ꝙ'), + ('ꝛ', 'ꝛ'), + ('ꝝ', 'ꝝ'), + ('ꝟ', 'ꝟ'), + ('ꝡ', 'ꝡ'), + ('ꝣ', 'ꝣ'), + ('ꝥ', 'ꝥ'), + ('ꝧ', 'ꝧ'), + ('ꝩ', 'ꝩ'), + ('ꝫ', 'ꝫ'), + ('ꝭ', 'ꝭ'), + ('ꝯ', 'ꝸ'), + ('ꝺ', 'ꝺ'), + ('ꝼ', 'ꝼ'), + ('ꝿ', 'ꝿ'), + ('ꞁ', 'ꞁ'), + ('ꞃ', 'ꞃ'), + ('ꞅ', 'ꞅ'), + ('ꞇ', 'ꞇ'), + ('ꞌ', 'ꞌ'), + ('ꞎ', 'ꞎ'), + ('ꞑ', 'ꞑ'), + ('ꞓ', 'ꞕ'), + ('ꞗ', 'ꞗ'), + ('ꞙ', 'ꞙ'), + ('ꞛ', 'ꞛ'), + ('ꞝ', 'ꞝ'), + ('ꞟ', 'ꞟ'), + ('ꞡ', 'ꞡ'), + ('ꞣ', 'ꞣ'), + ('ꞥ', 'ꞥ'), + ('ꞧ', 'ꞧ'), + ('ꞩ', 'ꞩ'), + ('ꞯ', 'ꞯ'), + ('ꞵ', 'ꞵ'), + ('ꞷ', 'ꞷ'), + ('ꞹ', 'ꞹ'), + ('ꞻ', 'ꞻ'), + ('ꞽ', 'ꞽ'), + ('ꞿ', 'ꞿ'), + ('ꟁ', 'ꟁ'), + ('ꟃ', 'ꟃ'), + ('ꟈ', 'ꟈ'), + ('ꟊ', 'ꟊ'), + ('ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟕ'), + ('ꟗ', 'ꟗ'), + ('ꟙ', 'ꟙ'), + ('ꟲ', 'ꟴ'), + ('ꟶ', 'ꟶ'), + ('ꟸ', 'ꟺ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꮿ'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('a', 'z'), + ('𐐨', '𐑏'), + ('𐓘', '𐓻'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐞀', '𐞀'), + ('𐞃', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐳀', '𐳲'), + ('𑣀', '𑣟'), + ('𖹠', '𖹿'), + ('𝐚', '𝐳'), + ('𝑎', '𝑔'), + ('𝑖', '𝑧'), + ('𝒂', '𝒛'), + ('𝒶', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝓏'), + ('𝓪', '𝔃'), + ('𝔞', '𝔷'), + ('𝕒', '𝕫'), + ('𝖆', '𝖟'), + ('𝖺', '𝗓'), + ('𝗮', '𝘇'), + ('𝘢', '𝘻'), + ('𝙖', '𝙯'), + ('𝚊', '𝚥'), + ('𝛂', '𝛚'), + ('𝛜', '𝛡'), + ('𝛼', '𝜔'), + ('𝜖', '𝜛'), + ('𝜶', '𝝎'), + ('𝝐', '𝝕'), + ('𝝰', '𝞈'), + ('𝞊', '𝞏'), + ('𝞪', '𝟂'), + ('𝟄', '𝟉'), + ('𝟋', '𝟋'), + ('𝼀', '𝼉'), + ('𝼋', '𝼞'), + ('𝼥', '𝼪'), + ('𞀰', '𞁭'), + ('𞤢', '𞥃'), +]; + +pub const MATH: &'static [(char, char)] = &[ + ('+', '+'), + ('<', '>'), + ('^', '^'), + ('|', '|'), + ('~', '~'), + ('¬', '¬'), + ('±', '±'), + ('×', '×'), + ('÷', '÷'), + ('ϐ', 'ϒ'), + ('ϕ', 'ϕ'), + ('ϰ', 'ϱ'), + ('ϴ', '϶'), + ('؆', '؈'), + ('‖', '‖'), + ('′', '‴'), + ('⁀', '⁀'), + ('⁄', '⁄'), + ('⁒', '⁒'), + ('\u{2061}', '\u{2064}'), + ('⁺', '⁾'), + ('₊', '₎'), + ('\u{20d0}', '\u{20dc}'), + ('\u{20e1}', '\u{20e1}'), + ('\u{20e5}', '\u{20e6}'), + ('\u{20eb}', '\u{20ef}'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('℘', 'ℝ'), + ('ℤ', 'ℤ'), + ('ℨ', '℩'), + ('ℬ', 'ℭ'), + ('ℯ', 'ℱ'), + ('ℳ', 'ℸ'), + ('ℼ', 'ⅉ'), + ('⅋', '⅋'), + ('←', '↧'), + ('↩', '↮'), + ('↰', '↱'), + ('↶', '↷'), + ('↼', '⇛'), + ('⇝', '⇝'), + ('⇤', '⇥'), + ('⇴', '⋿'), + ('⌈', '⌋'), + ('⌠', '⌡'), + ('⍼', '⍼'), + ('⎛', '⎵'), + ('⎷', '⎷'), + ('⏐', '⏐'), + ('⏜', '⏢'), + ('■', '□'), + ('▮', '▷'), + ('▼', '◁'), + ('◆', '◇'), + ('◊', '○'), + ('●', '◓'), + ('◢', '◢'), + ('◤', '◤'), + ('◧', '◬'), + ('◸', '◿'), + ('★', '☆'), + ('♀', '♀'), + ('♂', '♂'), + ('♠', '♣'), + ('♭', '♯'), + ('⟀', '⟿'), + ('⤀', '⫿'), + ('⬰', '⭄'), + ('⭇', '⭌'), + ('﬩', '﬩'), + ('﹡', '﹦'), + ('﹨', '﹨'), + ('+', '+'), + ('<', '>'), + ('\', '\'), + ('^', '^'), + ('|', '|'), + ('~', '~'), + ('¬', '¬'), + ('←', '↓'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝟋'), + ('𝟎', '𝟿'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('𞻰', '𞻱'), +]; + +pub const NONCHARACTER_CODE_POINT: &'static [(char, char)] = &[ + ('\u{fdd0}', '\u{fdef}'), + ('\u{fffe}', '\u{ffff}'), + ('\u{1fffe}', '\u{1ffff}'), + ('\u{2fffe}', '\u{2ffff}'), + ('\u{3fffe}', '\u{3ffff}'), + ('\u{4fffe}', '\u{4ffff}'), + ('\u{5fffe}', '\u{5ffff}'), + ('\u{6fffe}', '\u{6ffff}'), + ('\u{7fffe}', '\u{7ffff}'), + ('\u{8fffe}', '\u{8ffff}'), + ('\u{9fffe}', '\u{9ffff}'), + ('\u{afffe}', '\u{affff}'), + ('\u{bfffe}', '\u{bffff}'), + ('\u{cfffe}', '\u{cffff}'), + ('\u{dfffe}', '\u{dffff}'), + ('\u{efffe}', '\u{effff}'), + ('\u{ffffe}', '\u{fffff}'), + ('\u{10fffe}', '\u{10ffff}'), +]; + +pub const OTHER_ALPHABETIC: &'static [(char, char)] = &[ + ('\u{345}', '\u{345}'), + ('\u{5b0}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{657}'), + ('\u{659}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6e1}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ed}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{73f}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{816}', '\u{817}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82c}'), + ('\u{8d4}', '\u{8df}'), + ('\u{8e3}', '\u{8e9}'), + ('\u{8f0}', 'ः'), + ('\u{93a}', 'ऻ'), + ('ा', 'ौ'), + ('ॎ', 'ॏ'), + ('\u{955}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', 'ঃ'), + ('\u{9be}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', 'ৌ'), + ('\u{9d7}', '\u{9d7}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{a01}', 'ਃ'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4c}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', 'ઃ'), + ('ા', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', 'ૌ'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{afc}'), + ('\u{b01}', 'ଃ'), + ('\u{b3e}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', 'ୌ'), + ('\u{b56}', '\u{b57}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', 'ௌ'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', '\u{c04}'), + ('\u{c3e}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4c}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', 'ಃ'), + ('ಾ', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccc}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{ce2}', '\u{ce3}'), + ('ೳ', 'ೳ'), + ('\u{d00}', 'ഃ'), + ('\u{d3e}', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', 'ൌ'), + ('\u{d57}', '\u{d57}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', 'ඃ'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('ෲ', 'ෳ'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e4d}', '\u{e4d}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{eb9}'), + ('\u{ebb}', '\u{ebc}'), + ('\u{ecd}', '\u{ecd}'), + ('\u{f71}', '\u{f83}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('ါ', '\u{1036}'), + ('း', 'း'), + ('ျ', '\u{103e}'), + ('ၖ', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('ၢ', 'ၤ'), + ('ၧ', 'ၭ'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{108d}'), + ('ႏ', 'ႏ'), + ('ႚ', '\u{109d}'), + ('\u{1712}', '\u{1713}'), + ('\u{1732}', '\u{1733}'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('ា', 'ៈ'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', 'ᤸ'), + ('\u{1a17}', '\u{1a1b}'), + ('ᩕ', '\u{1a5e}'), + ('ᩡ', '\u{1a74}'), + ('\u{1abf}', '\u{1ac0}'), + ('\u{1acc}', '\u{1ace}'), + ('\u{1b00}', 'ᬄ'), + ('\u{1b35}', 'ᭃ'), + ('\u{1b80}', 'ᮂ'), + ('ᮡ', '\u{1ba9}'), + ('\u{1bac}', '\u{1bad}'), + ('ᯧ', '\u{1bf1}'), + ('ᰤ', '\u{1c36}'), + ('\u{1de7}', '\u{1df4}'), + ('Ⓐ', 'ⓩ'), + ('\u{2de0}', '\u{2dff}'), + ('\u{a674}', '\u{a67b}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a802}', '\u{a802}'), + ('\u{a80b}', '\u{a80b}'), + ('ꠣ', 'ꠧ'), + ('ꢀ', 'ꢁ'), + ('ꢴ', 'ꣃ'), + ('\u{a8c5}', '\u{a8c5}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92a}'), + ('\u{a947}', 'ꥒ'), + ('\u{a980}', 'ꦃ'), + ('ꦴ', 'ꦿ'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', 'ꩍ'), + ('ꩻ', 'ꩽ'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabe}'), + ('ꫫ', 'ꫯ'), + ('ꫵ', 'ꫵ'), + ('ꯣ', 'ꯪ'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('𑀀', '𑀂'), + ('\u{11038}', '\u{11045}'), + ('\u{11073}', '\u{11074}'), + ('\u{11080}', '𑂂'), + ('𑂰', '𑂸'), + ('\u{110c2}', '\u{110c2}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{11132}'), + ('𑅅', '𑅆'), + ('\u{11180}', '𑆂'), + ('𑆳', '𑆿'), + ('𑇎', '\u{111cf}'), + ('𑈬', '\u{11234}'), + ('\u{11237}', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{11241}', '\u{11241}'), + ('\u{112df}', '\u{112e8}'), + ('\u{11300}', '𑌃'), + ('\u{1133e}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍌'), + ('\u{11357}', '\u{11357}'), + ('𑍢', '𑍣'), + ('𑐵', '𑑁'), + ('\u{11443}', '𑑅'), + ('\u{114b0}', '𑓁'), + ('\u{115af}', '\u{115b5}'), + ('𑖸', '𑖾'), + ('\u{115dc}', '\u{115dd}'), + ('𑘰', '𑘾'), + ('\u{11640}', '\u{11640}'), + ('\u{116ab}', '\u{116b5}'), + ('\u{1171d}', '\u{1172a}'), + ('𑠬', '𑠸'), + ('\u{11930}', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '\u{1193c}'), + ('𑥀', '𑥀'), + ('𑥂', '𑥂'), + ('𑧑', '\u{119d7}'), + ('\u{119da}', '𑧟'), + ('𑧤', '𑧤'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a35}', '𑨹'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a51}', '\u{11a5b}'), + ('\u{11a8a}', '𑪗'), + ('𑰯', '\u{11c36}'), + ('\u{11c38}', '𑰾'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d41}'), + ('\u{11d43}', '\u{11d43}'), + ('\u{11d47}', '\u{11d47}'), + ('𑶊', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '𑶖'), + ('\u{11ef3}', '𑻶'), + ('\u{11f00}', '\u{11f01}'), + ('𑼃', '𑼃'), + ('𑼴', '\u{11f3a}'), + ('𑼾', '\u{11f40}'), + ('\u{16f4f}', '\u{16f4f}'), + ('𖽑', '𖾇'), + ('\u{16f8f}', '\u{16f92}'), + ('𖿰', '𖿱'), + ('\u{1bc9e}', '\u{1bc9e}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e08f}', '\u{1e08f}'), + ('\u{1e947}', '\u{1e947}'), + ('🄰', '🅉'), + ('🅐', '🅩'), + ('🅰', '🆉'), +]; + +pub const OTHER_DEFAULT_IGNORABLE_CODE_POINT: &'static [(char, char)] = &[ + ('\u{34f}', '\u{34f}'), + ('ᅟ', 'ᅠ'), + ('\u{17b4}', '\u{17b5}'), + ('\u{2065}', '\u{2065}'), + ('ㅤ', 'ㅤ'), + ('ᅠ', 'ᅠ'), + ('\u{fff0}', '\u{fff8}'), + ('\u{e0000}', '\u{e0000}'), + ('\u{e0002}', '\u{e001f}'), + ('\u{e0080}', '\u{e00ff}'), + ('\u{e01f0}', '\u{e0fff}'), +]; + +pub const OTHER_GRAPHEME_EXTEND: &'static [(char, char)] = &[ + ('\u{9be}', '\u{9be}'), + ('\u{9d7}', '\u{9d7}'), + ('\u{b3e}', '\u{b3e}'), + ('\u{b57}', '\u{b57}'), + ('\u{bbe}', '\u{bbe}'), + ('\u{bd7}', '\u{bd7}'), + ('\u{cc2}', '\u{cc2}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{d3e}', '\u{d3e}'), + ('\u{d57}', '\u{d57}'), + ('\u{dcf}', '\u{dcf}'), + ('\u{ddf}', '\u{ddf}'), + ('\u{1b35}', '\u{1b35}'), + ('\u{200c}', '\u{200c}'), + ('\u{302e}', '\u{302f}'), + ('\u{ff9e}', '\u{ff9f}'), + ('\u{1133e}', '\u{1133e}'), + ('\u{11357}', '\u{11357}'), + ('\u{114b0}', '\u{114b0}'), + ('\u{114bd}', '\u{114bd}'), + ('\u{115af}', '\u{115af}'), + ('\u{11930}', '\u{11930}'), + ('\u{1d165}', '\u{1d165}'), + ('\u{1d16e}', '\u{1d172}'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const OTHER_ID_CONTINUE: &'static [(char, char)] = + &[('·', '·'), ('·', '·'), ('፩', '፱'), ('᧚', '᧚')]; + +pub const OTHER_ID_START: &'static [(char, char)] = + &[('\u{1885}', '\u{1886}'), ('℘', '℘'), ('℮', '℮'), ('゛', '゜')]; + +pub const OTHER_LOWERCASE: &'static [(char, char)] = &[ + ('ª', 'ª'), + ('º', 'º'), + ('ʰ', 'ʸ'), + ('ˀ', 'ˁ'), + ('ˠ', 'ˤ'), + ('\u{345}', '\u{345}'), + ('ͺ', 'ͺ'), + ('ჼ', 'ჼ'), + ('ᴬ', 'ᵪ'), + ('ᵸ', 'ᵸ'), + ('ᶛ', 'ᶿ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('ⅰ', 'ⅿ'), + ('ⓐ', 'ⓩ'), + ('ⱼ', 'ⱽ'), + ('ꚜ', 'ꚝ'), + ('ꝰ', 'ꝰ'), + ('ꟲ', 'ꟴ'), + ('ꟸ', 'ꟹ'), + ('ꭜ', 'ꭟ'), + ('ꭩ', 'ꭩ'), + ('𐞀', '𐞀'), + ('𐞃', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𞀰', '𞁭'), +]; + +pub const OTHER_MATH: &'static [(char, char)] = &[ + ('^', '^'), + ('ϐ', 'ϒ'), + ('ϕ', 'ϕ'), + ('ϰ', 'ϱ'), + ('ϴ', 'ϵ'), + ('‖', '‖'), + ('′', '‴'), + ('⁀', '⁀'), + ('\u{2061}', '\u{2064}'), + ('⁽', '⁾'), + ('₍', '₎'), + ('\u{20d0}', '\u{20dc}'), + ('\u{20e1}', '\u{20e1}'), + ('\u{20e5}', '\u{20e6}'), + ('\u{20eb}', '\u{20ef}'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('ℨ', '℩'), + ('ℬ', 'ℭ'), + ('ℯ', 'ℱ'), + ('ℳ', 'ℸ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('↕', '↙'), + ('↜', '↟'), + ('↡', '↢'), + ('↤', '↥'), + ('↧', '↧'), + ('↩', '↭'), + ('↰', '↱'), + ('↶', '↷'), + ('↼', '⇍'), + ('⇐', '⇑'), + ('⇓', '⇓'), + ('⇕', '⇛'), + ('⇝', '⇝'), + ('⇤', '⇥'), + ('⌈', '⌋'), + ('⎴', '⎵'), + ('⎷', '⎷'), + ('⏐', '⏐'), + ('⏢', '⏢'), + ('■', '□'), + ('▮', '▶'), + ('▼', '◀'), + ('◆', '◇'), + ('◊', '○'), + ('●', '◓'), + ('◢', '◢'), + ('◤', '◤'), + ('◧', '◬'), + ('★', '☆'), + ('♀', '♀'), + ('♂', '♂'), + ('♠', '♣'), + ('♭', '♮'), + ('⟅', '⟆'), + ('⟦', '⟯'), + ('⦃', '⦘'), + ('⧘', '⧛'), + ('⧼', '⧽'), + ('﹡', '﹡'), + ('﹣', '﹣'), + ('﹨', '﹨'), + ('\', '\'), + ('^', '^'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝟎', '𝟿'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), +]; + +pub const OTHER_UPPERCASE: &'static [(char, char)] = + &[('Ⅰ', 'Ⅿ'), ('Ⓐ', 'Ⓩ'), ('🄰', '🅉'), ('🅐', '🅩'), ('🅰', '🆉')]; + +pub const PATTERN_SYNTAX: &'static [(char, char)] = &[ + ('!', '/'), + (':', '@'), + ('[', '^'), + ('`', '`'), + ('{', '~'), + ('¡', '§'), + ('©', '©'), + ('«', '¬'), + ('®', '®'), + ('°', '±'), + ('¶', '¶'), + ('»', '»'), + ('¿', '¿'), + ('×', '×'), + ('÷', '÷'), + ('‐', '‧'), + ('‰', '‾'), + ('⁁', '⁓'), + ('⁕', '⁞'), + ('←', '\u{245f}'), + ('─', '❵'), + ('➔', '⯿'), + ('⸀', '\u{2e7f}'), + ('、', '〃'), + ('〈', '〠'), + ('〰', '〰'), + ('﴾', '﴿'), + ('﹅', '﹆'), +]; + +pub const PATTERN_WHITE_SPACE: &'static [(char, char)] = &[ + ('\t', '\r'), + (' ', ' '), + ('\u{85}', '\u{85}'), + ('\u{200e}', '\u{200f}'), + ('\u{2028}', '\u{2029}'), +]; + +pub const PREPENDED_CONCATENATION_MARK: &'static [(char, char)] = &[ + ('\u{600}', '\u{605}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{70f}', '\u{70f}'), + ('\u{890}', '\u{891}'), + ('\u{8e2}', '\u{8e2}'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110cd}', '\u{110cd}'), +]; + +pub const QUOTATION_MARK: &'static [(char, char)] = &[ + ('"', '"'), + ('\'', '\''), + ('«', '«'), + ('»', '»'), + ('‘', '‟'), + ('‹', '›'), + ('⹂', '⹂'), + ('「', '』'), + ('〝', '〟'), + ('﹁', '﹄'), + ('"', '"'), + (''', '''), + ('「', '」'), +]; + +pub const RADICAL: &'static [(char, char)] = + &[('⺀', '⺙'), ('⺛', '⻳'), ('⼀', '⿕')]; + +pub const REGIONAL_INDICATOR: &'static [(char, char)] = &[('🇦', '🇿')]; + +pub const SENTENCE_TERMINAL: &'static [(char, char)] = &[ + ('!', '!'), + ('.', '.'), + ('?', '?'), + ('։', '։'), + ('؝', '؟'), + ('۔', '۔'), + ('܀', '܂'), + ('߹', '߹'), + ('࠷', '࠷'), + ('࠹', '࠹'), + ('࠽', '࠾'), + ('।', '॥'), + ('၊', '။'), + ('።', '።'), + ('፧', '፨'), + ('᙮', '᙮'), + ('᜵', '᜶'), + ('᠃', '᠃'), + ('᠉', '᠉'), + ('᥄', '᥅'), + ('᪨', '᪫'), + ('᭚', '᭛'), + ('᭞', '᭟'), + ('᭽', '᭾'), + ('᰻', '᰼'), + ('᱾', '᱿'), + ('‼', '‽'), + ('⁇', '⁉'), + ('⸮', '⸮'), + ('⸼', '⸼'), + ('⹓', '⹔'), + ('。', '。'), + ('꓿', '꓿'), + ('꘎', '꘏'), + ('꛳', '꛳'), + ('꛷', '꛷'), + ('꡶', '꡷'), + ('꣎', '꣏'), + ('꤯', '꤯'), + ('꧈', '꧉'), + ('꩝', '꩟'), + ('꫰', '꫱'), + ('꯫', '꯫'), + ('﹒', '﹒'), + ('﹖', '﹗'), + ('!', '!'), + ('.', '.'), + ('?', '?'), + ('。', '。'), + ('𐩖', '𐩗'), + ('𐽕', '𐽙'), + ('𐾆', '𐾉'), + ('𑁇', '𑁈'), + ('𑂾', '𑃁'), + ('𑅁', '𑅃'), + ('𑇅', '𑇆'), + ('𑇍', '𑇍'), + ('𑇞', '𑇟'), + ('𑈸', '𑈹'), + ('𑈻', '𑈼'), + ('𑊩', '𑊩'), + ('𑑋', '𑑌'), + ('𑗂', '𑗃'), + ('𑗉', '𑗗'), + ('𑙁', '𑙂'), + ('𑜼', '𑜾'), + ('𑥄', '𑥄'), + ('𑥆', '𑥆'), + ('𑩂', '𑩃'), + ('𑪛', '𑪜'), + ('𑱁', '𑱂'), + ('𑻷', '𑻸'), + ('𑽃', '𑽄'), + ('𖩮', '𖩯'), + ('𖫵', '𖫵'), + ('𖬷', '𖬸'), + ('𖭄', '𖭄'), + ('𖺘', '𖺘'), + ('𛲟', '𛲟'), + ('𝪈', '𝪈'), +]; + +pub const SOFT_DOTTED: &'static [(char, char)] = &[ + ('i', 'j'), + ('į', 'į'), + ('ɉ', 'ɉ'), + ('ɨ', 'ɨ'), + ('ʝ', 'ʝ'), + ('ʲ', 'ʲ'), + ('ϳ', 'ϳ'), + ('і', 'і'), + ('ј', 'ј'), + ('ᵢ', 'ᵢ'), + ('ᶖ', 'ᶖ'), + ('ᶤ', 'ᶤ'), + ('ᶨ', 'ᶨ'), + ('ḭ', 'ḭ'), + ('ị', 'ị'), + ('ⁱ', 'ⁱ'), + ('ⅈ', 'ⅉ'), + ('ⱼ', 'ⱼ'), + ('𝐢', '𝐣'), + ('𝑖', '𝑗'), + ('𝒊', '𝒋'), + ('𝒾', '𝒿'), + ('𝓲', '𝓳'), + ('𝔦', '𝔧'), + ('𝕚', '𝕛'), + ('𝖎', '𝖏'), + ('𝗂', '𝗃'), + ('𝗶', '𝗷'), + ('𝘪', '𝘫'), + ('𝙞', '𝙟'), + ('𝚒', '𝚓'), + ('𝼚', '𝼚'), + ('𞁌', '𞁍'), + ('𞁨', '𞁨'), +]; + +pub const TERMINAL_PUNCTUATION: &'static [(char, char)] = &[ + ('!', '!'), + (',', ','), + ('.', '.'), + (':', ';'), + ('?', '?'), + (';', ';'), + ('·', '·'), + ('։', '։'), + ('׃', '׃'), + ('،', '،'), + ('؛', '؛'), + ('؝', '؟'), + ('۔', '۔'), + ('܀', '܊'), + ('܌', '܌'), + ('߸', '߹'), + ('࠰', '࠾'), + ('࡞', '࡞'), + ('।', '॥'), + ('๚', '๛'), + ('༈', '༈'), + ('།', '༒'), + ('၊', '။'), + ('፡', '፨'), + ('᙮', '᙮'), + ('᛫', '᛭'), + ('᜵', '᜶'), + ('។', '៖'), + ('៚', '៚'), + ('᠂', '᠅'), + ('᠈', '᠉'), + ('᥄', '᥅'), + ('᪨', '᪫'), + ('᭚', '᭛'), + ('᭝', '᭟'), + ('᭽', '᭾'), + ('᰻', '᰿'), + ('᱾', '᱿'), + ('‼', '‽'), + ('⁇', '⁉'), + ('⸮', '⸮'), + ('⸼', '⸼'), + ('⹁', '⹁'), + ('⹌', '⹌'), + ('⹎', '⹏'), + ('⹓', '⹔'), + ('、', '。'), + ('꓾', '꓿'), + ('꘍', '꘏'), + ('꛳', '꛷'), + ('꡶', '꡷'), + ('꣎', '꣏'), + ('꤯', '꤯'), + ('꧇', '꧉'), + ('꩝', '꩟'), + ('꫟', '꫟'), + ('꫰', '꫱'), + ('꯫', '꯫'), + ('﹐', '﹒'), + ('﹔', '﹗'), + ('!', '!'), + (',', ','), + ('.', '.'), + (':', ';'), + ('?', '?'), + ('。', '。'), + ('、', '、'), + ('𐎟', '𐎟'), + ('𐏐', '𐏐'), + ('𐡗', '𐡗'), + ('𐤟', '𐤟'), + ('𐩖', '𐩗'), + ('𐫰', '𐫵'), + ('𐬺', '𐬿'), + ('𐮙', '𐮜'), + ('𐽕', '𐽙'), + ('𐾆', '𐾉'), + ('𑁇', '𑁍'), + ('𑂾', '𑃁'), + ('𑅁', '𑅃'), + ('𑇅', '𑇆'), + ('𑇍', '𑇍'), + ('𑇞', '𑇟'), + ('𑈸', '𑈼'), + ('𑊩', '𑊩'), + ('𑑋', '𑑍'), + ('𑑚', '𑑛'), + ('𑗂', '𑗅'), + ('𑗉', '𑗗'), + ('𑙁', '𑙂'), + ('𑜼', '𑜾'), + ('𑥄', '𑥄'), + ('𑥆', '𑥆'), + ('𑩂', '𑩃'), + ('𑪛', '𑪜'), + ('𑪡', '𑪢'), + ('𑱁', '𑱃'), + ('𑱱', '𑱱'), + ('𑻷', '𑻸'), + ('𑽃', '𑽄'), + ('𒑰', '𒑴'), + ('𖩮', '𖩯'), + ('𖫵', '𖫵'), + ('𖬷', '𖬹'), + ('𖭄', '𖭄'), + ('𖺗', '𖺘'), + ('𛲟', '𛲟'), + ('𝪇', '𝪊'), +]; + +pub const UNIFIED_IDEOGRAPH: &'static [(char, char)] = &[ + ('㐀', '䶿'), + ('一', '鿿'), + ('﨎', '﨏'), + ('﨑', '﨑'), + ('﨓', '﨔'), + ('﨟', '﨟'), + ('﨡', '﨡'), + ('﨣', '﨤'), + ('﨧', '﨩'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; + +pub const UPPERCASE: &'static [(char, char)] = &[ + ('A', 'Z'), + ('À', 'Ö'), + ('Ø', 'Þ'), + ('Ā', 'Ā'), + ('Ă', 'Ă'), + ('Ą', 'Ą'), + ('Ć', 'Ć'), + ('Ĉ', 'Ĉ'), + ('Ċ', 'Ċ'), + ('Č', 'Č'), + ('Ď', 'Ď'), + ('Đ', 'Đ'), + ('Ē', 'Ē'), + ('Ĕ', 'Ĕ'), + ('Ė', 'Ė'), + ('Ę', 'Ę'), + ('Ě', 'Ě'), + ('Ĝ', 'Ĝ'), + ('Ğ', 'Ğ'), + ('Ġ', 'Ġ'), + ('Ģ', 'Ģ'), + ('Ĥ', 'Ĥ'), + ('Ħ', 'Ħ'), + ('Ĩ', 'Ĩ'), + ('Ī', 'Ī'), + ('Ĭ', 'Ĭ'), + ('Į', 'Į'), + ('İ', 'İ'), + ('IJ', 'IJ'), + ('Ĵ', 'Ĵ'), + ('Ķ', 'Ķ'), + ('Ĺ', 'Ĺ'), + ('Ļ', 'Ļ'), + ('Ľ', 'Ľ'), + ('Ŀ', 'Ŀ'), + ('Ł', 'Ł'), + ('Ń', 'Ń'), + ('Ņ', 'Ņ'), + ('Ň', 'Ň'), + ('Ŋ', 'Ŋ'), + ('Ō', 'Ō'), + ('Ŏ', 'Ŏ'), + ('Ő', 'Ő'), + ('Œ', 'Œ'), + ('Ŕ', 'Ŕ'), + ('Ŗ', 'Ŗ'), + ('Ř', 'Ř'), + ('Ś', 'Ś'), + ('Ŝ', 'Ŝ'), + ('Ş', 'Ş'), + ('Š', 'Š'), + ('Ţ', 'Ţ'), + ('Ť', 'Ť'), + ('Ŧ', 'Ŧ'), + ('Ũ', 'Ũ'), + ('Ū', 'Ū'), + ('Ŭ', 'Ŭ'), + ('Ů', 'Ů'), + ('Ű', 'Ű'), + ('Ų', 'Ų'), + ('Ŵ', 'Ŵ'), + ('Ŷ', 'Ŷ'), + ('Ÿ', 'Ź'), + ('Ż', 'Ż'), + ('Ž', 'Ž'), + ('Ɓ', 'Ƃ'), + ('Ƅ', 'Ƅ'), + ('Ɔ', 'Ƈ'), + ('Ɖ', 'Ƌ'), + ('Ǝ', 'Ƒ'), + ('Ɠ', 'Ɣ'), + ('Ɩ', 'Ƙ'), + ('Ɯ', 'Ɲ'), + ('Ɵ', 'Ơ'), + ('Ƣ', 'Ƣ'), + ('Ƥ', 'Ƥ'), + ('Ʀ', 'Ƨ'), + ('Ʃ', 'Ʃ'), + ('Ƭ', 'Ƭ'), + ('Ʈ', 'Ư'), + ('Ʊ', 'Ƴ'), + ('Ƶ', 'Ƶ'), + ('Ʒ', 'Ƹ'), + ('Ƽ', 'Ƽ'), + ('DŽ', 'DŽ'), + ('LJ', 'LJ'), + ('NJ', 'NJ'), + ('Ǎ', 'Ǎ'), + ('Ǐ', 'Ǐ'), + ('Ǒ', 'Ǒ'), + ('Ǔ', 'Ǔ'), + ('Ǖ', 'Ǖ'), + ('Ǘ', 'Ǘ'), + ('Ǚ', 'Ǚ'), + ('Ǜ', 'Ǜ'), + ('Ǟ', 'Ǟ'), + ('Ǡ', 'Ǡ'), + ('Ǣ', 'Ǣ'), + ('Ǥ', 'Ǥ'), + ('Ǧ', 'Ǧ'), + ('Ǩ', 'Ǩ'), + ('Ǫ', 'Ǫ'), + ('Ǭ', 'Ǭ'), + ('Ǯ', 'Ǯ'), + ('DZ', 'DZ'), + ('Ǵ', 'Ǵ'), + ('Ƕ', 'Ǹ'), + ('Ǻ', 'Ǻ'), + ('Ǽ', 'Ǽ'), + ('Ǿ', 'Ǿ'), + ('Ȁ', 'Ȁ'), + ('Ȃ', 'Ȃ'), + ('Ȅ', 'Ȅ'), + ('Ȇ', 'Ȇ'), + ('Ȉ', 'Ȉ'), + ('Ȋ', 'Ȋ'), + ('Ȍ', 'Ȍ'), + ('Ȏ', 'Ȏ'), + ('Ȑ', 'Ȑ'), + ('Ȓ', 'Ȓ'), + ('Ȕ', 'Ȕ'), + ('Ȗ', 'Ȗ'), + ('Ș', 'Ș'), + ('Ț', 'Ț'), + ('Ȝ', 'Ȝ'), + ('Ȟ', 'Ȟ'), + ('Ƞ', 'Ƞ'), + ('Ȣ', 'Ȣ'), + ('Ȥ', 'Ȥ'), + ('Ȧ', 'Ȧ'), + ('Ȩ', 'Ȩ'), + ('Ȫ', 'Ȫ'), + ('Ȭ', 'Ȭ'), + ('Ȯ', 'Ȯ'), + ('Ȱ', 'Ȱ'), + ('Ȳ', 'Ȳ'), + ('Ⱥ', 'Ȼ'), + ('Ƚ', 'Ⱦ'), + ('Ɂ', 'Ɂ'), + ('Ƀ', 'Ɇ'), + ('Ɉ', 'Ɉ'), + ('Ɋ', 'Ɋ'), + ('Ɍ', 'Ɍ'), + ('Ɏ', 'Ɏ'), + ('Ͱ', 'Ͱ'), + ('Ͳ', 'Ͳ'), + ('Ͷ', 'Ͷ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ώ'), + ('Α', 'Ρ'), + ('Σ', 'Ϋ'), + ('Ϗ', 'Ϗ'), + ('ϒ', 'ϔ'), + ('Ϙ', 'Ϙ'), + ('Ϛ', 'Ϛ'), + ('Ϝ', 'Ϝ'), + ('Ϟ', 'Ϟ'), + ('Ϡ', 'Ϡ'), + ('Ϣ', 'Ϣ'), + ('Ϥ', 'Ϥ'), + ('Ϧ', 'Ϧ'), + ('Ϩ', 'Ϩ'), + ('Ϫ', 'Ϫ'), + ('Ϭ', 'Ϭ'), + ('Ϯ', 'Ϯ'), + ('ϴ', 'ϴ'), + ('Ϸ', 'Ϸ'), + ('Ϲ', 'Ϻ'), + ('Ͻ', 'Я'), + ('Ѡ', 'Ѡ'), + ('Ѣ', 'Ѣ'), + ('Ѥ', 'Ѥ'), + ('Ѧ', 'Ѧ'), + ('Ѩ', 'Ѩ'), + ('Ѫ', 'Ѫ'), + ('Ѭ', 'Ѭ'), + ('Ѯ', 'Ѯ'), + ('Ѱ', 'Ѱ'), + ('Ѳ', 'Ѳ'), + ('Ѵ', 'Ѵ'), + ('Ѷ', 'Ѷ'), + ('Ѹ', 'Ѹ'), + ('Ѻ', 'Ѻ'), + ('Ѽ', 'Ѽ'), + ('Ѿ', 'Ѿ'), + ('Ҁ', 'Ҁ'), + ('Ҋ', 'Ҋ'), + ('Ҍ', 'Ҍ'), + ('Ҏ', 'Ҏ'), + ('Ґ', 'Ґ'), + ('Ғ', 'Ғ'), + ('Ҕ', 'Ҕ'), + ('Җ', 'Җ'), + ('Ҙ', 'Ҙ'), + ('Қ', 'Қ'), + ('Ҝ', 'Ҝ'), + ('Ҟ', 'Ҟ'), + ('Ҡ', 'Ҡ'), + ('Ң', 'Ң'), + ('Ҥ', 'Ҥ'), + ('Ҧ', 'Ҧ'), + ('Ҩ', 'Ҩ'), + ('Ҫ', 'Ҫ'), + ('Ҭ', 'Ҭ'), + ('Ү', 'Ү'), + ('Ұ', 'Ұ'), + ('Ҳ', 'Ҳ'), + ('Ҵ', 'Ҵ'), + ('Ҷ', 'Ҷ'), + ('Ҹ', 'Ҹ'), + ('Һ', 'Һ'), + ('Ҽ', 'Ҽ'), + ('Ҿ', 'Ҿ'), + ('Ӏ', 'Ӂ'), + ('Ӄ', 'Ӄ'), + ('Ӆ', 'Ӆ'), + ('Ӈ', 'Ӈ'), + ('Ӊ', 'Ӊ'), + ('Ӌ', 'Ӌ'), + ('Ӎ', 'Ӎ'), + ('Ӑ', 'Ӑ'), + ('Ӓ', 'Ӓ'), + ('Ӕ', 'Ӕ'), + ('Ӗ', 'Ӗ'), + ('Ә', 'Ә'), + ('Ӛ', 'Ӛ'), + ('Ӝ', 'Ӝ'), + ('Ӟ', 'Ӟ'), + ('Ӡ', 'Ӡ'), + ('Ӣ', 'Ӣ'), + ('Ӥ', 'Ӥ'), + ('Ӧ', 'Ӧ'), + ('Ө', 'Ө'), + ('Ӫ', 'Ӫ'), + ('Ӭ', 'Ӭ'), + ('Ӯ', 'Ӯ'), + ('Ӱ', 'Ӱ'), + ('Ӳ', 'Ӳ'), + ('Ӵ', 'Ӵ'), + ('Ӷ', 'Ӷ'), + ('Ӹ', 'Ӹ'), + ('Ӻ', 'Ӻ'), + ('Ӽ', 'Ӽ'), + ('Ӿ', 'Ӿ'), + ('Ԁ', 'Ԁ'), + ('Ԃ', 'Ԃ'), + ('Ԅ', 'Ԅ'), + ('Ԇ', 'Ԇ'), + ('Ԉ', 'Ԉ'), + ('Ԋ', 'Ԋ'), + ('Ԍ', 'Ԍ'), + ('Ԏ', 'Ԏ'), + ('Ԑ', 'Ԑ'), + ('Ԓ', 'Ԓ'), + ('Ԕ', 'Ԕ'), + ('Ԗ', 'Ԗ'), + ('Ԙ', 'Ԙ'), + ('Ԛ', 'Ԛ'), + ('Ԝ', 'Ԝ'), + ('Ԟ', 'Ԟ'), + ('Ԡ', 'Ԡ'), + ('Ԣ', 'Ԣ'), + ('Ԥ', 'Ԥ'), + ('Ԧ', 'Ԧ'), + ('Ԩ', 'Ԩ'), + ('Ԫ', 'Ԫ'), + ('Ԭ', 'Ԭ'), + ('Ԯ', 'Ԯ'), + ('Ա', 'Ֆ'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('Ꭰ', 'Ᏽ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('Ḁ', 'Ḁ'), + ('Ḃ', 'Ḃ'), + ('Ḅ', 'Ḅ'), + ('Ḇ', 'Ḇ'), + ('Ḉ', 'Ḉ'), + ('Ḋ', 'Ḋ'), + ('Ḍ', 'Ḍ'), + ('Ḏ', 'Ḏ'), + ('Ḑ', 'Ḑ'), + ('Ḓ', 'Ḓ'), + ('Ḕ', 'Ḕ'), + ('Ḗ', 'Ḗ'), + ('Ḙ', 'Ḙ'), + ('Ḛ', 'Ḛ'), + ('Ḝ', 'Ḝ'), + ('Ḟ', 'Ḟ'), + ('Ḡ', 'Ḡ'), + ('Ḣ', 'Ḣ'), + ('Ḥ', 'Ḥ'), + ('Ḧ', 'Ḧ'), + ('Ḩ', 'Ḩ'), + ('Ḫ', 'Ḫ'), + ('Ḭ', 'Ḭ'), + ('Ḯ', 'Ḯ'), + ('Ḱ', 'Ḱ'), + ('Ḳ', 'Ḳ'), + ('Ḵ', 'Ḵ'), + ('Ḷ', 'Ḷ'), + ('Ḹ', 'Ḹ'), + ('Ḻ', 'Ḻ'), + ('Ḽ', 'Ḽ'), + ('Ḿ', 'Ḿ'), + ('Ṁ', 'Ṁ'), + ('Ṃ', 'Ṃ'), + ('Ṅ', 'Ṅ'), + ('Ṇ', 'Ṇ'), + ('Ṉ', 'Ṉ'), + ('Ṋ', 'Ṋ'), + ('Ṍ', 'Ṍ'), + ('Ṏ', 'Ṏ'), + ('Ṑ', 'Ṑ'), + ('Ṓ', 'Ṓ'), + ('Ṕ', 'Ṕ'), + ('Ṗ', 'Ṗ'), + ('Ṙ', 'Ṙ'), + ('Ṛ', 'Ṛ'), + ('Ṝ', 'Ṝ'), + ('Ṟ', 'Ṟ'), + ('Ṡ', 'Ṡ'), + ('Ṣ', 'Ṣ'), + ('Ṥ', 'Ṥ'), + ('Ṧ', 'Ṧ'), + ('Ṩ', 'Ṩ'), + ('Ṫ', 'Ṫ'), + ('Ṭ', 'Ṭ'), + ('Ṯ', 'Ṯ'), + ('Ṱ', 'Ṱ'), + ('Ṳ', 'Ṳ'), + ('Ṵ', 'Ṵ'), + ('Ṷ', 'Ṷ'), + ('Ṹ', 'Ṹ'), + ('Ṻ', 'Ṻ'), + ('Ṽ', 'Ṽ'), + ('Ṿ', 'Ṿ'), + ('Ẁ', 'Ẁ'), + ('Ẃ', 'Ẃ'), + ('Ẅ', 'Ẅ'), + ('Ẇ', 'Ẇ'), + ('Ẉ', 'Ẉ'), + ('Ẋ', 'Ẋ'), + ('Ẍ', 'Ẍ'), + ('Ẏ', 'Ẏ'), + ('Ẑ', 'Ẑ'), + ('Ẓ', 'Ẓ'), + ('Ẕ', 'Ẕ'), + ('ẞ', 'ẞ'), + ('Ạ', 'Ạ'), + ('Ả', 'Ả'), + ('Ấ', 'Ấ'), + ('Ầ', 'Ầ'), + ('Ẩ', 'Ẩ'), + ('Ẫ', 'Ẫ'), + ('Ậ', 'Ậ'), + ('Ắ', 'Ắ'), + ('Ằ', 'Ằ'), + ('Ẳ', 'Ẳ'), + ('Ẵ', 'Ẵ'), + ('Ặ', 'Ặ'), + ('Ẹ', 'Ẹ'), + ('Ẻ', 'Ẻ'), + ('Ẽ', 'Ẽ'), + ('Ế', 'Ế'), + ('Ề', 'Ề'), + ('Ể', 'Ể'), + ('Ễ', 'Ễ'), + ('Ệ', 'Ệ'), + ('Ỉ', 'Ỉ'), + ('Ị', 'Ị'), + ('Ọ', 'Ọ'), + ('Ỏ', 'Ỏ'), + ('Ố', 'Ố'), + ('Ồ', 'Ồ'), + ('Ổ', 'Ổ'), + ('Ỗ', 'Ỗ'), + ('Ộ', 'Ộ'), + ('Ớ', 'Ớ'), + ('Ờ', 'Ờ'), + ('Ở', 'Ở'), + ('Ỡ', 'Ỡ'), + ('Ợ', 'Ợ'), + ('Ụ', 'Ụ'), + ('Ủ', 'Ủ'), + ('Ứ', 'Ứ'), + ('Ừ', 'Ừ'), + ('Ử', 'Ử'), + ('Ữ', 'Ữ'), + ('Ự', 'Ự'), + ('Ỳ', 'Ỳ'), + ('Ỵ', 'Ỵ'), + ('Ỷ', 'Ỷ'), + ('Ỹ', 'Ỹ'), + ('Ỻ', 'Ỻ'), + ('Ỽ', 'Ỽ'), + ('Ỿ', 'Ỿ'), + ('Ἀ', 'Ἇ'), + ('Ἐ', 'Ἕ'), + ('Ἠ', 'Ἧ'), + ('Ἰ', 'Ἷ'), + ('Ὀ', 'Ὅ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'Ὗ'), + ('Ὠ', 'Ὧ'), + ('Ᾰ', 'Ά'), + ('Ὲ', 'Ή'), + ('Ῐ', 'Ί'), + ('Ῠ', 'Ῥ'), + ('Ὸ', 'Ώ'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℋ', 'ℍ'), + ('ℐ', 'ℒ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℰ', 'ℳ'), + ('ℾ', 'ℿ'), + ('ⅅ', 'ⅅ'), + ('Ⅰ', 'Ⅿ'), + ('Ↄ', 'Ↄ'), + ('Ⓐ', 'Ⓩ'), + ('Ⰰ', 'Ⱟ'), + ('Ⱡ', 'Ⱡ'), + ('Ɫ', 'Ɽ'), + ('Ⱨ', 'Ⱨ'), + ('Ⱪ', 'Ⱪ'), + ('Ⱬ', 'Ⱬ'), + ('Ɑ', 'Ɒ'), + ('Ⱳ', 'Ⱳ'), + ('Ⱶ', 'Ⱶ'), + ('Ȿ', 'Ⲁ'), + ('Ⲃ', 'Ⲃ'), + ('Ⲅ', 'Ⲅ'), + ('Ⲇ', 'Ⲇ'), + ('Ⲉ', 'Ⲉ'), + ('Ⲋ', 'Ⲋ'), + ('Ⲍ', 'Ⲍ'), + ('Ⲏ', 'Ⲏ'), + ('Ⲑ', 'Ⲑ'), + ('Ⲓ', 'Ⲓ'), + ('Ⲕ', 'Ⲕ'), + ('Ⲗ', 'Ⲗ'), + ('Ⲙ', 'Ⲙ'), + ('Ⲛ', 'Ⲛ'), + ('Ⲝ', 'Ⲝ'), + ('Ⲟ', 'Ⲟ'), + ('Ⲡ', 'Ⲡ'), + ('Ⲣ', 'Ⲣ'), + ('Ⲥ', 'Ⲥ'), + ('Ⲧ', 'Ⲧ'), + ('Ⲩ', 'Ⲩ'), + ('Ⲫ', 'Ⲫ'), + ('Ⲭ', 'Ⲭ'), + ('Ⲯ', 'Ⲯ'), + ('Ⲱ', 'Ⲱ'), + ('Ⲳ', 'Ⲳ'), + ('Ⲵ', 'Ⲵ'), + ('Ⲷ', 'Ⲷ'), + ('Ⲹ', 'Ⲹ'), + ('Ⲻ', 'Ⲻ'), + ('Ⲽ', 'Ⲽ'), + ('Ⲿ', 'Ⲿ'), + ('Ⳁ', 'Ⳁ'), + ('Ⳃ', 'Ⳃ'), + ('Ⳅ', 'Ⳅ'), + ('Ⳇ', 'Ⳇ'), + ('Ⳉ', 'Ⳉ'), + ('Ⳋ', 'Ⳋ'), + ('Ⳍ', 'Ⳍ'), + ('Ⳏ', 'Ⳏ'), + ('Ⳑ', 'Ⳑ'), + ('Ⳓ', 'Ⳓ'), + ('Ⳕ', 'Ⳕ'), + ('Ⳗ', 'Ⳗ'), + ('Ⳙ', 'Ⳙ'), + ('Ⳛ', 'Ⳛ'), + ('Ⳝ', 'Ⳝ'), + ('Ⳟ', 'Ⳟ'), + ('Ⳡ', 'Ⳡ'), + ('Ⳣ', 'Ⳣ'), + ('Ⳬ', 'Ⳬ'), + ('Ⳮ', 'Ⳮ'), + ('Ⳳ', 'Ⳳ'), + ('Ꙁ', 'Ꙁ'), + ('Ꙃ', 'Ꙃ'), + ('Ꙅ', 'Ꙅ'), + ('Ꙇ', 'Ꙇ'), + ('Ꙉ', 'Ꙉ'), + ('Ꙋ', 'Ꙋ'), + ('Ꙍ', 'Ꙍ'), + ('Ꙏ', 'Ꙏ'), + ('Ꙑ', 'Ꙑ'), + ('Ꙓ', 'Ꙓ'), + ('Ꙕ', 'Ꙕ'), + ('Ꙗ', 'Ꙗ'), + ('Ꙙ', 'Ꙙ'), + ('Ꙛ', 'Ꙛ'), + ('Ꙝ', 'Ꙝ'), + ('Ꙟ', 'Ꙟ'), + ('Ꙡ', 'Ꙡ'), + ('Ꙣ', 'Ꙣ'), + ('Ꙥ', 'Ꙥ'), + ('Ꙧ', 'Ꙧ'), + ('Ꙩ', 'Ꙩ'), + ('Ꙫ', 'Ꙫ'), + ('Ꙭ', 'Ꙭ'), + ('Ꚁ', 'Ꚁ'), + ('Ꚃ', 'Ꚃ'), + ('Ꚅ', 'Ꚅ'), + ('Ꚇ', 'Ꚇ'), + ('Ꚉ', 'Ꚉ'), + ('Ꚋ', 'Ꚋ'), + ('Ꚍ', 'Ꚍ'), + ('Ꚏ', 'Ꚏ'), + ('Ꚑ', 'Ꚑ'), + ('Ꚓ', 'Ꚓ'), + ('Ꚕ', 'Ꚕ'), + ('Ꚗ', 'Ꚗ'), + ('Ꚙ', 'Ꚙ'), + ('Ꚛ', 'Ꚛ'), + ('Ꜣ', 'Ꜣ'), + ('Ꜥ', 'Ꜥ'), + ('Ꜧ', 'Ꜧ'), + ('Ꜩ', 'Ꜩ'), + ('Ꜫ', 'Ꜫ'), + ('Ꜭ', 'Ꜭ'), + ('Ꜯ', 'Ꜯ'), + ('Ꜳ', 'Ꜳ'), + ('Ꜵ', 'Ꜵ'), + ('Ꜷ', 'Ꜷ'), + ('Ꜹ', 'Ꜹ'), + ('Ꜻ', 'Ꜻ'), + ('Ꜽ', 'Ꜽ'), + ('Ꜿ', 'Ꜿ'), + ('Ꝁ', 'Ꝁ'), + ('Ꝃ', 'Ꝃ'), + ('Ꝅ', 'Ꝅ'), + ('Ꝇ', 'Ꝇ'), + ('Ꝉ', 'Ꝉ'), + ('Ꝋ', 'Ꝋ'), + ('Ꝍ', 'Ꝍ'), + ('Ꝏ', 'Ꝏ'), + ('Ꝑ', 'Ꝑ'), + ('Ꝓ', 'Ꝓ'), + ('Ꝕ', 'Ꝕ'), + ('Ꝗ', 'Ꝗ'), + ('Ꝙ', 'Ꝙ'), + ('Ꝛ', 'Ꝛ'), + ('Ꝝ', 'Ꝝ'), + ('Ꝟ', 'Ꝟ'), + ('Ꝡ', 'Ꝡ'), + ('Ꝣ', 'Ꝣ'), + ('Ꝥ', 'Ꝥ'), + ('Ꝧ', 'Ꝧ'), + ('Ꝩ', 'Ꝩ'), + ('Ꝫ', 'Ꝫ'), + ('Ꝭ', 'Ꝭ'), + ('Ꝯ', 'Ꝯ'), + ('Ꝺ', 'Ꝺ'), + ('Ꝼ', 'Ꝼ'), + ('Ᵹ', 'Ꝿ'), + ('Ꞁ', 'Ꞁ'), + ('Ꞃ', 'Ꞃ'), + ('Ꞅ', 'Ꞅ'), + ('Ꞇ', 'Ꞇ'), + ('Ꞌ', 'Ꞌ'), + ('Ɥ', 'Ɥ'), + ('Ꞑ', 'Ꞑ'), + ('Ꞓ', 'Ꞓ'), + ('Ꞗ', 'Ꞗ'), + ('Ꞙ', 'Ꞙ'), + ('Ꞛ', 'Ꞛ'), + ('Ꞝ', 'Ꞝ'), + ('Ꞟ', 'Ꞟ'), + ('Ꞡ', 'Ꞡ'), + ('Ꞣ', 'Ꞣ'), + ('Ꞥ', 'Ꞥ'), + ('Ꞧ', 'Ꞧ'), + ('Ꞩ', 'Ꞩ'), + ('Ɦ', 'Ɪ'), + ('Ʞ', 'Ꞵ'), + ('Ꞷ', 'Ꞷ'), + ('Ꞹ', 'Ꞹ'), + ('Ꞻ', 'Ꞻ'), + ('Ꞽ', 'Ꞽ'), + ('Ꞿ', 'Ꞿ'), + ('Ꟁ', 'Ꟁ'), + ('Ꟃ', 'Ꟃ'), + ('Ꞔ', 'Ꟈ'), + ('Ꟊ', 'Ꟊ'), + ('Ꟑ', 'Ꟑ'), + ('Ꟗ', 'Ꟗ'), + ('Ꟙ', 'Ꟙ'), + ('Ꟶ', 'Ꟶ'), + ('A', 'Z'), + ('𐐀', '𐐧'), + ('𐒰', '𐓓'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐲀', '𐲲'), + ('𑢠', '𑢿'), + ('𖹀', '𖹟'), + ('𝐀', '𝐙'), + ('𝐴', '𝑍'), + ('𝑨', '𝒁'), + ('𝒜', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒵'), + ('𝓐', '𝓩'), + ('𝔄', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔸', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕬', '𝖅'), + ('𝖠', '𝖹'), + ('𝗔', '𝗭'), + ('𝘈', '𝘡'), + ('𝘼', '𝙕'), + ('𝙰', '𝚉'), + ('𝚨', '𝛀'), + ('𝛢', '𝛺'), + ('𝜜', '𝜴'), + ('𝝖', '𝝮'), + ('𝞐', '𝞨'), + ('𝟊', '𝟊'), + ('𞤀', '𞤡'), + ('🄰', '🅉'), + ('🅐', '🅩'), + ('🅰', '🆉'), +]; + +pub const VARIATION_SELECTOR: &'static [(char, char)] = &[ + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '\u{180f}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const WHITE_SPACE: &'static [(char, char)] = &[ + ('\t', '\r'), + (' ', ' '), + ('\u{85}', '\u{85}'), + ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), + ('\u{2028}', '\u{2029}'), + ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; + +pub const XID_CONTINUE: &'static [(char, char)] = &[ + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('·', '·'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ˁ'), + ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('\u{300}', 'ʹ'), + ('Ͷ', 'ͷ'), + ('ͻ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('\u{483}', '\u{487}'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), + ('ՠ', 'ֈ'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('א', 'ת'), + ('ׯ', 'ײ'), + ('\u{610}', '\u{61a}'), + ('ؠ', '٩'), + ('ٮ', 'ۓ'), + ('ە', '\u{6dc}'), + ('\u{6df}', '\u{6e8}'), + ('\u{6ea}', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', '\u{74a}'), + ('ݍ', 'ޱ'), + ('߀', 'ߵ'), + ('ߺ', 'ߺ'), + ('\u{7fd}', '\u{7fd}'), + ('ࠀ', '\u{82d}'), + ('ࡀ', '\u{85b}'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('\u{898}', '\u{8e1}'), + ('\u{8e3}', '\u{963}'), + ('०', '९'), + ('ॱ', 'ঃ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('\u{9bc}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', 'ৎ'), + ('\u{9d7}', '\u{9d7}'), + ('ড়', 'ঢ়'), + ('য়', '\u{9e3}'), + ('০', 'ৱ'), + ('ৼ', 'ৼ'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ਃ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('੦', '\u{a75}'), + ('\u{a81}', 'ઃ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('ૐ', 'ૐ'), + ('ૠ', '\u{ae3}'), + ('૦', '૯'), + ('ૹ', '\u{aff}'), + ('\u{b01}', 'ଃ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('\u{b3c}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', '\u{b63}'), + ('୦', '୯'), + ('ୱ', 'ୱ'), + ('\u{b82}', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('ௐ', 'ௐ'), + ('\u{bd7}', '\u{bd7}'), + ('௦', '௯'), + ('\u{c00}', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('\u{c3c}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', '\u{c63}'), + ('౦', '౯'), + ('ಀ', 'ಃ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('\u{cbc}', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('ೝ', 'ೞ'), + ('ೠ', '\u{ce3}'), + ('೦', '೯'), + ('ೱ', 'ೳ'), + ('\u{d00}', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', 'ൎ'), + ('ൔ', '\u{d57}'), + ('ൟ', '\u{d63}'), + ('൦', '൯'), + ('ൺ', 'ൿ'), + ('\u{d81}', 'ඃ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('෦', '෯'), + ('ෲ', 'ෳ'), + ('ก', '\u{e3a}'), + ('เ', '\u{e4e}'), + ('๐', '๙'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('\u{ec8}', '\u{ece}'), + ('໐', '໙'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('\u{f18}', '\u{f19}'), + ('༠', '༩'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('༾', 'ཇ'), + ('ཉ', 'ཬ'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('က', '၉'), + ('ၐ', '\u{109d}'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('\u{135d}', '\u{135f}'), + ('፩', '፱'), + ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛮ', 'ᛸ'), + ('ᜀ', '᜕'), + ('ᜟ', '᜴'), + ('ᝀ', '\u{1753}'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('\u{1772}', '\u{1773}'), + ('ក', '\u{17d3}'), + ('ៗ', 'ៗ'), + ('ៜ', '\u{17dd}'), + ('០', '៩'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '᠙'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('᥆', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('᧐', '᧚'), + ('ᨀ', '\u{1a1b}'), + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '᪉'), + ('᪐', '᪙'), + ('ᪧ', 'ᪧ'), + ('\u{1ab0}', '\u{1abd}'), + ('\u{1abf}', '\u{1ace}'), + ('\u{1b00}', 'ᭌ'), + ('᭐', '᭙'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', '᯳'), + ('ᰀ', '\u{1c37}'), + ('᱀', '᱉'), + ('ᱍ', 'ᱽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', 'ᳺ'), + ('ᴀ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('‿', '⁀'), + ('⁔', '⁔'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('\u{20d0}', '\u{20dc}'), + ('\u{20e1}', '\u{20e1}'), + ('\u{20e5}', '\u{20f0}'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('℘', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('\u{2d7f}', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('\u{2de0}', '\u{2dff}'), + ('々', '〇'), + ('〡', '\u{302f}'), + ('〱', '〵'), + ('〸', '〼'), + ('ぁ', 'ゖ'), + ('\u{3099}', '\u{309a}'), + ('ゝ', 'ゟ'), + ('ァ', 'ヺ'), + ('ー', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘫ'), + ('Ꙁ', '\u{a66f}'), + ('\u{a674}', '\u{a67d}'), + ('ꙿ', '\u{a6f1}'), + ('ꜗ', 'ꜟ'), + ('Ꜣ', 'ꞈ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠧ'), + ('\u{a82c}', '\u{a82c}'), + ('ꡀ', 'ꡳ'), + ('ꢀ', '\u{a8c5}'), + ('꣐', '꣙'), + ('\u{a8e0}', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', '\u{a92d}'), + ('ꤰ', '꥓'), + ('ꥠ', 'ꥼ'), + ('\u{a980}', '꧀'), + ('ꧏ', '꧙'), + ('ꧠ', 'ꧾ'), + ('ꨀ', '\u{aa36}'), + ('ꩀ', 'ꩍ'), + ('꩐', '꩙'), + ('ꩠ', 'ꩶ'), + ('ꩺ', 'ꫂ'), + ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫯ'), + ('ꫲ', '\u{aaf6}'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꯪ'), + ('꯬', '\u{abed}'), + ('꯰', '꯹'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('יִ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﱝ'), + ('ﱤ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷹ'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('︳', '︴'), + ('﹍', '﹏'), + ('ﹱ', 'ﹱ'), + ('ﹳ', 'ﹳ'), + ('ﹷ', 'ﹷ'), + ('ﹹ', 'ﹹ'), + ('ﹻ', 'ﹻ'), + ('ﹽ', 'ﹽ'), + ('ﹿ', 'ﻼ'), + ('0', '9'), + ('A', 'Z'), + ('_', '_'), + ('a', 'z'), + ('ヲ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐅀', '𐅴'), + ('\u{101fd}', '\u{101fd}'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('\u{102e0}', '\u{102e0}'), + ('𐌀', '𐌟'), + ('𐌭', '𐍊'), + ('𐍐', '\u{1037a}'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐏑', '𐏕'), + ('𐐀', '𐒝'), + ('𐒠', '𐒩'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '\u{10ae6}'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐴀', '\u{10d27}'), + ('𐴰', '𐴹'), + ('𐺀', '𐺩'), + ('\u{10eab}', '\u{10eac}'), + ('𐺰', '𐺱'), + ('\u{10efd}', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '\u{10f50}'), + ('𐽰', '\u{10f85}'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀀', '\u{11046}'), + ('𑁦', '𑁵'), + ('\u{1107f}', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('𑃐', '𑃨'), + ('𑃰', '𑃹'), + ('\u{11100}', '\u{11134}'), + ('𑄶', '𑄿'), + ('𑅄', '𑅇'), + ('𑅐', '\u{11173}'), + ('𑅶', '𑅶'), + ('\u{11180}', '𑇄'), + ('\u{111c9}', '\u{111cc}'), + ('𑇎', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '\u{11237}'), + ('\u{1123e}', '\u{11241}'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '\u{112ea}'), + ('𑋰', '𑋹'), + ('\u{11300}', '𑌃'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('\u{1133b}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('𑍐', '𑍐'), + ('\u{11357}', '\u{11357}'), + ('𑍝', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('𑐀', '𑑊'), + ('𑑐', '𑑙'), + ('\u{1145e}', '𑑡'), + ('𑒀', '𑓅'), + ('𑓇', '𑓇'), + ('𑓐', '𑓙'), + ('𑖀', '\u{115b5}'), + ('𑖸', '\u{115c0}'), + ('𑗘', '\u{115dd}'), + ('𑘀', '\u{11640}'), + ('𑙄', '𑙄'), + ('𑙐', '𑙙'), + ('𑚀', '𑚸'), + ('𑛀', '𑛉'), + ('𑜀', '𑜚'), + ('\u{1171d}', '\u{1172b}'), + ('𑜰', '𑜹'), + ('𑝀', '𑝆'), + ('𑠀', '\u{1183a}'), + ('𑢠', '𑣩'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '\u{11943}'), + ('𑥐', '𑥙'), + ('𑦠', '𑦧'), + ('𑦪', '\u{119d7}'), + ('\u{119da}', '𑧡'), + ('𑧣', '𑧤'), + ('𑨀', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('𑩐', '\u{11a99}'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '\u{11c36}'), + ('\u{11c38}', '𑱀'), + ('𑱐', '𑱙'), + ('𑱲', '𑲏'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('𑵐', '𑵙'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '𑶘'), + ('𑶠', '𑶩'), + ('𑻠', '𑻶'), + ('\u{11f00}', '𑼐'), + ('𑼒', '\u{11f3a}'), + ('𑼾', '\u{11f42}'), + ('𑽐', '𑽙'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒐀', '𒑮'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('\u{13440}', '\u{13455}'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩠', '𖩩'), + ('𖩰', '𖪾'), + ('𖫀', '𖫉'), + ('𖫐', '𖫭'), + ('\u{16af0}', '\u{16af4}'), + ('𖬀', '\u{16b36}'), + ('𖭀', '𖭃'), + ('𖭐', '𖭙'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖹿'), + ('𖼀', '𖽊'), + ('\u{16f4f}', '𖾇'), + ('\u{16f8f}', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '\u{16fe4}'), + ('𖿰', '𖿱'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d165}', '\u{1d169}'), + ('𝅭', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝟎', '𝟿'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('𞀰', '𞁭'), + ('\u{1e08f}', '\u{1e08f}'), + ('𞄀', '𞄬'), + ('\u{1e130}', '𞄽'), + ('𞅀', '𞅉'), + ('𞅎', '𞅎'), + ('𞊐', '\u{1e2ae}'), + ('𞋀', '𞋹'), + ('𞓐', '𞓹'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('𞤀', '𞥋'), + ('𞥐', '𞥙'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('🯰', '🯹'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const XID_START: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ˁ'), + ('ˆ', 'ˑ'), + ('ˠ', 'ˤ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('Ͱ', 'ʹ'), + ('Ͷ', 'ͷ'), + ('ͻ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', 'ՙ'), + ('ՠ', 'ֈ'), + ('א', 'ת'), + ('ׯ', 'ײ'), + ('ؠ', 'ي'), + ('ٮ', 'ٯ'), + ('ٱ', 'ۓ'), + ('ە', 'ە'), + ('ۥ', 'ۦ'), + ('ۮ', 'ۯ'), + ('ۺ', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', 'ܐ'), + ('ܒ', 'ܯ'), + ('ݍ', 'ޥ'), + ('ޱ', 'ޱ'), + ('ߊ', 'ߪ'), + ('ߴ', 'ߵ'), + ('ߺ', 'ߺ'), + ('ࠀ', 'ࠕ'), + ('ࠚ', 'ࠚ'), + ('ࠤ', 'ࠤ'), + ('ࠨ', 'ࠨ'), + ('ࡀ', 'ࡘ'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('ࢠ', 'ࣉ'), + ('ऄ', 'ह'), + ('ऽ', 'ऽ'), + ('ॐ', 'ॐ'), + ('क़', 'ॡ'), + ('ॱ', 'ঀ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('ঽ', 'ঽ'), + ('ৎ', 'ৎ'), + ('ড়', 'ঢ়'), + ('য়', 'ৡ'), + ('ৰ', 'ৱ'), + ('ৼ', 'ৼ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('ੲ', 'ੴ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('ઽ', 'ઽ'), + ('ૐ', 'ૐ'), + ('ૠ', 'ૡ'), + ('ૹ', 'ૹ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('ଽ', 'ଽ'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୡ'), + ('ୱ', 'ୱ'), + ('ஃ', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('ௐ', 'ௐ'), + ('అ', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('ఽ', 'ఽ'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', 'ౡ'), + ('ಀ', 'ಀ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('ಽ', 'ಽ'), + ('ೝ', 'ೞ'), + ('ೠ', 'ೡ'), + ('ೱ', 'ೲ'), + ('ഄ', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), + ('ൎ', 'ൎ'), + ('ൔ', 'ൖ'), + ('ൟ', 'ൡ'), + ('ൺ', 'ൿ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('ก', 'ะ'), + ('า', 'า'), + ('เ', 'ๆ'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ະ'), + ('າ', 'າ'), + ('ຽ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('ཀ', 'ཇ'), + ('ཉ', 'ཬ'), + ('ྈ', 'ྌ'), + ('က', 'ဪ'), + ('ဿ', 'ဿ'), + ('ၐ', 'ၕ'), + ('ၚ', 'ၝ'), + ('ၡ', 'ၡ'), + ('ၥ', 'ၦ'), + ('ၮ', 'ၰ'), + ('ၵ', 'ႁ'), + ('ႎ', 'ႎ'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛮ', 'ᛸ'), + ('ᜀ', 'ᜑ'), + ('ᜟ', 'ᜱ'), + ('ᝀ', 'ᝑ'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('ក', 'ឳ'), + ('ៗ', 'ៗ'), + ('ៜ', 'ៜ'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢨ'), + ('ᢪ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('ᥐ', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('ᨀ', 'ᨖ'), + ('ᨠ', 'ᩔ'), + ('ᪧ', 'ᪧ'), + ('ᬅ', 'ᬳ'), + ('ᭅ', 'ᭌ'), + ('ᮃ', 'ᮠ'), + ('ᮮ', 'ᮯ'), + ('ᮺ', 'ᯥ'), + ('ᰀ', 'ᰣ'), + ('ᱍ', 'ᱏ'), + ('ᱚ', 'ᱽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ᳩ', 'ᳬ'), + ('ᳮ', 'ᳳ'), + ('ᳵ', 'ᳶ'), + ('ᳺ', 'ᳺ'), + ('ᴀ', 'ᶿ'), + ('Ḁ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('℘', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('々', '〇'), + ('〡', '〩'), + ('〱', '〵'), + ('〸', '〼'), + ('ぁ', 'ゖ'), + ('ゝ', 'ゟ'), + ('ァ', 'ヺ'), + ('ー', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘟ'), + ('ꘪ', 'ꘫ'), + ('Ꙁ', 'ꙮ'), + ('ꙿ', 'ꚝ'), + ('ꚠ', 'ꛯ'), + ('ꜗ', 'ꜟ'), + ('Ꜣ', 'ꞈ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠁ'), + ('ꠃ', 'ꠅ'), + ('ꠇ', 'ꠊ'), + ('ꠌ', 'ꠢ'), + ('ꡀ', 'ꡳ'), + ('ꢂ', 'ꢳ'), + ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', 'ꣾ'), + ('ꤊ', 'ꤥ'), + ('ꤰ', 'ꥆ'), + ('ꥠ', 'ꥼ'), + ('ꦄ', 'ꦲ'), + ('ꧏ', 'ꧏ'), + ('ꧠ', 'ꧤ'), + ('ꧦ', 'ꧯ'), + ('ꧺ', 'ꧾ'), + ('ꨀ', 'ꨨ'), + ('ꩀ', 'ꩂ'), + ('ꩄ', 'ꩋ'), + ('ꩠ', 'ꩶ'), + ('ꩺ', 'ꩺ'), + ('ꩾ', 'ꪯ'), + ('ꪱ', 'ꪱ'), + ('ꪵ', 'ꪶ'), + ('ꪹ', 'ꪽ'), + ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), + ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫪ'), + ('ꫲ', 'ꫴ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꯢ'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('יִ', 'יִ'), + ('ײַ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﱝ'), + ('ﱤ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷹ'), + ('ﹱ', 'ﹱ'), + ('ﹳ', 'ﹳ'), + ('ﹷ', 'ﹷ'), + ('ﹹ', 'ﹹ'), + ('ﹻ', 'ﹻ'), + ('ﹽ', 'ﹽ'), + ('ﹿ', 'ﻼ'), + ('A', 'Z'), + ('a', 'z'), + ('ヲ', 'ン'), + ('ᅠ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐅀', '𐅴'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('𐌀', '𐌟'), + ('𐌭', '𐍊'), + ('𐍐', '𐍵'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐏑', '𐏕'), + ('𐐀', '𐒝'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '𐨀'), + ('𐨐', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '𐫤'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐴀', '𐴣'), + ('𐺀', '𐺩'), + ('𐺰', '𐺱'), + ('𐼀', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '𐽅'), + ('𐽰', '𐾁'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀃', '𑀷'), + ('𑁱', '𑁲'), + ('𑁵', '𑁵'), + ('𑂃', '𑂯'), + ('𑃐', '𑃨'), + ('𑄃', '𑄦'), + ('𑅄', '𑅄'), + ('𑅇', '𑅇'), + ('𑅐', '𑅲'), + ('𑅶', '𑅶'), + ('𑆃', '𑆲'), + ('𑇁', '𑇄'), + ('𑇚', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '𑈫'), + ('𑈿', '𑉀'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '𑋞'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('𑌽', '𑌽'), + ('𑍐', '𑍐'), + ('𑍝', '𑍡'), + ('𑐀', '𑐴'), + ('𑑇', '𑑊'), + ('𑑟', '𑑡'), + ('𑒀', '𑒯'), + ('𑓄', '𑓅'), + ('𑓇', '𑓇'), + ('𑖀', '𑖮'), + ('𑗘', '𑗛'), + ('𑘀', '𑘯'), + ('𑙄', '𑙄'), + ('𑚀', '𑚪'), + ('𑚸', '𑚸'), + ('𑜀', '𑜚'), + ('𑝀', '𑝆'), + ('𑠀', '𑠫'), + ('𑢠', '𑣟'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤯'), + ('𑤿', '𑤿'), + ('𑥁', '𑥁'), + ('𑦠', '𑦧'), + ('𑦪', '𑧐'), + ('𑧡', '𑧡'), + ('𑧣', '𑧣'), + ('𑨀', '𑨀'), + ('𑨋', '𑨲'), + ('𑨺', '𑨺'), + ('𑩐', '𑩐'), + ('𑩜', '𑪉'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '𑰮'), + ('𑱀', '𑱀'), + ('𑱲', '𑲏'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '𑴰'), + ('𑵆', '𑵆'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶉'), + ('𑶘', '𑶘'), + ('𑻠', '𑻲'), + ('𑼂', '𑼂'), + ('𑼄', '𑼐'), + ('𑼒', '𑼳'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒐀', '𒑮'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('𓑁', '𓑆'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩰', '𖪾'), + ('𖫐', '𖫭'), + ('𖬀', '𖬯'), + ('𖭀', '𖭃'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖹿'), + ('𖼀', '𖽊'), + ('𖽐', '𖽐'), + ('𖾓', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '𖿣'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('𞀰', '𞁭'), + ('𞄀', '𞄬'), + ('𞄷', '𞄽'), + ('𞅎', '𞅎'), + ('𞊐', '𞊭'), + ('𞋀', '𞋫'), + ('𞓐', '𞓫'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('𞤀', '𞥃'), + ('𞥋', '𞥋'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/property_names.rs b/vendor/regex-syntax/src/unicode_tables/property_names.rs new file mode 100644 index 0000000000..599a123ae5 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/property_names.rs @@ -0,0 +1,264 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate property-names ucd-15.0.0 +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const PROPERTY_NAMES: &'static [(&'static str, &'static str)] = &[ + ("age", "Age"), + ("ahex", "ASCII_Hex_Digit"), + ("alpha", "Alphabetic"), + ("alphabetic", "Alphabetic"), + ("asciihexdigit", "ASCII_Hex_Digit"), + ("bc", "Bidi_Class"), + ("bidic", "Bidi_Control"), + ("bidiclass", "Bidi_Class"), + ("bidicontrol", "Bidi_Control"), + ("bidim", "Bidi_Mirrored"), + ("bidimirrored", "Bidi_Mirrored"), + ("bidimirroringglyph", "Bidi_Mirroring_Glyph"), + ("bidipairedbracket", "Bidi_Paired_Bracket"), + ("bidipairedbrackettype", "Bidi_Paired_Bracket_Type"), + ("blk", "Block"), + ("block", "Block"), + ("bmg", "Bidi_Mirroring_Glyph"), + ("bpb", "Bidi_Paired_Bracket"), + ("bpt", "Bidi_Paired_Bracket_Type"), + ("canonicalcombiningclass", "Canonical_Combining_Class"), + ("cased", "Cased"), + ("casefolding", "Case_Folding"), + ("caseignorable", "Case_Ignorable"), + ("ccc", "Canonical_Combining_Class"), + ("ce", "Composition_Exclusion"), + ("cf", "Case_Folding"), + ("changeswhencasefolded", "Changes_When_Casefolded"), + ("changeswhencasemapped", "Changes_When_Casemapped"), + ("changeswhenlowercased", "Changes_When_Lowercased"), + ("changeswhennfkccasefolded", "Changes_When_NFKC_Casefolded"), + ("changeswhentitlecased", "Changes_When_Titlecased"), + ("changeswhenuppercased", "Changes_When_Uppercased"), + ("ci", "Case_Ignorable"), + ("cjkaccountingnumeric", "kAccountingNumeric"), + ("cjkcompatibilityvariant", "kCompatibilityVariant"), + ("cjkiicore", "kIICore"), + ("cjkirggsource", "kIRG_GSource"), + ("cjkirghsource", "kIRG_HSource"), + ("cjkirgjsource", "kIRG_JSource"), + ("cjkirgkpsource", "kIRG_KPSource"), + ("cjkirgksource", "kIRG_KSource"), + ("cjkirgmsource", "kIRG_MSource"), + ("cjkirgssource", "kIRG_SSource"), + ("cjkirgtsource", "kIRG_TSource"), + ("cjkirguksource", "kIRG_UKSource"), + ("cjkirgusource", "kIRG_USource"), + ("cjkirgvsource", "kIRG_VSource"), + ("cjkothernumeric", "kOtherNumeric"), + ("cjkprimarynumeric", "kPrimaryNumeric"), + ("cjkrsunicode", "kRSUnicode"), + ("compex", "Full_Composition_Exclusion"), + ("compositionexclusion", "Composition_Exclusion"), + ("cwcf", "Changes_When_Casefolded"), + ("cwcm", "Changes_When_Casemapped"), + ("cwkcf", "Changes_When_NFKC_Casefolded"), + ("cwl", "Changes_When_Lowercased"), + ("cwt", "Changes_When_Titlecased"), + ("cwu", "Changes_When_Uppercased"), + ("dash", "Dash"), + ("decompositionmapping", "Decomposition_Mapping"), + ("decompositiontype", "Decomposition_Type"), + ("defaultignorablecodepoint", "Default_Ignorable_Code_Point"), + ("dep", "Deprecated"), + ("deprecated", "Deprecated"), + ("di", "Default_Ignorable_Code_Point"), + ("dia", "Diacritic"), + ("diacritic", "Diacritic"), + ("dm", "Decomposition_Mapping"), + ("dt", "Decomposition_Type"), + ("ea", "East_Asian_Width"), + ("eastasianwidth", "East_Asian_Width"), + ("ebase", "Emoji_Modifier_Base"), + ("ecomp", "Emoji_Component"), + ("emod", "Emoji_Modifier"), + ("emoji", "Emoji"), + ("emojicomponent", "Emoji_Component"), + ("emojimodifier", "Emoji_Modifier"), + ("emojimodifierbase", "Emoji_Modifier_Base"), + ("emojipresentation", "Emoji_Presentation"), + ("epres", "Emoji_Presentation"), + ("equideo", "Equivalent_Unified_Ideograph"), + ("equivalentunifiedideograph", "Equivalent_Unified_Ideograph"), + ("expandsonnfc", "Expands_On_NFC"), + ("expandsonnfd", "Expands_On_NFD"), + ("expandsonnfkc", "Expands_On_NFKC"), + ("expandsonnfkd", "Expands_On_NFKD"), + ("ext", "Extender"), + ("extendedpictographic", "Extended_Pictographic"), + ("extender", "Extender"), + ("extpict", "Extended_Pictographic"), + ("fcnfkc", "FC_NFKC_Closure"), + ("fcnfkcclosure", "FC_NFKC_Closure"), + ("fullcompositionexclusion", "Full_Composition_Exclusion"), + ("gc", "General_Category"), + ("gcb", "Grapheme_Cluster_Break"), + ("generalcategory", "General_Category"), + ("graphemebase", "Grapheme_Base"), + ("graphemeclusterbreak", "Grapheme_Cluster_Break"), + ("graphemeextend", "Grapheme_Extend"), + ("graphemelink", "Grapheme_Link"), + ("grbase", "Grapheme_Base"), + ("grext", "Grapheme_Extend"), + ("grlink", "Grapheme_Link"), + ("hangulsyllabletype", "Hangul_Syllable_Type"), + ("hex", "Hex_Digit"), + ("hexdigit", "Hex_Digit"), + ("hst", "Hangul_Syllable_Type"), + ("hyphen", "Hyphen"), + ("idc", "ID_Continue"), + ("idcontinue", "ID_Continue"), + ("ideo", "Ideographic"), + ("ideographic", "Ideographic"), + ("ids", "ID_Start"), + ("idsb", "IDS_Binary_Operator"), + ("idsbinaryoperator", "IDS_Binary_Operator"), + ("idst", "IDS_Trinary_Operator"), + ("idstart", "ID_Start"), + ("idstrinaryoperator", "IDS_Trinary_Operator"), + ("indicpositionalcategory", "Indic_Positional_Category"), + ("indicsyllabiccategory", "Indic_Syllabic_Category"), + ("inpc", "Indic_Positional_Category"), + ("insc", "Indic_Syllabic_Category"), + ("isc", "ISO_Comment"), + ("jamoshortname", "Jamo_Short_Name"), + ("jg", "Joining_Group"), + ("joinc", "Join_Control"), + ("joincontrol", "Join_Control"), + ("joininggroup", "Joining_Group"), + ("joiningtype", "Joining_Type"), + ("jsn", "Jamo_Short_Name"), + ("jt", "Joining_Type"), + ("kaccountingnumeric", "kAccountingNumeric"), + ("kcompatibilityvariant", "kCompatibilityVariant"), + ("kiicore", "kIICore"), + ("kirggsource", "kIRG_GSource"), + ("kirghsource", "kIRG_HSource"), + ("kirgjsource", "kIRG_JSource"), + ("kirgkpsource", "kIRG_KPSource"), + ("kirgksource", "kIRG_KSource"), + ("kirgmsource", "kIRG_MSource"), + ("kirgssource", "kIRG_SSource"), + ("kirgtsource", "kIRG_TSource"), + ("kirguksource", "kIRG_UKSource"), + ("kirgusource", "kIRG_USource"), + ("kirgvsource", "kIRG_VSource"), + ("kothernumeric", "kOtherNumeric"), + ("kprimarynumeric", "kPrimaryNumeric"), + ("krsunicode", "kRSUnicode"), + ("lb", "Line_Break"), + ("lc", "Lowercase_Mapping"), + ("linebreak", "Line_Break"), + ("loe", "Logical_Order_Exception"), + ("logicalorderexception", "Logical_Order_Exception"), + ("lower", "Lowercase"), + ("lowercase", "Lowercase"), + ("lowercasemapping", "Lowercase_Mapping"), + ("math", "Math"), + ("na", "Name"), + ("na1", "Unicode_1_Name"), + ("name", "Name"), + ("namealias", "Name_Alias"), + ("nchar", "Noncharacter_Code_Point"), + ("nfcqc", "NFC_Quick_Check"), + ("nfcquickcheck", "NFC_Quick_Check"), + ("nfdqc", "NFD_Quick_Check"), + ("nfdquickcheck", "NFD_Quick_Check"), + ("nfkccasefold", "NFKC_Casefold"), + ("nfkccf", "NFKC_Casefold"), + ("nfkcqc", "NFKC_Quick_Check"), + ("nfkcquickcheck", "NFKC_Quick_Check"), + ("nfkdqc", "NFKD_Quick_Check"), + ("nfkdquickcheck", "NFKD_Quick_Check"), + ("noncharactercodepoint", "Noncharacter_Code_Point"), + ("nt", "Numeric_Type"), + ("numerictype", "Numeric_Type"), + ("numericvalue", "Numeric_Value"), + ("nv", "Numeric_Value"), + ("oalpha", "Other_Alphabetic"), + ("ocomment", "ISO_Comment"), + ("odi", "Other_Default_Ignorable_Code_Point"), + ("ogrext", "Other_Grapheme_Extend"), + ("oidc", "Other_ID_Continue"), + ("oids", "Other_ID_Start"), + ("olower", "Other_Lowercase"), + ("omath", "Other_Math"), + ("otheralphabetic", "Other_Alphabetic"), + ("otherdefaultignorablecodepoint", "Other_Default_Ignorable_Code_Point"), + ("othergraphemeextend", "Other_Grapheme_Extend"), + ("otheridcontinue", "Other_ID_Continue"), + ("otheridstart", "Other_ID_Start"), + ("otherlowercase", "Other_Lowercase"), + ("othermath", "Other_Math"), + ("otheruppercase", "Other_Uppercase"), + ("oupper", "Other_Uppercase"), + ("patsyn", "Pattern_Syntax"), + ("patternsyntax", "Pattern_Syntax"), + ("patternwhitespace", "Pattern_White_Space"), + ("patws", "Pattern_White_Space"), + ("pcm", "Prepended_Concatenation_Mark"), + ("prependedconcatenationmark", "Prepended_Concatenation_Mark"), + ("qmark", "Quotation_Mark"), + ("quotationmark", "Quotation_Mark"), + ("radical", "Radical"), + ("regionalindicator", "Regional_Indicator"), + ("ri", "Regional_Indicator"), + ("sb", "Sentence_Break"), + ("sc", "Script"), + ("scf", "Simple_Case_Folding"), + ("script", "Script"), + ("scriptextensions", "Script_Extensions"), + ("scx", "Script_Extensions"), + ("sd", "Soft_Dotted"), + ("sentencebreak", "Sentence_Break"), + ("sentenceterminal", "Sentence_Terminal"), + ("sfc", "Simple_Case_Folding"), + ("simplecasefolding", "Simple_Case_Folding"), + ("simplelowercasemapping", "Simple_Lowercase_Mapping"), + ("simpletitlecasemapping", "Simple_Titlecase_Mapping"), + ("simpleuppercasemapping", "Simple_Uppercase_Mapping"), + ("slc", "Simple_Lowercase_Mapping"), + ("softdotted", "Soft_Dotted"), + ("space", "White_Space"), + ("stc", "Simple_Titlecase_Mapping"), + ("sterm", "Sentence_Terminal"), + ("suc", "Simple_Uppercase_Mapping"), + ("tc", "Titlecase_Mapping"), + ("term", "Terminal_Punctuation"), + ("terminalpunctuation", "Terminal_Punctuation"), + ("titlecasemapping", "Titlecase_Mapping"), + ("uc", "Uppercase_Mapping"), + ("uideo", "Unified_Ideograph"), + ("unicode1name", "Unicode_1_Name"), + ("unicoderadicalstroke", "kRSUnicode"), + ("unifiedideograph", "Unified_Ideograph"), + ("upper", "Uppercase"), + ("uppercase", "Uppercase"), + ("uppercasemapping", "Uppercase_Mapping"), + ("urs", "kRSUnicode"), + ("variationselector", "Variation_Selector"), + ("verticalorientation", "Vertical_Orientation"), + ("vo", "Vertical_Orientation"), + ("vs", "Variation_Selector"), + ("wb", "Word_Break"), + ("whitespace", "White_Space"), + ("wordbreak", "Word_Break"), + ("wspace", "White_Space"), + ("xidc", "XID_Continue"), + ("xidcontinue", "XID_Continue"), + ("xids", "XID_Start"), + ("xidstart", "XID_Start"), + ("xonfc", "Expands_On_NFC"), + ("xonfd", "Expands_On_NFD"), + ("xonfkc", "Expands_On_NFKC"), + ("xonfkd", "Expands_On_NFKD"), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/property_values.rs b/vendor/regex-syntax/src/unicode_tables/property_values.rs new file mode 100644 index 0000000000..cb2d32fb70 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/property_values.rs @@ -0,0 +1,924 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate property-values ucd-15.0.0 --include gc,script,scx,age,gcb,wb,sb +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const PROPERTY_VALUES: &'static [( + &'static str, + &'static [(&'static str, &'static str)], +)] = &[ + ( + "Age", + &[ + ("1.1", "V1_1"), + ("10.0", "V10_0"), + ("11.0", "V11_0"), + ("12.0", "V12_0"), + ("12.1", "V12_1"), + ("13.0", "V13_0"), + ("14.0", "V14_0"), + ("15.0", "V15_0"), + ("2.0", "V2_0"), + ("2.1", "V2_1"), + ("3.0", "V3_0"), + ("3.1", "V3_1"), + ("3.2", "V3_2"), + ("4.0", "V4_0"), + ("4.1", "V4_1"), + ("5.0", "V5_0"), + ("5.1", "V5_1"), + ("5.2", "V5_2"), + ("6.0", "V6_0"), + ("6.1", "V6_1"), + ("6.2", "V6_2"), + ("6.3", "V6_3"), + ("7.0", "V7_0"), + ("8.0", "V8_0"), + ("9.0", "V9_0"), + ("na", "Unassigned"), + ("unassigned", "Unassigned"), + ("v100", "V10_0"), + ("v11", "V1_1"), + ("v110", "V11_0"), + ("v120", "V12_0"), + ("v121", "V12_1"), + ("v130", "V13_0"), + ("v140", "V14_0"), + ("v150", "V15_0"), + ("v20", "V2_0"), + ("v21", "V2_1"), + ("v30", "V3_0"), + ("v31", "V3_1"), + ("v32", "V3_2"), + ("v40", "V4_0"), + ("v41", "V4_1"), + ("v50", "V5_0"), + ("v51", "V5_1"), + ("v52", "V5_2"), + ("v60", "V6_0"), + ("v61", "V6_1"), + ("v62", "V6_2"), + ("v63", "V6_3"), + ("v70", "V7_0"), + ("v80", "V8_0"), + ("v90", "V9_0"), + ], + ), + ( + "General_Category", + &[ + ("c", "Other"), + ("casedletter", "Cased_Letter"), + ("cc", "Control"), + ("cf", "Format"), + ("closepunctuation", "Close_Punctuation"), + ("cn", "Unassigned"), + ("cntrl", "Control"), + ("co", "Private_Use"), + ("combiningmark", "Mark"), + ("connectorpunctuation", "Connector_Punctuation"), + ("control", "Control"), + ("cs", "Surrogate"), + ("currencysymbol", "Currency_Symbol"), + ("dashpunctuation", "Dash_Punctuation"), + ("decimalnumber", "Decimal_Number"), + ("digit", "Decimal_Number"), + ("enclosingmark", "Enclosing_Mark"), + ("finalpunctuation", "Final_Punctuation"), + ("format", "Format"), + ("initialpunctuation", "Initial_Punctuation"), + ("l", "Letter"), + ("lc", "Cased_Letter"), + ("letter", "Letter"), + ("letternumber", "Letter_Number"), + ("lineseparator", "Line_Separator"), + ("ll", "Lowercase_Letter"), + ("lm", "Modifier_Letter"), + ("lo", "Other_Letter"), + ("lowercaseletter", "Lowercase_Letter"), + ("lt", "Titlecase_Letter"), + ("lu", "Uppercase_Letter"), + ("m", "Mark"), + ("mark", "Mark"), + ("mathsymbol", "Math_Symbol"), + ("mc", "Spacing_Mark"), + ("me", "Enclosing_Mark"), + ("mn", "Nonspacing_Mark"), + ("modifierletter", "Modifier_Letter"), + ("modifiersymbol", "Modifier_Symbol"), + ("n", "Number"), + ("nd", "Decimal_Number"), + ("nl", "Letter_Number"), + ("no", "Other_Number"), + ("nonspacingmark", "Nonspacing_Mark"), + ("number", "Number"), + ("openpunctuation", "Open_Punctuation"), + ("other", "Other"), + ("otherletter", "Other_Letter"), + ("othernumber", "Other_Number"), + ("otherpunctuation", "Other_Punctuation"), + ("othersymbol", "Other_Symbol"), + ("p", "Punctuation"), + ("paragraphseparator", "Paragraph_Separator"), + ("pc", "Connector_Punctuation"), + ("pd", "Dash_Punctuation"), + ("pe", "Close_Punctuation"), + ("pf", "Final_Punctuation"), + ("pi", "Initial_Punctuation"), + ("po", "Other_Punctuation"), + ("privateuse", "Private_Use"), + ("ps", "Open_Punctuation"), + ("punct", "Punctuation"), + ("punctuation", "Punctuation"), + ("s", "Symbol"), + ("sc", "Currency_Symbol"), + ("separator", "Separator"), + ("sk", "Modifier_Symbol"), + ("sm", "Math_Symbol"), + ("so", "Other_Symbol"), + ("spaceseparator", "Space_Separator"), + ("spacingmark", "Spacing_Mark"), + ("surrogate", "Surrogate"), + ("symbol", "Symbol"), + ("titlecaseletter", "Titlecase_Letter"), + ("unassigned", "Unassigned"), + ("uppercaseletter", "Uppercase_Letter"), + ("z", "Separator"), + ("zl", "Line_Separator"), + ("zp", "Paragraph_Separator"), + ("zs", "Space_Separator"), + ], + ), + ( + "Grapheme_Cluster_Break", + &[ + ("cn", "Control"), + ("control", "Control"), + ("cr", "CR"), + ("eb", "E_Base"), + ("ebase", "E_Base"), + ("ebasegaz", "E_Base_GAZ"), + ("ebg", "E_Base_GAZ"), + ("em", "E_Modifier"), + ("emodifier", "E_Modifier"), + ("ex", "Extend"), + ("extend", "Extend"), + ("gaz", "Glue_After_Zwj"), + ("glueafterzwj", "Glue_After_Zwj"), + ("l", "L"), + ("lf", "LF"), + ("lv", "LV"), + ("lvt", "LVT"), + ("other", "Other"), + ("pp", "Prepend"), + ("prepend", "Prepend"), + ("regionalindicator", "Regional_Indicator"), + ("ri", "Regional_Indicator"), + ("sm", "SpacingMark"), + ("spacingmark", "SpacingMark"), + ("t", "T"), + ("v", "V"), + ("xx", "Other"), + ("zwj", "ZWJ"), + ], + ), + ( + "Script", + &[ + ("adlam", "Adlam"), + ("adlm", "Adlam"), + ("aghb", "Caucasian_Albanian"), + ("ahom", "Ahom"), + ("anatolianhieroglyphs", "Anatolian_Hieroglyphs"), + ("arab", "Arabic"), + ("arabic", "Arabic"), + ("armenian", "Armenian"), + ("armi", "Imperial_Aramaic"), + ("armn", "Armenian"), + ("avestan", "Avestan"), + ("avst", "Avestan"), + ("bali", "Balinese"), + ("balinese", "Balinese"), + ("bamu", "Bamum"), + ("bamum", "Bamum"), + ("bass", "Bassa_Vah"), + ("bassavah", "Bassa_Vah"), + ("batak", "Batak"), + ("batk", "Batak"), + ("beng", "Bengali"), + ("bengali", "Bengali"), + ("bhaiksuki", "Bhaiksuki"), + ("bhks", "Bhaiksuki"), + ("bopo", "Bopomofo"), + ("bopomofo", "Bopomofo"), + ("brah", "Brahmi"), + ("brahmi", "Brahmi"), + ("brai", "Braille"), + ("braille", "Braille"), + ("bugi", "Buginese"), + ("buginese", "Buginese"), + ("buhd", "Buhid"), + ("buhid", "Buhid"), + ("cakm", "Chakma"), + ("canadianaboriginal", "Canadian_Aboriginal"), + ("cans", "Canadian_Aboriginal"), + ("cari", "Carian"), + ("carian", "Carian"), + ("caucasianalbanian", "Caucasian_Albanian"), + ("chakma", "Chakma"), + ("cham", "Cham"), + ("cher", "Cherokee"), + ("cherokee", "Cherokee"), + ("chorasmian", "Chorasmian"), + ("chrs", "Chorasmian"), + ("common", "Common"), + ("copt", "Coptic"), + ("coptic", "Coptic"), + ("cpmn", "Cypro_Minoan"), + ("cprt", "Cypriot"), + ("cuneiform", "Cuneiform"), + ("cypriot", "Cypriot"), + ("cyprominoan", "Cypro_Minoan"), + ("cyrillic", "Cyrillic"), + ("cyrl", "Cyrillic"), + ("deseret", "Deseret"), + ("deva", "Devanagari"), + ("devanagari", "Devanagari"), + ("diak", "Dives_Akuru"), + ("divesakuru", "Dives_Akuru"), + ("dogr", "Dogra"), + ("dogra", "Dogra"), + ("dsrt", "Deseret"), + ("dupl", "Duployan"), + ("duployan", "Duployan"), + ("egyp", "Egyptian_Hieroglyphs"), + ("egyptianhieroglyphs", "Egyptian_Hieroglyphs"), + ("elba", "Elbasan"), + ("elbasan", "Elbasan"), + ("elym", "Elymaic"), + ("elymaic", "Elymaic"), + ("ethi", "Ethiopic"), + ("ethiopic", "Ethiopic"), + ("geor", "Georgian"), + ("georgian", "Georgian"), + ("glag", "Glagolitic"), + ("glagolitic", "Glagolitic"), + ("gong", "Gunjala_Gondi"), + ("gonm", "Masaram_Gondi"), + ("goth", "Gothic"), + ("gothic", "Gothic"), + ("gran", "Grantha"), + ("grantha", "Grantha"), + ("greek", "Greek"), + ("grek", "Greek"), + ("gujarati", "Gujarati"), + ("gujr", "Gujarati"), + ("gunjalagondi", "Gunjala_Gondi"), + ("gurmukhi", "Gurmukhi"), + ("guru", "Gurmukhi"), + ("han", "Han"), + ("hang", "Hangul"), + ("hangul", "Hangul"), + ("hani", "Han"), + ("hanifirohingya", "Hanifi_Rohingya"), + ("hano", "Hanunoo"), + ("hanunoo", "Hanunoo"), + ("hatr", "Hatran"), + ("hatran", "Hatran"), + ("hebr", "Hebrew"), + ("hebrew", "Hebrew"), + ("hira", "Hiragana"), + ("hiragana", "Hiragana"), + ("hluw", "Anatolian_Hieroglyphs"), + ("hmng", "Pahawh_Hmong"), + ("hmnp", "Nyiakeng_Puachue_Hmong"), + ("hrkt", "Katakana_Or_Hiragana"), + ("hung", "Old_Hungarian"), + ("imperialaramaic", "Imperial_Aramaic"), + ("inherited", "Inherited"), + ("inscriptionalpahlavi", "Inscriptional_Pahlavi"), + ("inscriptionalparthian", "Inscriptional_Parthian"), + ("ital", "Old_Italic"), + ("java", "Javanese"), + ("javanese", "Javanese"), + ("kaithi", "Kaithi"), + ("kali", "Kayah_Li"), + ("kana", "Katakana"), + ("kannada", "Kannada"), + ("katakana", "Katakana"), + ("katakanaorhiragana", "Katakana_Or_Hiragana"), + ("kawi", "Kawi"), + ("kayahli", "Kayah_Li"), + ("khar", "Kharoshthi"), + ("kharoshthi", "Kharoshthi"), + ("khitansmallscript", "Khitan_Small_Script"), + ("khmer", "Khmer"), + ("khmr", "Khmer"), + ("khoj", "Khojki"), + ("khojki", "Khojki"), + ("khudawadi", "Khudawadi"), + ("kits", "Khitan_Small_Script"), + ("knda", "Kannada"), + ("kthi", "Kaithi"), + ("lana", "Tai_Tham"), + ("lao", "Lao"), + ("laoo", "Lao"), + ("latin", "Latin"), + ("latn", "Latin"), + ("lepc", "Lepcha"), + ("lepcha", "Lepcha"), + ("limb", "Limbu"), + ("limbu", "Limbu"), + ("lina", "Linear_A"), + ("linb", "Linear_B"), + ("lineara", "Linear_A"), + ("linearb", "Linear_B"), + ("lisu", "Lisu"), + ("lyci", "Lycian"), + ("lycian", "Lycian"), + ("lydi", "Lydian"), + ("lydian", "Lydian"), + ("mahajani", "Mahajani"), + ("mahj", "Mahajani"), + ("maka", "Makasar"), + ("makasar", "Makasar"), + ("malayalam", "Malayalam"), + ("mand", "Mandaic"), + ("mandaic", "Mandaic"), + ("mani", "Manichaean"), + ("manichaean", "Manichaean"), + ("marc", "Marchen"), + ("marchen", "Marchen"), + ("masaramgondi", "Masaram_Gondi"), + ("medefaidrin", "Medefaidrin"), + ("medf", "Medefaidrin"), + ("meeteimayek", "Meetei_Mayek"), + ("mend", "Mende_Kikakui"), + ("mendekikakui", "Mende_Kikakui"), + ("merc", "Meroitic_Cursive"), + ("mero", "Meroitic_Hieroglyphs"), + ("meroiticcursive", "Meroitic_Cursive"), + ("meroitichieroglyphs", "Meroitic_Hieroglyphs"), + ("miao", "Miao"), + ("mlym", "Malayalam"), + ("modi", "Modi"), + ("mong", "Mongolian"), + ("mongolian", "Mongolian"), + ("mro", "Mro"), + ("mroo", "Mro"), + ("mtei", "Meetei_Mayek"), + ("mult", "Multani"), + ("multani", "Multani"), + ("myanmar", "Myanmar"), + ("mymr", "Myanmar"), + ("nabataean", "Nabataean"), + ("nagm", "Nag_Mundari"), + ("nagmundari", "Nag_Mundari"), + ("nand", "Nandinagari"), + ("nandinagari", "Nandinagari"), + ("narb", "Old_North_Arabian"), + ("nbat", "Nabataean"), + ("newa", "Newa"), + ("newtailue", "New_Tai_Lue"), + ("nko", "Nko"), + ("nkoo", "Nko"), + ("nshu", "Nushu"), + ("nushu", "Nushu"), + ("nyiakengpuachuehmong", "Nyiakeng_Puachue_Hmong"), + ("ogam", "Ogham"), + ("ogham", "Ogham"), + ("olchiki", "Ol_Chiki"), + ("olck", "Ol_Chiki"), + ("oldhungarian", "Old_Hungarian"), + ("olditalic", "Old_Italic"), + ("oldnortharabian", "Old_North_Arabian"), + ("oldpermic", "Old_Permic"), + ("oldpersian", "Old_Persian"), + ("oldsogdian", "Old_Sogdian"), + ("oldsoutharabian", "Old_South_Arabian"), + ("oldturkic", "Old_Turkic"), + ("olduyghur", "Old_Uyghur"), + ("oriya", "Oriya"), + ("orkh", "Old_Turkic"), + ("orya", "Oriya"), + ("osage", "Osage"), + ("osge", "Osage"), + ("osma", "Osmanya"), + ("osmanya", "Osmanya"), + ("ougr", "Old_Uyghur"), + ("pahawhhmong", "Pahawh_Hmong"), + ("palm", "Palmyrene"), + ("palmyrene", "Palmyrene"), + ("pauc", "Pau_Cin_Hau"), + ("paucinhau", "Pau_Cin_Hau"), + ("perm", "Old_Permic"), + ("phag", "Phags_Pa"), + ("phagspa", "Phags_Pa"), + ("phli", "Inscriptional_Pahlavi"), + ("phlp", "Psalter_Pahlavi"), + ("phnx", "Phoenician"), + ("phoenician", "Phoenician"), + ("plrd", "Miao"), + ("prti", "Inscriptional_Parthian"), + ("psalterpahlavi", "Psalter_Pahlavi"), + ("qaac", "Coptic"), + ("qaai", "Inherited"), + ("rejang", "Rejang"), + ("rjng", "Rejang"), + ("rohg", "Hanifi_Rohingya"), + ("runic", "Runic"), + ("runr", "Runic"), + ("samaritan", "Samaritan"), + ("samr", "Samaritan"), + ("sarb", "Old_South_Arabian"), + ("saur", "Saurashtra"), + ("saurashtra", "Saurashtra"), + ("sgnw", "SignWriting"), + ("sharada", "Sharada"), + ("shavian", "Shavian"), + ("shaw", "Shavian"), + ("shrd", "Sharada"), + ("sidd", "Siddham"), + ("siddham", "Siddham"), + ("signwriting", "SignWriting"), + ("sind", "Khudawadi"), + ("sinh", "Sinhala"), + ("sinhala", "Sinhala"), + ("sogd", "Sogdian"), + ("sogdian", "Sogdian"), + ("sogo", "Old_Sogdian"), + ("sora", "Sora_Sompeng"), + ("sorasompeng", "Sora_Sompeng"), + ("soyo", "Soyombo"), + ("soyombo", "Soyombo"), + ("sund", "Sundanese"), + ("sundanese", "Sundanese"), + ("sylo", "Syloti_Nagri"), + ("sylotinagri", "Syloti_Nagri"), + ("syrc", "Syriac"), + ("syriac", "Syriac"), + ("tagalog", "Tagalog"), + ("tagb", "Tagbanwa"), + ("tagbanwa", "Tagbanwa"), + ("taile", "Tai_Le"), + ("taitham", "Tai_Tham"), + ("taiviet", "Tai_Viet"), + ("takr", "Takri"), + ("takri", "Takri"), + ("tale", "Tai_Le"), + ("talu", "New_Tai_Lue"), + ("tamil", "Tamil"), + ("taml", "Tamil"), + ("tang", "Tangut"), + ("tangsa", "Tangsa"), + ("tangut", "Tangut"), + ("tavt", "Tai_Viet"), + ("telu", "Telugu"), + ("telugu", "Telugu"), + ("tfng", "Tifinagh"), + ("tglg", "Tagalog"), + ("thaa", "Thaana"), + ("thaana", "Thaana"), + ("thai", "Thai"), + ("tibetan", "Tibetan"), + ("tibt", "Tibetan"), + ("tifinagh", "Tifinagh"), + ("tirh", "Tirhuta"), + ("tirhuta", "Tirhuta"), + ("tnsa", "Tangsa"), + ("toto", "Toto"), + ("ugar", "Ugaritic"), + ("ugaritic", "Ugaritic"), + ("unknown", "Unknown"), + ("vai", "Vai"), + ("vaii", "Vai"), + ("vith", "Vithkuqi"), + ("vithkuqi", "Vithkuqi"), + ("wancho", "Wancho"), + ("wara", "Warang_Citi"), + ("warangciti", "Warang_Citi"), + ("wcho", "Wancho"), + ("xpeo", "Old_Persian"), + ("xsux", "Cuneiform"), + ("yezi", "Yezidi"), + ("yezidi", "Yezidi"), + ("yi", "Yi"), + ("yiii", "Yi"), + ("zanabazarsquare", "Zanabazar_Square"), + ("zanb", "Zanabazar_Square"), + ("zinh", "Inherited"), + ("zyyy", "Common"), + ("zzzz", "Unknown"), + ], + ), + ( + "Script_Extensions", + &[ + ("adlam", "Adlam"), + ("adlm", "Adlam"), + ("aghb", "Caucasian_Albanian"), + ("ahom", "Ahom"), + ("anatolianhieroglyphs", "Anatolian_Hieroglyphs"), + ("arab", "Arabic"), + ("arabic", "Arabic"), + ("armenian", "Armenian"), + ("armi", "Imperial_Aramaic"), + ("armn", "Armenian"), + ("avestan", "Avestan"), + ("avst", "Avestan"), + ("bali", "Balinese"), + ("balinese", "Balinese"), + ("bamu", "Bamum"), + ("bamum", "Bamum"), + ("bass", "Bassa_Vah"), + ("bassavah", "Bassa_Vah"), + ("batak", "Batak"), + ("batk", "Batak"), + ("beng", "Bengali"), + ("bengali", "Bengali"), + ("bhaiksuki", "Bhaiksuki"), + ("bhks", "Bhaiksuki"), + ("bopo", "Bopomofo"), + ("bopomofo", "Bopomofo"), + ("brah", "Brahmi"), + ("brahmi", "Brahmi"), + ("brai", "Braille"), + ("braille", "Braille"), + ("bugi", "Buginese"), + ("buginese", "Buginese"), + ("buhd", "Buhid"), + ("buhid", "Buhid"), + ("cakm", "Chakma"), + ("canadianaboriginal", "Canadian_Aboriginal"), + ("cans", "Canadian_Aboriginal"), + ("cari", "Carian"), + ("carian", "Carian"), + ("caucasianalbanian", "Caucasian_Albanian"), + ("chakma", "Chakma"), + ("cham", "Cham"), + ("cher", "Cherokee"), + ("cherokee", "Cherokee"), + ("chorasmian", "Chorasmian"), + ("chrs", "Chorasmian"), + ("common", "Common"), + ("copt", "Coptic"), + ("coptic", "Coptic"), + ("cpmn", "Cypro_Minoan"), + ("cprt", "Cypriot"), + ("cuneiform", "Cuneiform"), + ("cypriot", "Cypriot"), + ("cyprominoan", "Cypro_Minoan"), + ("cyrillic", "Cyrillic"), + ("cyrl", "Cyrillic"), + ("deseret", "Deseret"), + ("deva", "Devanagari"), + ("devanagari", "Devanagari"), + ("diak", "Dives_Akuru"), + ("divesakuru", "Dives_Akuru"), + ("dogr", "Dogra"), + ("dogra", "Dogra"), + ("dsrt", "Deseret"), + ("dupl", "Duployan"), + ("duployan", "Duployan"), + ("egyp", "Egyptian_Hieroglyphs"), + ("egyptianhieroglyphs", "Egyptian_Hieroglyphs"), + ("elba", "Elbasan"), + ("elbasan", "Elbasan"), + ("elym", "Elymaic"), + ("elymaic", "Elymaic"), + ("ethi", "Ethiopic"), + ("ethiopic", "Ethiopic"), + ("geor", "Georgian"), + ("georgian", "Georgian"), + ("glag", "Glagolitic"), + ("glagolitic", "Glagolitic"), + ("gong", "Gunjala_Gondi"), + ("gonm", "Masaram_Gondi"), + ("goth", "Gothic"), + ("gothic", "Gothic"), + ("gran", "Grantha"), + ("grantha", "Grantha"), + ("greek", "Greek"), + ("grek", "Greek"), + ("gujarati", "Gujarati"), + ("gujr", "Gujarati"), + ("gunjalagondi", "Gunjala_Gondi"), + ("gurmukhi", "Gurmukhi"), + ("guru", "Gurmukhi"), + ("han", "Han"), + ("hang", "Hangul"), + ("hangul", "Hangul"), + ("hani", "Han"), + ("hanifirohingya", "Hanifi_Rohingya"), + ("hano", "Hanunoo"), + ("hanunoo", "Hanunoo"), + ("hatr", "Hatran"), + ("hatran", "Hatran"), + ("hebr", "Hebrew"), + ("hebrew", "Hebrew"), + ("hira", "Hiragana"), + ("hiragana", "Hiragana"), + ("hluw", "Anatolian_Hieroglyphs"), + ("hmng", "Pahawh_Hmong"), + ("hmnp", "Nyiakeng_Puachue_Hmong"), + ("hrkt", "Katakana_Or_Hiragana"), + ("hung", "Old_Hungarian"), + ("imperialaramaic", "Imperial_Aramaic"), + ("inherited", "Inherited"), + ("inscriptionalpahlavi", "Inscriptional_Pahlavi"), + ("inscriptionalparthian", "Inscriptional_Parthian"), + ("ital", "Old_Italic"), + ("java", "Javanese"), + ("javanese", "Javanese"), + ("kaithi", "Kaithi"), + ("kali", "Kayah_Li"), + ("kana", "Katakana"), + ("kannada", "Kannada"), + ("katakana", "Katakana"), + ("katakanaorhiragana", "Katakana_Or_Hiragana"), + ("kawi", "Kawi"), + ("kayahli", "Kayah_Li"), + ("khar", "Kharoshthi"), + ("kharoshthi", "Kharoshthi"), + ("khitansmallscript", "Khitan_Small_Script"), + ("khmer", "Khmer"), + ("khmr", "Khmer"), + ("khoj", "Khojki"), + ("khojki", "Khojki"), + ("khudawadi", "Khudawadi"), + ("kits", "Khitan_Small_Script"), + ("knda", "Kannada"), + ("kthi", "Kaithi"), + ("lana", "Tai_Tham"), + ("lao", "Lao"), + ("laoo", "Lao"), + ("latin", "Latin"), + ("latn", "Latin"), + ("lepc", "Lepcha"), + ("lepcha", "Lepcha"), + ("limb", "Limbu"), + ("limbu", "Limbu"), + ("lina", "Linear_A"), + ("linb", "Linear_B"), + ("lineara", "Linear_A"), + ("linearb", "Linear_B"), + ("lisu", "Lisu"), + ("lyci", "Lycian"), + ("lycian", "Lycian"), + ("lydi", "Lydian"), + ("lydian", "Lydian"), + ("mahajani", "Mahajani"), + ("mahj", "Mahajani"), + ("maka", "Makasar"), + ("makasar", "Makasar"), + ("malayalam", "Malayalam"), + ("mand", "Mandaic"), + ("mandaic", "Mandaic"), + ("mani", "Manichaean"), + ("manichaean", "Manichaean"), + ("marc", "Marchen"), + ("marchen", "Marchen"), + ("masaramgondi", "Masaram_Gondi"), + ("medefaidrin", "Medefaidrin"), + ("medf", "Medefaidrin"), + ("meeteimayek", "Meetei_Mayek"), + ("mend", "Mende_Kikakui"), + ("mendekikakui", "Mende_Kikakui"), + ("merc", "Meroitic_Cursive"), + ("mero", "Meroitic_Hieroglyphs"), + ("meroiticcursive", "Meroitic_Cursive"), + ("meroitichieroglyphs", "Meroitic_Hieroglyphs"), + ("miao", "Miao"), + ("mlym", "Malayalam"), + ("modi", "Modi"), + ("mong", "Mongolian"), + ("mongolian", "Mongolian"), + ("mro", "Mro"), + ("mroo", "Mro"), + ("mtei", "Meetei_Mayek"), + ("mult", "Multani"), + ("multani", "Multani"), + ("myanmar", "Myanmar"), + ("mymr", "Myanmar"), + ("nabataean", "Nabataean"), + ("nagm", "Nag_Mundari"), + ("nagmundari", "Nag_Mundari"), + ("nand", "Nandinagari"), + ("nandinagari", "Nandinagari"), + ("narb", "Old_North_Arabian"), + ("nbat", "Nabataean"), + ("newa", "Newa"), + ("newtailue", "New_Tai_Lue"), + ("nko", "Nko"), + ("nkoo", "Nko"), + ("nshu", "Nushu"), + ("nushu", "Nushu"), + ("nyiakengpuachuehmong", "Nyiakeng_Puachue_Hmong"), + ("ogam", "Ogham"), + ("ogham", "Ogham"), + ("olchiki", "Ol_Chiki"), + ("olck", "Ol_Chiki"), + ("oldhungarian", "Old_Hungarian"), + ("olditalic", "Old_Italic"), + ("oldnortharabian", "Old_North_Arabian"), + ("oldpermic", "Old_Permic"), + ("oldpersian", "Old_Persian"), + ("oldsogdian", "Old_Sogdian"), + ("oldsoutharabian", "Old_South_Arabian"), + ("oldturkic", "Old_Turkic"), + ("olduyghur", "Old_Uyghur"), + ("oriya", "Oriya"), + ("orkh", "Old_Turkic"), + ("orya", "Oriya"), + ("osage", "Osage"), + ("osge", "Osage"), + ("osma", "Osmanya"), + ("osmanya", "Osmanya"), + ("ougr", "Old_Uyghur"), + ("pahawhhmong", "Pahawh_Hmong"), + ("palm", "Palmyrene"), + ("palmyrene", "Palmyrene"), + ("pauc", "Pau_Cin_Hau"), + ("paucinhau", "Pau_Cin_Hau"), + ("perm", "Old_Permic"), + ("phag", "Phags_Pa"), + ("phagspa", "Phags_Pa"), + ("phli", "Inscriptional_Pahlavi"), + ("phlp", "Psalter_Pahlavi"), + ("phnx", "Phoenician"), + ("phoenician", "Phoenician"), + ("plrd", "Miao"), + ("prti", "Inscriptional_Parthian"), + ("psalterpahlavi", "Psalter_Pahlavi"), + ("qaac", "Coptic"), + ("qaai", "Inherited"), + ("rejang", "Rejang"), + ("rjng", "Rejang"), + ("rohg", "Hanifi_Rohingya"), + ("runic", "Runic"), + ("runr", "Runic"), + ("samaritan", "Samaritan"), + ("samr", "Samaritan"), + ("sarb", "Old_South_Arabian"), + ("saur", "Saurashtra"), + ("saurashtra", "Saurashtra"), + ("sgnw", "SignWriting"), + ("sharada", "Sharada"), + ("shavian", "Shavian"), + ("shaw", "Shavian"), + ("shrd", "Sharada"), + ("sidd", "Siddham"), + ("siddham", "Siddham"), + ("signwriting", "SignWriting"), + ("sind", "Khudawadi"), + ("sinh", "Sinhala"), + ("sinhala", "Sinhala"), + ("sogd", "Sogdian"), + ("sogdian", "Sogdian"), + ("sogo", "Old_Sogdian"), + ("sora", "Sora_Sompeng"), + ("sorasompeng", "Sora_Sompeng"), + ("soyo", "Soyombo"), + ("soyombo", "Soyombo"), + ("sund", "Sundanese"), + ("sundanese", "Sundanese"), + ("sylo", "Syloti_Nagri"), + ("sylotinagri", "Syloti_Nagri"), + ("syrc", "Syriac"), + ("syriac", "Syriac"), + ("tagalog", "Tagalog"), + ("tagb", "Tagbanwa"), + ("tagbanwa", "Tagbanwa"), + ("taile", "Tai_Le"), + ("taitham", "Tai_Tham"), + ("taiviet", "Tai_Viet"), + ("takr", "Takri"), + ("takri", "Takri"), + ("tale", "Tai_Le"), + ("talu", "New_Tai_Lue"), + ("tamil", "Tamil"), + ("taml", "Tamil"), + ("tang", "Tangut"), + ("tangsa", "Tangsa"), + ("tangut", "Tangut"), + ("tavt", "Tai_Viet"), + ("telu", "Telugu"), + ("telugu", "Telugu"), + ("tfng", "Tifinagh"), + ("tglg", "Tagalog"), + ("thaa", "Thaana"), + ("thaana", "Thaana"), + ("thai", "Thai"), + ("tibetan", "Tibetan"), + ("tibt", "Tibetan"), + ("tifinagh", "Tifinagh"), + ("tirh", "Tirhuta"), + ("tirhuta", "Tirhuta"), + ("tnsa", "Tangsa"), + ("toto", "Toto"), + ("ugar", "Ugaritic"), + ("ugaritic", "Ugaritic"), + ("unknown", "Unknown"), + ("vai", "Vai"), + ("vaii", "Vai"), + ("vith", "Vithkuqi"), + ("vithkuqi", "Vithkuqi"), + ("wancho", "Wancho"), + ("wara", "Warang_Citi"), + ("warangciti", "Warang_Citi"), + ("wcho", "Wancho"), + ("xpeo", "Old_Persian"), + ("xsux", "Cuneiform"), + ("yezi", "Yezidi"), + ("yezidi", "Yezidi"), + ("yi", "Yi"), + ("yiii", "Yi"), + ("zanabazarsquare", "Zanabazar_Square"), + ("zanb", "Zanabazar_Square"), + ("zinh", "Inherited"), + ("zyyy", "Common"), + ("zzzz", "Unknown"), + ], + ), + ( + "Sentence_Break", + &[ + ("at", "ATerm"), + ("aterm", "ATerm"), + ("cl", "Close"), + ("close", "Close"), + ("cr", "CR"), + ("ex", "Extend"), + ("extend", "Extend"), + ("fo", "Format"), + ("format", "Format"), + ("le", "OLetter"), + ("lf", "LF"), + ("lo", "Lower"), + ("lower", "Lower"), + ("nu", "Numeric"), + ("numeric", "Numeric"), + ("oletter", "OLetter"), + ("other", "Other"), + ("sc", "SContinue"), + ("scontinue", "SContinue"), + ("se", "Sep"), + ("sep", "Sep"), + ("sp", "Sp"), + ("st", "STerm"), + ("sterm", "STerm"), + ("up", "Upper"), + ("upper", "Upper"), + ("xx", "Other"), + ], + ), + ( + "Word_Break", + &[ + ("aletter", "ALetter"), + ("cr", "CR"), + ("doublequote", "Double_Quote"), + ("dq", "Double_Quote"), + ("eb", "E_Base"), + ("ebase", "E_Base"), + ("ebasegaz", "E_Base_GAZ"), + ("ebg", "E_Base_GAZ"), + ("em", "E_Modifier"), + ("emodifier", "E_Modifier"), + ("ex", "ExtendNumLet"), + ("extend", "Extend"), + ("extendnumlet", "ExtendNumLet"), + ("fo", "Format"), + ("format", "Format"), + ("gaz", "Glue_After_Zwj"), + ("glueafterzwj", "Glue_After_Zwj"), + ("hebrewletter", "Hebrew_Letter"), + ("hl", "Hebrew_Letter"), + ("ka", "Katakana"), + ("katakana", "Katakana"), + ("le", "ALetter"), + ("lf", "LF"), + ("mb", "MidNumLet"), + ("midletter", "MidLetter"), + ("midnum", "MidNum"), + ("midnumlet", "MidNumLet"), + ("ml", "MidLetter"), + ("mn", "MidNum"), + ("newline", "Newline"), + ("nl", "Newline"), + ("nu", "Numeric"), + ("numeric", "Numeric"), + ("other", "Other"), + ("regionalindicator", "Regional_Indicator"), + ("ri", "Regional_Indicator"), + ("singlequote", "Single_Quote"), + ("sq", "Single_Quote"), + ("wsegspace", "WSegSpace"), + ("xx", "Other"), + ("zwj", "ZWJ"), + ], + ), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/script.rs b/vendor/regex-syntax/src/unicode_tables/script.rs new file mode 100644 index 0000000000..cc5c400ddb --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/script.rs @@ -0,0 +1,1263 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate script ucd-15.0.0 --chars +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("Adlam", ADLAM), + ("Ahom", AHOM), + ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS), + ("Arabic", ARABIC), + ("Armenian", ARMENIAN), + ("Avestan", AVESTAN), + ("Balinese", BALINESE), + ("Bamum", BAMUM), + ("Bassa_Vah", BASSA_VAH), + ("Batak", BATAK), + ("Bengali", BENGALI), + ("Bhaiksuki", BHAIKSUKI), + ("Bopomofo", BOPOMOFO), + ("Brahmi", BRAHMI), + ("Braille", BRAILLE), + ("Buginese", BUGINESE), + ("Buhid", BUHID), + ("Canadian_Aboriginal", CANADIAN_ABORIGINAL), + ("Carian", CARIAN), + ("Caucasian_Albanian", CAUCASIAN_ALBANIAN), + ("Chakma", CHAKMA), + ("Cham", CHAM), + ("Cherokee", CHEROKEE), + ("Chorasmian", CHORASMIAN), + ("Common", COMMON), + ("Coptic", COPTIC), + ("Cuneiform", CUNEIFORM), + ("Cypriot", CYPRIOT), + ("Cypro_Minoan", CYPRO_MINOAN), + ("Cyrillic", CYRILLIC), + ("Deseret", DESERET), + ("Devanagari", DEVANAGARI), + ("Dives_Akuru", DIVES_AKURU), + ("Dogra", DOGRA), + ("Duployan", DUPLOYAN), + ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS), + ("Elbasan", ELBASAN), + ("Elymaic", ELYMAIC), + ("Ethiopic", ETHIOPIC), + ("Georgian", GEORGIAN), + ("Glagolitic", GLAGOLITIC), + ("Gothic", GOTHIC), + ("Grantha", GRANTHA), + ("Greek", GREEK), + ("Gujarati", GUJARATI), + ("Gunjala_Gondi", GUNJALA_GONDI), + ("Gurmukhi", GURMUKHI), + ("Han", HAN), + ("Hangul", HANGUL), + ("Hanifi_Rohingya", HANIFI_ROHINGYA), + ("Hanunoo", HANUNOO), + ("Hatran", HATRAN), + ("Hebrew", HEBREW), + ("Hiragana", HIRAGANA), + ("Imperial_Aramaic", IMPERIAL_ARAMAIC), + ("Inherited", INHERITED), + ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI), + ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN), + ("Javanese", JAVANESE), + ("Kaithi", KAITHI), + ("Kannada", KANNADA), + ("Katakana", KATAKANA), + ("Kawi", KAWI), + ("Kayah_Li", KAYAH_LI), + ("Kharoshthi", KHAROSHTHI), + ("Khitan_Small_Script", KHITAN_SMALL_SCRIPT), + ("Khmer", KHMER), + ("Khojki", KHOJKI), + ("Khudawadi", KHUDAWADI), + ("Lao", LAO), + ("Latin", LATIN), + ("Lepcha", LEPCHA), + ("Limbu", LIMBU), + ("Linear_A", LINEAR_A), + ("Linear_B", LINEAR_B), + ("Lisu", LISU), + ("Lycian", LYCIAN), + ("Lydian", LYDIAN), + ("Mahajani", MAHAJANI), + ("Makasar", MAKASAR), + ("Malayalam", MALAYALAM), + ("Mandaic", MANDAIC), + ("Manichaean", MANICHAEAN), + ("Marchen", MARCHEN), + ("Masaram_Gondi", MASARAM_GONDI), + ("Medefaidrin", MEDEFAIDRIN), + ("Meetei_Mayek", MEETEI_MAYEK), + ("Mende_Kikakui", MENDE_KIKAKUI), + ("Meroitic_Cursive", MEROITIC_CURSIVE), + ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS), + ("Miao", MIAO), + ("Modi", MODI), + ("Mongolian", MONGOLIAN), + ("Mro", MRO), + ("Multani", MULTANI), + ("Myanmar", MYANMAR), + ("Nabataean", NABATAEAN), + ("Nag_Mundari", NAG_MUNDARI), + ("Nandinagari", NANDINAGARI), + ("New_Tai_Lue", NEW_TAI_LUE), + ("Newa", NEWA), + ("Nko", NKO), + ("Nushu", NUSHU), + ("Nyiakeng_Puachue_Hmong", NYIAKENG_PUACHUE_HMONG), + ("Ogham", OGHAM), + ("Ol_Chiki", OL_CHIKI), + ("Old_Hungarian", OLD_HUNGARIAN), + ("Old_Italic", OLD_ITALIC), + ("Old_North_Arabian", OLD_NORTH_ARABIAN), + ("Old_Permic", OLD_PERMIC), + ("Old_Persian", OLD_PERSIAN), + ("Old_Sogdian", OLD_SOGDIAN), + ("Old_South_Arabian", OLD_SOUTH_ARABIAN), + ("Old_Turkic", OLD_TURKIC), + ("Old_Uyghur", OLD_UYGHUR), + ("Oriya", ORIYA), + ("Osage", OSAGE), + ("Osmanya", OSMANYA), + ("Pahawh_Hmong", PAHAWH_HMONG), + ("Palmyrene", PALMYRENE), + ("Pau_Cin_Hau", PAU_CIN_HAU), + ("Phags_Pa", PHAGS_PA), + ("Phoenician", PHOENICIAN), + ("Psalter_Pahlavi", PSALTER_PAHLAVI), + ("Rejang", REJANG), + ("Runic", RUNIC), + ("Samaritan", SAMARITAN), + ("Saurashtra", SAURASHTRA), + ("Sharada", SHARADA), + ("Shavian", SHAVIAN), + ("Siddham", SIDDHAM), + ("SignWriting", SIGNWRITING), + ("Sinhala", SINHALA), + ("Sogdian", SOGDIAN), + ("Sora_Sompeng", SORA_SOMPENG), + ("Soyombo", SOYOMBO), + ("Sundanese", SUNDANESE), + ("Syloti_Nagri", SYLOTI_NAGRI), + ("Syriac", SYRIAC), + ("Tagalog", TAGALOG), + ("Tagbanwa", TAGBANWA), + ("Tai_Le", TAI_LE), + ("Tai_Tham", TAI_THAM), + ("Tai_Viet", TAI_VIET), + ("Takri", TAKRI), + ("Tamil", TAMIL), + ("Tangsa", TANGSA), + ("Tangut", TANGUT), + ("Telugu", TELUGU), + ("Thaana", THAANA), + ("Thai", THAI), + ("Tibetan", TIBETAN), + ("Tifinagh", TIFINAGH), + ("Tirhuta", TIRHUTA), + ("Toto", TOTO), + ("Ugaritic", UGARITIC), + ("Vai", VAI), + ("Vithkuqi", VITHKUQI), + ("Wancho", WANCHO), + ("Warang_Citi", WARANG_CITI), + ("Yezidi", YEZIDI), + ("Yi", YI), + ("Zanabazar_Square", ZANABAZAR_SQUARE), +]; + +pub const ADLAM: &'static [(char, char)] = + &[('𞤀', '𞥋'), ('𞥐', '𞥙'), ('𞥞', '𞥟')]; + +pub const AHOM: &'static [(char, char)] = + &[('𑜀', '𑜚'), ('\u{1171d}', '\u{1172b}'), ('𑜰', '𑝆')]; + +pub const ANATOLIAN_HIEROGLYPHS: &'static [(char, char)] = &[('𔐀', '𔙆')]; + +pub const ARABIC: &'static [(char, char)] = &[ + ('\u{600}', '\u{604}'), + ('؆', '؋'), + ('؍', '\u{61a}'), + ('\u{61c}', '؞'), + ('ؠ', 'ؿ'), + ('ف', 'ي'), + ('\u{656}', 'ٯ'), + ('ٱ', '\u{6dc}'), + ('۞', 'ۿ'), + ('ݐ', 'ݿ'), + ('ࡰ', 'ࢎ'), + ('\u{890}', '\u{891}'), + ('\u{898}', '\u{8e1}'), + ('\u{8e3}', '\u{8ff}'), + ('ﭐ', '﯂'), + ('ﯓ', 'ﴽ'), + ('﵀', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('﷏', '﷏'), + ('ﷰ', '﷿'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('𐹠', '𐹾'), + ('\u{10efd}', '\u{10eff}'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('𞻰', '𞻱'), +]; + +pub const ARMENIAN: &'static [(char, char)] = + &[('Ա', 'Ֆ'), ('ՙ', '֊'), ('֍', '֏'), ('ﬓ', 'ﬗ')]; + +pub const AVESTAN: &'static [(char, char)] = &[('𐬀', '𐬵'), ('𐬹', '𐬿')]; + +pub const BALINESE: &'static [(char, char)] = &[('\u{1b00}', 'ᭌ'), ('᭐', '᭾')]; + +pub const BAMUM: &'static [(char, char)] = &[('ꚠ', '꛷'), ('𖠀', '𖨸')]; + +pub const BASSA_VAH: &'static [(char, char)] = + &[('𖫐', '𖫭'), ('\u{16af0}', '𖫵')]; + +pub const BATAK: &'static [(char, char)] = &[('ᯀ', '᯳'), ('᯼', '᯿')]; + +pub const BENGALI: &'static [(char, char)] = &[ + ('ঀ', 'ঃ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('\u{9bc}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', 'ৎ'), + ('\u{9d7}', '\u{9d7}'), + ('ড়', 'ঢ়'), + ('য়', '\u{9e3}'), + ('০', '\u{9fe}'), +]; + +pub const BHAIKSUKI: &'static [(char, char)] = + &[('𑰀', '𑰈'), ('𑰊', '\u{11c36}'), ('\u{11c38}', '𑱅'), ('𑱐', '𑱬')]; + +pub const BOPOMOFO: &'static [(char, char)] = + &[('˪', '˫'), ('ㄅ', 'ㄯ'), ('ㆠ', 'ㆿ')]; + +pub const BRAHMI: &'static [(char, char)] = + &[('𑀀', '𑁍'), ('𑁒', '𑁵'), ('\u{1107f}', '\u{1107f}')]; + +pub const BRAILLE: &'static [(char, char)] = &[('⠀', '⣿')]; + +pub const BUGINESE: &'static [(char, char)] = &[('ᨀ', '\u{1a1b}'), ('᨞', '᨟')]; + +pub const BUHID: &'static [(char, char)] = &[('ᝀ', '\u{1753}')]; + +pub const CANADIAN_ABORIGINAL: &'static [(char, char)] = + &[('᐀', 'ᙿ'), ('ᢰ', 'ᣵ'), ('𑪰', '𑪿')]; + +pub const CARIAN: &'static [(char, char)] = &[('𐊠', '𐋐')]; + +pub const CAUCASIAN_ALBANIAN: &'static [(char, char)] = + &[('𐔰', '𐕣'), ('𐕯', '𐕯')]; + +pub const CHAKMA: &'static [(char, char)] = + &[('\u{11100}', '\u{11134}'), ('𑄶', '𑅇')]; + +pub const CHAM: &'static [(char, char)] = + &[('ꨀ', '\u{aa36}'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('꩜', '꩟')]; + +pub const CHEROKEE: &'static [(char, char)] = + &[('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ꭰ', 'ꮿ')]; + +pub const CHORASMIAN: &'static [(char, char)] = &[('𐾰', '𐿋')]; + +pub const COMMON: &'static [(char, char)] = &[ + ('\0', '@'), + ('[', '`'), + ('{', '©'), + ('«', '¹'), + ('»', '¿'), + ('×', '×'), + ('÷', '÷'), + ('ʹ', '˟'), + ('˥', '˩'), + ('ˬ', '˿'), + ('ʹ', 'ʹ'), + (';', ';'), + ('΅', '΅'), + ('·', '·'), + ('\u{605}', '\u{605}'), + ('،', '،'), + ('؛', '؛'), + ('؟', '؟'), + ('ـ', 'ـ'), + ('\u{6dd}', '\u{6dd}'), + ('\u{8e2}', '\u{8e2}'), + ('।', '॥'), + ('฿', '฿'), + ('࿕', '࿘'), + ('჻', '჻'), + ('᛫', '᛭'), + ('᜵', '᜶'), + ('᠂', '᠃'), + ('᠅', '᠅'), + ('᳓', '᳓'), + ('᳡', '᳡'), + ('ᳩ', 'ᳬ'), + ('ᳮ', 'ᳳ'), + ('ᳵ', '᳷'), + ('ᳺ', 'ᳺ'), + ('\u{2000}', '\u{200b}'), + ('\u{200e}', '\u{2064}'), + ('\u{2066}', '⁰'), + ('⁴', '⁾'), + ('₀', '₎'), + ('₠', '⃀'), + ('℀', '℥'), + ('℧', '℩'), + ('ℬ', 'ℱ'), + ('ℳ', '⅍'), + ('⅏', '⅟'), + ('↉', '↋'), + ('←', '␦'), + ('⑀', '⑊'), + ('①', '⟿'), + ('⤀', '⭳'), + ('⭶', '⮕'), + ('⮗', '⯿'), + ('⸀', '⹝'), + ('⿰', '⿻'), + ('\u{3000}', '〄'), + ('〆', '〆'), + ('〈', '〠'), + ('〰', '〷'), + ('〼', '〿'), + ('゛', '゜'), + ('゠', '゠'), + ('・', 'ー'), + ('㆐', '㆟'), + ('㇀', '㇣'), + ('㈠', '㉟'), + ('㉿', '㋏'), + ('㋿', '㋿'), + ('㍘', '㏿'), + ('䷀', '䷿'), + ('꜀', '꜡'), + ('ꞈ', '꞊'), + ('꠰', '꠹'), + ('꤮', '꤮'), + ('ꧏ', 'ꧏ'), + ('꭛', '꭛'), + ('꭪', '꭫'), + ('﴾', '﴿'), + ('︐', '︙'), + ('︰', '﹒'), + ('﹔', '﹦'), + ('﹨', '﹫'), + ('\u{feff}', '\u{feff}'), + ('!', '@'), + ('[', '`'), + ('{', '・'), + ('ー', 'ー'), + ('\u{ff9e}', '\u{ff9f}'), + ('¢', '₩'), + ('│', '○'), + ('\u{fff9}', '�'), + ('𐄀', '𐄂'), + ('𐄇', '𐄳'), + ('𐄷', '𐄿'), + ('𐆐', '𐆜'), + ('𐇐', '𐇼'), + ('𐋡', '𐋻'), + ('\u{1bca0}', '\u{1bca3}'), + ('𜽐', '𜿃'), + ('𝀀', '𝃵'), + ('𝄀', '𝄦'), + ('𝄩', '𝅦'), + ('𝅪', '\u{1d17a}'), + ('𝆃', '𝆄'), + ('𝆌', '𝆩'), + ('𝆮', '𝇪'), + ('𝋀', '𝋓'), + ('𝋠', '𝋳'), + ('𝌀', '𝍖'), + ('𝍠', '𝍸'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝟋'), + ('𝟎', '𝟿'), + ('𞱱', '𞲴'), + ('𞴁', '𞴽'), + ('🀀', '🀫'), + ('🀰', '🂓'), + ('🂠', '🂮'), + ('🂱', '🂿'), + ('🃁', '🃏'), + ('🃑', '🃵'), + ('🄀', '🆭'), + ('🇦', '🇿'), + ('🈁', '🈂'), + ('🈐', '🈻'), + ('🉀', '🉈'), + ('🉐', '🉑'), + ('🉠', '🉥'), + ('🌀', '🛗'), + ('🛜', '🛬'), + ('🛰', '🛼'), + ('🜀', '🝶'), + ('🝻', '🟙'), + ('🟠', '🟫'), + ('🟰', '🟰'), + ('🠀', '🠋'), + ('🠐', '🡇'), + ('🡐', '🡙'), + ('🡠', '🢇'), + ('🢐', '🢭'), + ('🢰', '🢱'), + ('🤀', '🩓'), + ('🩠', '🩭'), + ('🩰', '🩼'), + ('🪀', '🪈'), + ('🪐', '🪽'), + ('🪿', '🫅'), + ('🫎', '🫛'), + ('🫠', '🫨'), + ('🫰', '🫸'), + ('🬀', '🮒'), + ('🮔', '🯊'), + ('🯰', '🯹'), + ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const COPTIC: &'static [(char, char)] = + &[('Ϣ', 'ϯ'), ('Ⲁ', 'ⳳ'), ('⳹', '⳿')]; + +pub const CUNEIFORM: &'static [(char, char)] = + &[('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒑰', '𒑴'), ('𒒀', '𒕃')]; + +pub const CYPRIOT: &'static [(char, char)] = + &[('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐠿')]; + +pub const CYPRO_MINOAN: &'static [(char, char)] = &[('𒾐', '𒿲')]; + +pub const CYRILLIC: &'static [(char, char)] = &[ + ('Ѐ', '\u{484}'), + ('\u{487}', 'ԯ'), + ('ᲀ', 'ᲈ'), + ('ᴫ', 'ᴫ'), + ('ᵸ', 'ᵸ'), + ('\u{2de0}', '\u{2dff}'), + ('Ꙁ', '\u{a69f}'), + ('\u{fe2e}', '\u{fe2f}'), + ('𞀰', '𞁭'), + ('\u{1e08f}', '\u{1e08f}'), +]; + +pub const DESERET: &'static [(char, char)] = &[('𐐀', '𐑏')]; + +pub const DEVANAGARI: &'static [(char, char)] = &[ + ('\u{900}', 'ॐ'), + ('\u{955}', '\u{963}'), + ('०', 'ॿ'), + ('\u{a8e0}', '\u{a8ff}'), + ('𑬀', '𑬉'), +]; + +pub const DIVES_AKURU: &'static [(char, char)] = &[ + ('𑤀', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '𑥆'), + ('𑥐', '𑥙'), +]; + +pub const DOGRA: &'static [(char, char)] = &[('𑠀', '𑠻')]; + +pub const DUPLOYAN: &'static [(char, char)] = + &[('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𛲜', '𛲟')]; + +pub const EGYPTIAN_HIEROGLYPHS: &'static [(char, char)] = + &[('𓀀', '\u{13455}')]; + +pub const ELBASAN: &'static [(char, char)] = &[('𐔀', '𐔧')]; + +pub const ELYMAIC: &'static [(char, char)] = &[('𐿠', '𐿶')]; + +pub const ETHIOPIC: &'static [(char, char)] = &[ + ('ሀ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('\u{135d}', '፼'), + ('ᎀ', '᎙'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), +]; + +pub const GEORGIAN: &'static [(char, char)] = &[ + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ჿ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), +]; + +pub const GLAGOLITIC: &'static [(char, char)] = &[ + ('Ⰰ', 'ⱟ'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), +]; + +pub const GOTHIC: &'static [(char, char)] = &[('𐌰', '𐍊')]; + +pub const GRANTHA: &'static [(char, char)] = &[ + ('\u{11300}', '𑌃'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('\u{1133c}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('𑍐', '𑍐'), + ('\u{11357}', '\u{11357}'), + ('𑍝', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), +]; + +pub const GREEK: &'static [(char, char)] = &[ + ('Ͱ', 'ͳ'), + ('͵', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('΄', '΄'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϡ'), + ('ϰ', 'Ͽ'), + ('ᴦ', 'ᴪ'), + ('ᵝ', 'ᵡ'), + ('ᵦ', 'ᵪ'), + ('ᶿ', 'ᶿ'), + ('ἀ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ῄ'), + ('ῆ', 'ΐ'), + ('ῖ', 'Ί'), + ('῝', '`'), + ('ῲ', 'ῴ'), + ('ῶ', '῾'), + ('Ω', 'Ω'), + ('ꭥ', 'ꭥ'), + ('𐅀', '𐆎'), + ('𐆠', '𐆠'), + ('𝈀', '𝉅'), +]; + +pub const GUJARATI: &'static [(char, char)] = &[ + ('\u{a81}', 'ઃ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('ૐ', 'ૐ'), + ('ૠ', '\u{ae3}'), + ('૦', '૱'), + ('ૹ', '\u{aff}'), +]; + +pub const GUNJALA_GONDI: &'static [(char, char)] = &[ + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '𑶘'), + ('𑶠', '𑶩'), +]; + +pub const GURMUKHI: &'static [(char, char)] = &[ + ('\u{a01}', 'ਃ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('੦', '੶'), +]; + +pub const HAN: &'static [(char, char)] = &[ + ('⺀', '⺙'), + ('⺛', '⻳'), + ('⼀', '⿕'), + ('々', '々'), + ('〇', '〇'), + ('〡', '〩'), + ('〸', '〻'), + ('㐀', '䶿'), + ('一', '鿿'), + ('豈', '舘'), + ('並', '龎'), + ('𖿢', '𖿣'), + ('𖿰', '𖿱'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; + +pub const HANGUL: &'static [(char, char)] = &[ + ('ᄀ', 'ᇿ'), + ('\u{302e}', '\u{302f}'), + ('ㄱ', 'ㆎ'), + ('㈀', '㈞'), + ('㉠', '㉾'), + ('ꥠ', 'ꥼ'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('ᅠ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), +]; + +pub const HANIFI_ROHINGYA: &'static [(char, char)] = + &[('𐴀', '\u{10d27}'), ('𐴰', '𐴹')]; + +pub const HANUNOO: &'static [(char, char)] = &[('ᜠ', '᜴')]; + +pub const HATRAN: &'static [(char, char)] = + &[('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐣻', '𐣿')]; + +pub const HEBREW: &'static [(char, char)] = &[ + ('\u{591}', '\u{5c7}'), + ('א', 'ת'), + ('ׯ', '״'), + ('יִ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﭏ'), +]; + +pub const HIRAGANA: &'static [(char, char)] = &[ + ('ぁ', 'ゖ'), + ('ゝ', 'ゟ'), + ('𛀁', '𛄟'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('🈀', '🈀'), +]; + +pub const IMPERIAL_ARAMAIC: &'static [(char, char)] = + &[('𐡀', '𐡕'), ('𐡗', '𐡟')]; + +pub const INHERITED: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{485}', '\u{486}'), + ('\u{64b}', '\u{655}'), + ('\u{670}', '\u{670}'), + ('\u{951}', '\u{954}'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce0}'), + ('\u{1ce2}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{1dc0}', '\u{1dff}'), + ('\u{200c}', '\u{200d}'), + ('\u{20d0}', '\u{20f0}'), + ('\u{302a}', '\u{302d}'), + ('\u{3099}', '\u{309a}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2d}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{1133b}', '\u{1133b}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d167}', '\u{1d169}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const INSCRIPTIONAL_PAHLAVI: &'static [(char, char)] = + &[('𐭠', '𐭲'), ('𐭸', '𐭿')]; + +pub const INSCRIPTIONAL_PARTHIAN: &'static [(char, char)] = + &[('𐭀', '𐭕'), ('𐭘', '𐭟')]; + +pub const JAVANESE: &'static [(char, char)] = + &[('\u{a980}', '꧍'), ('꧐', '꧙'), ('꧞', '꧟')]; + +pub const KAITHI: &'static [(char, char)] = + &[('\u{11080}', '\u{110c2}'), ('\u{110cd}', '\u{110cd}')]; + +pub const KANNADA: &'static [(char, char)] = &[ + ('ಀ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('\u{cbc}', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('ೝ', 'ೞ'), + ('ೠ', '\u{ce3}'), + ('೦', '೯'), + ('ೱ', 'ೳ'), +]; + +pub const KATAKANA: &'static [(char, char)] = &[ + ('ァ', 'ヺ'), + ('ヽ', 'ヿ'), + ('ㇰ', 'ㇿ'), + ('㋐', '㋾'), + ('㌀', '㍗'), + ('ヲ', 'ッ'), + ('ア', 'ン'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛀀'), + ('𛄠', '𛄢'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), +]; + +pub const KAWI: &'static [(char, char)] = + &[('\u{11f00}', '𑼐'), ('𑼒', '\u{11f3a}'), ('𑼾', '𑽙')]; + +pub const KAYAH_LI: &'static [(char, char)] = &[('꤀', '\u{a92d}'), ('꤯', '꤯')]; + +pub const KHAROSHTHI: &'static [(char, char)] = &[ + ('𐨀', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '𐩈'), + ('𐩐', '𐩘'), +]; + +pub const KHITAN_SMALL_SCRIPT: &'static [(char, char)] = + &[('\u{16fe4}', '\u{16fe4}'), ('𘬀', '𘳕')]; + +pub const KHMER: &'static [(char, char)] = + &[('ក', '\u{17dd}'), ('០', '៩'), ('៰', '៹'), ('᧠', '᧿')]; + +pub const KHOJKI: &'static [(char, char)] = &[('𑈀', '𑈑'), ('𑈓', '\u{11241}')]; + +pub const KHUDAWADI: &'static [(char, char)] = + &[('𑊰', '\u{112ea}'), ('𑋰', '𑋹')]; + +pub const LAO: &'static [(char, char)] = &[ + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('\u{ec8}', '\u{ece}'), + ('໐', '໙'), + ('ໜ', 'ໟ'), +]; + +pub const LATIN: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ª', 'ª'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ʸ'), + ('ˠ', 'ˤ'), + ('ᴀ', 'ᴥ'), + ('ᴬ', 'ᵜ'), + ('ᵢ', 'ᵥ'), + ('ᵫ', 'ᵷ'), + ('ᵹ', 'ᶾ'), + ('Ḁ', 'ỿ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('K', 'Å'), + ('Ⅎ', 'Ⅎ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⱡ', 'Ɀ'), + ('Ꜣ', 'ꞇ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꟿ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭤ'), + ('ꭦ', 'ꭩ'), + ('ff', 'st'), + ('A', 'Z'), + ('a', 'z'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), +]; + +pub const LEPCHA: &'static [(char, char)] = + &[('ᰀ', '\u{1c37}'), ('᰻', '᱉'), ('ᱍ', 'ᱏ')]; + +pub const LIMBU: &'static [(char, char)] = &[ + ('ᤀ', 'ᤞ'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('᥀', '᥀'), + ('᥄', '᥏'), +]; + +pub const LINEAR_A: &'static [(char, char)] = + &[('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧')]; + +pub const LINEAR_B: &'static [(char, char)] = &[ + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), +]; + +pub const LISU: &'static [(char, char)] = &[('ꓐ', '꓿'), ('𑾰', '𑾰')]; + +pub const LYCIAN: &'static [(char, char)] = &[('𐊀', '𐊜')]; + +pub const LYDIAN: &'static [(char, char)] = &[('𐤠', '𐤹'), ('𐤿', '𐤿')]; + +pub const MAHAJANI: &'static [(char, char)] = &[('𑅐', '𑅶')]; + +pub const MAKASAR: &'static [(char, char)] = &[('𑻠', '𑻸')]; + +pub const MALAYALAM: &'static [(char, char)] = &[ + ('\u{d00}', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', '൏'), + ('ൔ', '\u{d63}'), + ('൦', 'ൿ'), +]; + +pub const MANDAIC: &'static [(char, char)] = &[('ࡀ', '\u{85b}'), ('࡞', '࡞')]; + +pub const MANICHAEAN: &'static [(char, char)] = + &[('𐫀', '\u{10ae6}'), ('𐫫', '𐫶')]; + +pub const MARCHEN: &'static [(char, char)] = + &[('𑱰', '𑲏'), ('\u{11c92}', '\u{11ca7}'), ('𑲩', '\u{11cb6}')]; + +pub const MASARAM_GONDI: &'static [(char, char)] = &[ + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('𑵐', '𑵙'), +]; + +pub const MEDEFAIDRIN: &'static [(char, char)] = &[('𖹀', '𖺚')]; + +pub const MEETEI_MAYEK: &'static [(char, char)] = + &[('ꫠ', '\u{aaf6}'), ('ꯀ', '\u{abed}'), ('꯰', '꯹')]; + +pub const MENDE_KIKAKUI: &'static [(char, char)] = + &[('𞠀', '𞣄'), ('𞣇', '\u{1e8d6}')]; + +pub const MEROITIC_CURSIVE: &'static [(char, char)] = + &[('𐦠', '𐦷'), ('𐦼', '𐧏'), ('𐧒', '𐧿')]; + +pub const MEROITIC_HIEROGLYPHS: &'static [(char, char)] = &[('𐦀', '𐦟')]; + +pub const MIAO: &'static [(char, char)] = + &[('𖼀', '𖽊'), ('\u{16f4f}', '𖾇'), ('\u{16f8f}', '𖾟')]; + +pub const MODI: &'static [(char, char)] = &[('𑘀', '𑙄'), ('𑙐', '𑙙')]; + +pub const MONGOLIAN: &'static [(char, char)] = + &[('᠀', '᠁'), ('᠄', '᠄'), ('᠆', '᠙'), ('ᠠ', 'ᡸ'), ('ᢀ', 'ᢪ'), ('𑙠', '𑙬')]; + +pub const MRO: &'static [(char, char)] = &[('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯')]; + +pub const MULTANI: &'static [(char, char)] = + &[('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊩')]; + +pub const MYANMAR: &'static [(char, char)] = + &[('က', '႟'), ('ꧠ', 'ꧾ'), ('ꩠ', 'ꩿ')]; + +pub const NABATAEAN: &'static [(char, char)] = &[('𐢀', '𐢞'), ('𐢧', '𐢯')]; + +pub const NAG_MUNDARI: &'static [(char, char)] = &[('𞓐', '𞓹')]; + +pub const NANDINAGARI: &'static [(char, char)] = + &[('𑦠', '𑦧'), ('𑦪', '\u{119d7}'), ('\u{119da}', '𑧤')]; + +pub const NEW_TAI_LUE: &'static [(char, char)] = + &[('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('᧞', '᧟')]; + +pub const NEWA: &'static [(char, char)] = &[('𑐀', '𑑛'), ('𑑝', '𑑡')]; + +pub const NKO: &'static [(char, char)] = &[('߀', 'ߺ'), ('\u{7fd}', '߿')]; + +pub const NUSHU: &'static [(char, char)] = &[('𖿡', '𖿡'), ('𛅰', '𛋻')]; + +pub const NYIAKENG_PUACHUE_HMONG: &'static [(char, char)] = + &[('𞄀', '𞄬'), ('\u{1e130}', '𞄽'), ('𞅀', '𞅉'), ('𞅎', '𞅏')]; + +pub const OGHAM: &'static [(char, char)] = &[('\u{1680}', '᚜')]; + +pub const OL_CHIKI: &'static [(char, char)] = &[('᱐', '᱿')]; + +pub const OLD_HUNGARIAN: &'static [(char, char)] = + &[('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿')]; + +pub const OLD_ITALIC: &'static [(char, char)] = &[('𐌀', '𐌣'), ('𐌭', '𐌯')]; + +pub const OLD_NORTH_ARABIAN: &'static [(char, char)] = &[('𐪀', '𐪟')]; + +pub const OLD_PERMIC: &'static [(char, char)] = &[('𐍐', '\u{1037a}')]; + +pub const OLD_PERSIAN: &'static [(char, char)] = &[('𐎠', '𐏃'), ('𐏈', '𐏕')]; + +pub const OLD_SOGDIAN: &'static [(char, char)] = &[('𐼀', '𐼧')]; + +pub const OLD_SOUTH_ARABIAN: &'static [(char, char)] = &[('𐩠', '𐩿')]; + +pub const OLD_TURKIC: &'static [(char, char)] = &[('𐰀', '𐱈')]; + +pub const OLD_UYGHUR: &'static [(char, char)] = &[('𐽰', '𐾉')]; + +pub const ORIYA: &'static [(char, char)] = &[ + ('\u{b01}', 'ଃ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('\u{b3c}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', '\u{b63}'), + ('୦', '୷'), +]; + +pub const OSAGE: &'static [(char, char)] = &[('𐒰', '𐓓'), ('𐓘', '𐓻')]; + +pub const OSMANYA: &'static [(char, char)] = &[('𐒀', '𐒝'), ('𐒠', '𐒩')]; + +pub const PAHAWH_HMONG: &'static [(char, char)] = + &[('𖬀', '𖭅'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖭣', '𖭷'), ('𖭽', '𖮏')]; + +pub const PALMYRENE: &'static [(char, char)] = &[('𐡠', '𐡿')]; + +pub const PAU_CIN_HAU: &'static [(char, char)] = &[('𑫀', '𑫸')]; + +pub const PHAGS_PA: &'static [(char, char)] = &[('ꡀ', '꡷')]; + +pub const PHOENICIAN: &'static [(char, char)] = &[('𐤀', '𐤛'), ('𐤟', '𐤟')]; + +pub const PSALTER_PAHLAVI: &'static [(char, char)] = + &[('𐮀', '𐮑'), ('𐮙', '𐮜'), ('𐮩', '𐮯')]; + +pub const REJANG: &'static [(char, char)] = &[('ꤰ', '꥓'), ('꥟', '꥟')]; + +pub const RUNIC: &'static [(char, char)] = &[('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ')]; + +pub const SAMARITAN: &'static [(char, char)] = &[('ࠀ', '\u{82d}'), ('࠰', '࠾')]; + +pub const SAURASHTRA: &'static [(char, char)] = + &[('ꢀ', '\u{a8c5}'), ('꣎', '꣙')]; + +pub const SHARADA: &'static [(char, char)] = &[('\u{11180}', '𑇟')]; + +pub const SHAVIAN: &'static [(char, char)] = &[('𐑐', '𐑿')]; + +pub const SIDDHAM: &'static [(char, char)] = + &[('𑖀', '\u{115b5}'), ('𑖸', '\u{115dd}')]; + +pub const SIGNWRITING: &'static [(char, char)] = + &[('𝠀', '𝪋'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}')]; + +pub const SINHALA: &'static [(char, char)] = &[ + ('\u{d81}', 'ඃ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('෦', '෯'), + ('ෲ', '෴'), + ('𑇡', '𑇴'), +]; + +pub const SOGDIAN: &'static [(char, char)] = &[('𐼰', '𐽙')]; + +pub const SORA_SOMPENG: &'static [(char, char)] = &[('𑃐', '𑃨'), ('𑃰', '𑃹')]; + +pub const SOYOMBO: &'static [(char, char)] = &[('𑩐', '𑪢')]; + +pub const SUNDANESE: &'static [(char, char)] = + &[('\u{1b80}', 'ᮿ'), ('᳀', '᳇')]; + +pub const SYLOTI_NAGRI: &'static [(char, char)] = &[('ꠀ', '\u{a82c}')]; + +pub const SYRIAC: &'static [(char, char)] = + &[('܀', '܍'), ('\u{70f}', '\u{74a}'), ('ݍ', 'ݏ'), ('ࡠ', 'ࡪ')]; + +pub const TAGALOG: &'static [(char, char)] = &[('ᜀ', '᜕'), ('ᜟ', 'ᜟ')]; + +pub const TAGBANWA: &'static [(char, char)] = + &[('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('\u{1772}', '\u{1773}')]; + +pub const TAI_LE: &'static [(char, char)] = &[('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ')]; + +pub const TAI_THAM: &'static [(char, char)] = &[ + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '᪉'), + ('᪐', '᪙'), + ('᪠', '᪭'), +]; + +pub const TAI_VIET: &'static [(char, char)] = &[('ꪀ', 'ꫂ'), ('ꫛ', '꫟')]; + +pub const TAKRI: &'static [(char, char)] = &[('𑚀', '𑚹'), ('𑛀', '𑛉')]; + +pub const TAMIL: &'static [(char, char)] = &[ + ('\u{b82}', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('ௐ', 'ௐ'), + ('\u{bd7}', '\u{bd7}'), + ('௦', '௺'), + ('𑿀', '𑿱'), + ('𑿿', '𑿿'), +]; + +pub const TANGSA: &'static [(char, char)] = &[('𖩰', '𖪾'), ('𖫀', '𖫉')]; + +pub const TANGUT: &'static [(char, char)] = + &[('𖿠', '𖿠'), ('𗀀', '𘟷'), ('𘠀', '𘫿'), ('𘴀', '𘴈')]; + +pub const TELUGU: &'static [(char, char)] = &[ + ('\u{c00}', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('\u{c3c}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', '\u{c63}'), + ('౦', '౯'), + ('౷', '౿'), +]; + +pub const THAANA: &'static [(char, char)] = &[('ހ', 'ޱ')]; + +pub const THAI: &'static [(char, char)] = &[('ก', '\u{e3a}'), ('เ', '๛')]; + +pub const TIBETAN: &'static [(char, char)] = &[ + ('ༀ', 'ཇ'), + ('ཉ', 'ཬ'), + ('\u{f71}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('྾', '࿌'), + ('࿎', '࿔'), + ('࿙', '࿚'), +]; + +pub const TIFINAGH: &'static [(char, char)] = + &[('ⴰ', 'ⵧ'), ('ⵯ', '⵰'), ('\u{2d7f}', '\u{2d7f}')]; + +pub const TIRHUTA: &'static [(char, char)] = &[('𑒀', '𑓇'), ('𑓐', '𑓙')]; + +pub const TOTO: &'static [(char, char)] = &[('𞊐', '\u{1e2ae}')]; + +pub const UGARITIC: &'static [(char, char)] = &[('𐎀', '𐎝'), ('𐎟', '𐎟')]; + +pub const VAI: &'static [(char, char)] = &[('ꔀ', 'ꘫ')]; + +pub const VITHKUQI: &'static [(char, char)] = &[ + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), +]; + +pub const WANCHO: &'static [(char, char)] = &[('𞋀', '𞋹'), ('𞋿', '𞋿')]; + +pub const WARANG_CITI: &'static [(char, char)] = &[('𑢠', '𑣲'), ('𑣿', '𑣿')]; + +pub const YEZIDI: &'static [(char, char)] = + &[('𐺀', '𐺩'), ('\u{10eab}', '𐺭'), ('𐺰', '𐺱')]; + +pub const YI: &'static [(char, char)] = &[('ꀀ', 'ꒌ'), ('꒐', '꓆')]; + +pub const ZANABAZAR_SQUARE: &'static [(char, char)] = &[('𑨀', '\u{11a47}')]; diff --git a/vendor/regex-syntax/src/unicode_tables/script_extension.rs b/vendor/regex-syntax/src/unicode_tables/script_extension.rs new file mode 100644 index 0000000000..42625e21b9 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/script_extension.rs @@ -0,0 +1,1457 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate script-extension ucd-15.0.0 --chars +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("Adlam", ADLAM), + ("Ahom", AHOM), + ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS), + ("Arabic", ARABIC), + ("Armenian", ARMENIAN), + ("Avestan", AVESTAN), + ("Balinese", BALINESE), + ("Bamum", BAMUM), + ("Bassa_Vah", BASSA_VAH), + ("Batak", BATAK), + ("Bengali", BENGALI), + ("Bhaiksuki", BHAIKSUKI), + ("Bopomofo", BOPOMOFO), + ("Brahmi", BRAHMI), + ("Braille", BRAILLE), + ("Buginese", BUGINESE), + ("Buhid", BUHID), + ("Canadian_Aboriginal", CANADIAN_ABORIGINAL), + ("Carian", CARIAN), + ("Caucasian_Albanian", CAUCASIAN_ALBANIAN), + ("Chakma", CHAKMA), + ("Cham", CHAM), + ("Cherokee", CHEROKEE), + ("Chorasmian", CHORASMIAN), + ("Common", COMMON), + ("Coptic", COPTIC), + ("Cuneiform", CUNEIFORM), + ("Cypriot", CYPRIOT), + ("Cypro_Minoan", CYPRO_MINOAN), + ("Cyrillic", CYRILLIC), + ("Deseret", DESERET), + ("Devanagari", DEVANAGARI), + ("Dives_Akuru", DIVES_AKURU), + ("Dogra", DOGRA), + ("Duployan", DUPLOYAN), + ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS), + ("Elbasan", ELBASAN), + ("Elymaic", ELYMAIC), + ("Ethiopic", ETHIOPIC), + ("Georgian", GEORGIAN), + ("Glagolitic", GLAGOLITIC), + ("Gothic", GOTHIC), + ("Grantha", GRANTHA), + ("Greek", GREEK), + ("Gujarati", GUJARATI), + ("Gunjala_Gondi", GUNJALA_GONDI), + ("Gurmukhi", GURMUKHI), + ("Han", HAN), + ("Hangul", HANGUL), + ("Hanifi_Rohingya", HANIFI_ROHINGYA), + ("Hanunoo", HANUNOO), + ("Hatran", HATRAN), + ("Hebrew", HEBREW), + ("Hiragana", HIRAGANA), + ("Imperial_Aramaic", IMPERIAL_ARAMAIC), + ("Inherited", INHERITED), + ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI), + ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN), + ("Javanese", JAVANESE), + ("Kaithi", KAITHI), + ("Kannada", KANNADA), + ("Katakana", KATAKANA), + ("Kawi", KAWI), + ("Kayah_Li", KAYAH_LI), + ("Kharoshthi", KHAROSHTHI), + ("Khitan_Small_Script", KHITAN_SMALL_SCRIPT), + ("Khmer", KHMER), + ("Khojki", KHOJKI), + ("Khudawadi", KHUDAWADI), + ("Lao", LAO), + ("Latin", LATIN), + ("Lepcha", LEPCHA), + ("Limbu", LIMBU), + ("Linear_A", LINEAR_A), + ("Linear_B", LINEAR_B), + ("Lisu", LISU), + ("Lycian", LYCIAN), + ("Lydian", LYDIAN), + ("Mahajani", MAHAJANI), + ("Makasar", MAKASAR), + ("Malayalam", MALAYALAM), + ("Mandaic", MANDAIC), + ("Manichaean", MANICHAEAN), + ("Marchen", MARCHEN), + ("Masaram_Gondi", MASARAM_GONDI), + ("Medefaidrin", MEDEFAIDRIN), + ("Meetei_Mayek", MEETEI_MAYEK), + ("Mende_Kikakui", MENDE_KIKAKUI), + ("Meroitic_Cursive", MEROITIC_CURSIVE), + ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS), + ("Miao", MIAO), + ("Modi", MODI), + ("Mongolian", MONGOLIAN), + ("Mro", MRO), + ("Multani", MULTANI), + ("Myanmar", MYANMAR), + ("Nabataean", NABATAEAN), + ("Nag_Mundari", NAG_MUNDARI), + ("Nandinagari", NANDINAGARI), + ("New_Tai_Lue", NEW_TAI_LUE), + ("Newa", NEWA), + ("Nko", NKO), + ("Nushu", NUSHU), + ("Nyiakeng_Puachue_Hmong", NYIAKENG_PUACHUE_HMONG), + ("Ogham", OGHAM), + ("Ol_Chiki", OL_CHIKI), + ("Old_Hungarian", OLD_HUNGARIAN), + ("Old_Italic", OLD_ITALIC), + ("Old_North_Arabian", OLD_NORTH_ARABIAN), + ("Old_Permic", OLD_PERMIC), + ("Old_Persian", OLD_PERSIAN), + ("Old_Sogdian", OLD_SOGDIAN), + ("Old_South_Arabian", OLD_SOUTH_ARABIAN), + ("Old_Turkic", OLD_TURKIC), + ("Old_Uyghur", OLD_UYGHUR), + ("Oriya", ORIYA), + ("Osage", OSAGE), + ("Osmanya", OSMANYA), + ("Pahawh_Hmong", PAHAWH_HMONG), + ("Palmyrene", PALMYRENE), + ("Pau_Cin_Hau", PAU_CIN_HAU), + ("Phags_Pa", PHAGS_PA), + ("Phoenician", PHOENICIAN), + ("Psalter_Pahlavi", PSALTER_PAHLAVI), + ("Rejang", REJANG), + ("Runic", RUNIC), + ("Samaritan", SAMARITAN), + ("Saurashtra", SAURASHTRA), + ("Sharada", SHARADA), + ("Shavian", SHAVIAN), + ("Siddham", SIDDHAM), + ("SignWriting", SIGNWRITING), + ("Sinhala", SINHALA), + ("Sogdian", SOGDIAN), + ("Sora_Sompeng", SORA_SOMPENG), + ("Soyombo", SOYOMBO), + ("Sundanese", SUNDANESE), + ("Syloti_Nagri", SYLOTI_NAGRI), + ("Syriac", SYRIAC), + ("Tagalog", TAGALOG), + ("Tagbanwa", TAGBANWA), + ("Tai_Le", TAI_LE), + ("Tai_Tham", TAI_THAM), + ("Tai_Viet", TAI_VIET), + ("Takri", TAKRI), + ("Tamil", TAMIL), + ("Tangsa", TANGSA), + ("Tangut", TANGUT), + ("Telugu", TELUGU), + ("Thaana", THAANA), + ("Thai", THAI), + ("Tibetan", TIBETAN), + ("Tifinagh", TIFINAGH), + ("Tirhuta", TIRHUTA), + ("Toto", TOTO), + ("Ugaritic", UGARITIC), + ("Vai", VAI), + ("Vithkuqi", VITHKUQI), + ("Wancho", WANCHO), + ("Warang_Citi", WARANG_CITI), + ("Yezidi", YEZIDI), + ("Yi", YI), + ("Zanabazar_Square", ZANABAZAR_SQUARE), +]; + +pub const ADLAM: &'static [(char, char)] = + &[('؟', '؟'), ('ـ', 'ـ'), ('𞤀', '𞥋'), ('𞥐', '𞥙'), ('𞥞', '𞥟')]; + +pub const AHOM: &'static [(char, char)] = + &[('𑜀', '𑜚'), ('\u{1171d}', '\u{1172b}'), ('𑜰', '𑝆')]; + +pub const ANATOLIAN_HIEROGLYPHS: &'static [(char, char)] = &[('𔐀', '𔙆')]; + +pub const ARABIC: &'static [(char, char)] = &[ + ('\u{600}', '\u{604}'), + ('؆', '\u{6dc}'), + ('۞', 'ۿ'), + ('ݐ', 'ݿ'), + ('ࡰ', 'ࢎ'), + ('\u{890}', '\u{891}'), + ('\u{898}', '\u{8e1}'), + ('\u{8e3}', '\u{8ff}'), + ('ﭐ', '﯂'), + ('ﯓ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('﷏', '﷏'), + ('ﷰ', '﷿'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('\u{102e0}', '𐋻'), + ('𐹠', '𐹾'), + ('\u{10efd}', '\u{10eff}'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('𞻰', '𞻱'), +]; + +pub const ARMENIAN: &'static [(char, char)] = + &[('Ա', 'Ֆ'), ('ՙ', '֊'), ('֍', '֏'), ('ﬓ', 'ﬗ')]; + +pub const AVESTAN: &'static [(char, char)] = &[('𐬀', '𐬵'), ('𐬹', '𐬿')]; + +pub const BALINESE: &'static [(char, char)] = &[('\u{1b00}', 'ᭌ'), ('᭐', '᭾')]; + +pub const BAMUM: &'static [(char, char)] = &[('ꚠ', '꛷'), ('𖠀', '𖨸')]; + +pub const BASSA_VAH: &'static [(char, char)] = + &[('𖫐', '𖫭'), ('\u{16af0}', '𖫵')]; + +pub const BATAK: &'static [(char, char)] = &[('ᯀ', '᯳'), ('᯼', '᯿')]; + +pub const BENGALI: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('।', '॥'), + ('ঀ', 'ঃ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('\u{9bc}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', 'ৎ'), + ('\u{9d7}', '\u{9d7}'), + ('ড়', 'ঢ়'), + ('য়', '\u{9e3}'), + ('০', '\u{9fe}'), + ('\u{1cd0}', '\u{1cd0}'), + ('\u{1cd2}', '\u{1cd2}'), + ('\u{1cd5}', '\u{1cd6}'), + ('\u{1cd8}', '\u{1cd8}'), + ('᳡', '᳡'), + ('ᳪ', 'ᳪ'), + ('\u{1ced}', '\u{1ced}'), + ('ᳲ', 'ᳲ'), + ('ᳵ', '᳷'), + ('\u{a8f1}', '\u{a8f1}'), +]; + +pub const BHAIKSUKI: &'static [(char, char)] = + &[('𑰀', '𑰈'), ('𑰊', '\u{11c36}'), ('\u{11c38}', '𑱅'), ('𑱐', '𑱬')]; + +pub const BOPOMOFO: &'static [(char, char)] = &[ + ('˪', '˫'), + ('、', '〃'), + ('〈', '】'), + ('〓', '〟'), + ('\u{302a}', '\u{302d}'), + ('〰', '〰'), + ('〷', '〷'), + ('・', '・'), + ('ㄅ', 'ㄯ'), + ('ㆠ', 'ㆿ'), + ('﹅', '﹆'), + ('。', '・'), +]; + +pub const BRAHMI: &'static [(char, char)] = + &[('𑀀', '𑁍'), ('𑁒', '𑁵'), ('\u{1107f}', '\u{1107f}')]; + +pub const BRAILLE: &'static [(char, char)] = &[('⠀', '⣿')]; + +pub const BUGINESE: &'static [(char, char)] = + &[('ᨀ', '\u{1a1b}'), ('᨞', '᨟'), ('ꧏ', 'ꧏ')]; + +pub const BUHID: &'static [(char, char)] = &[('᜵', '᜶'), ('ᝀ', '\u{1753}')]; + +pub const CANADIAN_ABORIGINAL: &'static [(char, char)] = + &[('᐀', 'ᙿ'), ('ᢰ', 'ᣵ'), ('𑪰', '𑪿')]; + +pub const CARIAN: &'static [(char, char)] = &[('𐊠', '𐋐')]; + +pub const CAUCASIAN_ALBANIAN: &'static [(char, char)] = + &[('𐔰', '𐕣'), ('𐕯', '𐕯')]; + +pub const CHAKMA: &'static [(char, char)] = + &[('০', '৯'), ('၀', '၉'), ('\u{11100}', '\u{11134}'), ('𑄶', '𑅇')]; + +pub const CHAM: &'static [(char, char)] = + &[('ꨀ', '\u{aa36}'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('꩜', '꩟')]; + +pub const CHEROKEE: &'static [(char, char)] = + &[('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ꭰ', 'ꮿ')]; + +pub const CHORASMIAN: &'static [(char, char)] = &[('𐾰', '𐿋')]; + +pub const COMMON: &'static [(char, char)] = &[ + ('\0', '@'), + ('[', '`'), + ('{', '©'), + ('«', '¹'), + ('»', '¿'), + ('×', '×'), + ('÷', '÷'), + ('ʹ', '˟'), + ('˥', '˩'), + ('ˬ', '˿'), + ('ʹ', 'ʹ'), + (';', ';'), + ('΅', '΅'), + ('·', '·'), + ('\u{605}', '\u{605}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{8e2}', '\u{8e2}'), + ('฿', '฿'), + ('࿕', '࿘'), + ('᛫', '᛭'), + ('\u{2000}', '\u{200b}'), + ('\u{200e}', '\u{202e}'), + ('‰', '\u{2064}'), + ('\u{2066}', '⁰'), + ('⁴', '⁾'), + ('₀', '₎'), + ('₠', '⃀'), + ('℀', '℥'), + ('℧', '℩'), + ('ℬ', 'ℱ'), + ('ℳ', '⅍'), + ('⅏', '⅟'), + ('↉', '↋'), + ('←', '␦'), + ('⑀', '⑊'), + ('①', '⟿'), + ('⤀', '⭳'), + ('⭶', '⮕'), + ('⮗', '⯿'), + ('⸀', '⹂'), + ('⹄', '⹝'), + ('⿰', '⿻'), + ('\u{3000}', '\u{3000}'), + ('〄', '〄'), + ('〒', '〒'), + ('〠', '〠'), + ('〶', '〶'), + ('㉈', '㉟'), + ('㉿', '㉿'), + ('㊱', '㊿'), + ('㋌', '㋏'), + ('㍱', '㍺'), + ('㎀', '㏟'), + ('㏿', '㏿'), + ('䷀', '䷿'), + ('꜈', '꜡'), + ('ꞈ', '꞊'), + ('꭛', '꭛'), + ('꭪', '꭫'), + ('︐', '︙'), + ('︰', '﹄'), + ('﹇', '﹒'), + ('﹔', '﹦'), + ('﹨', '﹫'), + ('\u{feff}', '\u{feff}'), + ('!', '@'), + ('[', '`'), + ('{', '⦆'), + ('¢', '₩'), + ('│', '○'), + ('\u{fff9}', '�'), + ('𐆐', '𐆜'), + ('𐇐', '𐇼'), + ('𜽐', '𜿃'), + ('𝀀', '𝃵'), + ('𝄀', '𝄦'), + ('𝄩', '𝅦'), + ('𝅪', '\u{1d17a}'), + ('𝆃', '𝆄'), + ('𝆌', '𝆩'), + ('𝆮', '𝇪'), + ('𝋀', '𝋓'), + ('𝋠', '𝋳'), + ('𝌀', '𝍖'), + ('𝍲', '𝍸'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝟋'), + ('𝟎', '𝟿'), + ('𞱱', '𞲴'), + ('𞴁', '𞴽'), + ('🀀', '🀫'), + ('🀰', '🂓'), + ('🂠', '🂮'), + ('🂱', '🂿'), + ('🃁', '🃏'), + ('🃑', '🃵'), + ('🄀', '🆭'), + ('🇦', '🇿'), + ('🈁', '🈂'), + ('🈐', '🈻'), + ('🉀', '🉈'), + ('🉠', '🉥'), + ('🌀', '🛗'), + ('🛜', '🛬'), + ('🛰', '🛼'), + ('🜀', '🝶'), + ('🝻', '🟙'), + ('🟠', '🟫'), + ('🟰', '🟰'), + ('🠀', '🠋'), + ('🠐', '🡇'), + ('🡐', '🡙'), + ('🡠', '🢇'), + ('🢐', '🢭'), + ('🢰', '🢱'), + ('🤀', '🩓'), + ('🩠', '🩭'), + ('🩰', '🩼'), + ('🪀', '🪈'), + ('🪐', '🪽'), + ('🪿', '🫅'), + ('🫎', '🫛'), + ('🫠', '🫨'), + ('🫰', '🫸'), + ('🬀', '🮒'), + ('🮔', '🯊'), + ('🯰', '🯹'), + ('\u{e0001}', '\u{e0001}'), + ('\u{e0020}', '\u{e007f}'), +]; + +pub const COPTIC: &'static [(char, char)] = + &[('Ϣ', 'ϯ'), ('Ⲁ', 'ⳳ'), ('⳹', '⳿'), ('\u{102e0}', '𐋻')]; + +pub const CUNEIFORM: &'static [(char, char)] = + &[('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒑰', '𒑴'), ('𒒀', '𒕃')]; + +pub const CYPRIOT: &'static [(char, char)] = &[ + ('𐄀', '𐄂'), + ('𐄇', '𐄳'), + ('𐄷', '𐄿'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐠿'), +]; + +pub const CYPRO_MINOAN: &'static [(char, char)] = &[('𐄀', '𐄁'), ('𒾐', '𒿲')]; + +pub const CYRILLIC: &'static [(char, char)] = &[ + ('Ѐ', 'ԯ'), + ('ᲀ', 'ᲈ'), + ('ᴫ', 'ᴫ'), + ('ᵸ', 'ᵸ'), + ('\u{1df8}', '\u{1df8}'), + ('\u{2de0}', '\u{2dff}'), + ('⹃', '⹃'), + ('Ꙁ', '\u{a69f}'), + ('\u{fe2e}', '\u{fe2f}'), + ('𞀰', '𞁭'), + ('\u{1e08f}', '\u{1e08f}'), +]; + +pub const DESERET: &'static [(char, char)] = &[('𐐀', '𐑏')]; + +pub const DEVANAGARI: &'static [(char, char)] = &[ + ('\u{900}', '\u{952}'), + ('\u{955}', 'ॿ'), + ('\u{1cd0}', 'ᳶ'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{20f0}', '\u{20f0}'), + ('꠰', '꠹'), + ('\u{a8e0}', '\u{a8ff}'), + ('𑬀', '𑬉'), +]; + +pub const DIVES_AKURU: &'static [(char, char)] = &[ + ('𑤀', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '𑥆'), + ('𑥐', '𑥙'), +]; + +pub const DOGRA: &'static [(char, char)] = + &[('।', '९'), ('꠰', '꠹'), ('𑠀', '𑠻')]; + +pub const DUPLOYAN: &'static [(char, char)] = + &[('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𛲜', '\u{1bca3}')]; + +pub const EGYPTIAN_HIEROGLYPHS: &'static [(char, char)] = + &[('𓀀', '\u{13455}')]; + +pub const ELBASAN: &'static [(char, char)] = &[('𐔀', '𐔧')]; + +pub const ELYMAIC: &'static [(char, char)] = &[('𐿠', '𐿶')]; + +pub const ETHIOPIC: &'static [(char, char)] = &[ + ('ሀ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('\u{135d}', '፼'), + ('ᎀ', '᎙'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), +]; + +pub const GEORGIAN: &'static [(char, char)] = &[ + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჿ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), +]; + +pub const GLAGOLITIC: &'static [(char, char)] = &[ + ('\u{484}', '\u{484}'), + ('\u{487}', '\u{487}'), + ('Ⰰ', 'ⱟ'), + ('⹃', '⹃'), + ('\u{a66f}', '\u{a66f}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), +]; + +pub const GOTHIC: &'static [(char, char)] = &[('𐌰', '𐍊')]; + +pub const GRANTHA: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('।', '॥'), + ('௦', '௳'), + ('\u{1cd0}', '\u{1cd0}'), + ('\u{1cd2}', '᳓'), + ('ᳲ', '\u{1cf4}'), + ('\u{1cf8}', '\u{1cf9}'), + ('\u{20f0}', '\u{20f0}'), + ('\u{11300}', '𑌃'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('\u{1133b}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('𑍐', '𑍐'), + ('\u{11357}', '\u{11357}'), + ('𑍝', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('𑿐', '𑿑'), + ('𑿓', '𑿓'), +]; + +pub const GREEK: &'static [(char, char)] = &[ + ('\u{342}', '\u{342}'), + ('\u{345}', '\u{345}'), + ('Ͱ', 'ͳ'), + ('͵', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('΄', '΄'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϡ'), + ('ϰ', 'Ͽ'), + ('ᴦ', 'ᴪ'), + ('ᵝ', 'ᵡ'), + ('ᵦ', 'ᵪ'), + ('ᶿ', '\u{1dc1}'), + ('ἀ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ῄ'), + ('ῆ', 'ΐ'), + ('ῖ', 'Ί'), + ('῝', '`'), + ('ῲ', 'ῴ'), + ('ῶ', '῾'), + ('Ω', 'Ω'), + ('ꭥ', 'ꭥ'), + ('𐅀', '𐆎'), + ('𐆠', '𐆠'), + ('𝈀', '𝉅'), +]; + +pub const GUJARATI: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('।', '॥'), + ('\u{a81}', 'ઃ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('\u{abc}', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('ૐ', 'ૐ'), + ('ૠ', '\u{ae3}'), + ('૦', '૱'), + ('ૹ', '\u{aff}'), + ('꠰', '꠹'), +]; + +pub const GUNJALA_GONDI: &'static [(char, char)] = &[ + ('।', '॥'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '𑶘'), + ('𑶠', '𑶩'), +]; + +pub const GURMUKHI: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('।', '॥'), + ('\u{a01}', 'ਃ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('੦', '੶'), + ('꠰', '꠹'), +]; + +pub const HAN: &'static [(char, char)] = &[ + ('⺀', '⺙'), + ('⺛', '⻳'), + ('⼀', '⿕'), + ('、', '〃'), + ('々', '】'), + ('〓', '〟'), + ('〡', '\u{302d}'), + ('〰', '〰'), + ('〷', '〿'), + ('・', '・'), + ('㆐', '㆟'), + ('㇀', '㇣'), + ('㈠', '㉇'), + ('㊀', '㊰'), + ('㋀', '㋋'), + ('㋿', '㋿'), + ('㍘', '㍰'), + ('㍻', '㍿'), + ('㏠', '㏾'), + ('㐀', '䶿'), + ('一', '鿿'), + ('꜀', '꜇'), + ('豈', '舘'), + ('並', '龎'), + ('﹅', '﹆'), + ('。', '・'), + ('𖿢', '𖿣'), + ('𖿰', '𖿱'), + ('𝍠', '𝍱'), + ('🉐', '🉑'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; + +pub const HANGUL: &'static [(char, char)] = &[ + ('ᄀ', 'ᇿ'), + ('、', '〃'), + ('〈', '】'), + ('〓', '〟'), + ('\u{302e}', '〰'), + ('〷', '〷'), + ('・', '・'), + ('ㄱ', 'ㆎ'), + ('㈀', '㈞'), + ('㉠', '㉾'), + ('ꥠ', 'ꥼ'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('﹅', '﹆'), + ('。', '・'), + ('ᅠ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), +]; + +pub const HANIFI_ROHINGYA: &'static [(char, char)] = &[ + ('،', '،'), + ('؛', '؛'), + ('؟', '؟'), + ('ـ', 'ـ'), + ('۔', '۔'), + ('𐴀', '\u{10d27}'), + ('𐴰', '𐴹'), +]; + +pub const HANUNOO: &'static [(char, char)] = &[('ᜠ', '᜶')]; + +pub const HATRAN: &'static [(char, char)] = + &[('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐣻', '𐣿')]; + +pub const HEBREW: &'static [(char, char)] = &[ + ('\u{591}', '\u{5c7}'), + ('א', 'ת'), + ('ׯ', '״'), + ('יִ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﭏ'), +]; + +pub const HIRAGANA: &'static [(char, char)] = &[ + ('、', '〃'), + ('〈', '】'), + ('〓', '〟'), + ('〰', '〵'), + ('〷', '〷'), + ('〼', '〽'), + ('ぁ', 'ゖ'), + ('\u{3099}', '゠'), + ('・', 'ー'), + ('﹅', '﹆'), + ('。', '・'), + ('ー', 'ー'), + ('\u{ff9e}', '\u{ff9f}'), + ('𛀁', '𛄟'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('🈀', '🈀'), +]; + +pub const IMPERIAL_ARAMAIC: &'static [(char, char)] = + &[('𐡀', '𐡕'), ('𐡗', '𐡟')]; + +pub const INHERITED: &'static [(char, char)] = &[ + ('\u{300}', '\u{341}'), + ('\u{343}', '\u{344}'), + ('\u{346}', '\u{362}'), + ('\u{953}', '\u{954}'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1dc2}', '\u{1df7}'), + ('\u{1df9}', '\u{1df9}'), + ('\u{1dfb}', '\u{1dff}'), + ('\u{200c}', '\u{200d}'), + ('\u{20d0}', '\u{20ef}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2d}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d167}', '\u{1d169}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const INSCRIPTIONAL_PAHLAVI: &'static [(char, char)] = + &[('𐭠', '𐭲'), ('𐭸', '𐭿')]; + +pub const INSCRIPTIONAL_PARTHIAN: &'static [(char, char)] = + &[('𐭀', '𐭕'), ('𐭘', '𐭟')]; + +pub const JAVANESE: &'static [(char, char)] = + &[('\u{a980}', '꧍'), ('ꧏ', '꧙'), ('꧞', '꧟')]; + +pub const KAITHI: &'static [(char, char)] = &[ + ('०', '९'), + ('꠰', '꠹'), + ('\u{11080}', '\u{110c2}'), + ('\u{110cd}', '\u{110cd}'), +]; + +pub const KANNADA: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('।', '॥'), + ('ಀ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('\u{cbc}', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('ೝ', 'ೞ'), + ('ೠ', '\u{ce3}'), + ('೦', '೯'), + ('ೱ', 'ೳ'), + ('\u{1cd0}', '\u{1cd0}'), + ('\u{1cd2}', '\u{1cd2}'), + ('\u{1cda}', '\u{1cda}'), + ('ᳲ', 'ᳲ'), + ('\u{1cf4}', '\u{1cf4}'), + ('꠰', '꠵'), +]; + +pub const KATAKANA: &'static [(char, char)] = &[ + ('、', '〃'), + ('〈', '】'), + ('〓', '〟'), + ('〰', '〵'), + ('〷', '〷'), + ('〼', '〽'), + ('\u{3099}', '゜'), + ('゠', 'ヿ'), + ('ㇰ', 'ㇿ'), + ('㋐', '㋾'), + ('㌀', '㍗'), + ('﹅', '﹆'), + ('。', '\u{ff9f}'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛀀'), + ('𛄠', '𛄢'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), +]; + +pub const KAWI: &'static [(char, char)] = + &[('\u{11f00}', '𑼐'), ('𑼒', '\u{11f3a}'), ('𑼾', '𑽙')]; + +pub const KAYAH_LI: &'static [(char, char)] = &[('꤀', '꤯')]; + +pub const KHAROSHTHI: &'static [(char, char)] = &[ + ('𐨀', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '𐩈'), + ('𐩐', '𐩘'), +]; + +pub const KHITAN_SMALL_SCRIPT: &'static [(char, char)] = + &[('\u{16fe4}', '\u{16fe4}'), ('𘬀', '𘳕')]; + +pub const KHMER: &'static [(char, char)] = + &[('ក', '\u{17dd}'), ('០', '៩'), ('៰', '៹'), ('᧠', '᧿')]; + +pub const KHOJKI: &'static [(char, char)] = + &[('૦', '૯'), ('꠰', '꠹'), ('𑈀', '𑈑'), ('𑈓', '\u{11241}')]; + +pub const KHUDAWADI: &'static [(char, char)] = + &[('।', '॥'), ('꠰', '꠹'), ('𑊰', '\u{112ea}'), ('𑋰', '𑋹')]; + +pub const LAO: &'static [(char, char)] = &[ + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('\u{ec8}', '\u{ece}'), + ('໐', '໙'), + ('ໜ', 'ໟ'), +]; + +pub const LATIN: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ª', 'ª'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', 'ʸ'), + ('ˠ', 'ˤ'), + ('\u{363}', '\u{36f}'), + ('\u{485}', '\u{486}'), + ('\u{951}', '\u{952}'), + ('჻', '჻'), + ('ᴀ', 'ᴥ'), + ('ᴬ', 'ᵜ'), + ('ᵢ', 'ᵥ'), + ('ᵫ', 'ᵷ'), + ('ᵹ', 'ᶾ'), + ('Ḁ', 'ỿ'), + ('\u{202f}', '\u{202f}'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('\u{20f0}', '\u{20f0}'), + ('K', 'Å'), + ('Ⅎ', 'Ⅎ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⱡ', 'Ɀ'), + ('꜀', '꜇'), + ('Ꜣ', 'ꞇ'), + ('Ꞌ', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꟿ'), + ('꤮', '꤮'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭤ'), + ('ꭦ', 'ꭩ'), + ('ff', 'st'), + ('A', 'Z'), + ('a', 'z'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), +]; + +pub const LEPCHA: &'static [(char, char)] = + &[('ᰀ', '\u{1c37}'), ('᰻', '᱉'), ('ᱍ', 'ᱏ')]; + +pub const LIMBU: &'static [(char, char)] = &[ + ('॥', '॥'), + ('ᤀ', 'ᤞ'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('᥀', '᥀'), + ('᥄', '᥏'), +]; + +pub const LINEAR_A: &'static [(char, char)] = + &[('𐄇', '𐄳'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧')]; + +pub const LINEAR_B: &'static [(char, char)] = &[ + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐄀', '𐄂'), + ('𐄇', '𐄳'), + ('𐄷', '𐄿'), +]; + +pub const LISU: &'static [(char, char)] = &[('ꓐ', '꓿'), ('𑾰', '𑾰')]; + +pub const LYCIAN: &'static [(char, char)] = &[('𐊀', '𐊜')]; + +pub const LYDIAN: &'static [(char, char)] = &[('𐤠', '𐤹'), ('𐤿', '𐤿')]; + +pub const MAHAJANI: &'static [(char, char)] = + &[('।', '९'), ('꠰', '꠹'), ('𑅐', '𑅶')]; + +pub const MAKASAR: &'static [(char, char)] = &[('𑻠', '𑻸')]; + +pub const MALAYALAM: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('।', '॥'), + ('\u{d00}', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', '൏'), + ('ൔ', '\u{d63}'), + ('൦', 'ൿ'), + ('\u{1cda}', '\u{1cda}'), + ('꠰', '꠲'), +]; + +pub const MANDAIC: &'static [(char, char)] = + &[('ـ', 'ـ'), ('ࡀ', '\u{85b}'), ('࡞', '࡞')]; + +pub const MANICHAEAN: &'static [(char, char)] = + &[('ـ', 'ـ'), ('𐫀', '\u{10ae6}'), ('𐫫', '𐫶')]; + +pub const MARCHEN: &'static [(char, char)] = + &[('𑱰', '𑲏'), ('\u{11c92}', '\u{11ca7}'), ('𑲩', '\u{11cb6}')]; + +pub const MASARAM_GONDI: &'static [(char, char)] = &[ + ('।', '॥'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d47}'), + ('𑵐', '𑵙'), +]; + +pub const MEDEFAIDRIN: &'static [(char, char)] = &[('𖹀', '𖺚')]; + +pub const MEETEI_MAYEK: &'static [(char, char)] = + &[('ꫠ', '\u{aaf6}'), ('ꯀ', '\u{abed}'), ('꯰', '꯹')]; + +pub const MENDE_KIKAKUI: &'static [(char, char)] = + &[('𞠀', '𞣄'), ('𞣇', '\u{1e8d6}')]; + +pub const MEROITIC_CURSIVE: &'static [(char, char)] = + &[('𐦠', '𐦷'), ('𐦼', '𐧏'), ('𐧒', '𐧿')]; + +pub const MEROITIC_HIEROGLYPHS: &'static [(char, char)] = &[('𐦀', '𐦟')]; + +pub const MIAO: &'static [(char, char)] = + &[('𖼀', '𖽊'), ('\u{16f4f}', '𖾇'), ('\u{16f8f}', '𖾟')]; + +pub const MODI: &'static [(char, char)] = + &[('꠰', '꠹'), ('𑘀', '𑙄'), ('𑙐', '𑙙')]; + +pub const MONGOLIAN: &'static [(char, char)] = &[ + ('᠀', '᠙'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢪ'), + ('\u{202f}', '\u{202f}'), + ('𑙠', '𑙬'), +]; + +pub const MRO: &'static [(char, char)] = &[('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯')]; + +pub const MULTANI: &'static [(char, char)] = + &[('੦', '੯'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊩')]; + +pub const MYANMAR: &'static [(char, char)] = + &[('က', '႟'), ('꤮', '꤮'), ('ꧠ', 'ꧾ'), ('ꩠ', 'ꩿ')]; + +pub const NABATAEAN: &'static [(char, char)] = &[('𐢀', '𐢞'), ('𐢧', '𐢯')]; + +pub const NAG_MUNDARI: &'static [(char, char)] = &[('𞓐', '𞓹')]; + +pub const NANDINAGARI: &'static [(char, char)] = &[ + ('।', '॥'), + ('೦', '೯'), + ('ᳩ', 'ᳩ'), + ('ᳲ', 'ᳲ'), + ('ᳺ', 'ᳺ'), + ('꠰', '꠵'), + ('𑦠', '𑦧'), + ('𑦪', '\u{119d7}'), + ('\u{119da}', '𑧤'), +]; + +pub const NEW_TAI_LUE: &'static [(char, char)] = + &[('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('᧞', '᧟')]; + +pub const NEWA: &'static [(char, char)] = &[('𑐀', '𑑛'), ('𑑝', '𑑡')]; + +pub const NKO: &'static [(char, char)] = &[ + ('،', '،'), + ('؛', '؛'), + ('؟', '؟'), + ('߀', 'ߺ'), + ('\u{7fd}', '߿'), + ('﴾', '﴿'), +]; + +pub const NUSHU: &'static [(char, char)] = &[('𖿡', '𖿡'), ('𛅰', '𛋻')]; + +pub const NYIAKENG_PUACHUE_HMONG: &'static [(char, char)] = + &[('𞄀', '𞄬'), ('\u{1e130}', '𞄽'), ('𞅀', '𞅉'), ('𞅎', '𞅏')]; + +pub const OGHAM: &'static [(char, char)] = &[('\u{1680}', '᚜')]; + +pub const OL_CHIKI: &'static [(char, char)] = &[('᱐', '᱿')]; + +pub const OLD_HUNGARIAN: &'static [(char, char)] = + &[('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿')]; + +pub const OLD_ITALIC: &'static [(char, char)] = &[('𐌀', '𐌣'), ('𐌭', '𐌯')]; + +pub const OLD_NORTH_ARABIAN: &'static [(char, char)] = &[('𐪀', '𐪟')]; + +pub const OLD_PERMIC: &'static [(char, char)] = + &[('\u{483}', '\u{483}'), ('𐍐', '\u{1037a}')]; + +pub const OLD_PERSIAN: &'static [(char, char)] = &[('𐎠', '𐏃'), ('𐏈', '𐏕')]; + +pub const OLD_SOGDIAN: &'static [(char, char)] = &[('𐼀', '𐼧')]; + +pub const OLD_SOUTH_ARABIAN: &'static [(char, char)] = &[('𐩠', '𐩿')]; + +pub const OLD_TURKIC: &'static [(char, char)] = &[('𐰀', '𐱈')]; + +pub const OLD_UYGHUR: &'static [(char, char)] = + &[('ـ', 'ـ'), ('𐫲', '𐫲'), ('𐽰', '𐾉')]; + +pub const ORIYA: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('।', '॥'), + ('\u{b01}', 'ଃ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('\u{b3c}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', '\u{b63}'), + ('୦', '୷'), + ('\u{1cda}', '\u{1cda}'), + ('ᳲ', 'ᳲ'), +]; + +pub const OSAGE: &'static [(char, char)] = &[('𐒰', '𐓓'), ('𐓘', '𐓻')]; + +pub const OSMANYA: &'static [(char, char)] = &[('𐒀', '𐒝'), ('𐒠', '𐒩')]; + +pub const PAHAWH_HMONG: &'static [(char, char)] = + &[('𖬀', '𖭅'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖭣', '𖭷'), ('𖭽', '𖮏')]; + +pub const PALMYRENE: &'static [(char, char)] = &[('𐡠', '𐡿')]; + +pub const PAU_CIN_HAU: &'static [(char, char)] = &[('𑫀', '𑫸')]; + +pub const PHAGS_PA: &'static [(char, char)] = + &[('᠂', '᠃'), ('᠅', '᠅'), ('ꡀ', '꡷')]; + +pub const PHOENICIAN: &'static [(char, char)] = &[('𐤀', '𐤛'), ('𐤟', '𐤟')]; + +pub const PSALTER_PAHLAVI: &'static [(char, char)] = + &[('ـ', 'ـ'), ('𐮀', '𐮑'), ('𐮙', '𐮜'), ('𐮩', '𐮯')]; + +pub const REJANG: &'static [(char, char)] = &[('ꤰ', '꥓'), ('꥟', '꥟')]; + +pub const RUNIC: &'static [(char, char)] = &[('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ')]; + +pub const SAMARITAN: &'static [(char, char)] = &[('ࠀ', '\u{82d}'), ('࠰', '࠾')]; + +pub const SAURASHTRA: &'static [(char, char)] = + &[('ꢀ', '\u{a8c5}'), ('꣎', '꣙')]; + +pub const SHARADA: &'static [(char, char)] = &[ + ('\u{951}', '\u{951}'), + ('\u{1cd7}', '\u{1cd7}'), + ('\u{1cd9}', '\u{1cd9}'), + ('\u{1cdc}', '\u{1cdd}'), + ('\u{1ce0}', '\u{1ce0}'), + ('\u{11180}', '𑇟'), +]; + +pub const SHAVIAN: &'static [(char, char)] = &[('𐑐', '𐑿')]; + +pub const SIDDHAM: &'static [(char, char)] = + &[('𑖀', '\u{115b5}'), ('𑖸', '\u{115dd}')]; + +pub const SIGNWRITING: &'static [(char, char)] = + &[('𝠀', '𝪋'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}')]; + +pub const SINHALA: &'static [(char, char)] = &[ + ('।', '॥'), + ('\u{d81}', 'ඃ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('෦', '෯'), + ('ෲ', '෴'), + ('𑇡', '𑇴'), +]; + +pub const SOGDIAN: &'static [(char, char)] = &[('ـ', 'ـ'), ('𐼰', '𐽙')]; + +pub const SORA_SOMPENG: &'static [(char, char)] = &[('𑃐', '𑃨'), ('𑃰', '𑃹')]; + +pub const SOYOMBO: &'static [(char, char)] = &[('𑩐', '𑪢')]; + +pub const SUNDANESE: &'static [(char, char)] = + &[('\u{1b80}', 'ᮿ'), ('᳀', '᳇')]; + +pub const SYLOTI_NAGRI: &'static [(char, char)] = + &[('।', '॥'), ('০', '৯'), ('ꠀ', '\u{a82c}')]; + +pub const SYRIAC: &'static [(char, char)] = &[ + ('،', '،'), + ('؛', '\u{61c}'), + ('؟', '؟'), + ('ـ', 'ـ'), + ('\u{64b}', '\u{655}'), + ('\u{670}', '\u{670}'), + ('܀', '܍'), + ('\u{70f}', '\u{74a}'), + ('ݍ', 'ݏ'), + ('ࡠ', 'ࡪ'), + ('\u{1df8}', '\u{1df8}'), + ('\u{1dfa}', '\u{1dfa}'), +]; + +pub const TAGALOG: &'static [(char, char)] = + &[('ᜀ', '᜕'), ('ᜟ', 'ᜟ'), ('᜵', '᜶')]; + +pub const TAGBANWA: &'static [(char, char)] = + &[('᜵', '᜶'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('\u{1772}', '\u{1773}')]; + +pub const TAI_LE: &'static [(char, char)] = + &[('၀', '၉'), ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ')]; + +pub const TAI_THAM: &'static [(char, char)] = &[ + ('ᨠ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '᪉'), + ('᪐', '᪙'), + ('᪠', '᪭'), +]; + +pub const TAI_VIET: &'static [(char, char)] = &[('ꪀ', 'ꫂ'), ('ꫛ', '꫟')]; + +pub const TAKRI: &'static [(char, char)] = + &[('।', '॥'), ('꠰', '꠹'), ('𑚀', '𑚹'), ('𑛀', '𑛉')]; + +pub const TAMIL: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('।', '॥'), + ('\u{b82}', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('ௐ', 'ௐ'), + ('\u{bd7}', '\u{bd7}'), + ('௦', '௺'), + ('\u{1cda}', '\u{1cda}'), + ('ꣳ', 'ꣳ'), + ('\u{11301}', '\u{11301}'), + ('𑌃', '𑌃'), + ('\u{1133b}', '\u{1133c}'), + ('𑿀', '𑿱'), + ('𑿿', '𑿿'), +]; + +pub const TANGSA: &'static [(char, char)] = &[('𖩰', '𖪾'), ('𖫀', '𖫉')]; + +pub const TANGUT: &'static [(char, char)] = + &[('𖿠', '𖿠'), ('𗀀', '𘟷'), ('𘠀', '𘫿'), ('𘴀', '𘴈')]; + +pub const TELUGU: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('।', '॥'), + ('\u{c00}', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('\u{c3c}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', '\u{c63}'), + ('౦', '౯'), + ('౷', '౿'), + ('\u{1cda}', '\u{1cda}'), + ('ᳲ', 'ᳲ'), +]; + +pub const THAANA: &'static [(char, char)] = &[ + ('،', '،'), + ('؛', '\u{61c}'), + ('؟', '؟'), + ('٠', '٩'), + ('ހ', 'ޱ'), + ('ﷲ', 'ﷲ'), + ('﷽', '﷽'), +]; + +pub const THAI: &'static [(char, char)] = &[('ก', '\u{e3a}'), ('เ', '๛')]; + +pub const TIBETAN: &'static [(char, char)] = &[ + ('ༀ', 'ཇ'), + ('ཉ', 'ཬ'), + ('\u{f71}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('྾', '࿌'), + ('࿎', '࿔'), + ('࿙', '࿚'), +]; + +pub const TIFINAGH: &'static [(char, char)] = + &[('ⴰ', 'ⵧ'), ('ⵯ', '⵰'), ('\u{2d7f}', '\u{2d7f}')]; + +pub const TIRHUTA: &'static [(char, char)] = &[ + ('\u{951}', '\u{952}'), + ('।', '॥'), + ('ᳲ', 'ᳲ'), + ('꠰', '꠹'), + ('𑒀', '𑓇'), + ('𑓐', '𑓙'), +]; + +pub const TOTO: &'static [(char, char)] = &[('𞊐', '\u{1e2ae}')]; + +pub const UGARITIC: &'static [(char, char)] = &[('𐎀', '𐎝'), ('𐎟', '𐎟')]; + +pub const VAI: &'static [(char, char)] = &[('ꔀ', 'ꘫ')]; + +pub const VITHKUQI: &'static [(char, char)] = &[ + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), +]; + +pub const WANCHO: &'static [(char, char)] = &[('𞋀', '𞋹'), ('𞋿', '𞋿')]; + +pub const WARANG_CITI: &'static [(char, char)] = &[('𑢠', '𑣲'), ('𑣿', '𑣿')]; + +pub const YEZIDI: &'static [(char, char)] = &[ + ('،', '،'), + ('؛', '؛'), + ('؟', '؟'), + ('٠', '٩'), + ('𐺀', '𐺩'), + ('\u{10eab}', '𐺭'), + ('𐺰', '𐺱'), +]; + +pub const YI: &'static [(char, char)] = &[ + ('、', '。'), + ('〈', '】'), + ('〔', '〛'), + ('・', '・'), + ('ꀀ', 'ꒌ'), + ('꒐', '꓆'), + ('。', '・'), +]; + +pub const ZANABAZAR_SQUARE: &'static [(char, char)] = &[('𑨀', '\u{11a47}')]; diff --git a/vendor/regex-syntax/src/unicode_tables/sentence_break.rs b/vendor/regex-syntax/src/unicode_tables/sentence_break.rs new file mode 100644 index 0000000000..24348736f2 --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/sentence_break.rs @@ -0,0 +1,2477 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate sentence-break ucd-15.0.0 --chars +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("ATerm", ATERM), + ("CR", CR), + ("Close", CLOSE), + ("Extend", EXTEND), + ("Format", FORMAT), + ("LF", LF), + ("Lower", LOWER), + ("Numeric", NUMERIC), + ("OLetter", OLETTER), + ("SContinue", SCONTINUE), + ("STerm", STERM), + ("Sep", SEP), + ("Sp", SP), + ("Upper", UPPER), +]; + +pub const ATERM: &'static [(char, char)] = + &[('.', '.'), ('․', '․'), ('﹒', '﹒'), ('.', '.')]; + +pub const CR: &'static [(char, char)] = &[('\r', '\r')]; + +pub const CLOSE: &'static [(char, char)] = &[ + ('"', '"'), + ('\'', ')'), + ('[', '['), + (']', ']'), + ('{', '{'), + ('}', '}'), + ('«', '«'), + ('»', '»'), + ('༺', '༽'), + ('᚛', '᚜'), + ('‘', '‟'), + ('‹', '›'), + ('⁅', '⁆'), + ('⁽', '⁾'), + ('₍', '₎'), + ('⌈', '⌋'), + ('〈', '〉'), + ('❛', '❠'), + ('❨', '❵'), + ('⟅', '⟆'), + ('⟦', '⟯'), + ('⦃', '⦘'), + ('⧘', '⧛'), + ('⧼', '⧽'), + ('⸀', '⸍'), + ('⸜', '⸝'), + ('⸠', '⸩'), + ('⹂', '⹂'), + ('⹕', '⹜'), + ('〈', '】'), + ('〔', '〛'), + ('〝', '〟'), + ('﴾', '﴿'), + ('︗', '︘'), + ('︵', '﹄'), + ('﹇', '﹈'), + ('﹙', '﹞'), + ('(', ')'), + ('[', '['), + (']', ']'), + ('{', '{'), + ('}', '}'), + ('⦅', '⦆'), + ('「', '」'), + ('🙶', '🙸'), +]; + +pub const EXTEND: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{483}', '\u{489}'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6df}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', '\u{7f3}'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{819}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('\u{898}', '\u{89f}'), + ('\u{8ca}', '\u{8e1}'), + ('\u{8e3}', 'ः'), + ('\u{93a}', '\u{93c}'), + ('ा', 'ॏ'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', 'ঃ'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9be}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', '\u{9cd}'), + ('\u{9d7}', '\u{9d7}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ਃ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', 'ઃ'), + ('\u{abc}', '\u{abc}'), + ('ા', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', 'ଃ'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3e}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', '\u{c04}'), + ('\u{c3c}', '\u{c3c}'), + ('\u{c3e}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', 'ಃ'), + ('\u{cbc}', '\u{cbc}'), + ('ಾ', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{ce2}', '\u{ce3}'), + ('ೳ', 'ೳ'), + ('\u{d00}', 'ഃ'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d3e}', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', '\u{d4d}'), + ('\u{d57}', '\u{d57}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', 'ඃ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('ෲ', 'ෳ'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e47}', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('\u{ec8}', '\u{ece}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('༾', '༿'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('ါ', '\u{103e}'), + ('ၖ', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('ၢ', 'ၤ'), + ('ၧ', 'ၭ'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{108d}'), + ('ႏ', 'ႏ'), + ('ႚ', '\u{109d}'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '᜕'), + ('\u{1732}', '᜴'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '\u{180f}'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('\u{1a17}', '\u{1a1b}'), + ('ᩕ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1b00}', 'ᬄ'), + ('\u{1b34}', '᭄'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', 'ᮂ'), + ('ᮡ', '\u{1bad}'), + ('\u{1be6}', '᯳'), + ('ᰤ', '\u{1c37}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('᳷', '\u{1cf9}'), + ('\u{1dc0}', '\u{1dff}'), + ('\u{200c}', '\u{200d}'), + ('\u{20d0}', '\u{20f0}'), + ('\u{2cef}', '\u{2cf1}'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('\u{302a}', '\u{302f}'), + ('\u{3099}', '\u{309a}'), + ('\u{a66f}', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('ꠣ', 'ꠧ'), + ('\u{a82c}', '\u{a82c}'), + ('ꢀ', 'ꢁ'), + ('ꢴ', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', '꥓'), + ('\u{a980}', 'ꦃ'), + ('\u{a9b3}', '꧀'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', 'ꩍ'), + ('ꩻ', 'ꩽ'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('ꫫ', 'ꫯ'), + ('ꫵ', '\u{aaf6}'), + ('ꯣ', 'ꯪ'), + ('꯬', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('\u{ff9e}', '\u{ff9f}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10efd}', '\u{10eff}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{10f82}', '\u{10f85}'), + ('𑀀', '𑀂'), + ('\u{11038}', '\u{11046}'), + ('\u{11070}', '\u{11070}'), + ('\u{11073}', '\u{11074}'), + ('\u{1107f}', '𑂂'), + ('𑂰', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{11134}'), + ('𑅅', '𑅆'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', '𑆂'), + ('𑆳', '𑇀'), + ('\u{111c9}', '\u{111cc}'), + ('𑇎', '\u{111cf}'), + ('𑈬', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{11241}', '\u{11241}'), + ('\u{112df}', '\u{112ea}'), + ('\u{11300}', '𑌃'), + ('\u{1133b}', '\u{1133c}'), + ('\u{1133e}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('\u{11357}', '\u{11357}'), + ('𑍢', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('𑐵', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b0}', '\u{114c3}'), + ('\u{115af}', '\u{115b5}'), + ('𑖸', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('𑘰', '\u{11640}'), + ('\u{116ab}', '\u{116b7}'), + ('\u{1171d}', '\u{1172b}'), + ('𑠬', '\u{1183a}'), + ('\u{11930}', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '\u{1193e}'), + ('𑥀', '𑥀'), + ('𑥂', '\u{11943}'), + ('𑧑', '\u{119d7}'), + ('\u{119da}', '\u{119e0}'), + ('𑧤', '𑧤'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', '𑨹'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a99}'), + ('𑰯', '\u{11c36}'), + ('\u{11c38}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('𑶊', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '\u{11d97}'), + ('\u{11ef3}', '𑻶'), + ('\u{11f00}', '\u{11f01}'), + ('𑼃', '𑼃'), + ('𑼴', '\u{11f3a}'), + ('𑼾', '\u{11f42}'), + ('\u{13440}', '\u{13440}'), + ('\u{13447}', '\u{13455}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f4f}', '\u{16f4f}'), + ('𖽑', '𖾇'), + ('\u{16f8f}', '\u{16f92}'), + ('\u{16fe4}', '\u{16fe4}'), + ('𖿰', '𖿱'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d165}', '\u{1d169}'), + ('𝅭', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e08f}', '\u{1e08f}'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ae}', '\u{1e2ae}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e4ec}', '\u{1e4ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e94a}'), + ('\u{e0020}', '\u{e007f}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const FORMAT: &'static [(char, char)] = &[ + ('\u{ad}', '\u{ad}'), + ('\u{600}', '\u{605}'), + ('\u{61c}', '\u{61c}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{70f}', '\u{70f}'), + ('\u{890}', '\u{891}'), + ('\u{8e2}', '\u{8e2}'), + ('\u{180e}', '\u{180e}'), + ('\u{200b}', '\u{200b}'), + ('\u{200e}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), + ('\u{2060}', '\u{2064}'), + ('\u{2066}', '\u{206f}'), + ('\u{feff}', '\u{feff}'), + ('\u{fff9}', '\u{fffb}'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110cd}', '\u{110cd}'), + ('\u{13430}', '\u{1343f}'), + ('\u{1bca0}', '\u{1bca3}'), + ('\u{1d173}', '\u{1d17a}'), + ('\u{e0001}', '\u{e0001}'), +]; + +pub const LF: &'static [(char, char)] = &[('\n', '\n')]; + +pub const LOWER: &'static [(char, char)] = &[ + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('ß', 'ö'), + ('ø', 'ÿ'), + ('ā', 'ā'), + ('ă', 'ă'), + ('ą', 'ą'), + ('ć', 'ć'), + ('ĉ', 'ĉ'), + ('ċ', 'ċ'), + ('č', 'č'), + ('ď', 'ď'), + ('đ', 'đ'), + ('ē', 'ē'), + ('ĕ', 'ĕ'), + ('ė', 'ė'), + ('ę', 'ę'), + ('ě', 'ě'), + ('ĝ', 'ĝ'), + ('ğ', 'ğ'), + ('ġ', 'ġ'), + ('ģ', 'ģ'), + ('ĥ', 'ĥ'), + ('ħ', 'ħ'), + ('ĩ', 'ĩ'), + ('ī', 'ī'), + ('ĭ', 'ĭ'), + ('į', 'į'), + ('ı', 'ı'), + ('ij', 'ij'), + ('ĵ', 'ĵ'), + ('ķ', 'ĸ'), + ('ĺ', 'ĺ'), + ('ļ', 'ļ'), + ('ľ', 'ľ'), + ('ŀ', 'ŀ'), + ('ł', 'ł'), + ('ń', 'ń'), + ('ņ', 'ņ'), + ('ň', 'ʼn'), + ('ŋ', 'ŋ'), + ('ō', 'ō'), + ('ŏ', 'ŏ'), + ('ő', 'ő'), + ('œ', 'œ'), + ('ŕ', 'ŕ'), + ('ŗ', 'ŗ'), + ('ř', 'ř'), + ('ś', 'ś'), + ('ŝ', 'ŝ'), + ('ş', 'ş'), + ('š', 'š'), + ('ţ', 'ţ'), + ('ť', 'ť'), + ('ŧ', 'ŧ'), + ('ũ', 'ũ'), + ('ū', 'ū'), + ('ŭ', 'ŭ'), + ('ů', 'ů'), + ('ű', 'ű'), + ('ų', 'ų'), + ('ŵ', 'ŵ'), + ('ŷ', 'ŷ'), + ('ź', 'ź'), + ('ż', 'ż'), + ('ž', 'ƀ'), + ('ƃ', 'ƃ'), + ('ƅ', 'ƅ'), + ('ƈ', 'ƈ'), + ('ƌ', 'ƍ'), + ('ƒ', 'ƒ'), + ('ƕ', 'ƕ'), + ('ƙ', 'ƛ'), + ('ƞ', 'ƞ'), + ('ơ', 'ơ'), + ('ƣ', 'ƣ'), + ('ƥ', 'ƥ'), + ('ƨ', 'ƨ'), + ('ƪ', 'ƫ'), + ('ƭ', 'ƭ'), + ('ư', 'ư'), + ('ƴ', 'ƴ'), + ('ƶ', 'ƶ'), + ('ƹ', 'ƺ'), + ('ƽ', 'ƿ'), + ('dž', 'dž'), + ('lj', 'lj'), + ('nj', 'nj'), + ('ǎ', 'ǎ'), + ('ǐ', 'ǐ'), + ('ǒ', 'ǒ'), + ('ǔ', 'ǔ'), + ('ǖ', 'ǖ'), + ('ǘ', 'ǘ'), + ('ǚ', 'ǚ'), + ('ǜ', 'ǝ'), + ('ǟ', 'ǟ'), + ('ǡ', 'ǡ'), + ('ǣ', 'ǣ'), + ('ǥ', 'ǥ'), + ('ǧ', 'ǧ'), + ('ǩ', 'ǩ'), + ('ǫ', 'ǫ'), + ('ǭ', 'ǭ'), + ('ǯ', 'ǰ'), + ('dz', 'dz'), + ('ǵ', 'ǵ'), + ('ǹ', 'ǹ'), + ('ǻ', 'ǻ'), + ('ǽ', 'ǽ'), + ('ǿ', 'ǿ'), + ('ȁ', 'ȁ'), + ('ȃ', 'ȃ'), + ('ȅ', 'ȅ'), + ('ȇ', 'ȇ'), + ('ȉ', 'ȉ'), + ('ȋ', 'ȋ'), + ('ȍ', 'ȍ'), + ('ȏ', 'ȏ'), + ('ȑ', 'ȑ'), + ('ȓ', 'ȓ'), + ('ȕ', 'ȕ'), + ('ȗ', 'ȗ'), + ('ș', 'ș'), + ('ț', 'ț'), + ('ȝ', 'ȝ'), + ('ȟ', 'ȟ'), + ('ȡ', 'ȡ'), + ('ȣ', 'ȣ'), + ('ȥ', 'ȥ'), + ('ȧ', 'ȧ'), + ('ȩ', 'ȩ'), + ('ȫ', 'ȫ'), + ('ȭ', 'ȭ'), + ('ȯ', 'ȯ'), + ('ȱ', 'ȱ'), + ('ȳ', 'ȹ'), + ('ȼ', 'ȼ'), + ('ȿ', 'ɀ'), + ('ɂ', 'ɂ'), + ('ɇ', 'ɇ'), + ('ɉ', 'ɉ'), + ('ɋ', 'ɋ'), + ('ɍ', 'ɍ'), + ('ɏ', 'ʓ'), + ('ʕ', 'ʸ'), + ('ˀ', 'ˁ'), + ('ˠ', 'ˤ'), + ('ͱ', 'ͱ'), + ('ͳ', 'ͳ'), + ('ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('ΐ', 'ΐ'), + ('ά', 'ώ'), + ('ϐ', 'ϑ'), + ('ϕ', 'ϗ'), + ('ϙ', 'ϙ'), + ('ϛ', 'ϛ'), + ('ϝ', 'ϝ'), + ('ϟ', 'ϟ'), + ('ϡ', 'ϡ'), + ('ϣ', 'ϣ'), + ('ϥ', 'ϥ'), + ('ϧ', 'ϧ'), + ('ϩ', 'ϩ'), + ('ϫ', 'ϫ'), + ('ϭ', 'ϭ'), + ('ϯ', 'ϳ'), + ('ϵ', 'ϵ'), + ('ϸ', 'ϸ'), + ('ϻ', 'ϼ'), + ('а', 'џ'), + ('ѡ', 'ѡ'), + ('ѣ', 'ѣ'), + ('ѥ', 'ѥ'), + ('ѧ', 'ѧ'), + ('ѩ', 'ѩ'), + ('ѫ', 'ѫ'), + ('ѭ', 'ѭ'), + ('ѯ', 'ѯ'), + ('ѱ', 'ѱ'), + ('ѳ', 'ѳ'), + ('ѵ', 'ѵ'), + ('ѷ', 'ѷ'), + ('ѹ', 'ѹ'), + ('ѻ', 'ѻ'), + ('ѽ', 'ѽ'), + ('ѿ', 'ѿ'), + ('ҁ', 'ҁ'), + ('ҋ', 'ҋ'), + ('ҍ', 'ҍ'), + ('ҏ', 'ҏ'), + ('ґ', 'ґ'), + ('ғ', 'ғ'), + ('ҕ', 'ҕ'), + ('җ', 'җ'), + ('ҙ', 'ҙ'), + ('қ', 'қ'), + ('ҝ', 'ҝ'), + ('ҟ', 'ҟ'), + ('ҡ', 'ҡ'), + ('ң', 'ң'), + ('ҥ', 'ҥ'), + ('ҧ', 'ҧ'), + ('ҩ', 'ҩ'), + ('ҫ', 'ҫ'), + ('ҭ', 'ҭ'), + ('ү', 'ү'), + ('ұ', 'ұ'), + ('ҳ', 'ҳ'), + ('ҵ', 'ҵ'), + ('ҷ', 'ҷ'), + ('ҹ', 'ҹ'), + ('һ', 'һ'), + ('ҽ', 'ҽ'), + ('ҿ', 'ҿ'), + ('ӂ', 'ӂ'), + ('ӄ', 'ӄ'), + ('ӆ', 'ӆ'), + ('ӈ', 'ӈ'), + ('ӊ', 'ӊ'), + ('ӌ', 'ӌ'), + ('ӎ', 'ӏ'), + ('ӑ', 'ӑ'), + ('ӓ', 'ӓ'), + ('ӕ', 'ӕ'), + ('ӗ', 'ӗ'), + ('ә', 'ә'), + ('ӛ', 'ӛ'), + ('ӝ', 'ӝ'), + ('ӟ', 'ӟ'), + ('ӡ', 'ӡ'), + ('ӣ', 'ӣ'), + ('ӥ', 'ӥ'), + ('ӧ', 'ӧ'), + ('ө', 'ө'), + ('ӫ', 'ӫ'), + ('ӭ', 'ӭ'), + ('ӯ', 'ӯ'), + ('ӱ', 'ӱ'), + ('ӳ', 'ӳ'), + ('ӵ', 'ӵ'), + ('ӷ', 'ӷ'), + ('ӹ', 'ӹ'), + ('ӻ', 'ӻ'), + ('ӽ', 'ӽ'), + ('ӿ', 'ӿ'), + ('ԁ', 'ԁ'), + ('ԃ', 'ԃ'), + ('ԅ', 'ԅ'), + ('ԇ', 'ԇ'), + ('ԉ', 'ԉ'), + ('ԋ', 'ԋ'), + ('ԍ', 'ԍ'), + ('ԏ', 'ԏ'), + ('ԑ', 'ԑ'), + ('ԓ', 'ԓ'), + ('ԕ', 'ԕ'), + ('ԗ', 'ԗ'), + ('ԙ', 'ԙ'), + ('ԛ', 'ԛ'), + ('ԝ', 'ԝ'), + ('ԟ', 'ԟ'), + ('ԡ', 'ԡ'), + ('ԣ', 'ԣ'), + ('ԥ', 'ԥ'), + ('ԧ', 'ԧ'), + ('ԩ', 'ԩ'), + ('ԫ', 'ԫ'), + ('ԭ', 'ԭ'), + ('ԯ', 'ԯ'), + ('ՠ', 'ֈ'), + ('ჼ', 'ჼ'), + ('ᏸ', 'ᏽ'), + ('ᲀ', 'ᲈ'), + ('ᴀ', 'ᶿ'), + ('ḁ', 'ḁ'), + ('ḃ', 'ḃ'), + ('ḅ', 'ḅ'), + ('ḇ', 'ḇ'), + ('ḉ', 'ḉ'), + ('ḋ', 'ḋ'), + ('ḍ', 'ḍ'), + ('ḏ', 'ḏ'), + ('ḑ', 'ḑ'), + ('ḓ', 'ḓ'), + ('ḕ', 'ḕ'), + ('ḗ', 'ḗ'), + ('ḙ', 'ḙ'), + ('ḛ', 'ḛ'), + ('ḝ', 'ḝ'), + ('ḟ', 'ḟ'), + ('ḡ', 'ḡ'), + ('ḣ', 'ḣ'), + ('ḥ', 'ḥ'), + ('ḧ', 'ḧ'), + ('ḩ', 'ḩ'), + ('ḫ', 'ḫ'), + ('ḭ', 'ḭ'), + ('ḯ', 'ḯ'), + ('ḱ', 'ḱ'), + ('ḳ', 'ḳ'), + ('ḵ', 'ḵ'), + ('ḷ', 'ḷ'), + ('ḹ', 'ḹ'), + ('ḻ', 'ḻ'), + ('ḽ', 'ḽ'), + ('ḿ', 'ḿ'), + ('ṁ', 'ṁ'), + ('ṃ', 'ṃ'), + ('ṅ', 'ṅ'), + ('ṇ', 'ṇ'), + ('ṉ', 'ṉ'), + ('ṋ', 'ṋ'), + ('ṍ', 'ṍ'), + ('ṏ', 'ṏ'), + ('ṑ', 'ṑ'), + ('ṓ', 'ṓ'), + ('ṕ', 'ṕ'), + ('ṗ', 'ṗ'), + ('ṙ', 'ṙ'), + ('ṛ', 'ṛ'), + ('ṝ', 'ṝ'), + ('ṟ', 'ṟ'), + ('ṡ', 'ṡ'), + ('ṣ', 'ṣ'), + ('ṥ', 'ṥ'), + ('ṧ', 'ṧ'), + ('ṩ', 'ṩ'), + ('ṫ', 'ṫ'), + ('ṭ', 'ṭ'), + ('ṯ', 'ṯ'), + ('ṱ', 'ṱ'), + ('ṳ', 'ṳ'), + ('ṵ', 'ṵ'), + ('ṷ', 'ṷ'), + ('ṹ', 'ṹ'), + ('ṻ', 'ṻ'), + ('ṽ', 'ṽ'), + ('ṿ', 'ṿ'), + ('ẁ', 'ẁ'), + ('ẃ', 'ẃ'), + ('ẅ', 'ẅ'), + ('ẇ', 'ẇ'), + ('ẉ', 'ẉ'), + ('ẋ', 'ẋ'), + ('ẍ', 'ẍ'), + ('ẏ', 'ẏ'), + ('ẑ', 'ẑ'), + ('ẓ', 'ẓ'), + ('ẕ', 'ẝ'), + ('ẟ', 'ẟ'), + ('ạ', 'ạ'), + ('ả', 'ả'), + ('ấ', 'ấ'), + ('ầ', 'ầ'), + ('ẩ', 'ẩ'), + ('ẫ', 'ẫ'), + ('ậ', 'ậ'), + ('ắ', 'ắ'), + ('ằ', 'ằ'), + ('ẳ', 'ẳ'), + ('ẵ', 'ẵ'), + ('ặ', 'ặ'), + ('ẹ', 'ẹ'), + ('ẻ', 'ẻ'), + ('ẽ', 'ẽ'), + ('ế', 'ế'), + ('ề', 'ề'), + ('ể', 'ể'), + ('ễ', 'ễ'), + ('ệ', 'ệ'), + ('ỉ', 'ỉ'), + ('ị', 'ị'), + ('ọ', 'ọ'), + ('ỏ', 'ỏ'), + ('ố', 'ố'), + ('ồ', 'ồ'), + ('ổ', 'ổ'), + ('ỗ', 'ỗ'), + ('ộ', 'ộ'), + ('ớ', 'ớ'), + ('ờ', 'ờ'), + ('ở', 'ở'), + ('ỡ', 'ỡ'), + ('ợ', 'ợ'), + ('ụ', 'ụ'), + ('ủ', 'ủ'), + ('ứ', 'ứ'), + ('ừ', 'ừ'), + ('ử', 'ử'), + ('ữ', 'ữ'), + ('ự', 'ự'), + ('ỳ', 'ỳ'), + ('ỵ', 'ỵ'), + ('ỷ', 'ỷ'), + ('ỹ', 'ỹ'), + ('ỻ', 'ỻ'), + ('ỽ', 'ỽ'), + ('ỿ', 'ἇ'), + ('ἐ', 'ἕ'), + ('ἠ', 'ἧ'), + ('ἰ', 'ἷ'), + ('ὀ', 'ὅ'), + ('ὐ', 'ὗ'), + ('ὠ', 'ὧ'), + ('ὰ', 'ώ'), + ('ᾀ', 'ᾇ'), + ('ᾐ', 'ᾗ'), + ('ᾠ', 'ᾧ'), + ('ᾰ', 'ᾴ'), + ('ᾶ', 'ᾷ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῇ'), + ('ῐ', 'ΐ'), + ('ῖ', 'ῗ'), + ('ῠ', 'ῧ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῷ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('ℊ', 'ℊ'), + ('ℎ', 'ℏ'), + ('ℓ', 'ℓ'), + ('ℯ', 'ℯ'), + ('ℴ', 'ℴ'), + ('ℹ', 'ℹ'), + ('ℼ', 'ℽ'), + ('ⅆ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('ⅰ', 'ⅿ'), + ('ↄ', 'ↄ'), + ('ⓐ', 'ⓩ'), + ('ⰰ', 'ⱟ'), + ('ⱡ', 'ⱡ'), + ('ⱥ', 'ⱦ'), + ('ⱨ', 'ⱨ'), + ('ⱪ', 'ⱪ'), + ('ⱬ', 'ⱬ'), + ('ⱱ', 'ⱱ'), + ('ⱳ', 'ⱴ'), + ('ⱶ', 'ⱽ'), + ('ⲁ', 'ⲁ'), + ('ⲃ', 'ⲃ'), + ('ⲅ', 'ⲅ'), + ('ⲇ', 'ⲇ'), + ('ⲉ', 'ⲉ'), + ('ⲋ', 'ⲋ'), + ('ⲍ', 'ⲍ'), + ('ⲏ', 'ⲏ'), + ('ⲑ', 'ⲑ'), + ('ⲓ', 'ⲓ'), + ('ⲕ', 'ⲕ'), + ('ⲗ', 'ⲗ'), + ('ⲙ', 'ⲙ'), + ('ⲛ', 'ⲛ'), + ('ⲝ', 'ⲝ'), + ('ⲟ', 'ⲟ'), + ('ⲡ', 'ⲡ'), + ('ⲣ', 'ⲣ'), + ('ⲥ', 'ⲥ'), + ('ⲧ', 'ⲧ'), + ('ⲩ', 'ⲩ'), + ('ⲫ', 'ⲫ'), + ('ⲭ', 'ⲭ'), + ('ⲯ', 'ⲯ'), + ('ⲱ', 'ⲱ'), + ('ⲳ', 'ⲳ'), + ('ⲵ', 'ⲵ'), + ('ⲷ', 'ⲷ'), + ('ⲹ', 'ⲹ'), + ('ⲻ', 'ⲻ'), + ('ⲽ', 'ⲽ'), + ('ⲿ', 'ⲿ'), + ('ⳁ', 'ⳁ'), + ('ⳃ', 'ⳃ'), + ('ⳅ', 'ⳅ'), + ('ⳇ', 'ⳇ'), + ('ⳉ', 'ⳉ'), + ('ⳋ', 'ⳋ'), + ('ⳍ', 'ⳍ'), + ('ⳏ', 'ⳏ'), + ('ⳑ', 'ⳑ'), + ('ⳓ', 'ⳓ'), + ('ⳕ', 'ⳕ'), + ('ⳗ', 'ⳗ'), + ('ⳙ', 'ⳙ'), + ('ⳛ', 'ⳛ'), + ('ⳝ', 'ⳝ'), + ('ⳟ', 'ⳟ'), + ('ⳡ', 'ⳡ'), + ('ⳣ', 'ⳤ'), + ('ⳬ', 'ⳬ'), + ('ⳮ', 'ⳮ'), + ('ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ꙁ', 'ꙁ'), + ('ꙃ', 'ꙃ'), + ('ꙅ', 'ꙅ'), + ('ꙇ', 'ꙇ'), + ('ꙉ', 'ꙉ'), + ('ꙋ', 'ꙋ'), + ('ꙍ', 'ꙍ'), + ('ꙏ', 'ꙏ'), + ('ꙑ', 'ꙑ'), + ('ꙓ', 'ꙓ'), + ('ꙕ', 'ꙕ'), + ('ꙗ', 'ꙗ'), + ('ꙙ', 'ꙙ'), + ('ꙛ', 'ꙛ'), + ('ꙝ', 'ꙝ'), + ('ꙟ', 'ꙟ'), + ('ꙡ', 'ꙡ'), + ('ꙣ', 'ꙣ'), + ('ꙥ', 'ꙥ'), + ('ꙧ', 'ꙧ'), + ('ꙩ', 'ꙩ'), + ('ꙫ', 'ꙫ'), + ('ꙭ', 'ꙭ'), + ('ꚁ', 'ꚁ'), + ('ꚃ', 'ꚃ'), + ('ꚅ', 'ꚅ'), + ('ꚇ', 'ꚇ'), + ('ꚉ', 'ꚉ'), + ('ꚋ', 'ꚋ'), + ('ꚍ', 'ꚍ'), + ('ꚏ', 'ꚏ'), + ('ꚑ', 'ꚑ'), + ('ꚓ', 'ꚓ'), + ('ꚕ', 'ꚕ'), + ('ꚗ', 'ꚗ'), + ('ꚙ', 'ꚙ'), + ('ꚛ', 'ꚝ'), + ('ꜣ', 'ꜣ'), + ('ꜥ', 'ꜥ'), + ('ꜧ', 'ꜧ'), + ('ꜩ', 'ꜩ'), + ('ꜫ', 'ꜫ'), + ('ꜭ', 'ꜭ'), + ('ꜯ', 'ꜱ'), + ('ꜳ', 'ꜳ'), + ('ꜵ', 'ꜵ'), + ('ꜷ', 'ꜷ'), + ('ꜹ', 'ꜹ'), + ('ꜻ', 'ꜻ'), + ('ꜽ', 'ꜽ'), + ('ꜿ', 'ꜿ'), + ('ꝁ', 'ꝁ'), + ('ꝃ', 'ꝃ'), + ('ꝅ', 'ꝅ'), + ('ꝇ', 'ꝇ'), + ('ꝉ', 'ꝉ'), + ('ꝋ', 'ꝋ'), + ('ꝍ', 'ꝍ'), + ('ꝏ', 'ꝏ'), + ('ꝑ', 'ꝑ'), + ('ꝓ', 'ꝓ'), + ('ꝕ', 'ꝕ'), + ('ꝗ', 'ꝗ'), + ('ꝙ', 'ꝙ'), + ('ꝛ', 'ꝛ'), + ('ꝝ', 'ꝝ'), + ('ꝟ', 'ꝟ'), + ('ꝡ', 'ꝡ'), + ('ꝣ', 'ꝣ'), + ('ꝥ', 'ꝥ'), + ('ꝧ', 'ꝧ'), + ('ꝩ', 'ꝩ'), + ('ꝫ', 'ꝫ'), + ('ꝭ', 'ꝭ'), + ('ꝯ', 'ꝸ'), + ('ꝺ', 'ꝺ'), + ('ꝼ', 'ꝼ'), + ('ꝿ', 'ꝿ'), + ('ꞁ', 'ꞁ'), + ('ꞃ', 'ꞃ'), + ('ꞅ', 'ꞅ'), + ('ꞇ', 'ꞇ'), + ('ꞌ', 'ꞌ'), + ('ꞎ', 'ꞎ'), + ('ꞑ', 'ꞑ'), + ('ꞓ', 'ꞕ'), + ('ꞗ', 'ꞗ'), + ('ꞙ', 'ꞙ'), + ('ꞛ', 'ꞛ'), + ('ꞝ', 'ꞝ'), + ('ꞟ', 'ꞟ'), + ('ꞡ', 'ꞡ'), + ('ꞣ', 'ꞣ'), + ('ꞥ', 'ꞥ'), + ('ꞧ', 'ꞧ'), + ('ꞩ', 'ꞩ'), + ('ꞯ', 'ꞯ'), + ('ꞵ', 'ꞵ'), + ('ꞷ', 'ꞷ'), + ('ꞹ', 'ꞹ'), + ('ꞻ', 'ꞻ'), + ('ꞽ', 'ꞽ'), + ('ꞿ', 'ꞿ'), + ('ꟁ', 'ꟁ'), + ('ꟃ', 'ꟃ'), + ('ꟈ', 'ꟈ'), + ('ꟊ', 'ꟊ'), + ('ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟕ'), + ('ꟗ', 'ꟗ'), + ('ꟙ', 'ꟙ'), + ('ꟲ', 'ꟴ'), + ('ꟶ', 'ꟶ'), + ('ꟸ', 'ꟺ'), + ('ꬰ', 'ꭚ'), + ('ꭜ', 'ꭩ'), + ('ꭰ', 'ꮿ'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('a', 'z'), + ('𐐨', '𐑏'), + ('𐓘', '𐓻'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐞀', '𐞀'), + ('𐞃', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐳀', '𐳲'), + ('𑣀', '𑣟'), + ('𖹠', '𖹿'), + ('𝐚', '𝐳'), + ('𝑎', '𝑔'), + ('𝑖', '𝑧'), + ('𝒂', '𝒛'), + ('𝒶', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝓏'), + ('𝓪', '𝔃'), + ('𝔞', '𝔷'), + ('𝕒', '𝕫'), + ('𝖆', '𝖟'), + ('𝖺', '𝗓'), + ('𝗮', '𝘇'), + ('𝘢', '𝘻'), + ('𝙖', '𝙯'), + ('𝚊', '𝚥'), + ('𝛂', '𝛚'), + ('𝛜', '𝛡'), + ('𝛼', '𝜔'), + ('𝜖', '𝜛'), + ('𝜶', '𝝎'), + ('𝝐', '𝝕'), + ('𝝰', '𝞈'), + ('𝞊', '𝞏'), + ('𝞪', '𝟂'), + ('𝟄', '𝟉'), + ('𝟋', '𝟋'), + ('𝼀', '𝼉'), + ('𝼋', '𝼞'), + ('𝼥', '𝼪'), + ('𞀰', '𞁭'), + ('𞤢', '𞥃'), +]; + +pub const NUMERIC: &'static [(char, char)] = &[ + ('0', '9'), + ('٠', '٩'), + ('٫', '٬'), + ('۰', '۹'), + ('߀', '߉'), + ('०', '९'), + ('০', '৯'), + ('੦', '੯'), + ('૦', '૯'), + ('୦', '୯'), + ('௦', '௯'), + ('౦', '౯'), + ('೦', '೯'), + ('൦', '൯'), + ('෦', '෯'), + ('๐', '๙'), + ('໐', '໙'), + ('༠', '༩'), + ('၀', '၉'), + ('႐', '႙'), + ('០', '៩'), + ('᠐', '᠙'), + ('᥆', '᥏'), + ('᧐', '᧙'), + ('᪀', '᪉'), + ('᪐', '᪙'), + ('᭐', '᭙'), + ('᮰', '᮹'), + ('᱀', '᱉'), + ('᱐', '᱙'), + ('꘠', '꘩'), + ('꣐', '꣙'), + ('꤀', '꤉'), + ('꧐', '꧙'), + ('꧰', '꧹'), + ('꩐', '꩙'), + ('꯰', '꯹'), + ('0', '9'), + ('𐒠', '𐒩'), + ('𐴰', '𐴹'), + ('𑁦', '𑁯'), + ('𑃰', '𑃹'), + ('𑄶', '𑄿'), + ('𑇐', '𑇙'), + ('𑋰', '𑋹'), + ('𑑐', '𑑙'), + ('𑓐', '𑓙'), + ('𑙐', '𑙙'), + ('𑛀', '𑛉'), + ('𑜰', '𑜹'), + ('𑣠', '𑣩'), + ('𑥐', '𑥙'), + ('𑱐', '𑱙'), + ('𑵐', '𑵙'), + ('𑶠', '𑶩'), + ('𑽐', '𑽙'), + ('𖩠', '𖩩'), + ('𖫀', '𖫉'), + ('𖭐', '𖭙'), + ('𝟎', '𝟿'), + ('𞅀', '𞅉'), + ('𞋰', '𞋹'), + ('𞓰', '𞓹'), + ('𞥐', '𞥙'), + ('🯰', '🯹'), +]; + +pub const OLETTER: &'static [(char, char)] = &[ + ('ƻ', 'ƻ'), + ('ǀ', 'ǃ'), + ('ʔ', 'ʔ'), + ('ʹ', 'ʿ'), + ('ˆ', 'ˑ'), + ('ˬ', 'ˬ'), + ('ˮ', 'ˮ'), + ('ʹ', 'ʹ'), + ('ՙ', 'ՙ'), + ('א', 'ת'), + ('ׯ', '׳'), + ('ؠ', 'ي'), + ('ٮ', 'ٯ'), + ('ٱ', 'ۓ'), + ('ە', 'ە'), + ('ۥ', 'ۦ'), + ('ۮ', 'ۯ'), + ('ۺ', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', 'ܐ'), + ('ܒ', 'ܯ'), + ('ݍ', 'ޥ'), + ('ޱ', 'ޱ'), + ('ߊ', 'ߪ'), + ('ߴ', 'ߵ'), + ('ߺ', 'ߺ'), + ('ࠀ', 'ࠕ'), + ('ࠚ', 'ࠚ'), + ('ࠤ', 'ࠤ'), + ('ࠨ', 'ࠨ'), + ('ࡀ', 'ࡘ'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('ࢠ', 'ࣉ'), + ('ऄ', 'ह'), + ('ऽ', 'ऽ'), + ('ॐ', 'ॐ'), + ('क़', 'ॡ'), + ('ॱ', 'ঀ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('ঽ', 'ঽ'), + ('ৎ', 'ৎ'), + ('ড়', 'ঢ়'), + ('য়', 'ৡ'), + ('ৰ', 'ৱ'), + ('ৼ', 'ৼ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('ੲ', 'ੴ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('ઽ', 'ઽ'), + ('ૐ', 'ૐ'), + ('ૠ', 'ૡ'), + ('ૹ', 'ૹ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('ଽ', 'ଽ'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୡ'), + ('ୱ', 'ୱ'), + ('ஃ', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('ௐ', 'ௐ'), + ('అ', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('ఽ', 'ఽ'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', 'ౡ'), + ('ಀ', 'ಀ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('ಽ', 'ಽ'), + ('ೝ', 'ೞ'), + ('ೠ', 'ೡ'), + ('ೱ', 'ೲ'), + ('ഄ', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), + ('ൎ', 'ൎ'), + ('ൔ', 'ൖ'), + ('ൟ', 'ൡ'), + ('ൺ', 'ൿ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('ก', 'ะ'), + ('า', 'ำ'), + ('เ', 'ๆ'), + ('ກ', 'ຂ'), + ('ຄ', 'ຄ'), + ('ຆ', 'ຊ'), + ('ຌ', 'ຣ'), + ('ລ', 'ລ'), + ('ວ', 'ະ'), + ('າ', 'ຳ'), + ('ຽ', 'ຽ'), + ('ເ', 'ໄ'), + ('ໆ', 'ໆ'), + ('ໜ', 'ໟ'), + ('ༀ', 'ༀ'), + ('ཀ', 'ཇ'), + ('ཉ', 'ཬ'), + ('ྈ', 'ྌ'), + ('က', 'ဪ'), + ('ဿ', 'ဿ'), + ('ၐ', 'ၕ'), + ('ၚ', 'ၝ'), + ('ၡ', 'ၡ'), + ('ၥ', 'ၦ'), + ('ၮ', 'ၰ'), + ('ၵ', 'ႁ'), + ('ႎ', 'ႎ'), + ('ა', 'ჺ'), + ('ჽ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('ᎀ', 'ᎏ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛮ', 'ᛸ'), + ('ᜀ', 'ᜑ'), + ('ᜟ', 'ᜱ'), + ('ᝀ', 'ᝑ'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('ក', 'ឳ'), + ('ៗ', 'ៗ'), + ('ៜ', 'ៜ'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢄ'), + ('ᢇ', 'ᢨ'), + ('ᢪ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('ᥐ', 'ᥭ'), + ('ᥰ', 'ᥴ'), + ('ᦀ', 'ᦫ'), + ('ᦰ', 'ᧉ'), + ('ᨀ', 'ᨖ'), + ('ᨠ', 'ᩔ'), + ('ᪧ', 'ᪧ'), + ('ᬅ', 'ᬳ'), + ('ᭅ', 'ᭌ'), + ('ᮃ', 'ᮠ'), + ('ᮮ', 'ᮯ'), + ('ᮺ', 'ᯥ'), + ('ᰀ', 'ᰣ'), + ('ᱍ', 'ᱏ'), + ('ᱚ', 'ᱽ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ᳩ', 'ᳬ'), + ('ᳮ', 'ᳳ'), + ('ᳵ', 'ᳶ'), + ('ᳺ', 'ᳺ'), + ('ℵ', 'ℸ'), + ('ↀ', 'ↂ'), + ('ↅ', 'ↈ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('ⸯ', 'ⸯ'), + ('々', '〇'), + ('〡', '〩'), + ('〱', '〵'), + ('〸', '〼'), + ('ぁ', 'ゖ'), + ('ゝ', 'ゟ'), + ('ァ', 'ヺ'), + ('ー', 'ヿ'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ㇰ', 'ㇿ'), + ('㐀', '䶿'), + ('一', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘟ'), + ('ꘪ', 'ꘫ'), + ('ꙮ', 'ꙮ'), + ('ꙿ', 'ꙿ'), + ('ꚠ', 'ꛯ'), + ('ꜗ', 'ꜟ'), + ('ꞈ', 'ꞈ'), + ('ꞏ', 'ꞏ'), + ('ꟷ', 'ꟷ'), + ('ꟻ', 'ꠁ'), + ('ꠃ', 'ꠅ'), + ('ꠇ', 'ꠊ'), + ('ꠌ', 'ꠢ'), + ('ꡀ', 'ꡳ'), + ('ꢂ', 'ꢳ'), + ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', 'ꣾ'), + ('ꤊ', 'ꤥ'), + ('ꤰ', 'ꥆ'), + ('ꥠ', 'ꥼ'), + ('ꦄ', 'ꦲ'), + ('ꧏ', 'ꧏ'), + ('ꧠ', 'ꧤ'), + ('ꧦ', 'ꧯ'), + ('ꧺ', 'ꧾ'), + ('ꨀ', 'ꨨ'), + ('ꩀ', 'ꩂ'), + ('ꩄ', 'ꩋ'), + ('ꩠ', 'ꩶ'), + ('ꩺ', 'ꩺ'), + ('ꩾ', 'ꪯ'), + ('ꪱ', 'ꪱ'), + ('ꪵ', 'ꪶ'), + ('ꪹ', 'ꪽ'), + ('ꫀ', 'ꫀ'), + ('ꫂ', 'ꫂ'), + ('ꫛ', 'ꫝ'), + ('ꫠ', 'ꫪ'), + ('ꫲ', 'ꫴ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꯀ', 'ꯢ'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('豈', '舘'), + ('並', '龎'), + ('יִ', 'יִ'), + ('ײַ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﮱ'), + ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('ヲ', 'ン'), + ('ᅠ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐅀', '𐅴'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('𐌀', '𐌟'), + ('𐌭', '𐍊'), + ('𐍐', '𐍵'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐏑', '𐏕'), + ('𐑐', '𐒝'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞁', '𐞂'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '𐨀'), + ('𐨐', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '𐫤'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐴀', '𐴣'), + ('𐺀', '𐺩'), + ('𐺰', '𐺱'), + ('𐼀', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '𐽅'), + ('𐽰', '𐾁'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀃', '𑀷'), + ('𑁱', '𑁲'), + ('𑁵', '𑁵'), + ('𑂃', '𑂯'), + ('𑃐', '𑃨'), + ('𑄃', '𑄦'), + ('𑅄', '𑅄'), + ('𑅇', '𑅇'), + ('𑅐', '𑅲'), + ('𑅶', '𑅶'), + ('𑆃', '𑆲'), + ('𑇁', '𑇄'), + ('𑇚', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '𑈫'), + ('𑈿', '𑉀'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '𑋞'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('𑌽', '𑌽'), + ('𑍐', '𑍐'), + ('𑍝', '𑍡'), + ('𑐀', '𑐴'), + ('𑑇', '𑑊'), + ('𑑟', '𑑡'), + ('𑒀', '𑒯'), + ('𑓄', '𑓅'), + ('𑓇', '𑓇'), + ('𑖀', '𑖮'), + ('𑗘', '𑗛'), + ('𑘀', '𑘯'), + ('𑙄', '𑙄'), + ('𑚀', '𑚪'), + ('𑚸', '𑚸'), + ('𑜀', '𑜚'), + ('𑝀', '𑝆'), + ('𑠀', '𑠫'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤯'), + ('𑤿', '𑤿'), + ('𑥁', '𑥁'), + ('𑦠', '𑦧'), + ('𑦪', '𑧐'), + ('𑧡', '𑧡'), + ('𑧣', '𑧣'), + ('𑨀', '𑨀'), + ('𑨋', '𑨲'), + ('𑨺', '𑨺'), + ('𑩐', '𑩐'), + ('𑩜', '𑪉'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '𑰮'), + ('𑱀', '𑱀'), + ('𑱲', '𑲏'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '𑴰'), + ('𑵆', '𑵆'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶉'), + ('𑶘', '𑶘'), + ('𑻠', '𑻲'), + ('𑼂', '𑼂'), + ('𑼄', '𑼐'), + ('𑼒', '𑼳'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒐀', '𒑮'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('𓑁', '𓑆'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩰', '𖪾'), + ('𖫐', '𖫭'), + ('𖬀', '𖬯'), + ('𖭀', '𖭃'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖼀', '𖽊'), + ('𖽐', '𖽐'), + ('𖾓', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '𖿣'), + ('𗀀', '𘟷'), + ('𘠀', '𘳕'), + ('𘴀', '𘴈'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛄢'), + ('𛄲', '𛄲'), + ('𛅐', '𛅒'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), + ('𛅰', '𛋻'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('𝼊', '𝼊'), + ('𞄀', '𞄬'), + ('𞄷', '𞄽'), + ('𞅎', '𞅎'), + ('𞊐', '𞊭'), + ('𞋀', '𞋫'), + ('𞓐', '𞓫'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('𞥋', '𞥋'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('𠀀', '𪛟'), + ('𪜀', '𫜹'), + ('𫝀', '𫠝'), + ('𫠠', '𬺡'), + ('𬺰', '𮯠'), + ('丽', '𪘀'), + ('𰀀', '𱍊'), + ('𱍐', '𲎯'), +]; + +pub const SCONTINUE: &'static [(char, char)] = &[ + (',', '-'), + (':', ':'), + ('՝', '՝'), + ('،', '؍'), + ('߸', '߸'), + ('᠂', '᠂'), + ('᠈', '᠈'), + ('–', '—'), + ('、', '、'), + ('︐', '︑'), + ('︓', '︓'), + ('︱', '︲'), + ('﹐', '﹑'), + ('﹕', '﹕'), + ('﹘', '﹘'), + ('﹣', '﹣'), + (',', '-'), + (':', ':'), + ('、', '、'), +]; + +pub const STERM: &'static [(char, char)] = &[ + ('!', '!'), + ('?', '?'), + ('։', '։'), + ('؝', '؟'), + ('۔', '۔'), + ('܀', '܂'), + ('߹', '߹'), + ('࠷', '࠷'), + ('࠹', '࠹'), + ('࠽', '࠾'), + ('।', '॥'), + ('၊', '။'), + ('።', '።'), + ('፧', '፨'), + ('᙮', '᙮'), + ('᜵', '᜶'), + ('᠃', '᠃'), + ('᠉', '᠉'), + ('᥄', '᥅'), + ('᪨', '᪫'), + ('᭚', '᭛'), + ('᭞', '᭟'), + ('᭽', '᭾'), + ('᰻', '᰼'), + ('᱾', '᱿'), + ('‼', '‽'), + ('⁇', '⁉'), + ('⸮', '⸮'), + ('⸼', '⸼'), + ('⹓', '⹔'), + ('。', '。'), + ('꓿', '꓿'), + ('꘎', '꘏'), + ('꛳', '꛳'), + ('꛷', '꛷'), + ('꡶', '꡷'), + ('꣎', '꣏'), + ('꤯', '꤯'), + ('꧈', '꧉'), + ('꩝', '꩟'), + ('꫰', '꫱'), + ('꯫', '꯫'), + ('﹖', '﹗'), + ('!', '!'), + ('?', '?'), + ('。', '。'), + ('𐩖', '𐩗'), + ('𐽕', '𐽙'), + ('𐾆', '𐾉'), + ('𑁇', '𑁈'), + ('𑂾', '𑃁'), + ('𑅁', '𑅃'), + ('𑇅', '𑇆'), + ('𑇍', '𑇍'), + ('𑇞', '𑇟'), + ('𑈸', '𑈹'), + ('𑈻', '𑈼'), + ('𑊩', '𑊩'), + ('𑑋', '𑑌'), + ('𑗂', '𑗃'), + ('𑗉', '𑗗'), + ('𑙁', '𑙂'), + ('𑜼', '𑜾'), + ('𑥄', '𑥄'), + ('𑥆', '𑥆'), + ('𑩂', '𑩃'), + ('𑪛', '𑪜'), + ('𑱁', '𑱂'), + ('𑻷', '𑻸'), + ('𑽃', '𑽄'), + ('𖩮', '𖩯'), + ('𖫵', '𖫵'), + ('𖬷', '𖬸'), + ('𖭄', '𖭄'), + ('𖺘', '𖺘'), + ('𛲟', '𛲟'), + ('𝪈', '𝪈'), +]; + +pub const SEP: &'static [(char, char)] = + &[('\u{85}', '\u{85}'), ('\u{2028}', '\u{2029}')]; + +pub const SP: &'static [(char, char)] = &[ + ('\t', '\t'), + ('\u{b}', '\u{c}'), + (' ', ' '), + ('\u{a0}', '\u{a0}'), + ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{200a}'), + ('\u{202f}', '\u{202f}'), + ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; + +pub const UPPER: &'static [(char, char)] = &[ + ('A', 'Z'), + ('À', 'Ö'), + ('Ø', 'Þ'), + ('Ā', 'Ā'), + ('Ă', 'Ă'), + ('Ą', 'Ą'), + ('Ć', 'Ć'), + ('Ĉ', 'Ĉ'), + ('Ċ', 'Ċ'), + ('Č', 'Č'), + ('Ď', 'Ď'), + ('Đ', 'Đ'), + ('Ē', 'Ē'), + ('Ĕ', 'Ĕ'), + ('Ė', 'Ė'), + ('Ę', 'Ę'), + ('Ě', 'Ě'), + ('Ĝ', 'Ĝ'), + ('Ğ', 'Ğ'), + ('Ġ', 'Ġ'), + ('Ģ', 'Ģ'), + ('Ĥ', 'Ĥ'), + ('Ħ', 'Ħ'), + ('Ĩ', 'Ĩ'), + ('Ī', 'Ī'), + ('Ĭ', 'Ĭ'), + ('Į', 'Į'), + ('İ', 'İ'), + ('IJ', 'IJ'), + ('Ĵ', 'Ĵ'), + ('Ķ', 'Ķ'), + ('Ĺ', 'Ĺ'), + ('Ļ', 'Ļ'), + ('Ľ', 'Ľ'), + ('Ŀ', 'Ŀ'), + ('Ł', 'Ł'), + ('Ń', 'Ń'), + ('Ņ', 'Ņ'), + ('Ň', 'Ň'), + ('Ŋ', 'Ŋ'), + ('Ō', 'Ō'), + ('Ŏ', 'Ŏ'), + ('Ő', 'Ő'), + ('Œ', 'Œ'), + ('Ŕ', 'Ŕ'), + ('Ŗ', 'Ŗ'), + ('Ř', 'Ř'), + ('Ś', 'Ś'), + ('Ŝ', 'Ŝ'), + ('Ş', 'Ş'), + ('Š', 'Š'), + ('Ţ', 'Ţ'), + ('Ť', 'Ť'), + ('Ŧ', 'Ŧ'), + ('Ũ', 'Ũ'), + ('Ū', 'Ū'), + ('Ŭ', 'Ŭ'), + ('Ů', 'Ů'), + ('Ű', 'Ű'), + ('Ų', 'Ų'), + ('Ŵ', 'Ŵ'), + ('Ŷ', 'Ŷ'), + ('Ÿ', 'Ź'), + ('Ż', 'Ż'), + ('Ž', 'Ž'), + ('Ɓ', 'Ƃ'), + ('Ƅ', 'Ƅ'), + ('Ɔ', 'Ƈ'), + ('Ɖ', 'Ƌ'), + ('Ǝ', 'Ƒ'), + ('Ɠ', 'Ɣ'), + ('Ɩ', 'Ƙ'), + ('Ɯ', 'Ɲ'), + ('Ɵ', 'Ơ'), + ('Ƣ', 'Ƣ'), + ('Ƥ', 'Ƥ'), + ('Ʀ', 'Ƨ'), + ('Ʃ', 'Ʃ'), + ('Ƭ', 'Ƭ'), + ('Ʈ', 'Ư'), + ('Ʊ', 'Ƴ'), + ('Ƶ', 'Ƶ'), + ('Ʒ', 'Ƹ'), + ('Ƽ', 'Ƽ'), + ('DŽ', 'Dž'), + ('LJ', 'Lj'), + ('NJ', 'Nj'), + ('Ǎ', 'Ǎ'), + ('Ǐ', 'Ǐ'), + ('Ǒ', 'Ǒ'), + ('Ǔ', 'Ǔ'), + ('Ǖ', 'Ǖ'), + ('Ǘ', 'Ǘ'), + ('Ǚ', 'Ǚ'), + ('Ǜ', 'Ǜ'), + ('Ǟ', 'Ǟ'), + ('Ǡ', 'Ǡ'), + ('Ǣ', 'Ǣ'), + ('Ǥ', 'Ǥ'), + ('Ǧ', 'Ǧ'), + ('Ǩ', 'Ǩ'), + ('Ǫ', 'Ǫ'), + ('Ǭ', 'Ǭ'), + ('Ǯ', 'Ǯ'), + ('DZ', 'Dz'), + ('Ǵ', 'Ǵ'), + ('Ƕ', 'Ǹ'), + ('Ǻ', 'Ǻ'), + ('Ǽ', 'Ǽ'), + ('Ǿ', 'Ǿ'), + ('Ȁ', 'Ȁ'), + ('Ȃ', 'Ȃ'), + ('Ȅ', 'Ȅ'), + ('Ȇ', 'Ȇ'), + ('Ȉ', 'Ȉ'), + ('Ȋ', 'Ȋ'), + ('Ȍ', 'Ȍ'), + ('Ȏ', 'Ȏ'), + ('Ȑ', 'Ȑ'), + ('Ȓ', 'Ȓ'), + ('Ȕ', 'Ȕ'), + ('Ȗ', 'Ȗ'), + ('Ș', 'Ș'), + ('Ț', 'Ț'), + ('Ȝ', 'Ȝ'), + ('Ȟ', 'Ȟ'), + ('Ƞ', 'Ƞ'), + ('Ȣ', 'Ȣ'), + ('Ȥ', 'Ȥ'), + ('Ȧ', 'Ȧ'), + ('Ȩ', 'Ȩ'), + ('Ȫ', 'Ȫ'), + ('Ȭ', 'Ȭ'), + ('Ȯ', 'Ȯ'), + ('Ȱ', 'Ȱ'), + ('Ȳ', 'Ȳ'), + ('Ⱥ', 'Ȼ'), + ('Ƚ', 'Ⱦ'), + ('Ɂ', 'Ɂ'), + ('Ƀ', 'Ɇ'), + ('Ɉ', 'Ɉ'), + ('Ɋ', 'Ɋ'), + ('Ɍ', 'Ɍ'), + ('Ɏ', 'Ɏ'), + ('Ͱ', 'Ͱ'), + ('Ͳ', 'Ͳ'), + ('Ͷ', 'Ͷ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ώ'), + ('Α', 'Ρ'), + ('Σ', 'Ϋ'), + ('Ϗ', 'Ϗ'), + ('ϒ', 'ϔ'), + ('Ϙ', 'Ϙ'), + ('Ϛ', 'Ϛ'), + ('Ϝ', 'Ϝ'), + ('Ϟ', 'Ϟ'), + ('Ϡ', 'Ϡ'), + ('Ϣ', 'Ϣ'), + ('Ϥ', 'Ϥ'), + ('Ϧ', 'Ϧ'), + ('Ϩ', 'Ϩ'), + ('Ϫ', 'Ϫ'), + ('Ϭ', 'Ϭ'), + ('Ϯ', 'Ϯ'), + ('ϴ', 'ϴ'), + ('Ϸ', 'Ϸ'), + ('Ϲ', 'Ϻ'), + ('Ͻ', 'Я'), + ('Ѡ', 'Ѡ'), + ('Ѣ', 'Ѣ'), + ('Ѥ', 'Ѥ'), + ('Ѧ', 'Ѧ'), + ('Ѩ', 'Ѩ'), + ('Ѫ', 'Ѫ'), + ('Ѭ', 'Ѭ'), + ('Ѯ', 'Ѯ'), + ('Ѱ', 'Ѱ'), + ('Ѳ', 'Ѳ'), + ('Ѵ', 'Ѵ'), + ('Ѷ', 'Ѷ'), + ('Ѹ', 'Ѹ'), + ('Ѻ', 'Ѻ'), + ('Ѽ', 'Ѽ'), + ('Ѿ', 'Ѿ'), + ('Ҁ', 'Ҁ'), + ('Ҋ', 'Ҋ'), + ('Ҍ', 'Ҍ'), + ('Ҏ', 'Ҏ'), + ('Ґ', 'Ґ'), + ('Ғ', 'Ғ'), + ('Ҕ', 'Ҕ'), + ('Җ', 'Җ'), + ('Ҙ', 'Ҙ'), + ('Қ', 'Қ'), + ('Ҝ', 'Ҝ'), + ('Ҟ', 'Ҟ'), + ('Ҡ', 'Ҡ'), + ('Ң', 'Ң'), + ('Ҥ', 'Ҥ'), + ('Ҧ', 'Ҧ'), + ('Ҩ', 'Ҩ'), + ('Ҫ', 'Ҫ'), + ('Ҭ', 'Ҭ'), + ('Ү', 'Ү'), + ('Ұ', 'Ұ'), + ('Ҳ', 'Ҳ'), + ('Ҵ', 'Ҵ'), + ('Ҷ', 'Ҷ'), + ('Ҹ', 'Ҹ'), + ('Һ', 'Һ'), + ('Ҽ', 'Ҽ'), + ('Ҿ', 'Ҿ'), + ('Ӏ', 'Ӂ'), + ('Ӄ', 'Ӄ'), + ('Ӆ', 'Ӆ'), + ('Ӈ', 'Ӈ'), + ('Ӊ', 'Ӊ'), + ('Ӌ', 'Ӌ'), + ('Ӎ', 'Ӎ'), + ('Ӑ', 'Ӑ'), + ('Ӓ', 'Ӓ'), + ('Ӕ', 'Ӕ'), + ('Ӗ', 'Ӗ'), + ('Ә', 'Ә'), + ('Ӛ', 'Ӛ'), + ('Ӝ', 'Ӝ'), + ('Ӟ', 'Ӟ'), + ('Ӡ', 'Ӡ'), + ('Ӣ', 'Ӣ'), + ('Ӥ', 'Ӥ'), + ('Ӧ', 'Ӧ'), + ('Ө', 'Ө'), + ('Ӫ', 'Ӫ'), + ('Ӭ', 'Ӭ'), + ('Ӯ', 'Ӯ'), + ('Ӱ', 'Ӱ'), + ('Ӳ', 'Ӳ'), + ('Ӵ', 'Ӵ'), + ('Ӷ', 'Ӷ'), + ('Ӹ', 'Ӹ'), + ('Ӻ', 'Ӻ'), + ('Ӽ', 'Ӽ'), + ('Ӿ', 'Ӿ'), + ('Ԁ', 'Ԁ'), + ('Ԃ', 'Ԃ'), + ('Ԅ', 'Ԅ'), + ('Ԇ', 'Ԇ'), + ('Ԉ', 'Ԉ'), + ('Ԋ', 'Ԋ'), + ('Ԍ', 'Ԍ'), + ('Ԏ', 'Ԏ'), + ('Ԑ', 'Ԑ'), + ('Ԓ', 'Ԓ'), + ('Ԕ', 'Ԕ'), + ('Ԗ', 'Ԗ'), + ('Ԙ', 'Ԙ'), + ('Ԛ', 'Ԛ'), + ('Ԝ', 'Ԝ'), + ('Ԟ', 'Ԟ'), + ('Ԡ', 'Ԡ'), + ('Ԣ', 'Ԣ'), + ('Ԥ', 'Ԥ'), + ('Ԧ', 'Ԧ'), + ('Ԩ', 'Ԩ'), + ('Ԫ', 'Ԫ'), + ('Ԭ', 'Ԭ'), + ('Ԯ', 'Ԯ'), + ('Ա', 'Ֆ'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('Ꭰ', 'Ᏽ'), + ('Ḁ', 'Ḁ'), + ('Ḃ', 'Ḃ'), + ('Ḅ', 'Ḅ'), + ('Ḇ', 'Ḇ'), + ('Ḉ', 'Ḉ'), + ('Ḋ', 'Ḋ'), + ('Ḍ', 'Ḍ'), + ('Ḏ', 'Ḏ'), + ('Ḑ', 'Ḑ'), + ('Ḓ', 'Ḓ'), + ('Ḕ', 'Ḕ'), + ('Ḗ', 'Ḗ'), + ('Ḙ', 'Ḙ'), + ('Ḛ', 'Ḛ'), + ('Ḝ', 'Ḝ'), + ('Ḟ', 'Ḟ'), + ('Ḡ', 'Ḡ'), + ('Ḣ', 'Ḣ'), + ('Ḥ', 'Ḥ'), + ('Ḧ', 'Ḧ'), + ('Ḩ', 'Ḩ'), + ('Ḫ', 'Ḫ'), + ('Ḭ', 'Ḭ'), + ('Ḯ', 'Ḯ'), + ('Ḱ', 'Ḱ'), + ('Ḳ', 'Ḳ'), + ('Ḵ', 'Ḵ'), + ('Ḷ', 'Ḷ'), + ('Ḹ', 'Ḹ'), + ('Ḻ', 'Ḻ'), + ('Ḽ', 'Ḽ'), + ('Ḿ', 'Ḿ'), + ('Ṁ', 'Ṁ'), + ('Ṃ', 'Ṃ'), + ('Ṅ', 'Ṅ'), + ('Ṇ', 'Ṇ'), + ('Ṉ', 'Ṉ'), + ('Ṋ', 'Ṋ'), + ('Ṍ', 'Ṍ'), + ('Ṏ', 'Ṏ'), + ('Ṑ', 'Ṑ'), + ('Ṓ', 'Ṓ'), + ('Ṕ', 'Ṕ'), + ('Ṗ', 'Ṗ'), + ('Ṙ', 'Ṙ'), + ('Ṛ', 'Ṛ'), + ('Ṝ', 'Ṝ'), + ('Ṟ', 'Ṟ'), + ('Ṡ', 'Ṡ'), + ('Ṣ', 'Ṣ'), + ('Ṥ', 'Ṥ'), + ('Ṧ', 'Ṧ'), + ('Ṩ', 'Ṩ'), + ('Ṫ', 'Ṫ'), + ('Ṭ', 'Ṭ'), + ('Ṯ', 'Ṯ'), + ('Ṱ', 'Ṱ'), + ('Ṳ', 'Ṳ'), + ('Ṵ', 'Ṵ'), + ('Ṷ', 'Ṷ'), + ('Ṹ', 'Ṹ'), + ('Ṻ', 'Ṻ'), + ('Ṽ', 'Ṽ'), + ('Ṿ', 'Ṿ'), + ('Ẁ', 'Ẁ'), + ('Ẃ', 'Ẃ'), + ('Ẅ', 'Ẅ'), + ('Ẇ', 'Ẇ'), + ('Ẉ', 'Ẉ'), + ('Ẋ', 'Ẋ'), + ('Ẍ', 'Ẍ'), + ('Ẏ', 'Ẏ'), + ('Ẑ', 'Ẑ'), + ('Ẓ', 'Ẓ'), + ('Ẕ', 'Ẕ'), + ('ẞ', 'ẞ'), + ('Ạ', 'Ạ'), + ('Ả', 'Ả'), + ('Ấ', 'Ấ'), + ('Ầ', 'Ầ'), + ('Ẩ', 'Ẩ'), + ('Ẫ', 'Ẫ'), + ('Ậ', 'Ậ'), + ('Ắ', 'Ắ'), + ('Ằ', 'Ằ'), + ('Ẳ', 'Ẳ'), + ('Ẵ', 'Ẵ'), + ('Ặ', 'Ặ'), + ('Ẹ', 'Ẹ'), + ('Ẻ', 'Ẻ'), + ('Ẽ', 'Ẽ'), + ('Ế', 'Ế'), + ('Ề', 'Ề'), + ('Ể', 'Ể'), + ('Ễ', 'Ễ'), + ('Ệ', 'Ệ'), + ('Ỉ', 'Ỉ'), + ('Ị', 'Ị'), + ('Ọ', 'Ọ'), + ('Ỏ', 'Ỏ'), + ('Ố', 'Ố'), + ('Ồ', 'Ồ'), + ('Ổ', 'Ổ'), + ('Ỗ', 'Ỗ'), + ('Ộ', 'Ộ'), + ('Ớ', 'Ớ'), + ('Ờ', 'Ờ'), + ('Ở', 'Ở'), + ('Ỡ', 'Ỡ'), + ('Ợ', 'Ợ'), + ('Ụ', 'Ụ'), + ('Ủ', 'Ủ'), + ('Ứ', 'Ứ'), + ('Ừ', 'Ừ'), + ('Ử', 'Ử'), + ('Ữ', 'Ữ'), + ('Ự', 'Ự'), + ('Ỳ', 'Ỳ'), + ('Ỵ', 'Ỵ'), + ('Ỷ', 'Ỷ'), + ('Ỹ', 'Ỹ'), + ('Ỻ', 'Ỻ'), + ('Ỽ', 'Ỽ'), + ('Ỿ', 'Ỿ'), + ('Ἀ', 'Ἇ'), + ('Ἐ', 'Ἕ'), + ('Ἠ', 'Ἧ'), + ('Ἰ', 'Ἷ'), + ('Ὀ', 'Ὅ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'Ὗ'), + ('Ὠ', 'Ὧ'), + ('ᾈ', 'ᾏ'), + ('ᾘ', 'ᾟ'), + ('ᾨ', 'ᾯ'), + ('Ᾰ', 'ᾼ'), + ('Ὲ', 'ῌ'), + ('Ῐ', 'Ί'), + ('Ῠ', 'Ῥ'), + ('Ὸ', 'ῼ'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℋ', 'ℍ'), + ('ℐ', 'ℒ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℰ', 'ℳ'), + ('ℾ', 'ℿ'), + ('ⅅ', 'ⅅ'), + ('Ⅰ', 'Ⅿ'), + ('Ↄ', 'Ↄ'), + ('Ⓐ', 'Ⓩ'), + ('Ⰰ', 'Ⱟ'), + ('Ⱡ', 'Ⱡ'), + ('Ɫ', 'Ɽ'), + ('Ⱨ', 'Ⱨ'), + ('Ⱪ', 'Ⱪ'), + ('Ⱬ', 'Ⱬ'), + ('Ɑ', 'Ɒ'), + ('Ⱳ', 'Ⱳ'), + ('Ⱶ', 'Ⱶ'), + ('Ȿ', 'Ⲁ'), + ('Ⲃ', 'Ⲃ'), + ('Ⲅ', 'Ⲅ'), + ('Ⲇ', 'Ⲇ'), + ('Ⲉ', 'Ⲉ'), + ('Ⲋ', 'Ⲋ'), + ('Ⲍ', 'Ⲍ'), + ('Ⲏ', 'Ⲏ'), + ('Ⲑ', 'Ⲑ'), + ('Ⲓ', 'Ⲓ'), + ('Ⲕ', 'Ⲕ'), + ('Ⲗ', 'Ⲗ'), + ('Ⲙ', 'Ⲙ'), + ('Ⲛ', 'Ⲛ'), + ('Ⲝ', 'Ⲝ'), + ('Ⲟ', 'Ⲟ'), + ('Ⲡ', 'Ⲡ'), + ('Ⲣ', 'Ⲣ'), + ('Ⲥ', 'Ⲥ'), + ('Ⲧ', 'Ⲧ'), + ('Ⲩ', 'Ⲩ'), + ('Ⲫ', 'Ⲫ'), + ('Ⲭ', 'Ⲭ'), + ('Ⲯ', 'Ⲯ'), + ('Ⲱ', 'Ⲱ'), + ('Ⲳ', 'Ⲳ'), + ('Ⲵ', 'Ⲵ'), + ('Ⲷ', 'Ⲷ'), + ('Ⲹ', 'Ⲹ'), + ('Ⲻ', 'Ⲻ'), + ('Ⲽ', 'Ⲽ'), + ('Ⲿ', 'Ⲿ'), + ('Ⳁ', 'Ⳁ'), + ('Ⳃ', 'Ⳃ'), + ('Ⳅ', 'Ⳅ'), + ('Ⳇ', 'Ⳇ'), + ('Ⳉ', 'Ⳉ'), + ('Ⳋ', 'Ⳋ'), + ('Ⳍ', 'Ⳍ'), + ('Ⳏ', 'Ⳏ'), + ('Ⳑ', 'Ⳑ'), + ('Ⳓ', 'Ⳓ'), + ('Ⳕ', 'Ⳕ'), + ('Ⳗ', 'Ⳗ'), + ('Ⳙ', 'Ⳙ'), + ('Ⳛ', 'Ⳛ'), + ('Ⳝ', 'Ⳝ'), + ('Ⳟ', 'Ⳟ'), + ('Ⳡ', 'Ⳡ'), + ('Ⳣ', 'Ⳣ'), + ('Ⳬ', 'Ⳬ'), + ('Ⳮ', 'Ⳮ'), + ('Ⳳ', 'Ⳳ'), + ('Ꙁ', 'Ꙁ'), + ('Ꙃ', 'Ꙃ'), + ('Ꙅ', 'Ꙅ'), + ('Ꙇ', 'Ꙇ'), + ('Ꙉ', 'Ꙉ'), + ('Ꙋ', 'Ꙋ'), + ('Ꙍ', 'Ꙍ'), + ('Ꙏ', 'Ꙏ'), + ('Ꙑ', 'Ꙑ'), + ('Ꙓ', 'Ꙓ'), + ('Ꙕ', 'Ꙕ'), + ('Ꙗ', 'Ꙗ'), + ('Ꙙ', 'Ꙙ'), + ('Ꙛ', 'Ꙛ'), + ('Ꙝ', 'Ꙝ'), + ('Ꙟ', 'Ꙟ'), + ('Ꙡ', 'Ꙡ'), + ('Ꙣ', 'Ꙣ'), + ('Ꙥ', 'Ꙥ'), + ('Ꙧ', 'Ꙧ'), + ('Ꙩ', 'Ꙩ'), + ('Ꙫ', 'Ꙫ'), + ('Ꙭ', 'Ꙭ'), + ('Ꚁ', 'Ꚁ'), + ('Ꚃ', 'Ꚃ'), + ('Ꚅ', 'Ꚅ'), + ('Ꚇ', 'Ꚇ'), + ('Ꚉ', 'Ꚉ'), + ('Ꚋ', 'Ꚋ'), + ('Ꚍ', 'Ꚍ'), + ('Ꚏ', 'Ꚏ'), + ('Ꚑ', 'Ꚑ'), + ('Ꚓ', 'Ꚓ'), + ('Ꚕ', 'Ꚕ'), + ('Ꚗ', 'Ꚗ'), + ('Ꚙ', 'Ꚙ'), + ('Ꚛ', 'Ꚛ'), + ('Ꜣ', 'Ꜣ'), + ('Ꜥ', 'Ꜥ'), + ('Ꜧ', 'Ꜧ'), + ('Ꜩ', 'Ꜩ'), + ('Ꜫ', 'Ꜫ'), + ('Ꜭ', 'Ꜭ'), + ('Ꜯ', 'Ꜯ'), + ('Ꜳ', 'Ꜳ'), + ('Ꜵ', 'Ꜵ'), + ('Ꜷ', 'Ꜷ'), + ('Ꜹ', 'Ꜹ'), + ('Ꜻ', 'Ꜻ'), + ('Ꜽ', 'Ꜽ'), + ('Ꜿ', 'Ꜿ'), + ('Ꝁ', 'Ꝁ'), + ('Ꝃ', 'Ꝃ'), + ('Ꝅ', 'Ꝅ'), + ('Ꝇ', 'Ꝇ'), + ('Ꝉ', 'Ꝉ'), + ('Ꝋ', 'Ꝋ'), + ('Ꝍ', 'Ꝍ'), + ('Ꝏ', 'Ꝏ'), + ('Ꝑ', 'Ꝑ'), + ('Ꝓ', 'Ꝓ'), + ('Ꝕ', 'Ꝕ'), + ('Ꝗ', 'Ꝗ'), + ('Ꝙ', 'Ꝙ'), + ('Ꝛ', 'Ꝛ'), + ('Ꝝ', 'Ꝝ'), + ('Ꝟ', 'Ꝟ'), + ('Ꝡ', 'Ꝡ'), + ('Ꝣ', 'Ꝣ'), + ('Ꝥ', 'Ꝥ'), + ('Ꝧ', 'Ꝧ'), + ('Ꝩ', 'Ꝩ'), + ('Ꝫ', 'Ꝫ'), + ('Ꝭ', 'Ꝭ'), + ('Ꝯ', 'Ꝯ'), + ('Ꝺ', 'Ꝺ'), + ('Ꝼ', 'Ꝼ'), + ('Ᵹ', 'Ꝿ'), + ('Ꞁ', 'Ꞁ'), + ('Ꞃ', 'Ꞃ'), + ('Ꞅ', 'Ꞅ'), + ('Ꞇ', 'Ꞇ'), + ('Ꞌ', 'Ꞌ'), + ('Ɥ', 'Ɥ'), + ('Ꞑ', 'Ꞑ'), + ('Ꞓ', 'Ꞓ'), + ('Ꞗ', 'Ꞗ'), + ('Ꞙ', 'Ꞙ'), + ('Ꞛ', 'Ꞛ'), + ('Ꞝ', 'Ꞝ'), + ('Ꞟ', 'Ꞟ'), + ('Ꞡ', 'Ꞡ'), + ('Ꞣ', 'Ꞣ'), + ('Ꞥ', 'Ꞥ'), + ('Ꞧ', 'Ꞧ'), + ('Ꞩ', 'Ꞩ'), + ('Ɦ', 'Ɪ'), + ('Ʞ', 'Ꞵ'), + ('Ꞷ', 'Ꞷ'), + ('Ꞹ', 'Ꞹ'), + ('Ꞻ', 'Ꞻ'), + ('Ꞽ', 'Ꞽ'), + ('Ꞿ', 'Ꞿ'), + ('Ꟁ', 'Ꟁ'), + ('Ꟃ', 'Ꟃ'), + ('Ꞔ', 'Ꟈ'), + ('Ꟊ', 'Ꟊ'), + ('Ꟑ', 'Ꟑ'), + ('Ꟗ', 'Ꟗ'), + ('Ꟙ', 'Ꟙ'), + ('Ꟶ', 'Ꟶ'), + ('A', 'Z'), + ('𐐀', '𐐧'), + ('𐒰', '𐓓'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐲀', '𐲲'), + ('𑢠', '𑢿'), + ('𖹀', '𖹟'), + ('𝐀', '𝐙'), + ('𝐴', '𝑍'), + ('𝑨', '𝒁'), + ('𝒜', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒵'), + ('𝓐', '𝓩'), + ('𝔄', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔸', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕬', '𝖅'), + ('𝖠', '𝖹'), + ('𝗔', '𝗭'), + ('𝘈', '𝘡'), + ('𝘼', '𝙕'), + ('𝙰', '𝚉'), + ('𝚨', '𝛀'), + ('𝛢', '𝛺'), + ('𝜜', '𝜴'), + ('𝝖', '𝝮'), + ('𝞐', '𝞨'), + ('𝟊', '𝟊'), + ('𞤀', '𞤡'), + ('🄰', '🅉'), + ('🅐', '🅩'), + ('🅰', '🆉'), +]; diff --git a/vendor/regex-syntax/src/unicode_tables/word_break.rs b/vendor/regex-syntax/src/unicode_tables/word_break.rs new file mode 100644 index 0000000000..c0714956fe --- /dev/null +++ b/vendor/regex-syntax/src/unicode_tables/word_break.rs @@ -0,0 +1,1120 @@ +// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: +// +// ucd-generate word-break ucd-15.0.0 --chars +// +// Unicode version: 15.0.0. +// +// ucd-generate 0.2.14 is available on crates.io. + +pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ + ("ALetter", ALETTER), + ("CR", CR), + ("Double_Quote", DOUBLE_QUOTE), + ("Extend", EXTEND), + ("ExtendNumLet", EXTENDNUMLET), + ("Format", FORMAT), + ("Hebrew_Letter", HEBREW_LETTER), + ("Katakana", KATAKANA), + ("LF", LF), + ("MidLetter", MIDLETTER), + ("MidNum", MIDNUM), + ("MidNumLet", MIDNUMLET), + ("Newline", NEWLINE), + ("Numeric", NUMERIC), + ("Regional_Indicator", REGIONAL_INDICATOR), + ("Single_Quote", SINGLE_QUOTE), + ("WSegSpace", WSEGSPACE), + ("ZWJ", ZWJ), +]; + +pub const ALETTER: &'static [(char, char)] = &[ + ('A', 'Z'), + ('a', 'z'), + ('ª', 'ª'), + ('µ', 'µ'), + ('º', 'º'), + ('À', 'Ö'), + ('Ø', 'ö'), + ('ø', '˗'), + ('˞', '˿'), + ('Ͱ', 'ʹ'), + ('Ͷ', 'ͷ'), + ('ͺ', 'ͽ'), + ('Ϳ', 'Ϳ'), + ('Ά', 'Ά'), + ('Έ', 'Ί'), + ('Ό', 'Ό'), + ('Ύ', 'Ρ'), + ('Σ', 'ϵ'), + ('Ϸ', 'ҁ'), + ('Ҋ', 'ԯ'), + ('Ա', 'Ֆ'), + ('ՙ', '՜'), + ('՞', '՞'), + ('ՠ', 'ֈ'), + ('֊', '֊'), + ('׳', '׳'), + ('ؠ', 'ي'), + ('ٮ', 'ٯ'), + ('ٱ', 'ۓ'), + ('ە', 'ە'), + ('ۥ', 'ۦ'), + ('ۮ', 'ۯ'), + ('ۺ', 'ۼ'), + ('ۿ', 'ۿ'), + ('ܐ', 'ܐ'), + ('ܒ', 'ܯ'), + ('ݍ', 'ޥ'), + ('ޱ', 'ޱ'), + ('ߊ', 'ߪ'), + ('ߴ', 'ߵ'), + ('ߺ', 'ߺ'), + ('ࠀ', 'ࠕ'), + ('ࠚ', 'ࠚ'), + ('ࠤ', 'ࠤ'), + ('ࠨ', 'ࠨ'), + ('ࡀ', 'ࡘ'), + ('ࡠ', 'ࡪ'), + ('ࡰ', 'ࢇ'), + ('ࢉ', 'ࢎ'), + ('ࢠ', 'ࣉ'), + ('ऄ', 'ह'), + ('ऽ', 'ऽ'), + ('ॐ', 'ॐ'), + ('क़', 'ॡ'), + ('ॱ', 'ঀ'), + ('অ', 'ঌ'), + ('এ', 'ঐ'), + ('ও', 'ন'), + ('প', 'র'), + ('ল', 'ল'), + ('শ', 'হ'), + ('ঽ', 'ঽ'), + ('ৎ', 'ৎ'), + ('ড়', 'ঢ়'), + ('য়', 'ৡ'), + ('ৰ', 'ৱ'), + ('ৼ', 'ৼ'), + ('ਅ', 'ਊ'), + ('ਏ', 'ਐ'), + ('ਓ', 'ਨ'), + ('ਪ', 'ਰ'), + ('ਲ', 'ਲ਼'), + ('ਵ', 'ਸ਼'), + ('ਸ', 'ਹ'), + ('ਖ਼', 'ੜ'), + ('ਫ਼', 'ਫ਼'), + ('ੲ', 'ੴ'), + ('અ', 'ઍ'), + ('એ', 'ઑ'), + ('ઓ', 'ન'), + ('પ', 'ર'), + ('લ', 'ળ'), + ('વ', 'હ'), + ('ઽ', 'ઽ'), + ('ૐ', 'ૐ'), + ('ૠ', 'ૡ'), + ('ૹ', 'ૹ'), + ('ଅ', 'ଌ'), + ('ଏ', 'ଐ'), + ('ଓ', 'ନ'), + ('ପ', 'ର'), + ('ଲ', 'ଳ'), + ('ଵ', 'ହ'), + ('ଽ', 'ଽ'), + ('ଡ଼', 'ଢ଼'), + ('ୟ', 'ୡ'), + ('ୱ', 'ୱ'), + ('ஃ', 'ஃ'), + ('அ', 'ஊ'), + ('எ', 'ஐ'), + ('ஒ', 'க'), + ('ங', 'ச'), + ('ஜ', 'ஜ'), + ('ஞ', 'ட'), + ('ண', 'த'), + ('ந', 'ப'), + ('ம', 'ஹ'), + ('ௐ', 'ௐ'), + ('అ', 'ఌ'), + ('ఎ', 'ఐ'), + ('ఒ', 'న'), + ('ప', 'హ'), + ('ఽ', 'ఽ'), + ('ౘ', 'ౚ'), + ('ౝ', 'ౝ'), + ('ౠ', 'ౡ'), + ('ಀ', 'ಀ'), + ('ಅ', 'ಌ'), + ('ಎ', 'ಐ'), + ('ಒ', 'ನ'), + ('ಪ', 'ಳ'), + ('ವ', 'ಹ'), + ('ಽ', 'ಽ'), + ('ೝ', 'ೞ'), + ('ೠ', 'ೡ'), + ('ೱ', 'ೲ'), + ('ഄ', 'ഌ'), + ('എ', 'ഐ'), + ('ഒ', 'ഺ'), + ('ഽ', 'ഽ'), + ('ൎ', 'ൎ'), + ('ൔ', 'ൖ'), + ('ൟ', 'ൡ'), + ('ൺ', 'ൿ'), + ('අ', 'ඖ'), + ('ක', 'න'), + ('ඳ', 'ර'), + ('ල', 'ල'), + ('ව', 'ෆ'), + ('ༀ', 'ༀ'), + ('ཀ', 'ཇ'), + ('ཉ', 'ཬ'), + ('ྈ', 'ྌ'), + ('Ⴀ', 'Ⴥ'), + ('Ⴧ', 'Ⴧ'), + ('Ⴭ', 'Ⴭ'), + ('ა', 'ჺ'), + ('ჼ', 'ቈ'), + ('ቊ', 'ቍ'), + ('ቐ', 'ቖ'), + ('ቘ', 'ቘ'), + ('ቚ', 'ቝ'), + ('በ', 'ኈ'), + ('ኊ', 'ኍ'), + ('ነ', 'ኰ'), + ('ኲ', 'ኵ'), + ('ኸ', 'ኾ'), + ('ዀ', 'ዀ'), + ('ዂ', 'ዅ'), + ('ወ', 'ዖ'), + ('ዘ', 'ጐ'), + ('ጒ', 'ጕ'), + ('ጘ', 'ፚ'), + ('ᎀ', 'ᎏ'), + ('Ꭰ', 'Ᏽ'), + ('ᏸ', 'ᏽ'), + ('ᐁ', 'ᙬ'), + ('ᙯ', 'ᙿ'), + ('ᚁ', 'ᚚ'), + ('ᚠ', 'ᛪ'), + ('ᛮ', 'ᛸ'), + ('ᜀ', 'ᜑ'), + ('ᜟ', 'ᜱ'), + ('ᝀ', 'ᝑ'), + ('ᝠ', 'ᝬ'), + ('ᝮ', 'ᝰ'), + ('ᠠ', 'ᡸ'), + ('ᢀ', 'ᢄ'), + ('ᢇ', 'ᢨ'), + ('ᢪ', 'ᢪ'), + ('ᢰ', 'ᣵ'), + ('ᤀ', 'ᤞ'), + ('ᨀ', 'ᨖ'), + ('ᬅ', 'ᬳ'), + ('ᭅ', 'ᭌ'), + ('ᮃ', 'ᮠ'), + ('ᮮ', 'ᮯ'), + ('ᮺ', 'ᯥ'), + ('ᰀ', 'ᰣ'), + ('ᱍ', 'ᱏ'), + ('ᱚ', 'ᱽ'), + ('ᲀ', 'ᲈ'), + ('Ა', 'Ჺ'), + ('Ჽ', 'Ჿ'), + ('ᳩ', 'ᳬ'), + ('ᳮ', 'ᳳ'), + ('ᳵ', 'ᳶ'), + ('ᳺ', 'ᳺ'), + ('ᴀ', 'ᶿ'), + ('Ḁ', 'ἕ'), + ('Ἐ', 'Ἕ'), + ('ἠ', 'ὅ'), + ('Ὀ', 'Ὅ'), + ('ὐ', 'ὗ'), + ('Ὑ', 'Ὑ'), + ('Ὓ', 'Ὓ'), + ('Ὕ', 'Ὕ'), + ('Ὗ', 'ώ'), + ('ᾀ', 'ᾴ'), + ('ᾶ', 'ᾼ'), + ('ι', 'ι'), + ('ῂ', 'ῄ'), + ('ῆ', 'ῌ'), + ('ῐ', 'ΐ'), + ('ῖ', 'Ί'), + ('ῠ', 'Ῥ'), + ('ῲ', 'ῴ'), + ('ῶ', 'ῼ'), + ('ⁱ', 'ⁱ'), + ('ⁿ', 'ⁿ'), + ('ₐ', 'ₜ'), + ('ℂ', 'ℂ'), + ('ℇ', 'ℇ'), + ('ℊ', 'ℓ'), + ('ℕ', 'ℕ'), + ('ℙ', 'ℝ'), + ('ℤ', 'ℤ'), + ('Ω', 'Ω'), + ('ℨ', 'ℨ'), + ('K', 'ℭ'), + ('ℯ', 'ℹ'), + ('ℼ', 'ℿ'), + ('ⅅ', 'ⅉ'), + ('ⅎ', 'ⅎ'), + ('Ⅰ', 'ↈ'), + ('Ⓐ', 'ⓩ'), + ('Ⰰ', 'ⳤ'), + ('Ⳬ', 'ⳮ'), + ('Ⳳ', 'ⳳ'), + ('ⴀ', 'ⴥ'), + ('ⴧ', 'ⴧ'), + ('ⴭ', 'ⴭ'), + ('ⴰ', 'ⵧ'), + ('ⵯ', 'ⵯ'), + ('ⶀ', 'ⶖ'), + ('ⶠ', 'ⶦ'), + ('ⶨ', 'ⶮ'), + ('ⶰ', 'ⶶ'), + ('ⶸ', 'ⶾ'), + ('ⷀ', 'ⷆ'), + ('ⷈ', 'ⷎ'), + ('ⷐ', 'ⷖ'), + ('ⷘ', 'ⷞ'), + ('ⸯ', 'ⸯ'), + ('々', '々'), + ('〻', '〼'), + ('ㄅ', 'ㄯ'), + ('ㄱ', 'ㆎ'), + ('ㆠ', 'ㆿ'), + ('ꀀ', 'ꒌ'), + ('ꓐ', 'ꓽ'), + ('ꔀ', 'ꘌ'), + ('ꘐ', 'ꘟ'), + ('ꘪ', 'ꘫ'), + ('Ꙁ', 'ꙮ'), + ('ꙿ', 'ꚝ'), + ('ꚠ', 'ꛯ'), + ('꜈', 'ꟊ'), + ('Ꟑ', 'ꟑ'), + ('ꟓ', 'ꟓ'), + ('ꟕ', 'ꟙ'), + ('ꟲ', 'ꠁ'), + ('ꠃ', 'ꠅ'), + ('ꠇ', 'ꠊ'), + ('ꠌ', 'ꠢ'), + ('ꡀ', 'ꡳ'), + ('ꢂ', 'ꢳ'), + ('ꣲ', 'ꣷ'), + ('ꣻ', 'ꣻ'), + ('ꣽ', 'ꣾ'), + ('ꤊ', 'ꤥ'), + ('ꤰ', 'ꥆ'), + ('ꥠ', 'ꥼ'), + ('ꦄ', 'ꦲ'), + ('ꧏ', 'ꧏ'), + ('ꨀ', 'ꨨ'), + ('ꩀ', 'ꩂ'), + ('ꩄ', 'ꩋ'), + ('ꫠ', 'ꫪ'), + ('ꫲ', 'ꫴ'), + ('ꬁ', 'ꬆ'), + ('ꬉ', 'ꬎ'), + ('ꬑ', 'ꬖ'), + ('ꬠ', 'ꬦ'), + ('ꬨ', 'ꬮ'), + ('ꬰ', 'ꭩ'), + ('ꭰ', 'ꯢ'), + ('가', '힣'), + ('ힰ', 'ퟆ'), + ('ퟋ', 'ퟻ'), + ('ff', 'st'), + ('ﬓ', 'ﬗ'), + ('ﭐ', 'ﮱ'), + ('ﯓ', 'ﴽ'), + ('ﵐ', 'ﶏ'), + ('ﶒ', 'ﷇ'), + ('ﷰ', 'ﷻ'), + ('ﹰ', 'ﹴ'), + ('ﹶ', 'ﻼ'), + ('A', 'Z'), + ('a', 'z'), + ('ᅠ', 'ᄒ'), + ('ᅡ', 'ᅦ'), + ('ᅧ', 'ᅬ'), + ('ᅭ', 'ᅲ'), + ('ᅳ', 'ᅵ'), + ('𐀀', '𐀋'), + ('𐀍', '𐀦'), + ('𐀨', '𐀺'), + ('𐀼', '𐀽'), + ('𐀿', '𐁍'), + ('𐁐', '𐁝'), + ('𐂀', '𐃺'), + ('𐅀', '𐅴'), + ('𐊀', '𐊜'), + ('𐊠', '𐋐'), + ('𐌀', '𐌟'), + ('𐌭', '𐍊'), + ('𐍐', '𐍵'), + ('𐎀', '𐎝'), + ('𐎠', '𐏃'), + ('𐏈', '𐏏'), + ('𐏑', '𐏕'), + ('𐐀', '𐒝'), + ('𐒰', '𐓓'), + ('𐓘', '𐓻'), + ('𐔀', '𐔧'), + ('𐔰', '𐕣'), + ('𐕰', '𐕺'), + ('𐕼', '𐖊'), + ('𐖌', '𐖒'), + ('𐖔', '𐖕'), + ('𐖗', '𐖡'), + ('𐖣', '𐖱'), + ('𐖳', '𐖹'), + ('𐖻', '𐖼'), + ('𐘀', '𐜶'), + ('𐝀', '𐝕'), + ('𐝠', '𐝧'), + ('𐞀', '𐞅'), + ('𐞇', '𐞰'), + ('𐞲', '𐞺'), + ('𐠀', '𐠅'), + ('𐠈', '𐠈'), + ('𐠊', '𐠵'), + ('𐠷', '𐠸'), + ('𐠼', '𐠼'), + ('𐠿', '𐡕'), + ('𐡠', '𐡶'), + ('𐢀', '𐢞'), + ('𐣠', '𐣲'), + ('𐣴', '𐣵'), + ('𐤀', '𐤕'), + ('𐤠', '𐤹'), + ('𐦀', '𐦷'), + ('𐦾', '𐦿'), + ('𐨀', '𐨀'), + ('𐨐', '𐨓'), + ('𐨕', '𐨗'), + ('𐨙', '𐨵'), + ('𐩠', '𐩼'), + ('𐪀', '𐪜'), + ('𐫀', '𐫇'), + ('𐫉', '𐫤'), + ('𐬀', '𐬵'), + ('𐭀', '𐭕'), + ('𐭠', '𐭲'), + ('𐮀', '𐮑'), + ('𐰀', '𐱈'), + ('𐲀', '𐲲'), + ('𐳀', '𐳲'), + ('𐴀', '𐴣'), + ('𐺀', '𐺩'), + ('𐺰', '𐺱'), + ('𐼀', '𐼜'), + ('𐼧', '𐼧'), + ('𐼰', '𐽅'), + ('𐽰', '𐾁'), + ('𐾰', '𐿄'), + ('𐿠', '𐿶'), + ('𑀃', '𑀷'), + ('𑁱', '𑁲'), + ('𑁵', '𑁵'), + ('𑂃', '𑂯'), + ('𑃐', '𑃨'), + ('𑄃', '𑄦'), + ('𑅄', '𑅄'), + ('𑅇', '𑅇'), + ('𑅐', '𑅲'), + ('𑅶', '𑅶'), + ('𑆃', '𑆲'), + ('𑇁', '𑇄'), + ('𑇚', '𑇚'), + ('𑇜', '𑇜'), + ('𑈀', '𑈑'), + ('𑈓', '𑈫'), + ('𑈿', '𑉀'), + ('𑊀', '𑊆'), + ('𑊈', '𑊈'), + ('𑊊', '𑊍'), + ('𑊏', '𑊝'), + ('𑊟', '𑊨'), + ('𑊰', '𑋞'), + ('𑌅', '𑌌'), + ('𑌏', '𑌐'), + ('𑌓', '𑌨'), + ('𑌪', '𑌰'), + ('𑌲', '𑌳'), + ('𑌵', '𑌹'), + ('𑌽', '𑌽'), + ('𑍐', '𑍐'), + ('𑍝', '𑍡'), + ('𑐀', '𑐴'), + ('𑑇', '𑑊'), + ('𑑟', '𑑡'), + ('𑒀', '𑒯'), + ('𑓄', '𑓅'), + ('𑓇', '𑓇'), + ('𑖀', '𑖮'), + ('𑗘', '𑗛'), + ('𑘀', '𑘯'), + ('𑙄', '𑙄'), + ('𑚀', '𑚪'), + ('𑚸', '𑚸'), + ('𑠀', '𑠫'), + ('𑢠', '𑣟'), + ('𑣿', '𑤆'), + ('𑤉', '𑤉'), + ('𑤌', '𑤓'), + ('𑤕', '𑤖'), + ('𑤘', '𑤯'), + ('𑤿', '𑤿'), + ('𑥁', '𑥁'), + ('𑦠', '𑦧'), + ('𑦪', '𑧐'), + ('𑧡', '𑧡'), + ('𑧣', '𑧣'), + ('𑨀', '𑨀'), + ('𑨋', '𑨲'), + ('𑨺', '𑨺'), + ('𑩐', '𑩐'), + ('𑩜', '𑪉'), + ('𑪝', '𑪝'), + ('𑪰', '𑫸'), + ('𑰀', '𑰈'), + ('𑰊', '𑰮'), + ('𑱀', '𑱀'), + ('𑱲', '𑲏'), + ('𑴀', '𑴆'), + ('𑴈', '𑴉'), + ('𑴋', '𑴰'), + ('𑵆', '𑵆'), + ('𑵠', '𑵥'), + ('𑵧', '𑵨'), + ('𑵪', '𑶉'), + ('𑶘', '𑶘'), + ('𑻠', '𑻲'), + ('𑼂', '𑼂'), + ('𑼄', '𑼐'), + ('𑼒', '𑼳'), + ('𑾰', '𑾰'), + ('𒀀', '𒎙'), + ('𒐀', '𒑮'), + ('𒒀', '𒕃'), + ('𒾐', '𒿰'), + ('𓀀', '𓐯'), + ('𓑁', '𓑆'), + ('𔐀', '𔙆'), + ('𖠀', '𖨸'), + ('𖩀', '𖩞'), + ('𖩰', '𖪾'), + ('𖫐', '𖫭'), + ('𖬀', '𖬯'), + ('𖭀', '𖭃'), + ('𖭣', '𖭷'), + ('𖭽', '𖮏'), + ('𖹀', '𖹿'), + ('𖼀', '𖽊'), + ('𖽐', '𖽐'), + ('𖾓', '𖾟'), + ('𖿠', '𖿡'), + ('𖿣', '𖿣'), + ('𛰀', '𛱪'), + ('𛱰', '𛱼'), + ('𛲀', '𛲈'), + ('𛲐', '𛲙'), + ('𝐀', '𝑔'), + ('𝑖', '𝒜'), + ('𝒞', '𝒟'), + ('𝒢', '𝒢'), + ('𝒥', '𝒦'), + ('𝒩', '𝒬'), + ('𝒮', '𝒹'), + ('𝒻', '𝒻'), + ('𝒽', '𝓃'), + ('𝓅', '𝔅'), + ('𝔇', '𝔊'), + ('𝔍', '𝔔'), + ('𝔖', '𝔜'), + ('𝔞', '𝔹'), + ('𝔻', '𝔾'), + ('𝕀', '𝕄'), + ('𝕆', '𝕆'), + ('𝕊', '𝕐'), + ('𝕒', '𝚥'), + ('𝚨', '𝛀'), + ('𝛂', '𝛚'), + ('𝛜', '𝛺'), + ('𝛼', '𝜔'), + ('𝜖', '𝜴'), + ('𝜶', '𝝎'), + ('𝝐', '𝝮'), + ('𝝰', '𝞈'), + ('𝞊', '𝞨'), + ('𝞪', '𝟂'), + ('𝟄', '𝟋'), + ('𝼀', '𝼞'), + ('𝼥', '𝼪'), + ('𞀰', '𞁭'), + ('𞄀', '𞄬'), + ('𞄷', '𞄽'), + ('𞅎', '𞅎'), + ('𞊐', '𞊭'), + ('𞋀', '𞋫'), + ('𞓐', '𞓫'), + ('𞟠', '𞟦'), + ('𞟨', '𞟫'), + ('𞟭', '𞟮'), + ('𞟰', '𞟾'), + ('𞠀', '𞣄'), + ('𞤀', '𞥃'), + ('𞥋', '𞥋'), + ('𞸀', '𞸃'), + ('𞸅', '𞸟'), + ('𞸡', '𞸢'), + ('𞸤', '𞸤'), + ('𞸧', '𞸧'), + ('𞸩', '𞸲'), + ('𞸴', '𞸷'), + ('𞸹', '𞸹'), + ('𞸻', '𞸻'), + ('𞹂', '𞹂'), + ('𞹇', '𞹇'), + ('𞹉', '𞹉'), + ('𞹋', '𞹋'), + ('𞹍', '𞹏'), + ('𞹑', '𞹒'), + ('𞹔', '𞹔'), + ('𞹗', '𞹗'), + ('𞹙', '𞹙'), + ('𞹛', '𞹛'), + ('𞹝', '𞹝'), + ('𞹟', '𞹟'), + ('𞹡', '𞹢'), + ('𞹤', '𞹤'), + ('𞹧', '𞹪'), + ('𞹬', '𞹲'), + ('𞹴', '𞹷'), + ('𞹹', '𞹼'), + ('𞹾', '𞹾'), + ('𞺀', '𞺉'), + ('𞺋', '𞺛'), + ('𞺡', '𞺣'), + ('𞺥', '𞺩'), + ('𞺫', '𞺻'), + ('🄰', '🅉'), + ('🅐', '🅩'), + ('🅰', '🆉'), +]; + +pub const CR: &'static [(char, char)] = &[('\r', '\r')]; + +pub const DOUBLE_QUOTE: &'static [(char, char)] = &[('"', '"')]; + +pub const EXTEND: &'static [(char, char)] = &[ + ('\u{300}', '\u{36f}'), + ('\u{483}', '\u{489}'), + ('\u{591}', '\u{5bd}'), + ('\u{5bf}', '\u{5bf}'), + ('\u{5c1}', '\u{5c2}'), + ('\u{5c4}', '\u{5c5}'), + ('\u{5c7}', '\u{5c7}'), + ('\u{610}', '\u{61a}'), + ('\u{64b}', '\u{65f}'), + ('\u{670}', '\u{670}'), + ('\u{6d6}', '\u{6dc}'), + ('\u{6df}', '\u{6e4}'), + ('\u{6e7}', '\u{6e8}'), + ('\u{6ea}', '\u{6ed}'), + ('\u{711}', '\u{711}'), + ('\u{730}', '\u{74a}'), + ('\u{7a6}', '\u{7b0}'), + ('\u{7eb}', '\u{7f3}'), + ('\u{7fd}', '\u{7fd}'), + ('\u{816}', '\u{819}'), + ('\u{81b}', '\u{823}'), + ('\u{825}', '\u{827}'), + ('\u{829}', '\u{82d}'), + ('\u{859}', '\u{85b}'), + ('\u{898}', '\u{89f}'), + ('\u{8ca}', '\u{8e1}'), + ('\u{8e3}', 'ः'), + ('\u{93a}', '\u{93c}'), + ('ा', 'ॏ'), + ('\u{951}', '\u{957}'), + ('\u{962}', '\u{963}'), + ('\u{981}', 'ঃ'), + ('\u{9bc}', '\u{9bc}'), + ('\u{9be}', '\u{9c4}'), + ('ে', 'ৈ'), + ('ো', '\u{9cd}'), + ('\u{9d7}', '\u{9d7}'), + ('\u{9e2}', '\u{9e3}'), + ('\u{9fe}', '\u{9fe}'), + ('\u{a01}', 'ਃ'), + ('\u{a3c}', '\u{a3c}'), + ('ਾ', '\u{a42}'), + ('\u{a47}', '\u{a48}'), + ('\u{a4b}', '\u{a4d}'), + ('\u{a51}', '\u{a51}'), + ('\u{a70}', '\u{a71}'), + ('\u{a75}', '\u{a75}'), + ('\u{a81}', 'ઃ'), + ('\u{abc}', '\u{abc}'), + ('ા', '\u{ac5}'), + ('\u{ac7}', 'ૉ'), + ('ો', '\u{acd}'), + ('\u{ae2}', '\u{ae3}'), + ('\u{afa}', '\u{aff}'), + ('\u{b01}', 'ଃ'), + ('\u{b3c}', '\u{b3c}'), + ('\u{b3e}', '\u{b44}'), + ('େ', 'ୈ'), + ('ୋ', '\u{b4d}'), + ('\u{b55}', '\u{b57}'), + ('\u{b62}', '\u{b63}'), + ('\u{b82}', '\u{b82}'), + ('\u{bbe}', 'ூ'), + ('ெ', 'ை'), + ('ொ', '\u{bcd}'), + ('\u{bd7}', '\u{bd7}'), + ('\u{c00}', '\u{c04}'), + ('\u{c3c}', '\u{c3c}'), + ('\u{c3e}', 'ౄ'), + ('\u{c46}', '\u{c48}'), + ('\u{c4a}', '\u{c4d}'), + ('\u{c55}', '\u{c56}'), + ('\u{c62}', '\u{c63}'), + ('\u{c81}', 'ಃ'), + ('\u{cbc}', '\u{cbc}'), + ('ಾ', 'ೄ'), + ('\u{cc6}', 'ೈ'), + ('ೊ', '\u{ccd}'), + ('\u{cd5}', '\u{cd6}'), + ('\u{ce2}', '\u{ce3}'), + ('ೳ', 'ೳ'), + ('\u{d00}', 'ഃ'), + ('\u{d3b}', '\u{d3c}'), + ('\u{d3e}', '\u{d44}'), + ('െ', 'ൈ'), + ('ൊ', '\u{d4d}'), + ('\u{d57}', '\u{d57}'), + ('\u{d62}', '\u{d63}'), + ('\u{d81}', 'ඃ'), + ('\u{dca}', '\u{dca}'), + ('\u{dcf}', '\u{dd4}'), + ('\u{dd6}', '\u{dd6}'), + ('ෘ', '\u{ddf}'), + ('ෲ', 'ෳ'), + ('\u{e31}', '\u{e31}'), + ('\u{e34}', '\u{e3a}'), + ('\u{e47}', '\u{e4e}'), + ('\u{eb1}', '\u{eb1}'), + ('\u{eb4}', '\u{ebc}'), + ('\u{ec8}', '\u{ece}'), + ('\u{f18}', '\u{f19}'), + ('\u{f35}', '\u{f35}'), + ('\u{f37}', '\u{f37}'), + ('\u{f39}', '\u{f39}'), + ('༾', '༿'), + ('\u{f71}', '\u{f84}'), + ('\u{f86}', '\u{f87}'), + ('\u{f8d}', '\u{f97}'), + ('\u{f99}', '\u{fbc}'), + ('\u{fc6}', '\u{fc6}'), + ('ါ', '\u{103e}'), + ('ၖ', '\u{1059}'), + ('\u{105e}', '\u{1060}'), + ('ၢ', 'ၤ'), + ('ၧ', 'ၭ'), + ('\u{1071}', '\u{1074}'), + ('\u{1082}', '\u{108d}'), + ('ႏ', 'ႏ'), + ('ႚ', '\u{109d}'), + ('\u{135d}', '\u{135f}'), + ('\u{1712}', '᜕'), + ('\u{1732}', '᜴'), + ('\u{1752}', '\u{1753}'), + ('\u{1772}', '\u{1773}'), + ('\u{17b4}', '\u{17d3}'), + ('\u{17dd}', '\u{17dd}'), + ('\u{180b}', '\u{180d}'), + ('\u{180f}', '\u{180f}'), + ('\u{1885}', '\u{1886}'), + ('\u{18a9}', '\u{18a9}'), + ('\u{1920}', 'ᤫ'), + ('ᤰ', '\u{193b}'), + ('\u{1a17}', '\u{1a1b}'), + ('ᩕ', '\u{1a5e}'), + ('\u{1a60}', '\u{1a7c}'), + ('\u{1a7f}', '\u{1a7f}'), + ('\u{1ab0}', '\u{1ace}'), + ('\u{1b00}', 'ᬄ'), + ('\u{1b34}', '᭄'), + ('\u{1b6b}', '\u{1b73}'), + ('\u{1b80}', 'ᮂ'), + ('ᮡ', '\u{1bad}'), + ('\u{1be6}', '᯳'), + ('ᰤ', '\u{1c37}'), + ('\u{1cd0}', '\u{1cd2}'), + ('\u{1cd4}', '\u{1ce8}'), + ('\u{1ced}', '\u{1ced}'), + ('\u{1cf4}', '\u{1cf4}'), + ('᳷', '\u{1cf9}'), + ('\u{1dc0}', '\u{1dff}'), + ('\u{200c}', '\u{200c}'), + ('\u{20d0}', '\u{20f0}'), + ('\u{2cef}', '\u{2cf1}'), + ('\u{2d7f}', '\u{2d7f}'), + ('\u{2de0}', '\u{2dff}'), + ('\u{302a}', '\u{302f}'), + ('\u{3099}', '\u{309a}'), + ('\u{a66f}', '\u{a672}'), + ('\u{a674}', '\u{a67d}'), + ('\u{a69e}', '\u{a69f}'), + ('\u{a6f0}', '\u{a6f1}'), + ('\u{a802}', '\u{a802}'), + ('\u{a806}', '\u{a806}'), + ('\u{a80b}', '\u{a80b}'), + ('ꠣ', 'ꠧ'), + ('\u{a82c}', '\u{a82c}'), + ('ꢀ', 'ꢁ'), + ('ꢴ', '\u{a8c5}'), + ('\u{a8e0}', '\u{a8f1}'), + ('\u{a8ff}', '\u{a8ff}'), + ('\u{a926}', '\u{a92d}'), + ('\u{a947}', '꥓'), + ('\u{a980}', 'ꦃ'), + ('\u{a9b3}', '꧀'), + ('\u{a9e5}', '\u{a9e5}'), + ('\u{aa29}', '\u{aa36}'), + ('\u{aa43}', '\u{aa43}'), + ('\u{aa4c}', 'ꩍ'), + ('ꩻ', 'ꩽ'), + ('\u{aab0}', '\u{aab0}'), + ('\u{aab2}', '\u{aab4}'), + ('\u{aab7}', '\u{aab8}'), + ('\u{aabe}', '\u{aabf}'), + ('\u{aac1}', '\u{aac1}'), + ('ꫫ', 'ꫯ'), + ('ꫵ', '\u{aaf6}'), + ('ꯣ', 'ꯪ'), + ('꯬', '\u{abed}'), + ('\u{fb1e}', '\u{fb1e}'), + ('\u{fe00}', '\u{fe0f}'), + ('\u{fe20}', '\u{fe2f}'), + ('\u{ff9e}', '\u{ff9f}'), + ('\u{101fd}', '\u{101fd}'), + ('\u{102e0}', '\u{102e0}'), + ('\u{10376}', '\u{1037a}'), + ('\u{10a01}', '\u{10a03}'), + ('\u{10a05}', '\u{10a06}'), + ('\u{10a0c}', '\u{10a0f}'), + ('\u{10a38}', '\u{10a3a}'), + ('\u{10a3f}', '\u{10a3f}'), + ('\u{10ae5}', '\u{10ae6}'), + ('\u{10d24}', '\u{10d27}'), + ('\u{10eab}', '\u{10eac}'), + ('\u{10efd}', '\u{10eff}'), + ('\u{10f46}', '\u{10f50}'), + ('\u{10f82}', '\u{10f85}'), + ('𑀀', '𑀂'), + ('\u{11038}', '\u{11046}'), + ('\u{11070}', '\u{11070}'), + ('\u{11073}', '\u{11074}'), + ('\u{1107f}', '𑂂'), + ('𑂰', '\u{110ba}'), + ('\u{110c2}', '\u{110c2}'), + ('\u{11100}', '\u{11102}'), + ('\u{11127}', '\u{11134}'), + ('𑅅', '𑅆'), + ('\u{11173}', '\u{11173}'), + ('\u{11180}', '𑆂'), + ('𑆳', '𑇀'), + ('\u{111c9}', '\u{111cc}'), + ('𑇎', '\u{111cf}'), + ('𑈬', '\u{11237}'), + ('\u{1123e}', '\u{1123e}'), + ('\u{11241}', '\u{11241}'), + ('\u{112df}', '\u{112ea}'), + ('\u{11300}', '𑌃'), + ('\u{1133b}', '\u{1133c}'), + ('\u{1133e}', '𑍄'), + ('𑍇', '𑍈'), + ('𑍋', '𑍍'), + ('\u{11357}', '\u{11357}'), + ('𑍢', '𑍣'), + ('\u{11366}', '\u{1136c}'), + ('\u{11370}', '\u{11374}'), + ('𑐵', '\u{11446}'), + ('\u{1145e}', '\u{1145e}'), + ('\u{114b0}', '\u{114c3}'), + ('\u{115af}', '\u{115b5}'), + ('𑖸', '\u{115c0}'), + ('\u{115dc}', '\u{115dd}'), + ('𑘰', '\u{11640}'), + ('\u{116ab}', '\u{116b7}'), + ('\u{1171d}', '\u{1172b}'), + ('𑠬', '\u{1183a}'), + ('\u{11930}', '𑤵'), + ('𑤷', '𑤸'), + ('\u{1193b}', '\u{1193e}'), + ('𑥀', '𑥀'), + ('𑥂', '\u{11943}'), + ('𑧑', '\u{119d7}'), + ('\u{119da}', '\u{119e0}'), + ('𑧤', '𑧤'), + ('\u{11a01}', '\u{11a0a}'), + ('\u{11a33}', '𑨹'), + ('\u{11a3b}', '\u{11a3e}'), + ('\u{11a47}', '\u{11a47}'), + ('\u{11a51}', '\u{11a5b}'), + ('\u{11a8a}', '\u{11a99}'), + ('𑰯', '\u{11c36}'), + ('\u{11c38}', '\u{11c3f}'), + ('\u{11c92}', '\u{11ca7}'), + ('𑲩', '\u{11cb6}'), + ('\u{11d31}', '\u{11d36}'), + ('\u{11d3a}', '\u{11d3a}'), + ('\u{11d3c}', '\u{11d3d}'), + ('\u{11d3f}', '\u{11d45}'), + ('\u{11d47}', '\u{11d47}'), + ('𑶊', '𑶎'), + ('\u{11d90}', '\u{11d91}'), + ('𑶓', '\u{11d97}'), + ('\u{11ef3}', '𑻶'), + ('\u{11f00}', '\u{11f01}'), + ('𑼃', '𑼃'), + ('𑼴', '\u{11f3a}'), + ('𑼾', '\u{11f42}'), + ('\u{13440}', '\u{13440}'), + ('\u{13447}', '\u{13455}'), + ('\u{16af0}', '\u{16af4}'), + ('\u{16b30}', '\u{16b36}'), + ('\u{16f4f}', '\u{16f4f}'), + ('𖽑', '𖾇'), + ('\u{16f8f}', '\u{16f92}'), + ('\u{16fe4}', '\u{16fe4}'), + ('𖿰', '𖿱'), + ('\u{1bc9d}', '\u{1bc9e}'), + ('\u{1cf00}', '\u{1cf2d}'), + ('\u{1cf30}', '\u{1cf46}'), + ('\u{1d165}', '\u{1d169}'), + ('𝅭', '\u{1d172}'), + ('\u{1d17b}', '\u{1d182}'), + ('\u{1d185}', '\u{1d18b}'), + ('\u{1d1aa}', '\u{1d1ad}'), + ('\u{1d242}', '\u{1d244}'), + ('\u{1da00}', '\u{1da36}'), + ('\u{1da3b}', '\u{1da6c}'), + ('\u{1da75}', '\u{1da75}'), + ('\u{1da84}', '\u{1da84}'), + ('\u{1da9b}', '\u{1da9f}'), + ('\u{1daa1}', '\u{1daaf}'), + ('\u{1e000}', '\u{1e006}'), + ('\u{1e008}', '\u{1e018}'), + ('\u{1e01b}', '\u{1e021}'), + ('\u{1e023}', '\u{1e024}'), + ('\u{1e026}', '\u{1e02a}'), + ('\u{1e08f}', '\u{1e08f}'), + ('\u{1e130}', '\u{1e136}'), + ('\u{1e2ae}', '\u{1e2ae}'), + ('\u{1e2ec}', '\u{1e2ef}'), + ('\u{1e4ec}', '\u{1e4ef}'), + ('\u{1e8d0}', '\u{1e8d6}'), + ('\u{1e944}', '\u{1e94a}'), + ('🏻', '🏿'), + ('\u{e0020}', '\u{e007f}'), + ('\u{e0100}', '\u{e01ef}'), +]; + +pub const EXTENDNUMLET: &'static [(char, char)] = &[ + ('_', '_'), + ('\u{202f}', '\u{202f}'), + ('‿', '⁀'), + ('⁔', '⁔'), + ('︳', '︴'), + ('﹍', '﹏'), + ('_', '_'), +]; + +pub const FORMAT: &'static [(char, char)] = &[ + ('\u{ad}', '\u{ad}'), + ('\u{600}', '\u{605}'), + ('\u{61c}', '\u{61c}'), + ('\u{6dd}', '\u{6dd}'), + ('\u{70f}', '\u{70f}'), + ('\u{890}', '\u{891}'), + ('\u{8e2}', '\u{8e2}'), + ('\u{180e}', '\u{180e}'), + ('\u{200e}', '\u{200f}'), + ('\u{202a}', '\u{202e}'), + ('\u{2060}', '\u{2064}'), + ('\u{2066}', '\u{206f}'), + ('\u{feff}', '\u{feff}'), + ('\u{fff9}', '\u{fffb}'), + ('\u{110bd}', '\u{110bd}'), + ('\u{110cd}', '\u{110cd}'), + ('\u{13430}', '\u{1343f}'), + ('\u{1bca0}', '\u{1bca3}'), + ('\u{1d173}', '\u{1d17a}'), + ('\u{e0001}', '\u{e0001}'), +]; + +pub const HEBREW_LETTER: &'static [(char, char)] = &[ + ('א', 'ת'), + ('ׯ', 'ײ'), + ('יִ', 'יִ'), + ('ײַ', 'ﬨ'), + ('שׁ', 'זּ'), + ('טּ', 'לּ'), + ('מּ', 'מּ'), + ('נּ', 'סּ'), + ('ףּ', 'פּ'), + ('צּ', 'ﭏ'), +]; + +pub const KATAKANA: &'static [(char, char)] = &[ + ('〱', '〵'), + ('゛', '゜'), + ('゠', 'ヺ'), + ('ー', 'ヿ'), + ('ㇰ', 'ㇿ'), + ('㋐', '㋾'), + ('㌀', '㍗'), + ('ヲ', 'ン'), + ('𚿰', '𚿳'), + ('𚿵', '𚿻'), + ('𚿽', '𚿾'), + ('𛀀', '𛀀'), + ('𛄠', '𛄢'), + ('𛅕', '𛅕'), + ('𛅤', '𛅧'), +]; + +pub const LF: &'static [(char, char)] = &[('\n', '\n')]; + +pub const MIDLETTER: &'static [(char, char)] = &[ + (':', ':'), + ('·', '·'), + ('·', '·'), + ('՟', '՟'), + ('״', '״'), + ('‧', '‧'), + ('︓', '︓'), + ('﹕', '﹕'), + (':', ':'), +]; + +pub const MIDNUM: &'static [(char, char)] = &[ + (',', ','), + (';', ';'), + (';', ';'), + ('։', '։'), + ('،', '؍'), + ('٬', '٬'), + ('߸', '߸'), + ('⁄', '⁄'), + ('︐', '︐'), + ('︔', '︔'), + ('﹐', '﹐'), + ('﹔', '﹔'), + (',', ','), + (';', ';'), +]; + +pub const MIDNUMLET: &'static [(char, char)] = &[ + ('.', '.'), + ('‘', '’'), + ('․', '․'), + ('﹒', '﹒'), + (''', '''), + ('.', '.'), +]; + +pub const NEWLINE: &'static [(char, char)] = + &[('\u{b}', '\u{c}'), ('\u{85}', '\u{85}'), ('\u{2028}', '\u{2029}')]; + +pub const NUMERIC: &'static [(char, char)] = &[ + ('0', '9'), + ('٠', '٩'), + ('٫', '٫'), + ('۰', '۹'), + ('߀', '߉'), + ('०', '९'), + ('০', '৯'), + ('੦', '੯'), + ('૦', '૯'), + ('୦', '୯'), + ('௦', '௯'), + ('౦', '౯'), + ('೦', '೯'), + ('൦', '൯'), + ('෦', '෯'), + ('๐', '๙'), + ('໐', '໙'), + ('༠', '༩'), + ('၀', '၉'), + ('႐', '႙'), + ('០', '៩'), + ('᠐', '᠙'), + ('᥆', '᥏'), + ('᧐', '᧙'), + ('᪀', '᪉'), + ('᪐', '᪙'), + ('᭐', '᭙'), + ('᮰', '᮹'), + ('᱀', '᱉'), + ('᱐', '᱙'), + ('꘠', '꘩'), + ('꣐', '꣙'), + ('꤀', '꤉'), + ('꧐', '꧙'), + ('꧰', '꧹'), + ('꩐', '꩙'), + ('꯰', '꯹'), + ('0', '9'), + ('𐒠', '𐒩'), + ('𐴰', '𐴹'), + ('𑁦', '𑁯'), + ('𑃰', '𑃹'), + ('𑄶', '𑄿'), + ('𑇐', '𑇙'), + ('𑋰', '𑋹'), + ('𑑐', '𑑙'), + ('𑓐', '𑓙'), + ('𑙐', '𑙙'), + ('𑛀', '𑛉'), + ('𑜰', '𑜹'), + ('𑣠', '𑣩'), + ('𑥐', '𑥙'), + ('𑱐', '𑱙'), + ('𑵐', '𑵙'), + ('𑶠', '𑶩'), + ('𑽐', '𑽙'), + ('𖩠', '𖩩'), + ('𖫀', '𖫉'), + ('𖭐', '𖭙'), + ('𝟎', '𝟿'), + ('𞅀', '𞅉'), + ('𞋰', '𞋹'), + ('𞓰', '𞓹'), + ('𞥐', '𞥙'), + ('🯰', '🯹'), +]; + +pub const REGIONAL_INDICATOR: &'static [(char, char)] = &[('🇦', '🇿')]; + +pub const SINGLE_QUOTE: &'static [(char, char)] = &[('\'', '\'')]; + +pub const WSEGSPACE: &'static [(char, char)] = &[ + (' ', ' '), + ('\u{1680}', '\u{1680}'), + ('\u{2000}', '\u{2006}'), + ('\u{2008}', '\u{200a}'), + ('\u{205f}', '\u{205f}'), + ('\u{3000}', '\u{3000}'), +]; + +pub const ZWJ: &'static [(char, char)] = &[('\u{200d}', '\u{200d}')]; diff --git a/vendor/regex-syntax/src/utf8.rs b/vendor/regex-syntax/src/utf8.rs new file mode 100644 index 0000000000..b9c8655320 --- /dev/null +++ b/vendor/regex-syntax/src/utf8.rs @@ -0,0 +1,587 @@ +/*! +Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes. + +This is sub-module is useful for constructing byte based automatons that need +to embed UTF-8 decoding. The most common use of this module is in conjunction +with the [`hir::ClassUnicodeRange`](../hir/struct.ClassUnicodeRange.html) type. + +See the documentation on the `Utf8Sequences` iterator for more details and +an example. + +# Wait, what is this? + +This is simplest to explain with an example. Let's say you wanted to test +whether a particular byte sequence was a Cyrillic character. One possible +scalar value range is `[0400-04FF]`. The set of allowed bytes for this +range can be expressed as a sequence of byte ranges: + +```text +[D0-D3][80-BF] +``` + +This is simple enough: simply encode the boundaries, `0400` encodes to +`D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each +corresponding pair of bytes: `D0` to `D3` and `80` to `BF`. + +However, what if you wanted to add the Cyrillic Supplementary characters to +your range? Your range might then become `[0400-052F]`. The same procedure +as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges +you'd get from the previous transformation would be `[D0-D4][80-AF]`. However, +this isn't quite correct because this range doesn't capture many characters, +for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`). + +Instead, you need multiple sequences of byte ranges: + +```text +[D0-D3][80-BF] # matches codepoints 0400-04FF +[D4][80-AF] # matches codepoints 0500-052F +``` + +This gets even more complicated if you want bigger ranges, particularly if +they naively contain surrogate codepoints. For example, the sequence of byte +ranges for the basic multilingual plane (`[0000-FFFF]`) look like this: + +```text +[0-7F] +[C2-DF][80-BF] +[E0][A0-BF][80-BF] +[E1-EC][80-BF][80-BF] +[ED][80-9F][80-BF] +[EE-EF][80-BF][80-BF] +``` + +Note that the byte ranges above will *not* match any erroneous encoding of +UTF-8, including encodings of surrogate codepoints. + +And, of course, for all of Unicode (`[000000-10FFFF]`): + +```text +[0-7F] +[C2-DF][80-BF] +[E0][A0-BF][80-BF] +[E1-EC][80-BF][80-BF] +[ED][80-9F][80-BF] +[EE-EF][80-BF][80-BF] +[F0][90-BF][80-BF][80-BF] +[F1-F3][80-BF][80-BF][80-BF] +[F4][80-8F][80-BF][80-BF] +``` + +This module automates the process of creating these byte ranges from ranges of +Unicode scalar values. + +# Lineage + +I got the idea and general implementation strategy from Russ Cox in his +[article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2. +Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?). +I also got the idea from +[Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java), +which uses it for executing automata on their term index. +*/ + +#![deny(missing_docs)] + +use std::char; +use std::fmt; +use std::iter::FusedIterator; +use std::slice; + +const MAX_UTF8_BYTES: usize = 4; + +/// Utf8Sequence represents a sequence of byte ranges. +/// +/// To match a Utf8Sequence, a candidate byte sequence must match each +/// successive range. +/// +/// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte +/// sequence `\xDD\x61` would not match because `0x61 < 0x80`. +#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)] +pub enum Utf8Sequence { + /// One byte range. + One(Utf8Range), + /// Two successive byte ranges. + Two([Utf8Range; 2]), + /// Three successive byte ranges. + Three([Utf8Range; 3]), + /// Four successive byte ranges. + Four([Utf8Range; 4]), +} + +impl Utf8Sequence { + /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value + /// range. + /// + /// This assumes that `start` and `end` have the same length. + fn from_encoded_range(start: &[u8], end: &[u8]) -> Self { + assert_eq!(start.len(), end.len()); + match start.len() { + 2 => Utf8Sequence::Two([ + Utf8Range::new(start[0], end[0]), + Utf8Range::new(start[1], end[1]), + ]), + 3 => Utf8Sequence::Three([ + Utf8Range::new(start[0], end[0]), + Utf8Range::new(start[1], end[1]), + Utf8Range::new(start[2], end[2]), + ]), + 4 => Utf8Sequence::Four([ + Utf8Range::new(start[0], end[0]), + Utf8Range::new(start[1], end[1]), + Utf8Range::new(start[2], end[2]), + Utf8Range::new(start[3], end[3]), + ]), + n => unreachable!("invalid encoded length: {}", n), + } + } + + /// Returns the underlying sequence of byte ranges as a slice. + pub fn as_slice(&self) -> &[Utf8Range] { + use self::Utf8Sequence::*; + match *self { + One(ref r) => slice::from_ref(r), + Two(ref r) => &r[..], + Three(ref r) => &r[..], + Four(ref r) => &r[..], + } + } + + /// Returns the number of byte ranges in this sequence. + /// + /// The length is guaranteed to be in the closed interval `[1, 4]`. + pub fn len(&self) -> usize { + self.as_slice().len() + } + + /// Reverses the ranges in this sequence. + /// + /// For example, if this corresponds to the following sequence: + /// + /// ```text + /// [D0-D3][80-BF] + /// ``` + /// + /// Then after reversal, it will be + /// + /// ```text + /// [80-BF][D0-D3] + /// ``` + /// + /// This is useful when one is constructing a UTF-8 automaton to match + /// character classes in reverse. + pub fn reverse(&mut self) { + match *self { + Utf8Sequence::One(_) => {} + Utf8Sequence::Two(ref mut x) => x.reverse(), + Utf8Sequence::Three(ref mut x) => x.reverse(), + Utf8Sequence::Four(ref mut x) => x.reverse(), + } + } + + /// Returns true if and only if a prefix of `bytes` matches this sequence + /// of byte ranges. + pub fn matches(&self, bytes: &[u8]) -> bool { + if bytes.len() < self.len() { + return false; + } + for (&b, r) in bytes.iter().zip(self) { + if !r.matches(b) { + return false; + } + } + true + } +} + +impl<'a> IntoIterator for &'a Utf8Sequence { + type IntoIter = slice::Iter<'a, Utf8Range>; + type Item = &'a Utf8Range; + + fn into_iter(self) -> Self::IntoIter { + self.as_slice().iter() + } +} + +impl fmt::Debug for Utf8Sequence { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use self::Utf8Sequence::*; + match *self { + One(ref r) => write!(f, "{:?}", r), + Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]), + Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]), + Four(ref r) => { + write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3]) + } + } + } +} + +/// A single inclusive range of UTF-8 bytes. +#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] +pub struct Utf8Range { + /// Start of byte range (inclusive). + pub start: u8, + /// End of byte range (inclusive). + pub end: u8, +} + +impl Utf8Range { + fn new(start: u8, end: u8) -> Self { + Utf8Range { start, end } + } + + /// Returns true if and only if the given byte is in this range. + pub fn matches(&self, b: u8) -> bool { + self.start <= b && b <= self.end + } +} + +impl fmt::Debug for Utf8Range { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.start == self.end { + write!(f, "[{:X}]", self.start) + } else { + write!(f, "[{:X}-{:X}]", self.start, self.end) + } + } +} + +/// An iterator over ranges of matching UTF-8 byte sequences. +/// +/// The iteration represents an alternation of comprehensive byte sequences +/// that match precisely the set of UTF-8 encoded scalar values. +/// +/// A byte sequence corresponds to one of the scalar values in the range given +/// if and only if it completely matches exactly one of the sequences of byte +/// ranges produced by this iterator. +/// +/// Each sequence of byte ranges matches a unique set of bytes. That is, no two +/// sequences will match the same bytes. +/// +/// # Example +/// +/// This shows how to match an arbitrary byte sequence against a range of +/// scalar values. +/// +/// ```rust +/// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence}; +/// +/// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool { +/// for range in seqs { +/// if range.matches(bytes) { +/// return true; +/// } +/// } +/// false +/// } +/// +/// // Test the basic multilingual plane. +/// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect(); +/// +/// // UTF-8 encoding of 'a'. +/// assert!(matches(&seqs, &[0x61])); +/// // UTF-8 encoding of '☃' (`\u{2603}`). +/// assert!(matches(&seqs, &[0xE2, 0x98, 0x83])); +/// // UTF-8 encoding of `\u{10348}` (outside the BMP). +/// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88])); +/// // Tries to match against a UTF-8 encoding of a surrogate codepoint, +/// // which is invalid UTF-8, and therefore fails, despite the fact that +/// // the corresponding codepoint (0xD800) falls in the range given. +/// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80])); +/// // And fails against plain old invalid UTF-8. +/// assert!(!matches(&seqs, &[0xFF, 0xFF])); +/// ``` +/// +/// If this example seems circuitous, that's because it is! It's meant to be +/// illustrative. In practice, you could just try to decode your byte sequence +/// and compare it with the scalar value range directly. However, this is not +/// always possible (for example, in a byte based automaton). +#[derive(Debug)] +pub struct Utf8Sequences { + range_stack: Vec, +} + +impl Utf8Sequences { + /// Create a new iterator over UTF-8 byte ranges for the scalar value range + /// given. + pub fn new(start: char, end: char) -> Self { + let mut it = Utf8Sequences { range_stack: vec![] }; + it.push(start as u32, end as u32); + it + } + + /// reset resets the scalar value range. + /// Any existing state is cleared, but resources may be reused. + /// + /// N.B. Benchmarks say that this method is dubious. + #[doc(hidden)] + pub fn reset(&mut self, start: char, end: char) { + self.range_stack.clear(); + self.push(start as u32, end as u32); + } + + fn push(&mut self, start: u32, end: u32) { + self.range_stack.push(ScalarRange { start, end }); + } +} + +struct ScalarRange { + start: u32, + end: u32, +} + +impl fmt::Debug for ScalarRange { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "ScalarRange({:X}, {:X})", self.start, self.end) + } +} + +impl Iterator for Utf8Sequences { + type Item = Utf8Sequence; + + fn next(&mut self) -> Option { + 'TOP: while let Some(mut r) = self.range_stack.pop() { + 'INNER: loop { + if let Some((r1, r2)) = r.split() { + self.push(r2.start, r2.end); + r.start = r1.start; + r.end = r1.end; + continue 'INNER; + } + if !r.is_valid() { + continue 'TOP; + } + for i in 1..MAX_UTF8_BYTES { + let max = max_scalar_value(i); + if r.start <= max && max < r.end { + self.push(max + 1, r.end); + r.end = max; + continue 'INNER; + } + } + if let Some(ascii_range) = r.as_ascii() { + return Some(Utf8Sequence::One(ascii_range)); + } + for i in 1..MAX_UTF8_BYTES { + let m = (1 << (6 * i)) - 1; + if (r.start & !m) != (r.end & !m) { + if (r.start & m) != 0 { + self.push((r.start | m) + 1, r.end); + r.end = r.start | m; + continue 'INNER; + } + if (r.end & m) != m { + self.push(r.end & !m, r.end); + r.end = (r.end & !m) - 1; + continue 'INNER; + } + } + } + let mut start = [0; MAX_UTF8_BYTES]; + let mut end = [0; MAX_UTF8_BYTES]; + let n = r.encode(&mut start, &mut end); + return Some(Utf8Sequence::from_encoded_range( + &start[0..n], + &end[0..n], + )); + } + } + None + } +} + +impl FusedIterator for Utf8Sequences {} + +impl ScalarRange { + /// split splits this range if it overlaps with a surrogate codepoint. + /// + /// Either or both ranges may be invalid. + fn split(&self) -> Option<(ScalarRange, ScalarRange)> { + if self.start < 0xE000 && self.end > 0xD7FF { + Some(( + ScalarRange { start: self.start, end: 0xD7FF }, + ScalarRange { start: 0xE000, end: self.end }, + )) + } else { + None + } + } + + /// is_valid returns true if and only if start <= end. + fn is_valid(&self) -> bool { + self.start <= self.end + } + + /// as_ascii returns this range as a Utf8Range if and only if all scalar + /// values in this range can be encoded as a single byte. + fn as_ascii(&self) -> Option { + if self.is_ascii() { + Some(Utf8Range::new(self.start as u8, self.end as u8)) + } else { + None + } + } + + /// is_ascii returns true if the range is ASCII only (i.e., takes a single + /// byte to encode any scalar value). + fn is_ascii(&self) -> bool { + self.is_valid() && self.end <= 0x7f + } + + /// encode writes the UTF-8 encoding of the start and end of this range + /// to the corresponding destination slices, and returns the number of + /// bytes written. + /// + /// The slices should have room for at least `MAX_UTF8_BYTES`. + fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize { + let cs = char::from_u32(self.start).unwrap(); + let ce = char::from_u32(self.end).unwrap(); + let ss = cs.encode_utf8(start); + let se = ce.encode_utf8(end); + assert_eq!(ss.len(), se.len()); + ss.len() + } +} + +fn max_scalar_value(nbytes: usize) -> u32 { + match nbytes { + 1 => 0x007F, + 2 => 0x07FF, + 3 => 0xFFFF, + 4 => 0x0010_FFFF, + _ => unreachable!("invalid UTF-8 byte sequence size"), + } +} + +#[cfg(test)] +mod tests { + use std::char; + + use crate::utf8::{Utf8Range, Utf8Sequences}; + + fn rutf8(s: u8, e: u8) -> Utf8Range { + Utf8Range::new(s, e) + } + + fn never_accepts_surrogate_codepoints(start: char, end: char) { + for cp in 0xD800..0xE000 { + let buf = encode_surrogate(cp); + for r in Utf8Sequences::new(start, end) { + if r.matches(&buf) { + panic!( + "Sequence ({:X}, {:X}) contains range {:?}, \ + which matches surrogate code point {:X} \ + with encoded bytes {:?}", + start as u32, end as u32, r, cp, buf, + ); + } + } + } + } + + #[test] + fn codepoints_no_surrogates() { + never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}'); + never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}'); + never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}'); + never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}'); + never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}'); + } + + #[test] + fn single_codepoint_one_sequence() { + // Tests that every range of scalar values that contains a single + // scalar value is recognized by one sequence of byte ranges. + for i in 0x0..=0x0010_FFFF { + let c = match char::from_u32(i) { + None => continue, + Some(c) => c, + }; + let seqs: Vec<_> = Utf8Sequences::new(c, c).collect(); + assert_eq!(seqs.len(), 1); + } + } + + #[test] + fn bmp() { + use crate::utf8::Utf8Sequence::*; + + let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::>(); + assert_eq!( + seqs, + vec![ + One(rutf8(0x0, 0x7F)), + Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]), + Three([ + rutf8(0xE0, 0xE0), + rutf8(0xA0, 0xBF), + rutf8(0x80, 0xBF) + ]), + Three([ + rutf8(0xE1, 0xEC), + rutf8(0x80, 0xBF), + rutf8(0x80, 0xBF) + ]), + Three([ + rutf8(0xED, 0xED), + rutf8(0x80, 0x9F), + rutf8(0x80, 0xBF) + ]), + Three([ + rutf8(0xEE, 0xEF), + rutf8(0x80, 0xBF), + rutf8(0x80, 0xBF) + ]), + ] + ); + } + + #[test] + fn reverse() { + use crate::utf8::Utf8Sequence::*; + + let mut s = One(rutf8(0xA, 0xB)); + s.reverse(); + assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]); + + let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]); + s.reverse(); + assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]); + + let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]); + s.reverse(); + assert_eq!( + s.as_slice(), + &[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)] + ); + + let mut s = Four([ + rutf8(0xA, 0xB), + rutf8(0xB, 0xC), + rutf8(0xC, 0xD), + rutf8(0xD, 0xE), + ]); + s.reverse(); + assert_eq!( + s.as_slice(), + &[ + rutf8(0xD, 0xE), + rutf8(0xC, 0xD), + rutf8(0xB, 0xC), + rutf8(0xA, 0xB) + ] + ); + } + + fn encode_surrogate(cp: u32) -> [u8; 3] { + const TAG_CONT: u8 = 0b1000_0000; + const TAG_THREE_B: u8 = 0b1110_0000; + + assert!(0xD800 <= cp && cp < 0xE000); + let mut dst = [0; 3]; + dst[0] = (cp >> 12 & 0x0F) as u8 | TAG_THREE_B; + dst[1] = (cp >> 6 & 0x3F) as u8 | TAG_CONT; + dst[2] = (cp & 0x3F) as u8 | TAG_CONT; + dst + } +} diff --git a/vendor/regex-syntax/test b/vendor/regex-syntax/test new file mode 100755 index 0000000000..4b1b9fb1a9 --- /dev/null +++ b/vendor/regex-syntax/test @@ -0,0 +1,22 @@ +#!/bin/bash + +set -e + +# This is a convenience script for running a broad swath of the syntax tests. +echo "===== DEFAULT FEATURES ===" +cargo test + +features=( + unicode + unicode-age + unicode-bool + unicode-case + unicode-gencat + unicode-perl + unicode-script + unicode-segment +) +for f in "${features[@]}"; do + echo "===== FEATURE: $f ===" + cargo test --no-default-features --features "$f" +done diff --git a/vendor/regex/.cargo-checksum.json b/vendor/regex/.cargo-checksum.json new file mode 100644 index 0000000000..d6ea1df9a2 --- /dev/null +++ b/vendor/regex/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"CHANGELOG.md":"c66cc76a297a1068a3aa81d08326175be887b60fb6330c4b7922f4ad286bd144","Cargo.lock":"fa78ce7955999185e017ef812db215124b65ae8aef67b4ea9ca5bd18c50b0d35","Cargo.toml":"566dca60827f0cbafd1d920457d6d7fad9bd5b85630832207de8cc9c35cab274","HACKING.md":"17818f7a17723608f6bdbe6388ad0a913d4f96f76a16649aaf4e274b1fa0ea97","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","PERFORMANCE.md":"0d5ef3866386918dfdefb1aa9a28cfe33cb3c8ceeb79f3f8ba5b88253dd95991","README.md":"f69204a0f446047d8f4d1f3d84b75f235adb5c26477f3a37b671411bc954d14c","UNICODE.md":"a8a8399540eed000d19420135a527f400247a04572e44d124c786b870f518776","examples/regexdna-input.txt":"156a49710bb3e1ed4bc2bbb0af0f383b747b3d0281453cfff39c296124c598f8","examples/regexdna-output.txt":"35e85b19b70a893d752fd43e54e1e9da08bac43559191cea85b33387c24c4cc1","examples/shootout-regex-dna-bytes.rs":"fa2daedb4e0a05f64f33f4af62fbb0176db998e3676f8637ab684b725367a7b4","examples/shootout-regex-dna-cheat.rs":"1f871a6eaaf8372299fa3c762051112fa89a14235b03f734fc50ebd51ecaee72","examples/shootout-regex-dna-replace.rs":"32ffdf13ac6c4ce3fc32116a048e9cc682aa34cdb8e5beaf565a22addbdcd9ab","examples/shootout-regex-dna-single-cheat.rs":"809f75bf1e1917a53623eb6f1a3ce3b7d2ed98a6a1dbc0bd4853bec49a0c6f94","examples/shootout-regex-dna-single.rs":"1ab14f5703cd4be2e75a2e792e0ba1d322b9e4b14535d396805a4316d577f5bb","examples/shootout-regex-dna.rs":"20ea46ab63f91e3ac6a64e997eadd436a9cbc2f1bdade28e4512052f0e25bc34","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/backtrack.rs":"52987d80448f3d7f5d4e3545ddfc09f1f30de7602d9b5489961db4b215a377fd","src/compile.rs":"79a59be2d2db650b5a322e15e9bf1d3227944410bc780fc6089da8f4d2609b77","src/dfa.rs":"10273980d1f08aaff495e11efa240249a2b2c08a4db7c49c8d6759bc65a3b174","src/error.rs":"71c85db839514f26ee024a689061743ea94a34eb7a3291e6c2b69b45a9682d09","src/exec.rs":"4726e2c210c6adb91b25390cbc864ab71deabb17ca87a46e83acef26a194bfc2","src/expand.rs":"71220309a3bac797f55129f49e79c03e96efec894ea338c735b78695367e04ca","src/find_byte.rs":"b387247b77e3269f057c3399aefe5a815032c3af918c876f80eb4b282e4eb95e","src/freqs.rs":"255555f3d95b08a5bb3bc2f38d5a06cc100a39c0f0127fe4f50c33afa1cadc65","src/input.rs":"13f49c1bce2fadd04a45b421d374cd0f8b72bef83f7e8fda958962aaccbe799a","src/lib.rs":"982fadba415c4c5b93f4d7d4a73a23ec88e2d96daaa03b679d14490ea0f63197","src/literal/imp.rs":"b7f63a861c299bea4baaab17353a420ee339c2cf76d3858c95f39342bd4463e7","src/literal/mod.rs":"533f1d68af088e9485170145e27518368e541a0337fdb44f63249ebf97310300","src/pattern.rs":"993d8b6b4bcea5e02bee3c76e17c356a5a47f8fc53c5555edfd1ebb71c0878bf","src/pikevm.rs":"6c0eaa7e878c945ac4c3c545c98f5706ad04846fc432a5086c8ee78eb030dfa7","src/pool.rs":"942e991ae31ef349bd76efd78b2a712c01166dec965bf93742977ed0870d5a10","src/prog.rs":"bebb3e50745bbc05d6c8240d972ba55a1818c51b1161dc1c21f3fe13c11d4884","src/re_builder.rs":"943344bf6e2fc90902ee04b11b741c32418ac6814b21b7982cc0a3a817713f3e","src/re_bytes.rs":"63ee1db1637a3764addb10e27248129acffaf78bb0a69624add4d9d6f1e97040","src/re_set.rs":"7921ac4a919b7a5deffe82d099a9ccaf5487aebd890dfb7a661e602c6ad3f1a9","src/re_trait.rs":"d237121b6f6b606836c72305cbcb3bbdbc54d1f6827d19a19cd0fbb4372e0145","src/re_unicode.rs":"4ca66d6e835df7c0f570c8cde52667ef90ba1687d5285f12fedef2e38ae925b4","src/sparse.rs":"0da3ddb7972109869248a764dbb10254555f4bb51c375e89fb3fab9cafa47320","src/testdata/LICENSE":"58cf078acc03da3e280a938c2bd9943f554fc9b6ced89ad93ba35ca436872899","src/testdata/README":"45f869e37f798905c773bfbe0ef19a5fb7e585cbf0b7c21b5b5a784e8cec3c14","src/testdata/basic.dat":"b5b33aa89d48a61cd67cb1fbfd8f70e62c83e30b86256f9f915a5190dd38ff06","src/testdata/nullsubexpr.dat":"496ac0278eec3b6d9170faace14554569032dd3d909618364d9326156de39ecf","src/testdata/repetition.dat":"1f7959063015b284b18a4a2c1c8b416d438a2d6c4b1a362da43406b865f50e69","src/utf8.rs":"f85a356ff5d5b19e417b73ce1dd84581b21d283f6dddd195547c30af9c60bd1a","test":"0d62fdca7da12fc19ea5306b5de1d83e68d9365a029c043d524334da138b0304","tests/api.rs":"7b2a0ef75e99b9776094967bd66e9cdeaa8e11359f5f0a12bd08ef0e8d0c11fc","tests/api_str.rs":"2ae38c04e7e8fac008b609a820d0b1561ba75f39b0edc0987d6d3d06132da77f","tests/bytes.rs":"edc50f526c5fee43df89d639ef18b237e4eb91e9d533bfc43f3cbab7417d38ba","tests/consistent.rs":"d69435154c09478076497216e43081a835ac65147181a4fbddad7bff469605b2","tests/crates_regex.rs":"91a59d470e0700b4bcb3ff735d06799f3107b8ef4875a2e9904607b164be0326","tests/crazy.rs":"c0d56380dff19bdd5d7a3eb731d0e2dc564e169a1b73c81e1879b1e87f5f5f77","tests/flags.rs":"05caace2c81a99d2168037f3a38035d4dffe9f85ef3ebd7ef18b1bc6612f1ea8","tests/fowler.rs":"d78cf914de40b1e125cc92b65ccb444d462586bd07b5e05de4e4a1b5de16aa76","tests/macros.rs":"6db70c16fc90df13e6b30d2b606f8b6dd4dc976697967f6ee001b15aab6d0b19","tests/macros_bytes.rs":"a049f528a93173a1bb176cd46932dce1880679f4a1752e099be920f0e4546fd0","tests/macros_str.rs":"e585b1461374c45a2eca44ca045bc3c1fe984b2b4212e432b0c695b420e708b7","tests/misc.rs":"395f52793fa022e4cdda78675b6a6fba1a3106b4b99c834c39f7801574054bd1","tests/multiline.rs":"1b1a3326ed976437c1357f01d81833ece7ea244f38826246eab55cacd5d0862a","tests/noparse.rs":"12b6be0eff3d80779d33c6459396c74c0f6ebf4ddc9f1d33c3e747ea9e3bf268","tests/regression.rs":"1c965fefb8c7a2b1dfdab3e3fdeebaf47846555c50c8005e5537f96a52a3e252","tests/regression_fuzz.rs":"a504ec563e0d23bd2039493b7b1767fe1f831d7d668f6f4b2ecd124fc7899bcd","tests/replace.rs":"66f97532e40697934e2a77605b9002dfd22c46b6033ccb755e7660d855229f41","tests/searcher.rs":"ce35e47b0a276a7e8c9060c6a0b225ffba163aebc61fbc15555a6897fa0e552c","tests/set.rs":"f1e2af6baeeaed3cc99ed347ff516fe7b2eb0027ef64b891502e1486598eaf8a","tests/shortest_match.rs":"a2c94390c0d61bc24796b4c1288c924e90c8c9c6156fdebb858175177a194a42","tests/suffix_reverse.rs":"b95f89397404871227d9efe6df23b9ded147f183db81597e608f693955c668b5","tests/test_backtrack.rs":"b70c5e5f1241efd76dd9f9dd4a4df8a7b38113bd407d1f5f56867f1176177a59","tests/test_backtrack_bytes.rs":"b8a111d4b4109c8bba7e2afb650572c495a14d357fb1f743c1076fb001f704b5","tests/test_backtrack_utf8bytes.rs":"c0c279785d18beac2b4e178e7bf6c14ed235d65f00ca467cfd9c333d79487649","tests/test_crates_regex.rs":"fd9525c2eef0e2f8cb7f787bc2b721bcd0b5d84f3bca49adfe48d657a99c721a","tests/test_default.rs":"b32c11a43da4379a3717dd7a5f152c811257c7d6595c9d3c51f2de102e320c87","tests/test_default_bytes.rs":"831d3e6bfb882feb15f700e30304bd34328f888fb4c15c7169371e25024ce9a7","tests/test_nfa.rs":"f119fc43a018249c39c813d57096b0654ff69f337345f2bbd9b0e61cc9137285","tests/test_nfa_bytes.rs":"89eae3bef6a1d0bcea6b5de5be35ad72f613f2ceb8b58fe82a6c6ef2ccdc07d0","tests/test_nfa_utf8bytes.rs":"7d830b4aa401887d7cf098b62fed4cd8017ef8b61f625c7c9a2159a6b4cfeb71","tests/unicode.rs":"1af9db7f09a6b0113b8a64733e06c8415fef720b2fdef227ae398d94332287cd","tests/word_boundary.rs":"7081317ddcec1e82dd4a2090a571c6abf2ff4bbfa8cd10395e1eb3f386157fae","tests/word_boundary_ascii.rs":"cd0be5b5b485de0ba7994b42e2864585556c3d2d8bf5eab05b58931d9aaf4b87","tests/word_boundary_unicode.rs":"75dbcc35d3abc0f9795c2ea99e216dc227b0a5b58e9ca5eef767815ff0513921"},"package":"8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d"} \ No newline at end of file diff --git a/vendor/regex/CHANGELOG.md b/vendor/regex/CHANGELOG.md new file mode 100644 index 0000000000..44274acac5 --- /dev/null +++ b/vendor/regex/CHANGELOG.md @@ -0,0 +1,1116 @@ +1.7.3 (2023-03-24) +================== +This is a small release that fixes a bug in `Regex::shortest_match_at` that +could cause it to panic, even when the offset given is valid. + +Bug fixes: + +* [BUG #969](https://github.com/rust-lang/regex/issues/969): + Fix a bug in how the reverse DFA was called for `Regex::shortest_match_at`. + + +1.7.2 (2023-03-21) +================== +This is a small release that fixes a failing test on FreeBSD. + +Bug fixes: + +* [BUG #967](https://github.com/rust-lang/regex/issues/967): + Fix "no stack overflow" test which can fail due to the small stack size. + + +1.7.1 (2023-01-09) +================== +This release was done principally to try and fix the doc.rs rendering for the +regex crate. + +Performance improvements: + +* [PERF #930](https://github.com/rust-lang/regex/pull/930): + Optimize `replacen`. This also applies to `replace`, but not `replace_all`. + +Bug fixes: + +* [BUG #945](https://github.com/rust-lang/regex/issues/945): + Maybe fix rustdoc rendering by just bumping a new release? + + +1.7.0 (2022-11-05) +================== +This release principally includes an upgrade to Unicode 15. + +New features: + +* [FEATURE #832](https://github.com/rust-lang/regex/issues/916): + Upgrade to Unicode 15. + + +1.6.0 (2022-07-05) +================== +This release principally includes an upgrade to Unicode 14. + +New features: + +* [FEATURE #832](https://github.com/rust-lang/regex/pull/832): + Clarify that `Captures::len` includes all groups, not just matching groups. +* [FEATURE #857](https://github.com/rust-lang/regex/pull/857): + Add an `ExactSizeIterator` impl for `SubCaptureMatches`. +* [FEATURE #861](https://github.com/rust-lang/regex/pull/861): + Improve `RegexSet` documentation examples. +* [FEATURE #877](https://github.com/rust-lang/regex/issues/877): + Upgrade to Unicode 14. + +Bug fixes: + +* [BUG #792](https://github.com/rust-lang/regex/issues/792): + Fix error message rendering bug. + + +1.5.6 (2022-05-20) +================== +This release includes a few bug fixes, including a bug that produced incorrect +matches when a non-greedy `?` operator was used. + +* [BUG #680](https://github.com/rust-lang/regex/issues/680): + Fixes a bug where `[[:alnum:][:^ascii:]]` dropped `[:alnum:]` from the class. +* [BUG #859](https://github.com/rust-lang/regex/issues/859): + Fixes a bug where `Hir::is_match_empty` returned `false` for `\b`. +* [BUG #862](https://github.com/rust-lang/regex/issues/862): + Fixes a bug where 'ab??' matches 'ab' instead of 'a' in 'ab'. + + +1.5.5 (2022-03-08) +================== +This releases fixes a security bug in the regex compiler. This bug permits a +vector for a denial-of-service attack in cases where the regex being compiled +is untrusted. There are no known problems where the regex is itself trusted, +including in cases of untrusted haystacks. + +* [SECURITY #GHSA-m5pq-gvj9-9vr8](https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8): + Fixes a bug in the regex compiler where empty sub-expressions subverted the + existing mitigations in place to enforce a size limit on compiled regexes. + The Rust Security Response WG published an advisory about this: + https://groups.google.com/g/rustlang-security-announcements/c/NcNNL1Jq7Yw + + +1.5.4 (2021-05-06) +================== +This release fixes another compilation failure when building regex. This time, +the fix is for when the `pattern` feature is enabled, which only works on +nightly Rust. CI has been updated to test this case. + +* [BUG #772](https://github.com/rust-lang/regex/pull/772): + Fix build when `pattern` feature is enabled. + + +1.5.3 (2021-05-01) +================== +This releases fixes a bug when building regex with only the `unicode-perl` +feature. It turns out that while CI was building this configuration, it wasn't +actually failing the overall build on a failed compilation. + +* [BUG #769](https://github.com/rust-lang/regex/issues/769): + Fix build in `regex-syntax` when only the `unicode-perl` feature is enabled. + + +1.5.2 (2021-05-01) +================== +This release fixes a performance bug when Unicode word boundaries are used. +Namely, for certain regexes on certain inputs, it's possible for the lazy DFA +to stop searching (causing a fallback to a slower engine) when it doesn't +actually need to. + +[PR #768](https://github.com/rust-lang/regex/pull/768) fixes the bug, which was +originally reported in +[ripgrep#1860](https://github.com/BurntSushi/ripgrep/issues/1860). + + +1.5.1 (2021-04-30) +================== +This is a patch release that fixes a compilation error when the `perf-literal` +feature is not enabled. + + +1.5.0 (2021-04-30) +================== +This release primarily updates to Rust 2018 (finally) and bumps the MSRV to +Rust 1.41 (from Rust 1.28). Rust 1.41 was chosen because it's still reasonably +old, and is what's in Debian stable at the time of writing. + +This release also drops this crate's own bespoke substring search algorithms +in favor of a new +[`memmem` implementation provided by the `memchr` crate](https://docs.rs/memchr/2.4.0/memchr/memmem/index.html). +This will change the performance profile of some regexes, sometimes getting a +little worse, and hopefully more frequently, getting a lot better. Please +report any serious performance regressions if you find them. + + +1.4.6 (2021-04-22) +================== +This is a small patch release that fixes the compiler's size check on how much +heap memory a regex uses. Previously, the compiler did not account for the +heap usage of Unicode character classes. Now it does. It's possible that this +may make some regexes fail to compile that previously did compile. If that +happens, please file an issue. + +* [BUG OSS-fuzz#33579](https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=33579): + Some regexes can use more heap memory than one would expect. + + +1.4.5 (2021-03-14) +================== +This is a small patch release that fixes a regression in the size of a `Regex` +in the 1.4.4 release. Prior to 1.4.4, a `Regex` was 552 bytes. In the 1.4.4 +release, it was 856 bytes due to internal changes. In this release, a `Regex` +is now 16 bytes. In general, the size of a `Regex` was never something that was +on my radar, but this increased size in the 1.4.4 release seems to have crossed +a threshold and resulted in stack overflows in some programs. + +* [BUG #750](https://github.com/rust-lang/regex/pull/750): + Fixes stack overflows seemingly caused by a large `Regex` size by decreasing + its size. + + +1.4.4 (2021-03-11) +================== +This is a small patch release that contains some bug fixes. Notably, it also +drops the `thread_local` (and `lazy_static`, via transitivity) dependencies. + +Bug fixes: + +* [BUG #362](https://github.com/rust-lang/regex/pull/362): + Memory leaks caused by an internal caching strategy should now be fixed. +* [BUG #576](https://github.com/rust-lang/regex/pull/576): + All regex types now implement `UnwindSafe` and `RefUnwindSafe`. +* [BUG #728](https://github.com/rust-lang/regex/pull/749): + Add missing `Replacer` impls for `Vec`, `String`, `Cow`, etc. + + +1.4.3 (2021-01-08) +================== +This is a small patch release that adds some missing standard trait +implementations for some types in the public API. + +Bug fixes: + +* [BUG #734](https://github.com/rust-lang/regex/pull/734): + Add `FusedIterator` and `ExactSizeIterator` impls to iterator types. +* [BUG #735](https://github.com/rust-lang/regex/pull/735): + Add missing `Debug` impls to public API types. + + +1.4.2 (2020-11-01) +================== +This is a small bug fix release that bans `\P{any}`. We previously banned empty +classes like `[^\w\W]`, but missed the `\P{any}` case. In the future, we hope +to permit empty classes. + +* [BUG #722](https://github.com/rust-lang/regex/issues/722): + Ban `\P{any}` to avoid a panic in the regex compiler. Found by OSS-Fuzz. + + +1.4.1 (2020-10-13) +================== +This is a small bug fix release that makes `\p{cf}` work. Previously, it would +report "property not found" even though `cf` is a valid abbreviation for the +`Format` general category. + +* [BUG #719](https://github.com/rust-lang/regex/issues/719): + Fixes bug that prevented `\p{cf}` from working. + + +1.4.0 (2020-10-11) +================== +This releases has a few minor documentation fixes as well as some very minor +API additions. The MSRV remains at Rust 1.28 for now, but this is intended to +increase to at least Rust 1.41.1 soon. + +This release also adds support for OSS-Fuzz. Kudos to +[@DavidKorczynski](https://github.com/DavidKorczynski) +for doing the heavy lifting for that! + +New features: + +* [FEATURE #649](https://github.com/rust-lang/regex/issues/649): + Support `[`, `]` and `.` in capture group names. +* [FEATURE #687](https://github.com/rust-lang/regex/issues/687): + Add `is_empty` predicate to `RegexSet`. +* [FEATURE #689](https://github.com/rust-lang/regex/issues/689): + Implement `Clone` for `SubCaptureMatches`. +* [FEATURE #715](https://github.com/rust-lang/regex/issues/715): + Add `empty` constructor to `RegexSet` for convenience. + +Bug fixes: + +* [BUG #694](https://github.com/rust-lang/regex/issues/694): + Fix doc example for `Replacer::replace_append`. +* [BUG #698](https://github.com/rust-lang/regex/issues/698): + Clarify docs for `s` flag when using a `bytes::Regex`. +* [BUG #711](https://github.com/rust-lang/regex/issues/711): + Clarify `is_match` docs to indicate that it can match anywhere in string. + + +1.3.9 (2020-05-28) +================== +This release fixes a MSRV (Minimum Support Rust Version) regression in the +1.3.8 release. Namely, while 1.3.8 compiles on Rust 1.28, it actually does not +compile on other Rust versions, such as Rust 1.39. + +Bug fixes: + +* [BUG #685](https://github.com/rust-lang/regex/issues/685): + Remove use of `doc_comment` crate, which cannot be used before Rust 1.43. + + +1.3.8 (2020-05-28) +================== +This release contains a couple of important bug fixes driven +by better support for empty-subexpressions in regexes. For +example, regexes like `b|` are now allowed. Major thanks to +[@sliquister](https://github.com/sliquister) for implementing support for this +in [#677](https://github.com/rust-lang/regex/pull/677). + +Bug fixes: + +* [BUG #523](https://github.com/rust-lang/regex/pull/523): + Add note to documentation that spaces can be escaped in `x` mode. +* [BUG #524](https://github.com/rust-lang/regex/issues/524): + Add support for empty sub-expressions, including empty alternations. +* [BUG #659](https://github.com/rust-lang/regex/issues/659): + Fix match bug caused by an empty sub-expression miscompilation. + + +1.3.7 (2020-04-17) +================== +This release contains a small bug fix that fixes how `regex` forwards crate +features to `regex-syntax`. In particular, this will reduce recompilations in +some cases. + +Bug fixes: + +* [BUG #665](https://github.com/rust-lang/regex/pull/665): + Fix feature forwarding to `regex-syntax`. + + +1.3.6 (2020-03-24) +================== +This release contains a sizable (~30%) performance improvement when compiling +some kinds of large regular expressions. + +Performance improvements: + +* [PERF #657](https://github.com/rust-lang/regex/pull/657): + Improvement performance of compiling large regular expressions. + + +1.3.5 (2020-03-12) +================== +This release updates this crate to Unicode 13. + +New features: + +* [FEATURE #653](https://github.com/rust-lang/regex/pull/653): + Update `regex-syntax` to Unicode 13. + + +1.3.4 (2020-01-30) +================== +This is a small bug fix release that fixes a bug related to the scoping of +flags in a regex. Namely, before this fix, a regex like `((?i)a)b)` would +match `aB` despite the fact that `b` should not be matched case insensitively. + +Bug fixes: + +* [BUG #640](https://github.com/rust-lang/regex/issues/640): + Fix bug related to the scoping of flags in a regex. + + +1.3.3 (2020-01-09) +================== +This is a small maintenance release that upgrades the dependency on +`thread_local` from `0.3` to `1.0`. The minimum supported Rust version remains +at Rust 1.28. + + +1.3.2 (2020-01-09) +================== +This is a small maintenance release with some house cleaning and bug fixes. + +New features: + +* [FEATURE #631](https://github.com/rust-lang/regex/issues/631): + Add a `Match::range` method an a `From for Range` impl. + +Bug fixes: + +* [BUG #521](https://github.com/rust-lang/regex/issues/521): + Corrects `/-/.splitn("a", 2)` to return `["a"]` instead of `["a", ""]`. +* [BUG #594](https://github.com/rust-lang/regex/pull/594): + Improve error reporting when writing `\p\`. +* [BUG #627](https://github.com/rust-lang/regex/issues/627): + Corrects `/-/.split("a-")` to return `["a", ""]` instead of `["a"]`. +* [BUG #633](https://github.com/rust-lang/regex/pull/633): + Squash deprecation warnings for the `std::error::Error::description` method. + + +1.3.1 (2019-09-04) +================== +This is a maintenance release with no changes in order to try to work-around +a [docs.rs/Cargo issue](https://github.com/rust-lang/docs.rs/issues/400). + + +1.3.0 (2019-09-03) +================== +This release adds a plethora of new crate features that permit users of regex +to shrink its size considerably, in exchange for giving up either functionality +(such as Unicode support) or runtime performance. When all such features are +disabled, the dependency tree for `regex` shrinks to exactly 1 crate +(`regex-syntax`). More information about the new crate features can be +[found in the docs](https://docs.rs/regex/*/#crate-features). + +Note that while this is a new minor version release, the minimum supported +Rust version for this crate remains at `1.28.0`. + +New features: + +* [FEATURE #474](https://github.com/rust-lang/regex/issues/474): + The `use_std` feature has been deprecated in favor of the `std` feature. + The `use_std` feature will be removed in regex 2. Until then, `use_std` will + remain as an alias for the `std` feature. +* [FEATURE #583](https://github.com/rust-lang/regex/issues/583): + Add a substantial number of crate features shrinking `regex`. + + +1.2.1 (2019-08-03) +================== +This release does a bit of house cleaning. Namely: + +* This repository is now using rustfmt. +* License headers have been removed from all files, in following suit with the + Rust project. +* Teddy has been removed from the `regex` crate, and is now part of the + `aho-corasick` crate. + [See `aho-corasick`'s new `packed` sub-module for details](https://docs.rs/aho-corasick/0.7.6/aho_corasick/packed/index.html). +* The `utf8-ranges` crate has been deprecated, with its functionality moving + into the + [`utf8` sub-module of `regex-syntax`](https://docs.rs/regex-syntax/0.6.11/regex_syntax/utf8/index.html). +* The `ucd-util` dependency has been dropped, in favor of implementing what + little we need inside of `regex-syntax` itself. + +In general, this is part of an ongoing (long term) effort to make optimizations +in the regex engine easier to reason about. The current code is too convoluted +and thus it is very easy to introduce new bugs. This simplification effort is +the primary motivation behind re-working the `aho-corasick` crate to not only +bundle algorithms like Teddy, but to also provide regex-like match semantics +automatically. + +Moving forward, the plan is to join up with the `bstr` and `regex-automata` +crates, with the former providing more sophisticated substring search +algorithms (thereby deleting existing code in `regex`) and the latter providing +ahead-of-time compiled DFAs for cases where they are inexpensive to compute. + + +1.2.0 (2019-07-20) +================== +This release updates regex's minimum supported Rust version to 1.28, which was +release almost 1 year ago. This release also updates regex's Unicode data +tables to 12.1.0. + + +1.1.9 (2019-07-06) +================== +This release contains a bug fix that caused regex's tests to fail, due to a +dependency on an unreleased behavior in regex-syntax. + +* [BUG #593](https://github.com/rust-lang/regex/issues/593): + Move an integration-style test on error messages into regex-syntax. + + +1.1.8 (2019-07-04) +================== +This release contains a few small internal refactorings. One of which fixes +an instance of undefined behavior in a part of the SIMD code. + +Bug fixes: + +* [BUG #545](https://github.com/rust-lang/regex/issues/545): + Improves error messages when a repetition operator is used without a number. +* [BUG #588](https://github.com/rust-lang/regex/issues/588): + Removes use of a repr(Rust) union used for type punning in the Teddy matcher. +* [BUG #591](https://github.com/rust-lang/regex/issues/591): + Update docs for running benchmarks and improve failure modes. + + +1.1.7 (2019-06-09) +================== +This release fixes up a few warnings as a result of recent deprecations. + + +1.1.6 (2019-04-16) +================== +This release fixes a regression introduced by a bug fix (for +[BUG #557](https://github.com/rust-lang/regex/issues/557)) which could cause +the regex engine to enter an infinite loop. This bug was originally +[reported against ripgrep](https://github.com/BurntSushi/ripgrep/issues/1247). + + +1.1.5 (2019-04-01) +================== +This release fixes a bug in regex's dependency specification where it requires +a newer version of regex-syntax, but this wasn't communicated correctly in the +Cargo.toml. This would have been caught by a minimal version check, but this +check was disabled because the `rand` crate itself advertises incorrect +dependency specifications. + +Bug fixes: + +* [BUG #570](https://github.com/rust-lang/regex/pull/570): + Fix regex-syntax minimal version. + + +1.1.4 (2019-03-31) +================== +This release fixes a backwards compatibility regression where Regex was no +longer UnwindSafe. This was caused by the upgrade to aho-corasick 0.7, whose +AhoCorasick type was itself not UnwindSafe. This has been fixed in aho-corasick +0.7.4, which we now require. + +Bug fixes: + +* [BUG #568](https://github.com/rust-lang/regex/pull/568): + Fix an API regression where Regex was no longer UnwindSafe. + + +1.1.3 (2019-03-30) +================== +This releases fixes a few bugs and adds a performance improvement when a regex +is a simple alternation of literals. + +Performance improvements: + +* [OPT #566](https://github.com/rust-lang/regex/pull/566): + Upgrades `aho-corasick` to 0.7 and uses it for `foo|bar|...|quux` regexes. + +Bug fixes: + +* [BUG #527](https://github.com/rust-lang/regex/issues/527): + Fix a bug where the parser would panic on patterns like `((?x))`. +* [BUG #555](https://github.com/rust-lang/regex/issues/555): + Fix a bug where the parser would panic on patterns like `(?m){1,1}`. +* [BUG #557](https://github.com/rust-lang/regex/issues/557): + Fix a bug where captures could lead to an incorrect match. + + +1.1.2 (2019-02-27) +================== +This release fixes a bug found in the fix introduced in 1.1.1. + +Bug fixes: + +* [BUG edf45e6f](https://github.com/rust-lang/regex/commit/edf45e6f): + Fix bug introduced in reverse suffix literal matcher in the 1.1.1 release. + + +1.1.1 (2019-02-27) +================== +This is a small release with one fix for a bug caused by literal optimizations. + +Bug fixes: + +* [BUG 661bf53d](https://github.com/rust-lang/regex/commit/661bf53d): + Fixes a bug in the reverse suffix literal optimization. This was originally + reported + [against ripgrep](https://github.com/BurntSushi/ripgrep/issues/1203). + + +1.1.0 (2018-11-30) +================== +This is a small release with a couple small enhancements. This release also +increases the minimal supported Rust version (MSRV) to 1.24.1 (from 1.20.0). In +accordance with this crate's MSRV policy, this release bumps the minor version +number. + +Performance improvements: + +* [OPT #511](https://github.com/rust-lang/regex/pull/511), + [OPT #540](https://github.com/rust-lang/regex/pull/540): + Improve lazy DFA construction for large regex sets. + +New features: + +* [FEATURE #538](https://github.com/rust-lang/regex/pull/538): + Add Emoji and "break" Unicode properties. See [UNICODE.md](UNICODE.md). + +Bug fixes: + +* [BUG #530](https://github.com/rust-lang/regex/pull/530): + Add Unicode license (for data tables). +* Various typo/doc fixups. + + +1.0.6 (2018-11-06) +================== +This is a small release. + +Performance improvements: + +* [OPT #513](https://github.com/rust-lang/regex/pull/513): + Improve performance of compiling large Unicode classes by 8-10%. + +Bug fixes: + +* [BUG #533](https://github.com/rust-lang/regex/issues/533): + Fix definition of `[[:blank:]]` class that regressed in `regex-syntax 0.5`. + + +1.0.5 (2018-09-06) +================== +This is a small release with an API enhancement. + +New features: + +* [FEATURE #509](https://github.com/rust-lang/regex/pull/509): + Generalize impls of the `Replacer` trait. + + +1.0.4 (2018-08-25) +================== +This is a small release that bumps the quickcheck dependency. + + +1.0.3 (2018-08-24) +================== +This is a small bug fix release. + +Bug fixes: + +* [BUG #504](https://github.com/rust-lang/regex/pull/504): + Fix for Cargo's "minimal version" support. +* [BUG 1e39165f](https://github.com/rust-lang/regex/commit/1e39165f): + Fix doc examples for byte regexes. + + +1.0.2 (2018-07-18) +================== +This release exposes some new lower level APIs on `Regex` that permit +amortizing allocation and controlling the location at which a search is +performed in a more granular way. Most users of the regex crate will not +need or want to use these APIs. + +New features: + +* [FEATURE #493](https://github.com/rust-lang/regex/pull/493): + Add a few lower level APIs for amortizing allocation and more fine grained + searching. + +Bug fixes: + +* [BUG 3981d2ad](https://github.com/rust-lang/regex/commit/3981d2ad): + Correct outdated documentation on `RegexBuilder::dot_matches_new_line`. +* [BUG 7ebe4ae0](https://github.com/rust-lang/regex/commit/7ebe4ae0): + Correct outdated documentation on `Parser::allow_invalid_utf8` in the + `regex-syntax` crate. +* [BUG 24c7770b](https://github.com/rust-lang/regex/commit/24c7770b): + Fix a bug in the HIR printer where it wouldn't correctly escape meta + characters in character classes. + + +1.0.1 (2018-06-19) +================== +This release upgrades regex's Unicode tables to Unicode 11, and enables SIMD +optimizations automatically on Rust stable (1.27 or newer). + +New features: + +* [FEATURE #486](https://github.com/rust-lang/regex/pull/486): + Implement `size_hint` on `RegexSet` match iterators. +* [FEATURE #488](https://github.com/rust-lang/regex/pull/488): + Update Unicode tables for Unicode 11. +* [FEATURE #490](https://github.com/rust-lang/regex/pull/490): + SIMD optimizations are now enabled automatically in Rust stable, for versions + 1.27 and up. No compilation flags or features need to be set. CPU support + SIMD is detected automatically at runtime. + +Bug fixes: + +* [BUG #482](https://github.com/rust-lang/regex/pull/482): + Present a better compilation error when the `use_std` feature isn't used. + + +1.0.0 (2018-05-01) +================== +This release marks the 1.0 release of regex. + +While this release includes some breaking changes, most users of older versions +of the regex library should be able to migrate to 1.0 by simply bumping the +version number. The important changes are as follows: + +* We adopt Rust 1.20 as the new minimum supported version of Rust for regex. + We also tentativley adopt a policy that permits bumping the minimum supported + version of Rust in minor version releases of regex, but no patch releases. + That is, with respect to semver, we do not strictly consider bumping the + minimum version of Rust to be a breaking change, but adopt a conservative + stance as a compromise. +* Octal syntax in regular expressions has been disabled by default. This + permits better error messages that inform users that backreferences aren't + available. Octal syntax can be re-enabled via the corresponding option on + `RegexBuilder`. +* `(?-u:\B)` is no longer allowed in Unicode regexes since it can match at + invalid UTF-8 code unit boundaries. `(?-u:\b)` is still allowed in Unicode + regexes. +* The `From` impl has been removed. This formally removes + the public dependency on `regex-syntax`. +* A new feature, `use_std`, has been added and enabled by default. Disabling + the feature will result in a compilation error. In the future, this may + permit us to support `no_std` environments (w/ `alloc`) in a backwards + compatible way. + +For more information and discussion, please see +[1.0 release tracking issue](https://github.com/rust-lang/regex/issues/457). + + +0.2.11 (2018-05-01) +=================== +This release primarily contains bug fixes. Some of them resolve bugs where +the parser could panic. + +New features: + +* [FEATURE #459](https://github.com/rust-lang/regex/pull/459): + Include C++'s standard regex library and Boost's regex library in the + benchmark harness. We now include D/libphobos, C++/std, C++/boost, Oniguruma, + PCRE1, PCRE2, RE2 and Tcl in the harness. + +Bug fixes: + +* [BUG #445](https://github.com/rust-lang/regex/issues/445): + Clarify order of indices returned by RegexSet match iterator. +* [BUG #461](https://github.com/rust-lang/regex/issues/461): + Improve error messages for invalid regexes like `[\d-a]`. +* [BUG #464](https://github.com/rust-lang/regex/issues/464): + Fix a bug in the error message pretty printer that could cause a panic when + a regex contained a literal `\n` character. +* [BUG #465](https://github.com/rust-lang/regex/issues/465): + Fix a panic in the parser that was caused by applying a repetition operator + to `(?flags)`. +* [BUG #466](https://github.com/rust-lang/regex/issues/466): + Fix a bug where `\pC` was not recognized as an alias for `\p{Other}`. +* [BUG #470](https://github.com/rust-lang/regex/pull/470): + Fix a bug where literal searches did more work than necessary for anchored + regexes. + + +0.2.10 (2018-03-16) +=================== +This release primarily updates the regex crate to changes made in `std::arch` +on nightly Rust. + +New features: + +* [FEATURE #458](https://github.com/rust-lang/regex/pull/458): + The `Hir` type in `regex-syntax` now has a printer. + + +0.2.9 (2018-03-12) +================== +This release introduces a new nightly only feature, `unstable`, which enables +SIMD optimizations for certain types of regexes. No additional compile time +options are necessary, and the regex crate will automatically choose the +best CPU features at run time. As a result, the `simd` (nightly only) crate +dependency has been dropped. + +New features: + +* [FEATURE #456](https://github.com/rust-lang/regex/pull/456): + The regex crate now includes AVX2 optimizations in addition to the extant + SSSE3 optimization. + +Bug fixes: + +* [BUG #455](https://github.com/rust-lang/regex/pull/455): + Fix a bug where `(?x)[ / - ]` failed to parse. + + +0.2.8 (2018-03-12) +================== +Bug gixes: + +* [BUG #454](https://github.com/rust-lang/regex/pull/454): + Fix a bug in the nest limit checker being too aggressive. + + +0.2.7 (2018-03-07) +================== +This release includes a ground-up rewrite of the regex-syntax crate, which has +been in development for over a year. + +New features: + +* Error messages for invalid regexes have been greatly improved. You get these + automatically; you don't need to do anything. In addition to better + formatting, error messages will now explicitly call out the use of look + around. When regex 1.0 is released, this will happen for backreferences as + well. +* Full support for intersection, difference and symmetric difference of + character classes. These can be used via the `&&`, `--` and `~~` binary + operators within classes. +* A Unicode Level 1 conformat implementation of `\p{..}` character classes. + Things like `\p{scx:Hira}`, `\p{age:3.2}` or `\p{Changes_When_Casefolded}` + now work. All property name and value aliases are supported, and properties + are selected via loose matching. e.g., `\p{Greek}` is the same as + `\p{G r E e K}`. +* A new `UNICODE.md` document has been added to this repository that + exhaustively documents support for UTS#18. +* Empty sub-expressions are now permitted in most places. That is, `()+` is + now a valid regex. +* Almost everything in regex-syntax now uses constant stack space, even when + performing analysis that requires structural induction. This reduces the risk + of a user provided regular expression causing a stack overflow. +* [FEATURE #174](https://github.com/rust-lang/regex/issues/174): + The `Ast` type in `regex-syntax` now contains span information. +* [FEATURE #424](https://github.com/rust-lang/regex/issues/424): + Support `\u`, `\u{...}`, `\U` and `\U{...}` syntax for specifying code points + in a regular expression. +* [FEATURE #449](https://github.com/rust-lang/regex/pull/449): + Add a `Replace::by_ref` adapter for use of a replacer without consuming it. + +Bug fixes: + +* [BUG #446](https://github.com/rust-lang/regex/issues/446): + We re-enable the Boyer-Moore literal matcher. + + +0.2.6 (2018-02-08) +================== +Bug fixes: + +* [BUG #446](https://github.com/rust-lang/regex/issues/446): + Fixes a bug in the new Boyer-Moore searcher that results in a match failure. + We fix this bug by temporarily disabling Boyer-Moore. + + +0.2.5 (2017-12-30) +================== +Bug fixes: + +* [BUG #437](https://github.com/rust-lang/regex/issues/437): + Fixes a bug in the new Boyer-Moore searcher that results in a panic. + + +0.2.4 (2017-12-30) +================== +New features: + +* [FEATURE #348](https://github.com/rust-lang/regex/pull/348): + Improve performance for capture searches on anchored regex. + (Contributed by @ethanpailes. Nice work!) +* [FEATURE #419](https://github.com/rust-lang/regex/pull/419): + Expand literal searching to include Tuned Boyer-Moore in some cases. + (Contributed by @ethanpailes. Nice work!) + +Bug fixes: + +* [BUG](https://github.com/rust-lang/regex/pull/436): + The regex compiler plugin has been removed. +* [BUG](https://github.com/rust-lang/regex/pull/436): + `simd` has been bumped to `0.2.1`, which fixes a Rust nightly build error. +* [BUG](https://github.com/rust-lang/regex/pull/436): + Bring the benchmark harness up to date. + + +0.2.3 (2017-11-30) +================== +New features: + +* [FEATURE #374](https://github.com/rust-lang/regex/pull/374): + Add `impl From for &str`. +* [FEATURE #380](https://github.com/rust-lang/regex/pull/380): + Derive `Clone` and `PartialEq` on `Error`. +* [FEATURE #400](https://github.com/rust-lang/regex/pull/400): + Update to Unicode 10. + +Bug fixes: + +* [BUG #375](https://github.com/rust-lang/regex/issues/375): + Fix a bug that prevented the bounded backtracker from terminating. +* [BUG #393](https://github.com/rust-lang/regex/issues/393), + [BUG #394](https://github.com/rust-lang/regex/issues/394): + Fix bug with `replace` methods for empty matches. + + +0.2.2 (2017-05-21) +================== +New features: + +* [FEATURE #341](https://github.com/rust-lang/regex/issues/341): + Support nested character classes and intersection operation. + For example, `[\p{Greek}&&\pL]` matches greek letters and + `[[0-9]&&[^4]]` matches every decimal digit except `4`. + (Much thanks to @robinst, who contributed this awesome feature.) + +Bug fixes: + +* [BUG #321](https://github.com/rust-lang/regex/issues/321): + Fix bug in literal extraction and UTF-8 decoding. +* [BUG #326](https://github.com/rust-lang/regex/issues/326): + Add documentation tip about the `(?x)` flag. +* [BUG #333](https://github.com/rust-lang/regex/issues/333): + Show additional replacement example using curly braces. +* [BUG #334](https://github.com/rust-lang/regex/issues/334): + Fix bug when resolving captures after a match. +* [BUG #338](https://github.com/rust-lang/regex/issues/338): + Add example that uses `Captures::get` to API documentation. +* [BUG #353](https://github.com/rust-lang/regex/issues/353): + Fix RegexSet bug that caused match failure in some cases. +* [BUG #354](https://github.com/rust-lang/regex/pull/354): + Fix panic in parser when `(?x)` is used. +* [BUG #358](https://github.com/rust-lang/regex/issues/358): + Fix literal optimization bug with RegexSet. +* [BUG #359](https://github.com/rust-lang/regex/issues/359): + Fix example code in README. +* [BUG #365](https://github.com/rust-lang/regex/pull/365): + Fix bug in `rure_captures_len` in the C binding. +* [BUG #367](https://github.com/rust-lang/regex/issues/367): + Fix byte class bug that caused a panic. + + +0.2.1 +===== +One major bug with `replace_all` has been fixed along with a couple of other +touchups. + +* [BUG #312](https://github.com/rust-lang/regex/issues/312): + Fix documentation for `NoExpand` to reference correct lifetime parameter. +* [BUG #314](https://github.com/rust-lang/regex/issues/314): + Fix a bug with `replace_all` when replacing a match with the empty string. +* [BUG #316](https://github.com/rust-lang/regex/issues/316): + Note a missing breaking change from the `0.2.0` CHANGELOG entry. + (`RegexBuilder::compile` was renamed to `RegexBuilder::build`.) +* [BUG #324](https://github.com/rust-lang/regex/issues/324): + Compiling `regex` should only require one version of `memchr` crate. + + +0.2.0 +===== +This is a new major release of the regex crate, and is an implementation of the +[regex 1.0 RFC](https://github.com/rust-lang/rfcs/blob/master/text/1620-regex-1.0.md). +We are releasing a `0.2` first, and if there are no major problems, we will +release a `1.0` shortly. For `0.2`, the minimum *supported* Rust version is +1.12. + +There are a number of **breaking changes** in `0.2`. They are split into two +types. The first type correspond to breaking changes in regular expression +syntax. The second type correspond to breaking changes in the API. + +Breaking changes for regex syntax: + +* POSIX character classes now require double bracketing. Previously, the regex + `[:upper:]` would parse as the `upper` POSIX character class. Now it parses + as the character class containing the characters `:upper:`. The fix to this + change is to use `[[:upper:]]` instead. Note that variants like + `[[:upper:][:blank:]]` continue to work. +* The character `[` must always be escaped inside a character class. +* The characters `&`, `-` and `~` must be escaped if any one of them are + repeated consecutively. For example, `[&]`, `[\&]`, `[\&\&]`, `[&-&]` are all + equivalent while `[&&]` is illegal. (The motivation for this and the prior + change is to provide a backwards compatible path for adding character class + set notation.) +* A `bytes::Regex` now has Unicode mode enabled by default (like the main + `Regex` type). This means regexes compiled with `bytes::Regex::new` that + don't have the Unicode flag set should add `(?-u)` to recover the original + behavior. + +Breaking changes for the regex API: + +* `find` and `find_iter` now **return `Match` values instead of + `(usize, usize)`.** `Match` values have `start` and `end` methods, which + return the match offsets. `Match` values also have an `as_str` method, + which returns the text of the match itself. +* The `Captures` type now only provides a single iterator over all capturing + matches, which should replace uses of `iter` and `iter_pos`. Uses of + `iter_named` should use the `capture_names` method on `Regex`. +* The `at` method on the `Captures` type has been renamed to `get`, and it + now returns a `Match`. Similarly, the `name` method on `Captures` now returns + a `Match`. +* The `replace` methods now return `Cow` values. The `Cow::Borrowed` variant + is returned when no replacements are made. +* The `Replacer` trait has been completely overhauled. This should only + impact clients that implement this trait explicitly. Standard uses of + the `replace` methods should continue to work unchanged. If you implement + the `Replacer` trait, please consult the new documentation. +* The `quote` free function has been renamed to `escape`. +* The `Regex::with_size_limit` method has been removed. It is replaced by + `RegexBuilder::size_limit`. +* The `RegexBuilder` type has switched from owned `self` method receivers to + `&mut self` method receivers. Most uses will continue to work unchanged, but + some code may require naming an intermediate variable to hold the builder. +* The `compile` method on `RegexBuilder` has been renamed to `build`. +* The free `is_match` function has been removed. It is replaced by compiling + a `Regex` and calling its `is_match` method. +* The `PartialEq` and `Eq` impls on `Regex` have been dropped. If you relied + on these impls, the fix is to define a wrapper type around `Regex`, impl + `Deref` on it and provide the necessary impls. +* The `is_empty` method on `Captures` has been removed. This always returns + `false`, so its use is superfluous. +* The `Syntax` variant of the `Error` type now contains a string instead of + a `regex_syntax::Error`. If you were examining syntax errors more closely, + you'll need to explicitly use the `regex_syntax` crate to re-parse the regex. +* The `InvalidSet` variant of the `Error` type has been removed since it is + no longer used. +* Most of the iterator types have been renamed to match conventions. If you + were using these iterator types explicitly, please consult the documentation + for its new name. For example, `RegexSplits` has been renamed to `Split`. + +A number of bugs have been fixed: + +* [BUG #151](https://github.com/rust-lang/regex/issues/151): + The `Replacer` trait has been changed to permit the caller to control + allocation. +* [BUG #165](https://github.com/rust-lang/regex/issues/165): + Remove the free `is_match` function. +* [BUG #166](https://github.com/rust-lang/regex/issues/166): + Expose more knobs (available in `0.1`) and remove `with_size_limit`. +* [BUG #168](https://github.com/rust-lang/regex/issues/168): + Iterators produced by `Captures` now have the correct lifetime parameters. +* [BUG #175](https://github.com/rust-lang/regex/issues/175): + Fix a corner case in the parsing of POSIX character classes. +* [BUG #178](https://github.com/rust-lang/regex/issues/178): + Drop the `PartialEq` and `Eq` impls on `Regex`. +* [BUG #179](https://github.com/rust-lang/regex/issues/179): + Remove `is_empty` from `Captures` since it always returns false. +* [BUG #276](https://github.com/rust-lang/regex/issues/276): + Position of named capture can now be retrieved from a `Captures`. +* [BUG #296](https://github.com/rust-lang/regex/issues/296): + Remove winapi/kernel32-sys dependency on UNIX. +* [BUG #307](https://github.com/rust-lang/regex/issues/307): + Fix error on emscripten. + + +0.1.80 +====== +* [PR #292](https://github.com/rust-lang/regex/pull/292): + Fixes bug #291, which was introduced by PR #290. + +0.1.79 +====== +* Require regex-syntax 0.3.8. + +0.1.78 +====== +* [PR #290](https://github.com/rust-lang/regex/pull/290): + Fixes bug #289, which caused some regexes with a certain combination + of literals to match incorrectly. + +0.1.77 +====== +* [PR #281](https://github.com/rust-lang/regex/pull/281): + Fixes bug #280 by disabling all literal optimizations when a pattern + is partially anchored. + +0.1.76 +====== +* Tweak criteria for using the Teddy literal matcher. + +0.1.75 +====== +* [PR #275](https://github.com/rust-lang/regex/pull/275): + Improves match verification performance in the Teddy SIMD searcher. +* [PR #278](https://github.com/rust-lang/regex/pull/278): + Replaces slow substring loop in the Teddy SIMD searcher with Aho-Corasick. +* Implemented DoubleEndedIterator on regex set match iterators. + +0.1.74 +====== +* Release regex-syntax 0.3.5 with a minor bug fix. +* Fix bug #272. +* Fix bug #277. +* [PR #270](https://github.com/rust-lang/regex/pull/270): + Fixes bugs #264, #268 and an unreported where the DFA cache size could be + drastically under estimated in some cases (leading to high unexpected memory + usage). + +0.1.73 +====== +* Release `regex-syntax 0.3.4`. +* Bump `regex-syntax` dependency version for `regex` to `0.3.4`. + +0.1.72 +====== +* [PR #262](https://github.com/rust-lang/regex/pull/262): + Fixes a number of small bugs caught by fuzz testing (AFL). + +0.1.71 +====== +* [PR #236](https://github.com/rust-lang/regex/pull/236): + Fix a bug in how suffix literals were extracted, which could lead + to invalid match behavior in some cases. + +0.1.70 +====== +* [PR #231](https://github.com/rust-lang/regex/pull/231): + Add SIMD accelerated multiple pattern search. +* [PR #228](https://github.com/rust-lang/regex/pull/228): + Reintroduce the reverse suffix literal optimization. +* [PR #226](https://github.com/rust-lang/regex/pull/226): + Implements NFA state compression in the lazy DFA. +* [PR #223](https://github.com/rust-lang/regex/pull/223): + A fully anchored RegexSet can now short-circuit. + +0.1.69 +====== +* [PR #216](https://github.com/rust-lang/regex/pull/216): + Tweak the threshold for running backtracking. +* [PR #217](https://github.com/rust-lang/regex/pull/217): + Add upper limit (from the DFA) to capture search (for the NFA). +* [PR #218](https://github.com/rust-lang/regex/pull/218): + Add rure, a C API. + +0.1.68 +====== +* [PR #210](https://github.com/rust-lang/regex/pull/210): + Fixed a performance bug in `bytes::Regex::replace` where `extend` was used + instead of `extend_from_slice`. +* [PR #211](https://github.com/rust-lang/regex/pull/211): + Fixed a bug in the handling of word boundaries in the DFA. +* [PR #213](https://github.com/rust-lang/pull/213): + Added RE2 and Tcl to the benchmark harness. Also added a CLI utility from + running regexes using any of the following regex engines: PCRE1, PCRE2, + Oniguruma, RE2, Tcl and of course Rust's own regexes. + +0.1.67 +====== +* [PR #201](https://github.com/rust-lang/regex/pull/201): + Fix undefined behavior in the `regex!` compiler plugin macro. +* [PR #205](https://github.com/rust-lang/regex/pull/205): + More improvements to DFA performance. Competitive with RE2. See PR for + benchmarks. +* [PR #209](https://github.com/rust-lang/regex/pull/209): + Release 0.1.66 was semver incompatible since it required a newer version + of Rust than previous releases. This PR fixes that. (And `0.1.66` was + yanked.) + +0.1.66 +====== +* Speculative support for Unicode word boundaries was added to the DFA. This + should remove the last common case that disqualified use of the DFA. +* An optimization that scanned for suffix literals and then matched the regular + expression in reverse was removed because it had worst case quadratic time + complexity. It was replaced with a more limited optimization where, given any + regex of the form `re$`, it will be matched in reverse from the end of the + haystack. +* [PR #202](https://github.com/rust-lang/regex/pull/202): + The inner loop of the DFA was heavily optimized to improve cache locality + and reduce the overall number of instructions run on each iteration. This + represents the first use of `unsafe` in `regex` (to elide bounds checks). +* [PR #200](https://github.com/rust-lang/regex/pull/200): + Use of the `mempool` crate (which used thread local storage) was replaced + with a faster version of a similar API in @Amanieu's `thread_local` crate. + It should reduce contention when using a regex from multiple threads + simultaneously. +* PCRE2 JIT benchmarks were added. A benchmark comparison can be found + [here](https://gist.github.com/anonymous/14683c01993e91689f7206a18675901b). + (Includes a comparison with PCRE1's JIT and Oniguruma.) +* A bug where word boundaries weren't being matched correctly in the DFA was + fixed. This only affected use of `bytes::Regex`. +* [#160](https://github.com/rust-lang/regex/issues/160): + `Captures` now has a `Debug` impl. diff --git a/vendor/regex/Cargo.lock b/vendor/regex/Cargo.lock new file mode 100644 index 0000000000..f91c8879b1 --- /dev/null +++ b/vendor/regex/Cargo.lock @@ -0,0 +1,98 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "0.7.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" +dependencies = [ + "memchr", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "getrandom" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.139" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "quickcheck" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6" +dependencies = [ + "rand", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "regex" +version = "1.7.3" +dependencies = [ + "aho-corasick", + "lazy_static", + "memchr", + "quickcheck", + "rand", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" diff --git a/vendor/regex/Cargo.toml b/vendor/regex/Cargo.toml new file mode 100644 index 0000000000..37e44fb3b7 --- /dev/null +++ b/vendor/regex/Cargo.toml @@ -0,0 +1,149 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2018" +name = "regex" +version = "1.7.3" +authors = ["The Rust Project Developers"] +exclude = [ + "/scripts/*", + "/.github/*", +] +autotests = false +description = """ +An implementation of regular expressions for Rust. This implementation uses +finite automata and guarantees linear time matching on all inputs. +""" +homepage = "https://github.com/rust-lang/regex" +documentation = "https://docs.rs/regex" +readme = "README.md" +categories = ["text-processing"] +license = "MIT OR Apache-2.0" +repository = "https://github.com/rust-lang/regex" + +[profile.bench] +debug = true + +[profile.release] +debug = true + +[profile.test] +debug = true + +[lib] +doctest = false +bench = false + +[[test]] +name = "default" +path = "tests/test_default.rs" + +[[test]] +name = "default-bytes" +path = "tests/test_default_bytes.rs" + +[[test]] +name = "nfa" +path = "tests/test_nfa.rs" + +[[test]] +name = "nfa-utf8bytes" +path = "tests/test_nfa_utf8bytes.rs" + +[[test]] +name = "nfa-bytes" +path = "tests/test_nfa_bytes.rs" + +[[test]] +name = "backtrack" +path = "tests/test_backtrack.rs" + +[[test]] +name = "backtrack-utf8bytes" +path = "tests/test_backtrack_utf8bytes.rs" + +[[test]] +name = "backtrack-bytes" +path = "tests/test_backtrack_bytes.rs" + +[[test]] +name = "crates-regex" +path = "tests/test_crates_regex.rs" + +[dependencies.aho-corasick] +version = "0.7.18" +optional = true + +[dependencies.memchr] +version = "2.4.0" +optional = true + +[dependencies.regex-syntax] +version = "0.6.29" +default-features = false + +[dev-dependencies.lazy_static] +version = "1" + +[dev-dependencies.quickcheck] +version = "1.0.3" +default-features = false + +[dev-dependencies.rand] +version = "0.8.3" +features = [ + "getrandom", + "small_rng", +] +default-features = false + +[features] +default = [ + "std", + "perf", + "unicode", + "regex-syntax/default", +] +pattern = [] +perf = [ + "perf-cache", + "perf-dfa", + "perf-inline", + "perf-literal", +] +perf-cache = [] +perf-dfa = [] +perf-inline = [] +perf-literal = [ + "aho-corasick", + "memchr", +] +std = [] +unicode = [ + "unicode-age", + "unicode-bool", + "unicode-case", + "unicode-gencat", + "unicode-perl", + "unicode-script", + "unicode-segment", + "regex-syntax/unicode", +] +unicode-age = ["regex-syntax/unicode-age"] +unicode-bool = ["regex-syntax/unicode-bool"] +unicode-case = ["regex-syntax/unicode-case"] +unicode-gencat = ["regex-syntax/unicode-gencat"] +unicode-perl = ["regex-syntax/unicode-perl"] +unicode-script = ["regex-syntax/unicode-script"] +unicode-segment = ["regex-syntax/unicode-segment"] +unstable = ["pattern"] +use_std = ["std"] diff --git a/vendor/regex/HACKING.md b/vendor/regex/HACKING.md new file mode 100644 index 0000000000..34af5b517c --- /dev/null +++ b/vendor/regex/HACKING.md @@ -0,0 +1,341 @@ +Your friendly guide to hacking and navigating the regex library. + +This guide assumes familiarity with Rust and Cargo, and at least a perusal of +the user facing documentation for this crate. + +If you're looking for background on the implementation in this library, then +you can do no better than Russ Cox's article series on implementing regular +expressions using finite automata: https://swtch.com/~rsc/regexp/ + + +## Architecture overview + +As you probably already know, this library executes regular expressions using +finite automata. In particular, a design goal is to make searching linear +with respect to both the regular expression and the text being searched. +Meeting that design goal on its own is not so hard and can be done with an +implementation of the Pike VM (similar to Thompson's construction, but supports +capturing groups), as described in: https://swtch.com/~rsc/regexp/regexp2.html +--- This library contains such an implementation in src/pikevm.rs. + +Making it fast is harder. One of the key problems with the Pike VM is that it +can be in more than one state at any point in time, and must shuffle capture +positions between them. The Pike VM also spends a lot of time following the +same epsilon transitions over and over again. We can employ one trick to +speed up the Pike VM: extract one or more literal prefixes from the regular +expression and execute specialized code to quickly find matches of those +prefixes in the search text. The Pike VM can then be avoided for most the +search, and instead only executed when a prefix is found. The code to find +prefixes is in the regex-syntax crate (in this repository). The code to search +for literals is in src/literals.rs. When more than one literal prefix is found, +we fall back to an Aho-Corasick DFA using the aho-corasick crate. For one +literal, we use a variant of the Boyer-Moore algorithm. Both Aho-Corasick and +Boyer-Moore use `memchr` when appropriate. The Boyer-Moore variant in this +library also uses elementary frequency analysis to choose the right byte to run +`memchr` with. + +Of course, detecting prefix literals can only take us so far. Not all regular +expressions have literal prefixes. To remedy this, we try another approach +to executing the Pike VM: backtracking, whose implementation can be found in +src/backtrack.rs. One reason why backtracking can be faster is that it avoids +excessive shuffling of capture groups. Of course, backtracking is susceptible +to exponential runtimes, so we keep track of every state we've visited to make +sure we never visit it again. This guarantees linear time execution, but we +pay for it with the memory required to track visited states. Because of the +memory requirement, we only use this engine on small search strings *and* small +regular expressions. + +Lastly, the real workhorse of this library is the "lazy" DFA in src/dfa.rs. +It is distinct from the Pike VM in that the DFA is explicitly represented in +memory and is only ever in one state at a time. It is said to be "lazy" because +the DFA is computed as text is searched, where each byte in the search text +results in at most one new DFA state. It is made fast by caching states. DFAs +are susceptible to exponential state blow up (where the worst case is computing +a new state for every input byte, regardless of what's in the state cache). To +avoid using a lot of memory, the lazy DFA uses a bounded cache. Once the cache +is full, it is wiped and state computation starts over again. If the cache is +wiped too frequently, then the DFA gives up and searching falls back to one of +the aforementioned algorithms. + +All of the above matching engines expose precisely the same matching semantics. +This is indeed tested. (See the section below about testing.) + +The following sub-sections describe the rest of the library and how each of the +matching engines are actually used. + +### Parsing + +Regular expressions are parsed using the regex-syntax crate, which is +maintained in this repository. The regex-syntax crate defines an abstract +syntax and provides very detailed error messages when a parse error is +encountered. Parsing is done in a separate crate so that others may benefit +from its existence, and because it is relatively divorced from the rest of the +regex library. + +The regex-syntax crate also provides sophisticated support for extracting +prefix and suffix literals from regular expressions. + +### Compilation + +The compiler is in src/compile.rs. The input to the compiler is some abstract +syntax for a regular expression and the output is a sequence of opcodes that +matching engines use to execute a search. (One can think of matching engines as +mini virtual machines.) The sequence of opcodes is a particular encoding of a +non-deterministic finite automaton. In particular, the opcodes explicitly rely +on epsilon transitions. + +Consider a simple regular expression like `a|b`. Its compiled form looks like +this: + + 000 Save(0) + 001 Split(2, 3) + 002 'a' (goto: 4) + 003 'b' + 004 Save(1) + 005 Match + +The first column is the instruction pointer and the second column is the +instruction. Save instructions indicate that the current position in the input +should be stored in a captured location. Split instructions represent a binary +branch in the program (i.e., epsilon transitions). The instructions `'a'` and +`'b'` indicate that the literal bytes `'a'` or `'b'` should match. + +In older versions of this library, the compilation looked like this: + + 000 Save(0) + 001 Split(2, 3) + 002 'a' + 003 Jump(5) + 004 'b' + 005 Save(1) + 006 Match + +In particular, empty instructions that merely served to move execution from one +point in the program to another were removed. Instead, every instruction has a +`goto` pointer embedded into it. This resulted in a small performance boost for +the Pike VM, because it was one fewer epsilon transition that it had to follow. + +There exist more instructions and they are defined and documented in +src/prog.rs. + +Compilation has several knobs and a few unfortunately complicated invariants. +Namely, the output of compilation can be one of two types of programs: a +program that executes on Unicode scalar values or a program that executes +on raw bytes. In the former case, the matching engine is responsible for +performing UTF-8 decoding and executing instructions using Unicode codepoints. +In the latter case, the program handles UTF-8 decoding implicitly, so that the +matching engine can execute on raw bytes. All matching engines can execute +either Unicode or byte based programs except for the lazy DFA, which requires +byte based programs. In general, both representations were kept because (1) the +lazy DFA requires byte based programs so that states can be encoded in a memory +efficient manner and (2) the Pike VM benefits greatly from inlining Unicode +character classes into fewer instructions as it results in fewer epsilon +transitions. + +N.B. UTF-8 decoding is built into the compiled program by making use of the +utf8-ranges crate. The compiler in this library factors out common suffixes to +reduce the size of huge character classes (e.g., `\pL`). + +A regrettable consequence of this split in instruction sets is we generally +need to compile two programs; one for NFA execution and one for the lazy DFA. + +In fact, it is worse than that: the lazy DFA is not capable of finding the +starting location of a match in a single scan, and must instead execute a +backwards search after finding the end location. To execute a backwards search, +we must have compiled the regular expression *in reverse*. + +This means that every compilation of a regular expression generally results in +three distinct programs. It would be possible to lazily compile the Unicode +program, since it is never needed if (1) the regular expression uses no word +boundary assertions and (2) the caller never asks for sub-capture locations. + +### Execution + +At the time of writing, there are four matching engines in this library: + +1. The Pike VM (supports captures). +2. Bounded backtracking (supports captures). +3. Literal substring or multi-substring search. +4. Lazy DFA (no support for Unicode word boundary assertions). + +Only the first two matching engines are capable of executing every regular +expression program. They also happen to be the slowest, which means we need +some logic that (1) knows various facts about the regular expression and (2) +knows what the caller wants. Using this information, we can determine which +engine (or engines) to use. + +The logic for choosing which engine to execute is in src/exec.rs and is +documented on the Exec type. Exec values contain regular expression Programs +(defined in src/prog.rs), which contain all the necessary tidbits for actually +executing a regular expression on search text. + +For the most part, the execution logic is straight-forward and follows the +limitations of each engine described above pretty faithfully. The hairiest +part of src/exec.rs by far is the execution of the lazy DFA, since it requires +a forwards and backwards search, and then falls back to either the Pike VM or +backtracking if the caller requested capture locations. + +The Exec type also contains mutable scratch space for each type of matching +engine. This scratch space is used during search (for example, for the lazy +DFA, it contains compiled states that are reused on subsequent searches). + +### Programs + +A regular expression program is essentially a sequence of opcodes produced by +the compiler plus various facts about the regular expression (such as whether +it is anchored, its capture names, etc.). + +### The regex! macro + +The `regex!` macro no longer exists. It was developed in a bygone era as a +compiler plugin during the infancy of the regex crate. Back then, then only +matching engine in the crate was the Pike VM. The `regex!` macro was, itself, +also a Pike VM. The only advantages it offered over the dynamic Pike VM that +was built at runtime were the following: + + 1. Syntax checking was done at compile time. Your Rust program wouldn't + compile if your regex didn't compile. + 2. Reduction of overhead that was proportional to the size of the regex. + For the most part, this overhead consisted of heap allocation, which + was nearly eliminated in the compiler plugin. + +The main takeaway here is that the compiler plugin was a marginally faster +version of a slow regex engine. As the regex crate evolved, it grew other regex +engines (DFA, bounded backtracker) and sophisticated literal optimizations. +The regex macro didn't keep pace, and it therefore became (dramatically) slower +than the dynamic engines. The only reason left to use it was for the compile +time guarantee that your regex is correct. Fortunately, Clippy (the Rust lint +tool) has a lint that checks your regular expression validity, which mostly +replaces that use case. + +Additionally, the regex compiler plugin stopped receiving maintenance. Nobody +complained. At that point, it seemed prudent to just remove it. + +Will a compiler plugin be brought back? The future is murky, but there is +definitely an opportunity there to build something that is faster than the +dynamic engines in some cases. But it will be challenging! As of now, there +are no plans to work on this. + + +## Testing + +A key aspect of any mature regex library is its test suite. A subset of the +tests in this library come from Glenn Fowler's AT&T test suite (its online +presence seems gone at the time of writing). The source of the test suite is +located in src/testdata. The scripts/regex-match-tests.py takes the test suite +in src/testdata and generates tests/matches.rs. + +There are also many other manually crafted tests and regression tests in +tests/tests.rs. Some of these tests were taken from RE2. + +The biggest source of complexity in the tests is related to answering this +question: how can we reuse the tests to check all of our matching engines? One +approach would have been to encode every test into some kind of format (like +the AT&T test suite) and code generate tests for each matching engine. The +approach we use in this library is to create a Cargo.toml entry point for each +matching engine we want to test. The entry points are: + +* `tests/test_default.rs` - tests `Regex::new` +* `tests/test_default_bytes.rs` - tests `bytes::Regex::new` +* `tests/test_nfa.rs` - tests `Regex::new`, forced to use the NFA + algorithm on every regex. +* `tests/test_nfa_bytes.rs` - tests `Regex::new`, forced to use the NFA + algorithm on every regex and use *arbitrary* byte based programs. +* `tests/test_nfa_utf8bytes.rs` - tests `Regex::new`, forced to use the NFA + algorithm on every regex and use *UTF-8* byte based programs. +* `tests/test_backtrack.rs` - tests `Regex::new`, forced to use + backtracking on every regex. +* `tests/test_backtrack_bytes.rs` - tests `Regex::new`, forced to use + backtracking on every regex and use *arbitrary* byte based programs. +* `tests/test_backtrack_utf8bytes.rs` - tests `Regex::new`, forced to use + backtracking on every regex and use *UTF-8* byte based programs. +* `tests/test_crates_regex.rs` - tests to make sure that all of the + backends behave in the same way against a number of quickcheck + generated random inputs. These tests need to be enabled through + the `RUST_REGEX_RANDOM_TEST` environment variable (see + below). + +The lazy DFA and pure literal engines are absent from this list because +they cannot be used on every regular expression. Instead, we rely on +`tests/test_dynamic.rs` to test the lazy DFA and literal engines when possible. + +Since the tests are repeated several times, and because `cargo test` runs all +entry points, it can take a while to compile everything. To reduce compile +times slightly, try using `cargo test --test default`, which will only use the +`tests/test_default.rs` entry point. + +The random testing takes quite a while, so it is not enabled by default. +In order to run the random testing you can set the +`RUST_REGEX_RANDOM_TEST` environment variable to anything before +invoking `cargo test`. Note that this variable is inspected at compile +time, so if the tests don't seem to be running, you may need to run +`cargo clean`. + +## Benchmarking + +The benchmarking in this crate is made up of many micro-benchmarks. Currently, +there are two primary sets of benchmarks: the benchmarks that were adopted +at this library's inception (in `bench/src/misc.rs`) and a newer set of +benchmarks meant to test various optimizations. Specifically, the latter set +contain some analysis and are in `bench/src/sherlock.rs`. Also, the latter +set are all executed on the same lengthy input whereas the former benchmarks +are executed on strings of varying length. + +There is also a smattering of benchmarks for parsing and compilation. + +Benchmarks are in a separate crate so that its dependencies can be managed +separately from the main regex crate. + +Benchmarking follows a similarly wonky setup as tests. There are multiple entry +points: + +* `bench_rust.rs` - benchmarks `Regex::new` +* `bench_rust_bytes.rs` benchmarks `bytes::Regex::new` +* `bench_pcre.rs` - benchmarks PCRE +* `bench_onig.rs` - benchmarks Oniguruma + +The PCRE and Oniguruma benchmarks exist as a comparison point to a mature +regular expression library. In general, this regex library compares favorably +(there are even a few benchmarks that PCRE simply runs too slowly on or +outright can't execute at all). I would love to add other regular expression +library benchmarks (especially RE2). + +If you're hacking on one of the matching engines and just want to see +benchmarks, then all you need to run is: + + $ (cd bench && ./run rust) + +If you want to compare your results with older benchmarks, then try: + + $ (cd bench && ./run rust | tee old) + $ ... make it faster + $ (cd bench && ./run rust | tee new) + $ cargo benchcmp old new --improvements + +The `cargo-benchcmp` utility is available here: +https://github.com/BurntSushi/cargo-benchcmp + +The `./bench/run` utility can run benchmarks for PCRE and Oniguruma too. See +`./bench/bench --help`. + +## Dev Docs + +When digging your teeth into the codebase for the first time, the +crate documentation can be a great resource. By default `rustdoc` +will strip out all documentation of private crate members in an +effort to help consumers of the crate focus on the *interface* +without having to concern themselves with the *implementation*. +Normally this is a great thing, but if you want to start hacking +on regex internals it is not what you want. Many of the private members +of this crate are well documented with rustdoc style comments, and +it would be a shame to miss out on the opportunity that presents. +You can generate the private docs with: + +``` +$ rustdoc --crate-name docs src/lib.rs -o target/doc -L target/debug/deps --no-defaults --passes collapse-docs --passes unindent-comments +``` + +Then just point your browser at `target/doc/regex/index.html`. + +See https://github.com/rust-lang/rust/issues/15347 for more info +about generating developer docs for internal use. diff --git a/vendor/regex/LICENSE-APACHE b/vendor/regex/LICENSE-APACHE new file mode 100644 index 0000000000..16fe87b06e --- /dev/null +++ b/vendor/regex/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/vendor/regex/LICENSE-MIT b/vendor/regex/LICENSE-MIT new file mode 100644 index 0000000000..39d4bdb5ac --- /dev/null +++ b/vendor/regex/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2014 The Rust Project Developers + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/vendor/regex/PERFORMANCE.md b/vendor/regex/PERFORMANCE.md new file mode 100644 index 0000000000..8cd0d9c719 --- /dev/null +++ b/vendor/regex/PERFORMANCE.md @@ -0,0 +1,277 @@ +Your friendly guide to understanding the performance characteristics of this +crate. + +This guide assumes some familiarity with the public API of this crate, which +can be found here: https://docs.rs/regex + +## Theory vs. Practice + +One of the design goals of this crate is to provide worst case linear time +behavior with respect to the text searched using finite state automata. This +means that, *in theory*, the performance of this crate is much better than most +regex implementations, which typically use backtracking which has worst case +exponential time. + +For example, try opening a Python interpreter and typing this: + + >>> import re + >>> re.search('(a*)*c', 'a' * 30).span() + +I'll wait. + +At some point, you'll figure out that it won't terminate any time soon. ^C it. + +The promise of this crate is that *this pathological behavior can't happen*. + +With that said, just because we have protected ourselves against worst case +exponential behavior doesn't mean we are immune from large constant factors +or places where the current regex engine isn't quite optimal. This guide will +detail those cases and provide guidance on how to avoid them, among other +bits of general advice. + +## Thou Shalt Not Compile Regular Expressions In A Loop + +**Advice**: Use `lazy_static` to amortize the cost of `Regex` compilation. + +Don't do it unless you really don't mind paying for it. Compiling a regular +expression in this crate is quite expensive. It is conceivable that it may get +faster some day, but I wouldn't hold out hope for, say, an order of magnitude +improvement. In particular, compilation can take any where from a few dozen +microseconds to a few dozen milliseconds. Yes, milliseconds. Unicode character +classes, in particular, have the largest impact on compilation performance. At +the time of writing, for example, `\pL{100}` takes around 44ms to compile. This +is because `\pL` corresponds to every letter in Unicode and compilation must +turn it into a proper automaton that decodes a subset of UTF-8 which +corresponds to those letters. Compilation also spends some cycles shrinking the +size of the automaton. + +This means that in order to realize efficient regex matching, one must +*amortize the cost of compilation*. Trivially, if a call to `is_match` is +inside a loop, then make sure your call to `Regex::new` is *outside* that loop. + +In many programming languages, regular expressions can be conveniently defined +and compiled in a global scope, and code can reach out and use them as if +they were global static variables. In Rust, there is really no concept of +life-before-main, and therefore, one cannot utter this: + + static MY_REGEX: Regex = Regex::new("...").unwrap(); + +Unfortunately, this would seem to imply that one must pass `Regex` objects +around to everywhere they are used, which can be especially painful depending +on how your program is structured. Thankfully, the +[`lazy_static`](https://crates.io/crates/lazy_static) +crate provides an answer that works well: + + use lazy_static::lazy_static; + use regex::Regex; + + fn some_helper_function(text: &str) -> bool { + lazy_static! { + static ref MY_REGEX: Regex = Regex::new("...").unwrap(); + } + MY_REGEX.is_match(text) + } + +In other words, the `lazy_static!` macro enables us to define a `Regex` *as if* +it were a global static value. What is actually happening under the covers is +that the code inside the macro (i.e., `Regex::new(...)`) is run on *first use* +of `MY_REGEX` via a `Deref` impl. The implementation is admittedly magical, but +it's self contained and everything works exactly as you expect. In particular, +`MY_REGEX` can be used from multiple threads without wrapping it in an `Arc` or +a `Mutex`. On that note... + +## Using a regex from multiple threads + +**Advice**: The performance impact from using a `Regex` from multiple threads +is likely negligible. If necessary, clone the `Regex` so that each thread gets +its own copy. Cloning a regex does not incur any additional memory overhead +than what would be used by using a `Regex` from multiple threads +simultaneously. *Its only cost is ergonomics.* + +It is supported and encouraged to define your regexes using `lazy_static!` as +if they were global static values, and then use them to search text from +multiple threads simultaneously. + +One might imagine that this is possible because a `Regex` represents a +*compiled* program, so that any allocation or mutation is already done, and is +therefore read-only. Unfortunately, this is not true. Each type of search +strategy in this crate requires some kind of mutable scratch space to use +*during search*. For example, when executing a DFA, its states are computed +lazily and reused on subsequent searches. Those states go into that mutable +scratch space. + +The mutable scratch space is an implementation detail, and in general, its +mutation should not be observable from users of this crate. Therefore, it uses +interior mutability. This implies that `Regex` can either only be used from one +thread, or it must do some sort of synchronization. Either choice is +reasonable, but this crate chooses the latter, in particular because it is +ergonomic and makes use with `lazy_static!` straight forward. + +Synchronization implies *some* amount of overhead. When a `Regex` is used from +a single thread, this overhead is negligible. When a `Regex` is used from +multiple threads simultaneously, it is possible for the overhead of +synchronization from contention to impact performance. The specific cases where +contention may happen is if you are calling any of these methods repeatedly +from multiple threads simultaneously: + +* shortest_match +* is_match +* find +* captures + +In particular, every invocation of one of these methods must synchronize with +other threads to retrieve its mutable scratch space before searching can start. +If, however, you are using one of these methods: + +* find_iter +* captures_iter + +Then you may not suffer from contention since the cost of synchronization is +amortized on *construction of the iterator*. That is, the mutable scratch space +is obtained when the iterator is created and retained throughout its lifetime. + +## Only ask for what you need + +**Advice**: Prefer in this order: `is_match`, `find`, `captures`. + +There are three primary search methods on a `Regex`: + +* is_match +* find +* captures + +In general, these are ordered from fastest to slowest. + +`is_match` is fastest because it doesn't actually need to find the start or the +end of the leftmost-first match. It can quit immediately after it knows there +is a match. For example, given the regex `a+` and the haystack, `aaaaa`, the +search will quit after examining the first byte. + +In contrast, `find` must return both the start and end location of the +leftmost-first match. It can use the DFA matcher for this, but must run it +forwards once to find the end of the match *and then run it backwards* to find +the start of the match. The two scans and the cost of finding the real end of +the leftmost-first match make this more expensive than `is_match`. + +`captures` is the most expensive of them all because it must do what `find` +does, and then run either the bounded backtracker or the Pike VM to fill in the +capture group locations. Both of these are simulations of an NFA, which must +spend a lot of time shuffling states around. The DFA limits the performance hit +somewhat by restricting the amount of text that must be searched via an NFA +simulation. + +One other method not mentioned is `shortest_match`. This method has precisely +the same performance characteristics as `is_match`, except it will return the +end location of when it discovered a match. For example, given the regex `a+` +and the haystack `aaaaa`, `shortest_match` may return `1` as opposed to `5`, +the latter of which being the correct end location of the leftmost-first match. + +## Literals in your regex may make it faster + +**Advice**: Literals can reduce the work that the regex engine needs to do. Use +them if you can, especially as prefixes. + +In particular, if your regex starts with a prefix literal, the prefix is +quickly searched before entering the (much slower) regex engine. For example, +given the regex `foo\w+`, the literal `foo` will be searched for using +Boyer-Moore. If there's no match, then no regex engine is ever used. Only when +there's a match is the regex engine invoked at the location of the match, which +effectively permits the regex engine to skip large portions of a haystack. +If a regex is comprised entirely of literals (possibly more than one), then +it's possible that the regex engine can be avoided entirely even when there's a +match. + +When one literal is found, Boyer-Moore is used. When multiple literals are +found, then an optimized version of Aho-Corasick is used. + +This optimization is in particular extended quite a bit in this crate. Here are +a few examples of regexes that get literal prefixes detected: + +* `(foo|bar)` detects `foo` and `bar` +* `(a|b)c` detects `ac` and `bc` +* `[ab]foo[yz]` detects `afooy`, `afooz`, `bfooy` and `bfooz` +* `a?b` detects `a` and `b` +* `a*b` detects `a` and `b` +* `(ab){3,6}` detects `ababab` + +Literals in anchored regexes can also be used for detecting non-matches very +quickly. For example, `^foo\w+` and `\w+foo$` may be able to detect a non-match +just by examining the first (or last) three bytes of the haystack. + +## Unicode word boundaries may prevent the DFA from being used + +**Advice**: In most cases, `\b` should work well. If not, use `(?-u:\b)` +instead of `\b` if you care about consistent performance more than correctness. + +It's a sad state of the current implementation. At the moment, the DFA will try +to interpret Unicode word boundaries as if they were ASCII word boundaries. +If the DFA comes across any non-ASCII byte, it will quit and fall back to an +alternative matching engine that can handle Unicode word boundaries correctly. +The alternate matching engine is generally quite a bit slower (perhaps by an +order of magnitude). If necessary, this can be ameliorated in two ways. + +The first way is to add some number of literal prefixes to your regular +expression. Even though the DFA may not be used, specialized routines will +still kick in to find prefix literals quickly, which limits how much work the +NFA simulation will need to do. + +The second way is to give up on Unicode and use an ASCII word boundary instead. +One can use an ASCII word boundary by disabling Unicode support. That is, +instead of using `\b`, use `(?-u:\b)`. Namely, given the regex `\b.+\b`, it +can be transformed into a regex that uses the DFA with `(?-u:\b).+(?-u:\b)`. It +is important to limit the scope of disabling the `u` flag, since it might lead +to a syntax error if the regex could match arbitrary bytes. For example, if one +wrote `(?-u)\b.+\b`, then a syntax error would be returned because `.` matches +any *byte* when the Unicode flag is disabled. + +The second way isn't appreciably different than just using a Unicode word +boundary in the first place, since the DFA will speculatively interpret it as +an ASCII word boundary anyway. The key difference is that if an ASCII word +boundary is used explicitly, then the DFA won't quit in the presence of +non-ASCII UTF-8 bytes. This results in giving up correctness in exchange for +more consistent performance. + +N.B. When using `bytes::Regex`, Unicode support is disabled by default, so one +can simply write `\b` to get an ASCII word boundary. + +## Excessive counting can lead to exponential state blow up in the DFA + +**Advice**: Don't write regexes that cause DFA state blow up if you care about +match performance. + +Wait, didn't I say that this crate guards against exponential worst cases? +Well, it turns out that the process of converting an NFA to a DFA can lead to +an exponential blow up in the number of states. This crate specifically guards +against exponential blow up by doing two things: + +1. The DFA is computed lazily. That is, a state in the DFA only exists in + memory if it is visited. In particular, the lazy DFA guarantees that *at + most* one state is created for every byte of input. This, on its own, + guarantees linear time complexity. +2. Of course, creating a new state for *every* byte of input means that search + will go incredibly slow because of very large constant factors. On top of + that, creating a state for every byte in a large haystack could result in + exorbitant memory usage. To ameliorate this, the DFA bounds the number of + states it can store. Once it reaches its limit, it flushes its cache. This + prevents reuse of states that it already computed. If the cache is flushed + too frequently, then the DFA will give up and execution will fall back to + one of the NFA simulations. + +In effect, this crate will detect exponential state blow up and fall back to +a search routine with fixed memory requirements. This does, however, mean that +searching will be much slower than one might expect. Regexes that rely on +counting in particular are strong aggravators of this behavior. For example, +matching `[01]*1[01]{20}$` against a random sequence of `0`s and `1`s. + +In the future, it may be possible to increase the bound that the DFA uses, +which would allow the caller to choose how much memory they're willing to +spend. + +## Resist the temptation to "optimize" regexes + +**Advice**: This ain't a backtracking engine. + +An entire book was written on how to optimize Perl-style regular expressions. +Most of those techniques are not applicable for this library. For example, +there is no problem with using non-greedy matching or having lots of +alternations in your regex. diff --git a/vendor/regex/README.md b/vendor/regex/README.md new file mode 100644 index 0000000000..861417da65 --- /dev/null +++ b/vendor/regex/README.md @@ -0,0 +1,246 @@ +regex +===== +A Rust library for parsing, compiling, and executing regular expressions. Its +syntax is similar to Perl-style regular expressions, but lacks a few features +like look around and backreferences. In exchange, all searches execute in +linear time with respect to the size of the regular expression and search text. +Much of the syntax and implementation is inspired +by [RE2](https://github.com/google/re2). + +[![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions) +[![Crates.io](https://img.shields.io/crates/v/regex.svg)](https://crates.io/crates/regex) +[![Rust](https://img.shields.io/badge/rust-1.41.1%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex) + +### Documentation + +[Module documentation with examples](https://docs.rs/regex). +The module documentation also includes a comprehensive description of the +syntax supported. + +Documentation with examples for the various matching functions and iterators +can be found on the +[`Regex` type](https://docs.rs/regex/*/regex/struct.Regex.html). + +### Usage + +To bring this crate into your repository, either add `regex` to your +`Cargo.toml`, or run `cargo add regex`. + +Here's a simple example that matches a date in YYYY-MM-DD format and prints the +year, month and day: + +```rust +use regex::Regex; + +fn main() { + let re = Regex::new(r"(?x) +(?P\d{4}) # the year +- +(?P\d{2}) # the month +- +(?P\d{2}) # the day +").unwrap(); + let caps = re.captures("2010-03-14").unwrap(); + + assert_eq!("2010", &caps["year"]); + assert_eq!("03", &caps["month"]); + assert_eq!("14", &caps["day"]); +} +``` + +If you have lots of dates in text that you'd like to iterate over, then it's +easy to adapt the above example with an iterator: + +```rust +use regex::Regex; + +const TO_SEARCH: &'static str = " +On 2010-03-14, foo happened. On 2014-10-14, bar happened. +"; + +fn main() { + let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap(); + + for caps in re.captures_iter(TO_SEARCH) { + // Note that all of the unwraps are actually OK for this regex + // because the only way for the regex to match is if all of the + // capture groups match. This is not true in general though! + println!("year: {}, month: {}, day: {}", + caps.get(1).unwrap().as_str(), + caps.get(2).unwrap().as_str(), + caps.get(3).unwrap().as_str()); + } +} +``` + +This example outputs: + +```text +year: 2010, month: 03, day: 14 +year: 2014, month: 10, day: 14 +``` + +### Usage: Avoid compiling the same regex in a loop + +It is an anti-pattern to compile the same regular expression in a loop since +compilation is typically expensive. (It takes anywhere from a few microseconds +to a few **milliseconds** depending on the size of the regex.) Not only is +compilation itself expensive, but this also prevents optimizations that reuse +allocations internally to the matching engines. + +In Rust, it can sometimes be a pain to pass regular expressions around if +they're used from inside a helper function. Instead, we recommend using the +[`lazy_static`](https://crates.io/crates/lazy_static) crate to ensure that +regular expressions are compiled exactly once. + +For example: + +```rust,ignore +use regex::Regex; + +fn some_helper_function(text: &str) -> bool { + lazy_static! { + static ref RE: Regex = Regex::new("...").unwrap(); + } + RE.is_match(text) +} +``` + +Specifically, in this example, the regex will be compiled when it is used for +the first time. On subsequent uses, it will reuse the previous compilation. + +### Usage: match regular expressions on `&[u8]` + +The main API of this crate (`regex::Regex`) requires the caller to pass a +`&str` for searching. In Rust, an `&str` is required to be valid UTF-8, which +means the main API can't be used for searching arbitrary bytes. + +To match on arbitrary bytes, use the `regex::bytes::Regex` API. The API +is identical to the main API, except that it takes an `&[u8]` to search +on instead of an `&str`. By default, `.` will match any *byte* using +`regex::bytes::Regex`, while `.` will match any *UTF-8 encoded Unicode scalar +value* using the main API. + +This example shows how to find all null-terminated strings in a slice of bytes: + +```rust +use regex::bytes::Regex; + +let re = Regex::new(r"(?P[^\x00]+)\x00").unwrap(); +let text = b"foo\x00bar\x00baz\x00"; + +// Extract all of the strings without the null terminator from each match. +// The unwrap is OK here since a match requires the `cstr` capture to match. +let cstrs: Vec<&[u8]> = + re.captures_iter(text) + .map(|c| c.name("cstr").unwrap().as_bytes()) + .collect(); +assert_eq!(vec![&b"foo"[..], &b"bar"[..], &b"baz"[..]], cstrs); +``` + +Notice here that the `[^\x00]+` will match any *byte* except for `NUL`. When +using the main API, `[^\x00]+` would instead match any valid UTF-8 sequence +except for `NUL`. + +### Usage: match multiple regular expressions simultaneously + +This demonstrates how to use a `RegexSet` to match multiple (possibly +overlapping) regular expressions in a single scan of the search text: + +```rust +use regex::RegexSet; + +let set = RegexSet::new(&[ + r"\w+", + r"\d+", + r"\pL+", + r"foo", + r"bar", + r"barfoo", + r"foobar", +]).unwrap(); + +// Iterate over and collect all of the matches. +let matches: Vec<_> = set.matches("foobar").into_iter().collect(); +assert_eq!(matches, vec![0, 2, 3, 4, 6]); + +// You can also test whether a particular regex matched: +let matches = set.matches("foobar"); +assert!(!matches.matched(5)); +assert!(matches.matched(6)); +``` + +### Usage: enable SIMD optimizations + +SIMD optimizations are enabled automatically on Rust stable 1.27 and newer. +For nightly versions of Rust, this requires a recent version with the SIMD +features stabilized. + + +### Usage: a regular expression parser + +This repository contains a crate that provides a well tested regular expression +parser, abstract syntax and a high-level intermediate representation for +convenient analysis. It provides no facilities for compilation or execution. +This may be useful if you're implementing your own regex engine or otherwise +need to do analysis on the syntax of a regular expression. It is otherwise not +recommended for general use. + +[Documentation `regex-syntax`.](https://docs.rs/regex-syntax) + + +### Crate features + +This crate comes with several features that permit tweaking the trade off +between binary size, compilation time and runtime performance. Users of this +crate can selectively disable Unicode tables, or choose from a variety of +optimizations performed by this crate to disable. + +When all of these features are disabled, runtime match performance may be much +worse, but if you're matching on short strings, or if high performance isn't +necessary, then such a configuration is perfectly serviceable. To disable +all such features, use the following `Cargo.toml` dependency configuration: + +```toml +[dependencies.regex] +version = "1.3" +default-features = false +# regex currently requires the standard library, you must re-enable it. +features = ["std"] +``` + +This will reduce the dependency tree of `regex` down to a single crate +(`regex-syntax`). + +The full set of features one can disable are +[in the "Crate features" section of the documentation](https://docs.rs/regex/*/#crate-features). + + +### Minimum Rust version policy + +This crate's minimum supported `rustc` version is `1.41.1`. + +The current **tentative** policy is that the minimum Rust version required +to use this crate can be increased in minor version updates. For example, if +regex 1.0 requires Rust 1.20.0, then regex 1.0.z for all values of `z` will +also require Rust 1.20.0 or newer. However, regex 1.y for `y > 0` may require a +newer minimum version of Rust. + +In general, this crate will be conservative with respect to the minimum +supported version of Rust. + + +### License + +This project is licensed under either of + + * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or + https://www.apache.org/licenses/LICENSE-2.0) + * MIT license ([LICENSE-MIT](LICENSE-MIT) or + https://opensource.org/licenses/MIT) + +at your option. + +The data in `regex-syntax/src/unicode_tables/` is licensed under the Unicode +License Agreement +([LICENSE-UNICODE](https://www.unicode.org/copyright.html#License)). diff --git a/vendor/regex/UNICODE.md b/vendor/regex/UNICODE.md new file mode 100644 index 0000000000..df7d21ed97 --- /dev/null +++ b/vendor/regex/UNICODE.md @@ -0,0 +1,259 @@ +# Unicode conformance + +This document describes the regex crate's conformance to Unicode's +[UTS#18](https://unicode.org/reports/tr18/) +report, which lays out 3 levels of support: Basic, Extended and Tailored. + +Full support for Level 1 ("Basic Unicode Support") is provided with two +exceptions: + +1. Line boundaries are not Unicode aware. Namely, only the `\n` + (`END OF LINE`) character is recognized as a line boundary. +2. The compatibility properties specified by + [RL1.2a](https://unicode.org/reports/tr18/#RL1.2a) + are ASCII-only definitions. + +Little to no support is provided for either Level 2 or Level 3. For the most +part, this is because the features are either complex/hard to implement, or at +the very least, very difficult to implement without sacrificing performance. +For example, tackling canonical equivalence such that matching worked as one +would expect regardless of normalization form would be a significant +undertaking. This is at least partially a result of the fact that this regex +engine is based on finite automata, which admits less flexibility normally +associated with backtracking implementations. + + +## RL1.1 Hex Notation + +[UTS#18 RL1.1](https://unicode.org/reports/tr18/#Hex_notation) + +Hex Notation refers to the ability to specify a Unicode code point in a regular +expression via its hexadecimal code point representation. This is useful in +environments that have poor Unicode font rendering or if you need to express a +code point that is not normally displayable. All forms of hexadecimal notation +are supported + + \x7F hex character code (exactly two digits) + \x{10FFFF} any hex character code corresponding to a Unicode code point + \u007F hex character code (exactly four digits) + \u{7F} any hex character code corresponding to a Unicode code point + \U0000007F hex character code (exactly eight digits) + \U{7F} any hex character code corresponding to a Unicode code point + +Briefly, the `\x{...}`, `\u{...}` and `\U{...}` are all exactly equivalent ways +of expressing hexadecimal code points. Any number of digits can be written +within the brackets. In contrast, `\xNN`, `\uNNNN`, `\UNNNNNNNN` are all +fixed-width variants of the same idea. + +Note that when Unicode mode is disabled, any non-ASCII Unicode codepoint is +banned. Additionally, the `\xNN` syntax represents arbitrary bytes when Unicode +mode is disabled. That is, the regex `\xFF` matches the Unicode codepoint +U+00FF (encoded as `\xC3\xBF` in UTF-8) while the regex `(?-u)\xFF` matches +the literal byte `\xFF`. + + +## RL1.2 Properties + +[UTS#18 RL1.2](https://unicode.org/reports/tr18/#Categories) + +Full support for Unicode property syntax is provided. Unicode properties +provide a convenient way to construct character classes of groups of code +points specified by Unicode. The regex crate does not provide exhaustive +support, but covers a useful subset. In particular: + +* [General categories](https://unicode.org/reports/tr18/#General_Category_Property) +* [Scripts and Script Extensions](https://unicode.org/reports/tr18/#Script_Property) +* [Age](https://unicode.org/reports/tr18/#Age) +* A smattering of boolean properties, including all of those specified by + [RL1.2](https://unicode.org/reports/tr18/#RL1.2) explicitly. + +In all cases, property name and value abbreviations are supported, and all +names/values are matched loosely without regard for case, whitespace or +underscores. Property name aliases can be found in Unicode's +[`PropertyAliases.txt`](https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt) +file, while property value aliases can be found in Unicode's +[`PropertyValueAliases.txt`](https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt) +file. + +The syntax supported is also consistent with the UTS#18 recommendation: + +* `\p{Greek}` selects the `Greek` script. Equivalent expressions follow: + `\p{sc:Greek}`, `\p{Script:Greek}`, `\p{Sc=Greek}`, `\p{script=Greek}`, + `\P{sc!=Greek}`. Similarly for `General_Category` (or `gc` for short) and + `Script_Extensions` (or `scx` for short). +* `\p{age:3.2}` selects all code points in Unicode 3.2. +* `\p{Alphabetic}` selects the "alphabetic" property and can be abbreviated + via `\p{alpha}` (for example). +* Single letter variants for properties with single letter abbreviations. + For example, `\p{Letter}` can be equivalently written as `\pL`. + +The following is a list of all properties supported by the regex crate (starred +properties correspond to properties required by RL1.2): + +* `General_Category` \* (including `Any`, `ASCII` and `Assigned`) +* `Script` \* +* `Script_Extensions` \* +* `Age` +* `ASCII_Hex_Digit` +* `Alphabetic` \* +* `Bidi_Control` +* `Case_Ignorable` +* `Cased` +* `Changes_When_Casefolded` +* `Changes_When_Casemapped` +* `Changes_When_Lowercased` +* `Changes_When_Titlecased` +* `Changes_When_Uppercased` +* `Dash` +* `Default_Ignorable_Code_Point` \* +* `Deprecated` +* `Diacritic` +* `Emoji` +* `Emoji_Presentation` +* `Emoji_Modifier` +* `Emoji_Modifier_Base` +* `Emoji_Component` +* `Extended_Pictographic` +* `Extender` +* `Grapheme_Base` +* `Grapheme_Cluster_Break` +* `Grapheme_Extend` +* `Hex_Digit` +* `IDS_Binary_Operator` +* `IDS_Trinary_Operator` +* `ID_Continue` +* `ID_Start` +* `Join_Control` +* `Logical_Order_Exception` +* `Lowercase` \* +* `Math` +* `Noncharacter_Code_Point` \* +* `Pattern_Syntax` +* `Pattern_White_Space` +* `Prepended_Concatenation_Mark` +* `Quotation_Mark` +* `Radical` +* `Regional_Indicator` +* `Sentence_Break` +* `Sentence_Terminal` +* `Soft_Dotted` +* `Terminal_Punctuation` +* `Unified_Ideograph` +* `Uppercase` \* +* `Variation_Selector` +* `White_Space` \* +* `Word_Break` +* `XID_Continue` +* `XID_Start` + + +## RL1.2a Compatibility Properties + +[UTS#18 RL1.2a](https://unicode.org/reports/tr18/#RL1.2a) + +The regex crate only provides ASCII definitions of the +[compatibility properties documented in UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties) +(sans the `\X` class, for matching grapheme clusters, which isn't provided +at all). This is because it seems to be consistent with most other regular +expression engines, and in particular, because these are often referred to as +"ASCII" or "POSIX" character classes. + +Note that the `\w`, `\s` and `\d` character classes **are** Unicode aware. +Their traditional ASCII definition can be used by disabling Unicode. That is, +`[[:word:]]` and `(?-u)\w` are equivalent. + + +## RL1.3 Subtraction and Intersection + +[UTS#18 RL1.3](https://unicode.org/reports/tr18/#Subtraction_and_Intersection) + +The regex crate provides full support for nested character classes, along with +union, intersection (`&&`), difference (`--`) and symmetric difference (`~~`) +operations on arbitrary character classes. + +For example, to match all non-ASCII letters, you could use either +`[\p{Letter}--\p{Ascii}]` (difference) or `[\p{Letter}&&[^\p{Ascii}]]` +(intersecting the negation). + + +## RL1.4 Simple Word Boundaries + +[UTS#18 RL1.4](https://unicode.org/reports/tr18/#Simple_Word_Boundaries) + +The regex crate provides basic Unicode aware word boundary assertions. A word +boundary assertion can be written as `\b`, or `\B` as its negation. A word +boundary negation corresponds to a zero-width match, where its adjacent +characters correspond to word and non-word, or non-word and word characters. + +Conformance in this case chooses to define word character in the same way that +the `\w` character class is defined: a code point that is a member of one of +the following classes: + +* `\p{Alphabetic}` +* `\p{Join_Control}` +* `\p{gc:Mark}` +* `\p{gc:Decimal_Number}` +* `\p{gc:Connector_Punctuation}` + +In particular, this differs slightly from the +[prescription given in RL1.4](https://unicode.org/reports/tr18/#Simple_Word_Boundaries) +but is permissible according to +[UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties). +Namely, it is convenient and simpler to have `\w` and `\b` be in sync with +one another. + +Finally, Unicode word boundaries can be disabled, which will cause ASCII word +boundaries to be used instead. That is, `\b` is a Unicode word boundary while +`(?-u)\b` is an ASCII-only word boundary. This can occasionally be beneficial +if performance is important, since the implementation of Unicode word +boundaries is currently sub-optimal on non-ASCII text. + + +## RL1.5 Simple Loose Matches + +[UTS#18 RL1.5](https://unicode.org/reports/tr18/#Simple_Loose_Matches) + +The regex crate provides full support for case insensitive matching in +accordance with RL1.5. That is, it uses the "simple" case folding mapping. The +"simple" mapping was chosen because of a key convenient property: every +"simple" mapping is a mapping from exactly one code point to exactly one other +code point. This makes case insensitive matching of character classes, for +example, straight-forward to implement. + +When case insensitive mode is enabled (e.g., `(?i)[a]` is equivalent to `a|A`), +then all characters classes are case folded as well. + + +## RL1.6 Line Boundaries + +[UTS#18 RL1.6](https://unicode.org/reports/tr18/#Line_Boundaries) + +The regex crate only provides support for recognizing the `\n` (`END OF LINE`) +character as a line boundary. This choice was made mostly for implementation +convenience, and to avoid performance cliffs that Unicode word boundaries are +subject to. + +Ideally, it would be nice to at least support `\r\n` as a line boundary as +well, and in theory, this could be done efficiently. + + +## RL1.7 Code Points + +[UTS#18 RL1.7](https://unicode.org/reports/tr18/#Supplementary_Characters) + +The regex crate provides full support for Unicode code point matching. Namely, +the fundamental atom of any match is always a single code point. + +Given Rust's strong ties to UTF-8, the following guarantees are also provided: + +* All matches are reported on valid UTF-8 code unit boundaries. That is, any + match range returned by the public regex API is guaranteed to successfully + slice the string that was searched. +* By consequence of the above, it is impossible to match surrogode code points. + No support for UTF-16 is provided, so this is never necessary. + +Note that when Unicode mode is disabled, the fundamental atom of matching is +no longer a code point but a single byte. When Unicode mode is disabled, many +Unicode features are disabled as well. For example, `(?-u)\pL` is not a valid +regex but `\pL(?-u)\xFF` (matches any Unicode `Letter` followed by the literal +byte `\xFF`) is, for example. diff --git a/vendor/regex/examples/regexdna-input.txt b/vendor/regex/examples/regexdna-input.txt new file mode 100644 index 0000000000..fb23263397 --- /dev/null +++ b/vendor/regex/examples/regexdna-input.txt @@ -0,0 +1,1671 @@ +>ONE Homo sapiens alu +GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGA +TCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACT +AAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAG +GCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCG +CCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGT +GGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCA +GGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAA +TTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAG +AATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCA +GCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGT +AATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACC +AGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTG +GTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACC +CGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAG +AGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTT +TGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACA +TGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCT +GTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGG +TTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGT +CTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG +CGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCG +TCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTA +CTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCG +AGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCG +GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACC +TGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAA +TACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGA +GGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACT +GCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTC +ACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGT +TCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGC +CGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCG +CTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTG +GGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCC +CAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCT +GGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGC +GCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGA +GGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGA +GACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGA +GGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTG +AAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAAT +CCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCA +GTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAA +AAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGC +GGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCT +ACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGG +GAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATC +GCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGC +GGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGG +TCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAA +AAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAG +GAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACT +CCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCC +TGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAG +ACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGC +GTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGA +ACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGA +CAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCA +CTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCA +ACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCG +CCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGG +AGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTC +CGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCG +AGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACC +CCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAG +CTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAG +CCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGG +CCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATC +ACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAA +AAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGC +TGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCC +ACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGG +CTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGG +AGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATT +AGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAA +TCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGC +CTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAA +TCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAG +CCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGT +GGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCG +GGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAG +CGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTG +GGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATG +GTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGT +AATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTT +GCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCT +CAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCG +GGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTC +TCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACT +CGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAG +ATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGG +CGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTG +AGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATA +CAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGG +CAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGC +ACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCAC +GCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTC +GAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCG +GGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCT +TGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGG +CGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCA +GCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGG +CCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGC +GCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGG +CGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGA +CTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGG +CCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAA +ACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCC +CAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGT +GAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAA +AGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGG +ATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTAC +TAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGA +GGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGC +GCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGG +TGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTC +AGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAA +ATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGA +GAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC +AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTG +TAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGAC +CAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGT +GGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAAC +CCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACA +GAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACT +TTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAAC +ATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCC +TGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAG +GTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCG +TCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAG +GCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCC +GTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCT +ACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCC +GAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCC +GGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCAC +CTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAA +ATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTG +AGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCAC +TGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCT +CACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAG +TTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAG +CCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATC +GCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCT +GGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATC +CCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCC +TGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGG +CGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG +AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCG +AGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGG +AGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGT +GAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAA +TCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGC +AGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCA +AAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGG +CGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTC +TACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCG +GGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGAT +CGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCG +CGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAG +GTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACA +AAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCA +GGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCAC +TCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGC +CTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGA +GACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGG +CGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTG +AACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCG +ACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGC +ACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCC +AACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGC +GCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCG +GAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACT +CCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCC +GAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAAC +CCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA +GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGA +GCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAG +GCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGAT +CACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTA +AAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGG +CTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGC +CACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTG +GCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAG +GAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAAT +TAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGA +ATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAG +CCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTA +ATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCA +GCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGG +TGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCC +GGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGA +GCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTT +GGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACAT +GGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTG +TAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGT +TGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTC +TCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGC +GGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGT +CTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTAC +TCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGA +GATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGG +GCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCT +GAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT +ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAG +GCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTG +CACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCA +CGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTT +CGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCC +GGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGC +TTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGG +GCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCC +AGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTG +GCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCG +CGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAG +GCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAG +ACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAG +GCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGA +AACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATC +CCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAG +TGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAA +AAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCG +GATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTA +CTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGG +AGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCG +CGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCG +GTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGT +CAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAA +AATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGG +AGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTC +CAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCT +GTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA +CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCG +TGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAA +CCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGAC +AGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCAC +TTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAA +CATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGC +CTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGA +GGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCC +GTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGA +GGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCC +CGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGC +TACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGC +CGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGC +CGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCA +CCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAA +AATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCT +GAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCA +CTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGC +TCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGA +GTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTA +GCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAAT +CGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCC +TGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAAT +CCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGC +CTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTG +GCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGG +GAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGC +GAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG +GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGG +TGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTA +ATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTG +CAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTC +AAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGG +GCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCT +CTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTC +GGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGA +TCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGC +GCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGA +GGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATAC +AAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGC +AGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCA +CTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACG +CCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCG +AGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGG +GCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTT +GAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGC +GACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAG +CACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGC +CAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCG +CGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGC +GGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGAC +TCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGC +CGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAA +CCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCC +AGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTG +AGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA +GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGA +TCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACT +AAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAG +GCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCG +CCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGT +GGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCA +GGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAA +TTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAG +AATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCA +GCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGT +AATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACC +AGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTG +GTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACC +CGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAG +AGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTT +TGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACA +TGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCT +GTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGG +TTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGT +CTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG +CGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCG +TCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTA +CTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCG +AGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCG +GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACC +TGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAA +TACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGA +GGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACT +GCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTC +ACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGT +TCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGC +CGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCG +CTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTG +GGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCC +CAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCT +GGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGC +GCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGA +GGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGA +GACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGA +GGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTG +AAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAAT +CCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCA +GTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAA +AAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGC +GGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCT +ACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGG +GAGGCTGAGGCAGGAGAATC +>TWO IUB ambiguity codes +cttBtatcatatgctaKggNcataaaSatgtaaaDcDRtBggDtctttataattcBgtcg +tactDtDagcctatttSVHtHttKtgtHMaSattgWaHKHttttagacatWatgtRgaaa +NtactMcSMtYtcMgRtacttctWBacgaaatatagScDtttgaagacacatagtVgYgt +cattHWtMMWcStgttaggKtSgaYaaccWStcgBttgcgaMttBYatcWtgacaYcaga +gtaBDtRacttttcWatMttDBcatWtatcttactaBgaYtcttgttttttttYaaScYa +HgtgttNtSatcMtcVaaaStccRcctDaataataStcYtRDSaMtDttgttSagtRRca +tttHatSttMtWgtcgtatSSagactYaaattcaMtWatttaSgYttaRgKaRtccactt +tattRggaMcDaWaWagttttgacatgttctacaaaRaatataataaMttcgDacgaSSt +acaStYRctVaNMtMgtaggcKatcttttattaaaaagVWaHKYagtttttatttaacct +tacgtVtcVaattVMBcttaMtttaStgacttagattWWacVtgWYagWVRctDattBYt +gtttaagaagattattgacVatMaacattVctgtBSgaVtgWWggaKHaatKWcBScSWa +accRVacacaaactaccScattRatatKVtactatatttHttaagtttSKtRtacaaagt +RDttcaaaaWgcacatWaDgtDKacgaacaattacaRNWaatHtttStgttattaaMtgt +tgDcgtMgcatBtgcttcgcgaDWgagctgcgaggggVtaaScNatttacttaatgacag +cccccacatYScaMgtaggtYaNgttctgaMaacNaMRaacaaacaKctacatagYWctg +ttWaaataaaataRattagHacacaagcgKatacBttRttaagtatttccgatctHSaat +actcNttMaagtattMtgRtgaMgcataatHcMtaBSaRattagttgatHtMttaaKagg +YtaaBataSaVatactWtataVWgKgttaaaacagtgcgRatatacatVtHRtVYataSa +KtWaStVcNKHKttactatccctcatgWHatWaRcttactaggatctataDtDHBttata +aaaHgtacVtagaYttYaKcctattcttcttaataNDaaggaaaDYgcggctaaWSctBa +aNtgctggMBaKctaMVKagBaactaWaDaMaccYVtNtaHtVWtKgRtcaaNtYaNacg +gtttNattgVtttctgtBaWgtaattcaagtcaVWtactNggattctttaYtaaagccgc +tcttagHVggaYtgtNcDaVagctctctKgacgtatagYcctRYHDtgBattDaaDgccK +tcHaaStttMcctagtattgcRgWBaVatHaaaataYtgtttagMDMRtaataaggatMt +ttctWgtNtgtgaaaaMaatatRtttMtDgHHtgtcattttcWattRSHcVagaagtacg +ggtaKVattKYagactNaatgtttgKMMgYNtcccgSKttctaStatatNVataYHgtNa +BKRgNacaactgatttcctttaNcgatttctctataScaHtataRagtcRVttacDSDtt +aRtSatacHgtSKacYagttMHtWataggatgactNtatSaNctataVtttRNKtgRacc +tttYtatgttactttttcctttaaacatacaHactMacacggtWataMtBVacRaSaatc +cgtaBVttccagccBcttaRKtgtgcctttttRtgtcagcRttKtaaacKtaaatctcac +aattgcaNtSBaaccgggttattaaBcKatDagttactcttcattVtttHaaggctKKga +tacatcBggScagtVcacattttgaHaDSgHatRMaHWggtatatRgccDttcgtatcga +aacaHtaagttaRatgaVacttagattVKtaaYttaaatcaNatccRttRRaMScNaaaD +gttVHWgtcHaaHgacVaWtgttScactaagSgttatcttagggDtaccagWattWtRtg +ttHWHacgattBtgVcaYatcggttgagKcWtKKcaVtgaYgWctgYggVctgtHgaNcV +taBtWaaYatcDRaaRtSctgaHaYRttagatMatgcatttNattaDttaattgttctaa +ccctcccctagaWBtttHtBccttagaVaatMcBHagaVcWcagBVttcBtaYMccagat +gaaaaHctctaacgttagNWRtcggattNatcRaNHttcagtKttttgWatWttcSaNgg +gaWtactKKMaacatKatacNattgctWtatctaVgagctatgtRaHtYcWcttagccaa +tYttWttaWSSttaHcaaaaagVacVgtaVaRMgattaVcDactttcHHggHRtgNcctt +tYatcatKgctcctctatVcaaaaKaaaagtatatctgMtWtaaaacaStttMtcgactt +taSatcgDataaactaaacaagtaaVctaggaSccaatMVtaaSKNVattttgHccatca +cBVctgcaVatVttRtactgtVcaattHgtaaattaaattttYtatattaaRSgYtgBag +aHSBDgtagcacRHtYcBgtcacttacactaYcgctWtattgSHtSatcataaatataHt +cgtYaaMNgBaatttaRgaMaatatttBtttaaaHHKaatctgatWatYaacttMctctt +ttVctagctDaaagtaVaKaKRtaacBgtatccaaccactHHaagaagaaggaNaaatBW +attccgStaMSaMatBttgcatgRSacgttVVtaaDMtcSgVatWcaSatcttttVatag +ttactttacgatcaccNtaDVgSRcgVcgtgaacgaNtaNatatagtHtMgtHcMtagaa +attBgtataRaaaacaYKgtRccYtatgaagtaataKgtaaMttgaaRVatgcagaKStc +tHNaaatctBBtcttaYaBWHgtVtgacagcaRcataWctcaBcYacYgatDgtDHccta +aagacYRcaggattHaYgtKtaatgcVcaataMYacccatatcacgWDBtgaatcBaata +cKcttRaRtgatgaBDacggtaattaaYtataStgVHDtDctgactcaaatKtacaatgc +gYatBtRaDatHaactgtttatatDttttaaaKVccYcaaccNcBcgHaaVcattHctcg +attaaatBtatgcaaaaatYMctSactHatacgaWacattacMBgHttcgaatVaaaaca +BatatVtctgaaaaWtctRacgBMaatSgRgtgtcgactatcRtattaScctaStagKga +DcWgtYtDDWKRgRtHatRtggtcgaHgggcgtattaMgtcagccaBggWVcWctVaaat +tcgNaatcKWagcNaHtgaaaSaaagctcYctttRVtaaaatNtataaccKtaRgtttaM +tgtKaBtRtNaggaSattHatatWactcagtgtactaKctatttgRYYatKatgtccgtR +tttttatttaatatVgKtttgtatgtNtataRatWYNgtRtHggtaaKaYtKSDcatcKg +taaYatcSRctaVtSMWtVtRWHatttagataDtVggacagVcgKWagBgatBtaaagNc +aRtagcataBggactaacacRctKgttaatcctHgDgttKHHagttgttaatgHBtatHc +DaagtVaBaRccctVgtgDtacRHSctaagagcggWYaBtSaKtHBtaaactYacgNKBa +VYgtaacttagtVttcttaatgtBtatMtMtttaattaatBWccatRtttcatagVgMMt +agctStKctaMactacDNYgKYHgaWcgaHgagattacVgtttgtRaSttaWaVgataat +gtgtYtaStattattMtNgWtgttKaccaatagNYttattcgtatHcWtctaaaNVYKKt +tWtggcDtcgaagtNcagatacgcattaagaccWctgcagcttggNSgaNcHggatgtVt +catNtRaaBNcHVagagaaBtaaSggDaatWaatRccaVgggStctDaacataKttKatt +tggacYtattcSatcttagcaatgaVBMcttDattctYaaRgatgcattttNgVHtKcYR +aatRKctgtaaacRatVSagctgtWacBtKVatctgttttKcgtctaaDcaagtatcSat +aWVgcKKataWaYttcccSaatgaaaacccWgcRctWatNcWtBRttYaattataaNgac +acaatagtttVNtataNaYtaatRaVWKtBatKagtaatataDaNaaaaataMtaagaaS +tccBcaatNgaataWtHaNactgtcDtRcYaaVaaaaaDgtttRatctatgHtgttKtga +aNSgatactttcgagWaaatctKaaDaRttgtggKKagcDgataaattgSaacWaVtaNM +acKtcaDaaatttctRaaVcagNacaScRBatatctRatcctaNatWgRtcDcSaWSgtt +RtKaRtMtKaatgttBHcYaaBtgatSgaSWaScMgatNtctcctatttctYtatMatMt +RRtSaattaMtagaaaaStcgVgRttSVaScagtgDtttatcatcatacRcatatDctta +tcatVRtttataaHtattcYtcaaaatactttgVctagtaaYttagatagtSYacKaaac +gaaKtaaatagataatSatatgaaatSgKtaatVtttatcctgKHaatHattagaaccgt +YaaHactRcggSBNgtgctaaBagBttgtRttaaattYtVRaaaattgtaatVatttctc +ttcatgBcVgtgKgaHaaatattYatagWacNctgaaMcgaattStagWaSgtaaKagtt +ttaagaDgatKcctgtaHtcatggKttVDatcaaggtYcgccagNgtgcVttttagagat +gctaccacggggtNttttaSHaNtatNcctcatSaaVgtactgBHtagcaYggYVKNgta +KBcRttgaWatgaatVtagtcgattYgatgtaatttacDacSctgctaaaStttaWMagD +aaatcaVYctccgggcgaVtaaWtStaKMgDtttcaaMtVgBaatccagNaaatcYRMBg +gttWtaaScKttMWtYataRaDBMaDataatHBcacDaaKDactaMgagttDattaHatH +taYatDtattDcRNStgaatattSDttggtattaaNSYacttcDMgYgBatWtaMagact +VWttctttgYMaYaacRgHWaattgRtaagcattctMKVStatactacHVtatgatcBtV +NataaBttYtSttacKgggWgYDtgaVtYgatDaacattYgatggtRDaVDttNactaSa +MtgNttaacaaSaBStcDctaccacagacgcaHatMataWKYtaYattMcaMtgSttDag +cHacgatcaHttYaKHggagttccgatYcaatgatRaVRcaagatcagtatggScctata +ttaNtagcgacgtgKaaWaactSgagtMYtcttccaKtStaacggMtaagNttattatcg +tctaRcactctctDtaacWYtgaYaSaagaWtNtatttRacatgNaatgttattgWDDcN +aHcctgaaHacSgaataaRaataMHttatMtgaSDSKatatHHaNtacagtccaYatWtc +actaactatKDacSaStcggataHgYatagKtaatKagStaNgtatactatggRHacttg +tattatgtDVagDVaRctacMYattDgtttYgtctatggtKaRSttRccRtaaccttaga +gRatagSaaMaacgcaNtatgaaatcaRaagataatagatactcHaaYKBctccaagaRa +BaStNagataggcgaatgaMtagaatgtcaKttaaatgtaWcaBttaatRcggtgNcaca +aKtttScRtWtgcatagtttWYaagBttDKgcctttatMggNttattBtctagVtacata +aaYttacacaaRttcYtWttgHcaYYtaMgBaBatctNgcDtNttacgacDcgataaSat +YaSttWtcctatKaatgcagHaVaacgctgcatDtgttaSataaaaYSNttatagtaNYt +aDaaaNtggggacttaBggcHgcgtNtaaMcctggtVtaKcgNacNtatVaSWctWtgaW +cggNaBagctctgaYataMgaagatBSttctatacttgtgtKtaattttRagtDtacata +tatatgatNHVgBMtKtaKaNttDHaagatactHaccHtcatttaaagttVaMcNgHata +tKtaNtgYMccttatcaaNagctggacStttcNtggcaVtattactHaSttatgNMVatt +MMDtMactattattgWMSgtHBttStStgatatRaDaagattttctatMtaaaaaggtac +taaVttaSacNaatactgMttgacHaHRttgMacaaaatagttaatatWKRgacDgaRta +tatttattatcYttaWtgtBRtWatgHaaattHataagtVaDtWaVaWtgStcgtMSgaS +RgMKtaaataVacataatgtaSaatttagtcgaaHtaKaatgcacatcggRaggSKctDc +agtcSttcccStYtccRtctctYtcaaKcgagtaMttttcRaYDttgttatctaatcata +NctctgctatcaMatactataggDaHaaSttMtaDtcNatataattctMcStaaBYtaNa +gatgtaatHagagSttgWHVcttatKaYgDctcttggtgttMcRaVgSgggtagacaata +aDtaattSaDaNaHaBctattgNtaccaaRgaVtKNtaaYggHtaKKgHcatctWtctDt +ttctttggSDtNtaStagttataaacaattgcaBaBWggHgcaaaBtYgctaatgaaatW +cDcttHtcMtWWattBHatcatcaaatctKMagtDNatttWaBtHaaaNgMttaaStagt +tctctaatDtcRVaYttgttMtRtgtcaSaaYVgSWDRtaatagctcagDgcWWaaaBaa +RaBctgVgggNgDWStNaNBKcBctaaKtttDcttBaaggBttgaccatgaaaNgttttt +tttatctatgttataccaaDRaaSagtaVtDtcaWatBtacattaWacttaSgtattggD +gKaaatScaattacgWcagKHaaccaYcRcaRttaDttRtttHgaHVggcttBaRgtccc +tDatKaVtKtcRgYtaKttacgtatBtStaagcaattaagaRgBagSaattccSWYttta +ttVaataNctgHgttaaNBgcVYgtRtcccagWNaaaacaDNaBcaaaaRVtcWMgBagM +tttattacgDacttBtactatcattggaaatVccggttRttcatagttVYcatYaSHaHc +ttaaagcNWaHataaaRWtctVtRYtagHtaaaYMataHYtNBctNtKaatattStgaMc +BtRgctaKtgcScSttDgYatcVtggaaKtaagatWccHccgKYctaNNctacaWctttt +gcRtgtVcgaKttcMRHgctaHtVaataaDtatgKDcttatBtDttggNtacttttMtga +acRattaaNagaactcaaaBBVtcDtcgaStaDctgaaaSgttMaDtcgttcaccaaaag +gWtcKcgSMtcDtatgtttStaaBtatagDcatYatWtaaaBacaKgcaDatgRggaaYc +taRtccagattDaWtttggacBaVcHtHtaacDacYgtaatataMagaatgHMatcttat +acgtatttttatattacHactgttataMgStYaattYaccaattgagtcaaattaYtgta +tcatgMcaDcgggtcttDtKgcatgWRtataatatRacacNRBttcHtBgcRttgtgcgt +catacMtttBctatctBaatcattMttMYgattaaVYatgDaatVagtattDacaacDMa +tcMtHcccataagatgBggaccattVWtRtSacatgctcaaggggYtttDtaaNgNtaaB +atggaatgtctRtaBgBtcNYatatNRtagaacMgagSaSDDSaDcctRagtVWSHtVSR +ggaacaBVaccgtttaStagaacaMtactccagtttVctaaRaaHttNcttagcaattta +ttaatRtaaaatctaacDaBttggSagagctacHtaaRWgattcaaBtctRtSHaNtgta +cattVcaHaNaagtataccacaWtaRtaaVKgMYaWgttaKggKMtKcgWatcaDatYtK +SttgtacgaccNctSaattcDcatcttcaaaDKttacHtggttHggRRaRcaWacaMtBW +VHSHgaaMcKattgtaRWttScNattBBatYtaNRgcggaagacHSaattRtttcYgacc +BRccMacccKgatgaacttcgDgHcaaaaaRtatatDtatYVtttttHgSHaSaatagct +NYtaHYaVYttattNtttgaaaYtaKttWtctaNtgagaaaNctNDctaaHgttagDcRt +tatagccBaacgcaRBtRctRtggtaMYYttWtgataatcgaataattattataVaaaaa +ttacNRVYcaaMacNatRttcKatMctgaagactaattataaYgcKcaSYaatMNctcaa +cgtgatttttBacNtgatDccaattattKWWcattttatatatgatBcDtaaaagttgaa +VtaHtaHHtBtataRBgtgDtaataMttRtDgDcttattNtggtctatctaaBcatctaR +atgNacWtaatgaagtcMNaacNgHttatactaWgcNtaStaRgttaaHacccgaYStac +aaaatWggaYaWgaattattcMaactcBKaaaRVNcaNRDcYcgaBctKaacaaaaaSgc +tccYBBHYaVagaatagaaaacagYtctVccaMtcgtttVatcaatttDRtgWctagtac +RttMctgtDctttcKtWttttataaatgVttgBKtgtKWDaWagMtaaagaaattDVtag +gttacatcatttatgtcgMHaVcttaBtVRtcgtaYgBRHatttHgaBcKaYWaatcNSc +tagtaaaaatttacaatcactSWacgtaatgKttWattagttttNaggtctcaagtcact +attcttctaagKggaataMgtttcataagataaaaatagattatDgcBVHWgaBKttDgc +atRHaagcaYcRaattattatgtMatatattgHDtcaDtcaaaHctStattaatHaccga +cNattgatatattttgtgtDtRatagSacaMtcRtcattcccgacacSattgttKaWatt +NHcaacttccgtttSRtgtctgDcgctcaaMagVtBctBMcMcWtgtaacgactctcttR +ggRKSttgYtYatDccagttDgaKccacgVatWcataVaaagaataMgtgataaKYaaat +cHDaacgataYctRtcYatcgcaMgtNttaBttttgatttaRtStgcaacaaaataccVg +aaDgtVgDcStctatatttattaaaaRKDatagaaagaKaaYYcaYSgKStctccSttac +agtcNactttDVttagaaagMHttRaNcSaRaMgBttattggtttaRMggatggcKDgWR +tNaataataWKKacttcKWaaagNaBttaBatMHtccattaacttccccYtcBcYRtaga +ttaagctaaYBDttaNtgaaaccHcaRMtKtaaHMcNBttaNaNcVcgVttWNtDaBatg +ataaVtcWKcttRggWatcattgaRagHgaattNtatttctctattaattaatgaDaaMa +tacgttgggcHaYVaaNaDDttHtcaaHtcVVDgBVagcMacgtgttaaBRNtatRtcag +taagaggtttaagacaVaaggttaWatctccgtVtaDtcDatttccVatgtacNtttccg +tHttatKgScBatgtVgHtYcWagcaKtaMYaaHgtaattaSaHcgcagtWNaatNccNN +YcacgVaagaRacttctcattcccRtgtgtaattagcSttaaStWaMtctNNcSMacatt +ataaactaDgtatWgtagtttaagaaaattgtagtNagtcaataaatttgatMMYactaa +tatcggBWDtVcYttcDHtVttatacYaRgaMaacaStaatcRttttVtagaDtcacWat +ttWtgaaaagaaagNRacDtttStVatBaDNtaactatatcBSMcccaSttccggaMatg +attaaWatKMaBaBatttgataNctgttKtVaagtcagScgaaaDggaWgtgttttKtWt +atttHaatgtagttcactaaKMagttSYBtKtaYgaactcagagRtatagtVtatcaaaW +YagcgNtaDagtacNSaaYDgatBgtcgataacYDtaaactacagWDcYKaagtttatta +gcatcgagttKcatDaattgattatDtcagRtWSKtcgNtMaaaaacaMttKcaWcaaSV +MaaaccagMVtaMaDtMaHaBgaacataBBVtaatVYaNSWcSgNtDNaaKacacBttta +tKtgtttcaaHaMctcagtaacgtcgYtactDcgcctaNgagagcYgatattttaaattt +ccattttacatttDaaRctattttWctttacgtDatYtttcagacgcaaVttagtaaKaa +aRtgVtccataBggacttatttgtttaWNtgttVWtaWNVDaattgtatttBaagcBtaa +BttaaVatcHcaVgacattccNggtcgacKttaaaRtagRtctWagaYggtgMtataatM +tgaaRttattttgWcttNtDRRgMDKacagaaaaggaaaRStcccagtYccVattaNaaK +StNWtgacaVtagaagcttSaaDtcacaacgDYacWDYtgtttKatcVtgcMaDaSKStV +cgtagaaWaKaagtttcHaHgMgMtctataagBtKaaaKKcactggagRRttaagaBaaN +atVVcgRcKSttDaactagtSttSattgttgaaRYatggttVttaataaHttccaagDtg +atNWtaagHtgcYtaactRgcaatgMgtgtRaatRaNaacHKtagactactggaatttcg +ccataacgMctRgatgttaccctaHgtgWaYcactcacYaattcttaBtgacttaaacct +gYgaWatgBttcttVttcgttWttMcNYgtaaaatctYgMgaaattacNgaHgaacDVVM +tttggtHtctaaRgtacagacgHtVtaBMNBgattagcttaRcttacaHcRctgttcaaD +BggttKaacatgKtttYataVaNattccgMcgcgtagtRaVVaattaKaatggttRgaMc +agtatcWBttNtHagctaatctagaaNaaacaYBctatcgcVctBtgcaaagDgttVtga +HtactSNYtaaNccatgtgDacgaVtDcgKaRtacDcttgctaagggcagMDagggtBWR +tttSgccttttttaacgtcHctaVtVDtagatcaNMaVtcVacatHctDWNaataRgcgt +aVHaggtaaaaSgtttMtattDgBtctgatSgtRagagYtctSaKWaataMgattRKtaa +catttYcgtaacacattRWtBtcggtaaatMtaaacBatttctKagtcDtttgcBtKYYB +aKttctVttgttaDtgattttcttccacttgSaaacggaaaNDaattcYNNaWcgaaYat +tttMgcBtcatRtgtaaagatgaWtgaccaYBHgaatagataVVtHtttVgYBtMctaMt +cctgaDcYttgtccaaaRNtacagcMctKaaaggatttacatgtttaaWSaYaKttBtag +DacactagctMtttNaKtctttcNcSattNacttggaacaatDagtattRtgSHaataat +gccVgacccgatactatccctgtRctttgagaSgatcatatcgDcagWaaHSgctYYWta +tHttggttctttatVattatcgactaagtgtagcatVgtgHMtttgtttcgttaKattcM +atttgtttWcaaStNatgtHcaaaDtaagBaKBtRgaBgDtSagtatMtaacYaatYtVc +KatgtgcaacVaaaatactKcRgtaYtgtNgBBNcKtcttaccttKgaRaYcaNKtactt +tgagSBtgtRagaNgcaaaNcacagtVtttHWatgttaNatBgtttaatNgVtctgaata +tcaRtattcttttttttRaaKcRStctcggDgKagattaMaaaKtcaHacttaataataK +taRgDtKVBttttcgtKaggHHcatgttagHggttNctcgtatKKagVagRaaaggaaBt +NatttVKcRttaHctaHtcaaatgtaggHccaBataNaNaggttgcWaatctgatYcaaa +HaatWtaVgaaBttagtaagaKKtaaaKtRHatMaDBtBctagcatWtatttgWttVaaa +ScMNattRactttgtYtttaaaagtaagtMtaMaSttMBtatgaBtttaKtgaatgagYg +tNNacMtcNRacMMHcttWtgtRtctttaacaacattattcYaMagBaacYttMatcttK +cRMtgMNccattaRttNatHaHNaSaaHMacacaVaatacaKaSttHatattMtVatWga +ttttttaYctttKttHgScWaacgHtttcaVaaMgaacagNatcgttaacaaaaagtaca +HBNaattgttKtcttVttaaBtctgctacgBgcWtttcaggacacatMgacatcccagcg +gMgaVKaBattgacttaatgacacacaaaaaatRKaaBctacgtRaDcgtagcVBaacDS +BHaaaaSacatatacagacRNatcttNaaVtaaaataHattagtaaaaSWccgtatWatg +gDttaactattgcccatcttHaSgYataBttBaactattBtcHtgatcaataSttaBtat +KSHYttWggtcYtttBttaataccRgVatStaHaKagaatNtagRMNgtcttYaaSaact +cagDSgagaaYtMttDtMRVgWKWtgMaKtKaDttttgactatacataatcNtatNaHat +tVagacgYgatatatttttgtStWaaatctWaMgagaRttRatacgStgattcttaagaD +taWccaaatRcagcagaaNKagtaaDggcgccBtYtagSBMtactaaataMataBSacRM +gDgattMMgtcHtcaYDtRaDaacggttDaggcMtttatgttaNctaattaVacgaaMMt +aatDccSgtattgaRtWWaccaccgagtactMcgVNgctDctaMScatagcgtcaactat +acRacgHRttgctatttaatgaattataYKttgtaagWgtYttgcHgMtaMattWaWVta +RgcttgYgttBHtYataSccStBtgtagMgtDtggcVaaSBaatagDttgBgtctttctc +attttaNagtHKtaMWcYactVcgcgtatMVtttRacVagDaatcttgctBBcRDgcaac +KttgatSKtYtagBMagaRtcgBattHcBWcaactgatttaatttWDccatttatcgagS +KaWttataHactaHMttaatHtggaHtHagaatgtKtaaRactgtttMatacgatcaagD +gatKaDctataMggtHDtggHacctttRtatcttYattttgacttgaaSaataaatYcgB +aaaaccgNatVBttMacHaKaataagtatKgtcaagactcttaHttcggaattgttDtct +aaccHttttWaaatgaaatataaaWattccYDtKtaaaacggtgaggWVtctattagtga +ctattaagtMgtttaagcatttgSgaaatatccHaaggMaaaattttcWtatKctagDtY +tMcctagagHcactttactatacaaacattaacttaHatcVMYattYgVgtMttaaRtga +aataaDatcaHgtHHatKcDYaatcttMtNcgatYatgSaMaNtcttKcWataScKggta +tcttacgcttWaaagNatgMgHtctttNtaacVtgttcMaaRatccggggactcMtttaY +MtcWRgNctgNccKatcttgYDcMgattNYaRagatHaaHgKctcataRDttacatBatc +cattgDWttatttaWgtcggagaaaaatacaatacSNtgggtttccttacSMaagBatta +caMaNcactMttatgaRBacYcYtcaaaWtagctSaacttWgDMHgaggatgBVgcHaDt +ggaactttggtcNatNgtaKaBcccaNtaagttBaacagtatacDYttcctNgWgcgSMc +acatStctHatgRcNcgtacacaatRttMggaNKKggataaaSaYcMVcMgtaMaHtgat +tYMatYcggtcttcctHtcDccgtgRatcattgcgccgatatMaaYaataaYSggatagc +gcBtNtaaaScaKgttBgagVagttaKagagtatVaactaSacWactSaKatWccaKaaa +atBKgaaKtDMattttgtaaatcRctMatcaaMagMttDgVatggMaaWgttcgaWatga +aatttgRtYtattaWHKcRgctacatKttctaccaaHttRatctaYattaaWatVNccat +NgagtcKttKataStRaatatattcctRWatDctVagttYDgSBaatYgttttgtVaatt +taatagcagMatRaacttBctattgtMagagattaaactaMatVtHtaaatctRgaaaaa +aaatttWacaacaYccYDSaattMatgaccKtaBKWBattgtcaagcHKaagttMMtaat +ttcKcMagNaaKagattggMagaggtaatttYacatcWaaDgatMgKHacMacgcVaaca +DtaDatatYggttBcgtatgWgaSatttgtagaHYRVacaRtctHaaRtatgaactaata +tctSSBgggaaHMWtcaagatKgagtDaSatagttgattVRatNtctMtcSaagaSHaat +aNataataRaaRgattctttaataaagWaRHcYgcatgtWRcttgaaggaMcaataBRaa +ccagStaaacNtttcaatataYtaatatgHaDgcStcWttaacctaRgtYaRtataKtgM +ttttatgactaaaatttacYatcccRWtttHRtattaaatgtttatatttgttYaatMca +RcSVaaDatcgtaYMcatgtagacatgaaattgRtcaaYaaYtRBatKacttataccaNa +aattVaBtctggacaagKaaYaaatatWtMtatcYaaVNtcgHaactBaagKcHgtctac +aatWtaDtSgtaHcataHtactgataNctRgttMtDcDttatHtcgtacatcccaggStt +aBgtcacacWtccNMcNatMVaVgtccDYStatMaccDatggYaRKaaagataRatttHK +tSaaatDgataaacttaHgttgVBtcttVttHgDacgaKatgtatatNYataactctSat +atatattgcHRRYttStggaactHgttttYtttaWtatMcttttctatctDtagVHYgMR +BgtHttcctaatYRttKtaagatggaVRataKDctaMtKBNtMtHNtWtttYcVtattMc +gRaacMcctNSctcatttaaagDcaHtYccSgatgcaatYaaaaDcttcgtaWtaattct +cgttttScttggtaatctttYgtctaactKataHacctMctcttacHtKataacacagcN +RatgKatttttSaaatRYcgDttaMRcgaaattactMtgcgtaagcgttatBtttttaat +taagtNacatHgttcRgacKcBBtVgatKttcgaBaatactDRgtRtgaNacWtcacYtt +aaKcgttctHaKttaNaMgWgWaggtctRgaKgWttSttBtDcNtgtttacaaatYcDRt +gVtgcctattcNtctaaaDMNttttNtggctgagaVctDaacVtWccaagtaacacaNct +gaScattccDHcVBatcgatgtMtaatBgHaatDctMYgagaatgYWKcctaatNaStHa +aaKccgHgcgtYaaYtattgtStgtgcaaRtattaKatattagaWVtcaMtBagttatta +gNaWHcVgcaattttDcMtgtaRHVYtHtctgtaaaaHVtMKacatcgNaatttMatatg +ttgttactagWYtaRacgataKagYNKcattataNaRtgaacKaYgcaaYYacaNccHat +MatDcNgtHttRaWttagaaDcaaaaaatagggtKDtStaDaRtaVtHWKNtgtattVct +SVgRgataDaRaWataBgaagaaKtaataaYgDcaStaNgtaDaaggtattHaRaWMYaY +aWtggttHYgagVtgtgcttttcaaDKcagVcgttagacNaaWtagtaataDttctggtt +VcatcataaagtgKaaaNaMtaBBaattaatWaattgctHaVKaSgDaaVKaHtatatat +HatcatSBagNgHtatcHYMHgttDgtaHtBttWatcgtttaRaattgStKgSKNWKatc +agDtctcagatttctRtYtBatBgHHtKaWtgYBgacVVWaKtacKcDttKMaKaVcggt +gttataagaataaHaatattagtataatMHgttYgaRttagtaRtcaaVatacggtcMcg +agtaaRttacWgactKRYataaaagSattYaWgagatYagKagatgSaagKgttaatMgg +tataatgttWYttatgagaaacctNVataatHcccKtDctcctaatactggctHggaSag +gRtKHaWaattcgSatMatttagaggcYtctaMcgctcataSatatgRagacNaaDagga +VBagaYttKtacNaKgtSYtagttggaWcatcWttaatctatgaVtcgtgtMtatcaYcg +tRccaaYgDctgcMgtgtWgacWtgataacacgcgctBtgttaKtYDtatDcatcagKaV +MctaatcttgVcaaRgcRMtDcgattaHttcaNatgaatMtactacVgtRgatggaWttt +actaaKatgagSaaKggtaNtactVaYtaaKRagaacccacaMtaaMtKtatBcttgtaa +WBtMctaataaVcDaaYtcRHBtcgttNtaaHatttBNgRStVDattBatVtaagttaYa +tVattaagaBcacggtSgtVtatttaRattgatgtaHDKgcaatattKtggcctatgaWD +KRYcggattgRctatNgatacaatMNttctgtcRBYRaaaHctNYattcHtaWcaattct +BtMKtVgYataatMgYtcagcttMDataVtggRtKtgaatgccNcRttcaMtRgattaac +attRcagcctHtWMtgtDRagaKaBtgDttYaaaaKatKgatctVaaYaacWcgcatagB +VtaNtRtYRaggBaaBtgKgttacataagagcatgtRattccacttaccatRaaatgWgD +aMHaYVgVtaSctatcgKaatatattaDgacccYagtgtaYNaaatKcagtBRgagtcca +tgKgaaaccBgaagBtgSttWtacgatWHaYatcgatttRaaNRgcaNaKVacaNtDgat +tgHVaatcDaagcgtatgcNttaDataatcSataaKcaataaHWataBtttatBtcaKtK +tatagttaDgSaYctacaRatNtaWctSaatatttYaKaKtaccWtatcRagacttaYtt +VcKgSDcgagaagatccHtaattctSttatggtKYgtMaHagVaBRatttctgtRgtcta +tgggtaHKgtHacHtSYacgtacacHatacKaaBaVaccaDtatcSaataaHaagagaat +ScagactataaRttagcaaVcaHataKgDacatWccccaagcaBgagWatctaYttgaaa +tctVNcYtttWagHcgcgcDcVaaatgttKcHtNtcaatagtgtNRaactttttcaatgg +WgBcgDtgVgtttctacMtaaataaaRggaaacWaHttaRtNtgctaaRRtVBctYtVta +tDcattDtgaccYatagatYRKatNYKttNgcctagtaWtgaactaMVaacctgaStttc +tgaKVtaaVaRKDttVtVctaDNtataaaDtccccaagtWtcgatcactDgYaBcatcct +MtVtacDaaBtYtMaKNatNtcaNacgDatYcatcgcaRatWBgaacWttKttagYtaat +tcggttgSWttttDWctttacYtatatWtcatDtMgtBttgRtVDggttaacYtacgtac +atgaattgaaWcttMStaDgtatattgaDtcRBcattSgaaVBRgagccaaKtttcDgcg +aSMtatgWattaKttWtgDBMaggBBttBaatWttRtgcNtHcgttttHtKtcWtagHSt +aacagttgatatBtaWSaWggtaataaMttaKacDaatactcBttcaatatHttcBaaSa +aatYggtaRtatNtHcaatcaHtagVtgtattataNggaMtcttHtNagctaaaggtaga +YctMattNaMVNtcKtactBKcaHHcBttaSagaKacataYgctaKaYgttYcgacWVtt +WtSagcaacatcccHaccKtcttaacgaKttcacKtNtacHtatatRtaaatacactaBt +ttgaHaRttggttWtatYagcatYDatcggagagcWBataagRtacctataRKgtBgatg +aDatataSttagBaHtaatNtaDWcWtgtaattacagKttcNtMagtattaNgtctcgtc +ctcttBaHaKcKccgtRcaaYagSattaagtKataDatatatagtcDtaacaWHcaKttD +gaaRcgtgYttgtcatatNtatttttatggccHtgDtYHtWgttatYaacaattcaWtat +NgctcaaaSttRgctaatcaaatNatcgtttaBtNNVtgttataagcaaagattBacgtD +atttNatttaaaDcBgtaSKgacgtagataatttcHMVNttgttBtDtgtaWKaaRMcKM +tHtaVtagataWctccNNaSWtVaHatctcMgggDgtNHtDaDttatatVWttgttattt +aacctttcacaaggaSaDcggttttttatatVtctgVtaacaStDVaKactaMtttaSNa +gtgaaattaNacttSKctattcctctaSagKcaVttaagNaVcttaVaaRNaHaaHttat +gtHttgtgatMccaggtaDcgaccgtWgtWMtttaHcRtattgScctatttKtaaccaag +tYagaHgtWcHaatgccKNRtttagtMYSgaDatctgtgaWDtccMNcgHgcaaacNDaa +aRaStDWtcaaaaHKtaNBctagBtgtattaactaattttVctagaatggcWSatMaccc +ttHttaSgSgtgMRcatRVKtatctgaaaccDNatYgaaVHNgatMgHRtacttaaaRta +tStRtDtatDttYatattHggaBcttHgcgattgaKcKtttcRataMtcgaVttWacatN +catacctRataDDatVaWNcggttgaHtgtMacVtttaBHtgagVttMaataattatgtt +cttagtttgtgcDtSatttgBtcaacHattaaBagVWcgcaSYttMgcttacYKtVtatc +aYaKctgBatgcgggcYcaaaaacgNtctagKBtattatctttKtaVttatagtaYtRag +NtaYataaVtgaatatcHgcaaRataHtacacatgtaNtgtcgYatWMatttgaactacR +ctaWtWtatacaatctBatatgYtaagtatgtgtatSttactVatcttYtaBcKgRaSgg +RaaaaatgcagtaaaWgtaRgcgataatcBaataccgtatttttccatcNHtatWYgatH +SaaaDHttgctgtccHtggggcctaataatttttctatattYWtcattBtgBRcVttaVM +RSgctaatMagtYtttaaaaatBRtcBttcaaVtaacagctccSaaSttKNtHtKYcagc +agaaaccccRtttttaaDcDtaStatccaagcgctHtatcttaDRYgatDHtWcaaaBcW +gKWHttHataagHacgMNKttMKHccaYcatMVaacgttaKgYcaVaaBtacgcaacttt +MctaaHaatgtBatgagaSatgtatgSRgHgWaVWgataaatatttccKagVgataattW +aHNcYggaaatgctHtKtaDtctaaagtMaatVDVactWtSaaWaaMtaHtaSKtcBRaN +cttStggtBttacNagcatagRgtKtgcgaacaacBcgKaatgataagatgaaaattgta +ctgcgggtccHHWHaaNacaBttNKtKtcaaBatatgctaHNgtKcDWgtttatNgVDHg +accaacWctKaaggHttgaRgYaatHcaBacaatgagcaaattactgtaVaaYaDtagat +tgagNKggtggtgKtWKaatacagDRtatRaMRtgattDggtcaaYRtatttNtagaDtc +acaaSDctDtataatcgtactaHttatacaatYaacaaHttHatHtgcgatRRttNgcat +SVtacWWgaaggagtatVMaVaaattScDDKNcaYBYaDatHgtctatBagcaacaagaa +tgagaaRcataaKNaRtBDatcaaacgcattttttaaBtcSgtacaRggatgtMNaattg +gatatWtgagtattaaaVctgcaYMtatgatttttYgaHtgtcttaagWBttHttgtctt +attDtcgtatWtataataSgctaHagcDVcNtaatcaagtaBDaWaDgtttagYctaNcc +DtaKtaHcttaataacccaRKtacaVaatNgcWRaMgaattatgaBaaagattVYaHMDc +aDHtcRcgYtcttaaaWaaaVKgatacRtttRRKYgaatacaWVacVcRtatMacaBtac +tggMataaattttHggNagSctacHgtBagcgtcgtgattNtttgatSaaggMttctttc +ttNtYNagBtaaacaaatttMgaccttacataattgYtcgacBtVMctgStgMDtagtaR +ctHtatgttcatatVRNWataDKatWcgaaaaagttaaaagcacgHNacgtaatctttMR +tgacttttDacctataaacgaaatatgattagaactccSYtaBctttaataacWgaaaYa +tagatgWttcatKtNgatttttcaagHtaYgaaRaDaagtaggagcttatVtagtctttc +attaaaatcgKtattaRttacagVaDatgcatVgattgggtctttHVtagKaaRBtaHta +aggccccaaaaKatggtttaMWgtBtaaacttcactttKHtcgatctccctaYaBacMgt +cttBaBaNgcgaaacaatctagtHccHtKttcRtRVttccVctttcatacYagMVtMcag +aMaaacaataBctgYtaatRaaagattaaccatVRatHtaRagcgcaBcgDttStttttc +VtttaDtKgcaaWaaaaatSccMcVatgtKgtaKgcgatatgtagtSaaaDttatacaaa +catYaRRcVRHctKtcgacKttaaVctaDaatgttMggRcWaacttttHaDaKaDaBctg +taggcgtttaHBccatccattcNHtDaYtaataMttacggctNVaacDattgatatttta +cVttSaattacaaRtataNDgacVtgaacataVRttttaDtcaaacataYDBtttaatBa +DtttYDaDaMccMttNBttatatgagaaMgaNtattHccNataattcaHagtgaaggDga +tgtatatatgYatgaStcataaBStWacgtcccataRMaaDattggttaaattcMKtctM +acaBSactcggaatDDgatDgcWctaacaccgggaVcacWKVacggtaNatatacctMta +tgatagtgcaKagggVaDtgtaacttggagtcKatatcgMcttRaMagcattaBRaStct +YSggaHYtacaactMBaagDcaBDRaaacMYacaHaattagcattaaaHgcgctaaggSc +cKtgaaKtNaBtatDDcKBSaVtgatVYaagVtctSgMctacgttaacWaaattctSgtD +actaaStaaattgcagBBRVctaatatacctNttMcRggctttMttagacRaHcaBaacV +KgaataHttttMgYgattcYaNRgttMgcVaaacaVVcDHaatttgKtMYgtatBtVVct +WgVtatHtacaaHttcacgatagcagtaaNattBatatatttcVgaDagcggttMaagtc +ScHagaaatgcYNggcgtttttMtStggtRatctacttaaatVVtBacttHNttttaRca +aatcacagHgagagtMgatcSWaNRacagDtatactaaDKaSRtgattctccatSaaRtt +aaYctacacNtaRtaactggatgaccYtacactttaattaattgattYgttcagDtNKtt +agDttaaaaaaaBtttaaNaYWKMBaaaacVcBMtatWtgBatatgaacVtattMtYatM +NYDKNcKgDttDaVtaaaatgggatttctgtaaatWtctcWgtVVagtcgRgacttcccc +taDcacagcRcagagtgtWSatgtacatgttaaSttgtaaHcgatgggMagtgaacttat +RtttaVcaccaWaMgtactaatSSaHtcMgaaYtatcgaaggYgggcgtgaNDtgttMNg +aNDMtaattcgVttttaacatgVatgtWVMatatcaKgaaattcaBcctccWcttgaaWH +tWgHtcgNWgaRgctcBgSgaattgcaaHtgattgtgNagtDttHHgBttaaWcaaWagc +aSaHHtaaaVctRaaMagtaDaatHtDMtcVaWMtagSagcttHSattaacaaagtRacM +tRtctgttagcMtcaBatVKtKtKacgagaSNatSactgtatatcBctgagVtYactgta +aattaaaggcYgDHgtaacatSRDatMMccHatKgttaacgactKtgKagtcttcaaHRV +tccttKgtSataatttacaactggatDNgaacttcaRtVaagDcaWatcBctctHYatHa +DaaatttagYatSatccaWtttagaaatVaacBatHcatcgtacaatatcgcNYRcaata +YaRaYtgattVttgaatgaVaactcRcaNStgtgtattMtgaggtNttBaDRcgaaaagc +tNgBcWaWgtSaDcVtgVaatMKBtttcgtttctaaHctaaagYactgMtatBDtcStga +ccgtSDattYaataHctgggaYYttcggttaWaatctggtRagWMaDagtaacBccacta +cgHWMKaatgatWatcctgHcaBaSctVtcMtgtDttacctaVgatYcWaDRaaaaRtag +atcgaMagtggaRaWctctgMgcWttaagKBRtaaDaaWtctgtaagYMttactaHtaat +cttcataacggcacBtSgcgttNHtgtHccatgttttaaagtatcgaKtMttVcataYBB +aKtaMVaVgtattNDSataHcagtWMtaggtaSaaKgttgBtVtttgttatcatKcgHac +acRtctHatNVagSBgatgHtgaRaSgttRcctaacaaattDNttgacctaaYtBgaaaa +tagttattactcttttgatgtNNtVtgtatMgtcttRttcatttgatgacacttcHSaaa +ccaWWDtWagtaRDDVNacVaRatgttBccttaatHtgtaaacStcVNtcacaSRttcYa +gacagaMMttttgMcNttBcgWBtactgVtaRttctccaaYHBtaaagaBattaYacgat +ttacatctgtaaMKaRYtttttactaaVatWgctBtttDVttctggcDaHaggDaagtcg +aWcaagtagtWttHtgKtVataStccaMcWcaagataagatcactctHatgtcYgaKcat +cagatactaagNSStHcctRRNtattgtccttagttagMVgtatagactaactctVcaat +MctgtttgtgttgccttatWgtaBVtttctggMcaaKgDWtcgtaaYStgSactatttHg +atctgKagtagBtVacRaagRtMctatgggcaaaKaaaatacttcHctaRtgtDcttDat +taggaaatttcYHaRaaBttaatggcacKtgctHVcaDcaaaVDaaaVcgMttgtNagcg +taDWgtcgttaatDgKgagcSatatcSHtagtagttggtgtHaWtaHKtatagctgtVga +ttaBVaatgaataagtaatVatSttaHctttKtttgtagttaccttaatcgtagtcctgB +cgactatttVcMacHaaaggaatgDatggKtaHtgStatattaaSagctWcctccRtata +BaDYcgttgcNaagaggatRaaaYtaWgNtSMcaatttactaacatttaaWttHtatBat +tgtcgacaatNgattgcNgtMaaaKaBDattHacttggtRtttaYaacgVactBtaBaKt +gBttatgVttgtVttcaatcWcNctDBaaBgaDHacBttattNtgtDtatttVSaaacag +gatgcRatSgtaSaNtgBatagttcHBgcBBaaattaHgtDattatDaKaatBaaYaaMa +ataaataKtttYtagtBgMatNcatgtttgaNagtgttgtgKaNaSagtttgaSMaYBca +aaacDStagttVacaaaaactaaWttBaagtctgtgcgtMgtaattctcctacctcaNtt +taaccaaaaVtBcacataacaccccBcWMtatVtggaatgaWtcaaWaaaaaaaaWtDta +atatRcctDWtcctaccMtVVatKttaWaaKaaatataaagScHBagaggBaSMtaWaVt +atattactSaaaKNaactatNatccttgaYctattcaaaVgatttYHcRagattttaSat +aggttattcVtaaagaKgtattattKtRttNcggcRgtgtgtWYtaacHgKatKgatYta +cYagDtWcHBDctctgRaYKaYagcactKcacSaRtBttttBHKcMtNtcBatttatttt +tgSatVgaaagaWtcDtagDatatgMacaacRgatatatgtttgtKtNRaatatNatgYc +aHtgHataacKtgagtagtaacYttaNccaaatHcacaacaVDtagtaYtccagcattNt +acKtBtactaaagaBatVtKaaHBctgStgtBgtatgaSNtgDataaccctgtagcaBgt +gatcttaDataStgaMaccaSBBgWagtacKcgattgaDgNNaaaacacagtSatBacKD +gcgtataBKcatacactaSaatYtYcDaactHttcatRtttaatcaattataRtttgtaa +gMcgNttcatcBtYBagtNWNMtSHcattcRctttttRWgaKacKttgggagBcgttcgc +MaWHtaatactgtctctatttataVgtttaBScttttaBMaNaatMacactYtBMggtHa +cMagtaRtctgcatttaHtcaaaatttgagKtgNtactBacaHtcgtatttctMaSRagc +agttaatgtNtaaattgagagWcKtaNttagVtacgatttgaatttcgRtgtWcVatcgt +taaDVctgtttBWgaccagaaagtcSgtVtatagaBccttttcctaaattgHtatcggRa +ttttcaaggcYSKaagWaWtRactaaaacccBatMtttBaatYtaagaactSttcgaaSc +aatagtattgaccaagtgttttctaacatgtttNVaatcaaagagaaaNattaaRtttta +VaaaccgcaggNMtatattVctcaagaggaacgBgtttaacaagttcKcYaatatactaa +ccBaaaSggttcNtattctagttRtBacgScVctcaatttaatYtaaaaaaatgSaatga +tagaMBRatgRcMcgttgaWHtcaVYgaatYtaatctttYttatRaWtctgBtDcgatNa +tcKaBaDgatgtaNatWKctccgatattaacattNaaacDatgBgttctgtDtaaaMggt +gaBaSHataacgccSctaBtttaRBtcNHcDatcDcctagagtcRtaBgWttDRVHagat +tYatgtatcWtaHtttYcattWtaaagtctNgtStggRNcgcggagSSaaagaaaatYcH +DtcgctttaatgYcKBVSgtattRaYBaDaaatBgtatgaHtaaRaRgcaSWNtagatHa +acttNctBtcaccatctMcatattccaSatttgcgaDagDgtatYtaaaVDtaagtttWV +aagtagYatRttaagDcNgacKBcScagHtattatcDaDactaaaaaYgHttBcgaDttg +gataaaKSRcBMaBcgaBSttcWtgNBatRaccgattcatttataacggHVtaattcaca +agagVttaaRaatVVRKcgWtVgacctgDgYaaHaWtctttcacMagggatVgactagMa +aataKaaNWagKatagNaaWtaaaatttgaattttatttgctaaVgaHatBatcaaBWcB +gttcMatcgBaaNgttcgSNaggSaRtttgHtRtattaNttcDcatSaVttttcgaaaaa +ttgHatctaRaggSaNatMDaaatDcacgattttagaHgHaWtYgattaatHNSttatMS +gggNtcKtYatRggtttgtMWVtttaYtagcagBagHaYagttatatggtBacYcattaR +SataBatMtttaaatctHcaaaSaaaagttNSaaWcWRccRtKaagtBWtcaaattSttM +tattggaaaccttaacgttBtWatttatatWcDaatagattcctScacctaagggRaaYt +aNaatgVtBcttaaBaacaMVaaattatStYgRcctgtactatcMcVKatttcgSgatRH +MaaaHtagtaaHtVgcaaataatatcgKKtgccaatBNgaaWcVttgagttaKatagttc +aggKDatDtattgaKaVcaKtaataDataataHSaHcattagttaatRVYcNaHtaRcaa +ggtNHcgtcaaccaBaaagYtHWaaaRcKgaYaaDttgcWYtataRgaatatgtYtgcKt +aNttWacatYHctRaDtYtattcBttttatcSataYaYgttWaRagcacHMgtttHtYtt +YaatcggtatStttcgtRSattaaDaKMaatatactaNBaWgctacacYtgaYVgtgHta +aaRaaRgHtagtWattataaaSDaaWtgMattatcgaaaagtaYRSaWtSgNtBgagcRY +aMDtactaacttaWgtatctagacaagNtattHggataatYttYatcataDcgHgttBtt +ctttVttgccgaaWtaaaacgKgtatctaaaaaNtccDtaDatBMaMggaatNKtatBaa +atVtccRaHtaSacataHattgtttKVYattcataVaattWtcgtgMttcttKtgtctaa +cVtatctatatBRataactcgKatStatattcatHHRttKtccaacgtgggtgRgtgaMt +attattggctatcgtgacMtRcBDtcttgtactaatRHttttaagatcgVMDStattatY +BtttDttgtBtNttgRcMtYtgBacHaWaBaatDKctaagtgaaactaatgRaaKgatcc +aagNaaaatattaggWNtaagtatacttttKcgtcggSYtcttgRctataYcttatataa +agtatattaatttataVaacacaDHatctatttttKYVatHRactttaBHccaWagtact +BtcacgaVgcgttRtttttttSVgtSagtBaaattctgaHgactcttgMcattttagVta +agaattHctHtcaDaaNtaacRggWatagttcgtSttgaDatcNgNagctagDgatcNtt +KgttgtaDtctttRaaYStRatDtgMggactSttaDtagSaVtBDttgtDgccatcacaM +attaaaMtNacaVcgSWcVaaDatcaHaatgaattaMtatccVtctBtaattgtWattat +BRcWcaatgNNtactWYtDaKttaaatcactcagtRaaRgatggtKgcgccaaHgaggat +StattYcaNMtcaBttacttatgagDaNtaMgaaWtgtttcttctaHtMNgttatctaWW +atMtBtaaatagDVatgtBYtatcggcttaagacMRtaHScgatatYgRDtcattatSDa +HggaaataNgaWSRRaaaBaatagBattaDctttgHWNttacaataaaaaaatacggttt +gHgVtaHtWMttNtBtctagtMcgKMgHgYtataHaNagWtcaacYattaataYRgtaWK +gaBctataaccgatttaHaNBRaRaMtccggtNgacMtctcatttgcaattcWgMactta +caaDaaNtactWatVtttagccttMaatcagVaagtctVaaDaBtattaattaYtNaYtg +gattaKtaKctYaMtattYgatattataatKtVgDcttatatNBtcgttgtStttttMag +aggttaHYSttcKgtcKtDNtataagttataagSgttatDtRttattgttttSNggRtca +aKMNatgaatattgtBWtaMacctgggYgaSgaagYataagattacgagaatBtggtRcV +HtgYggaDgaYaKagWagctatagacgaaHgtWaNgacttHRatVaWacKYtgRVNgVcS +gRWctacatcKSactctgWYtBggtataagcttNRttVtgRcaWaaatDMatYattaact +ttcgaagRatSctgccttgcRKaccHtttSNVagtagHagBagttagaccaRtataBcca +taatSHatRtcHagacBWatagcaMtacaRtgtgaaBatctKRtScttccaNaatcNgta +atatWtcaMgactctBtWtaaNactHaaaaRctcgcatggctMcaaNtcagaaaaacaca +gtggggWttRttagtaagaVctVMtcgaatcttcMaaaHcaHBttcgattatgtcaDagc +YRtBtYcgacMgtDcagcgaNgttaataatagcagKYYtcgtaBtYctMaRtaRtDagaa +aacacatgYaBttgattattcgaaNttBctSataaMataWRgaHtttccgtDgaYtatgg +tDgHKgMtatttVtMtVagttaRatMattRagataaccctKctMtSttgaHagtcStcta +tttccSagatgttccacgaggYNttHRacgattcDatatDcataaaatBBttatcgaHtN +HaaatatDNaggctgaNcaaggagttBttMgRagVatBcRtaWgatgBtSgaKtcgHttt +gaatcaaDaHttcSBgHcagtVaaSttDcagccgttNBtgttHagYtattctttRWaaVt +SttcatatKaaRaaaNacaVtVctMtSDtDtRHRcgtaatgctcttaaatSacacaatcg +HattcaWcttaaaatHaaatcNctWttaNMcMtaKctVtcctaagYgatgatcYaaaRac +tctaRDaYagtaacgtDgaggaaatctcaaacatcaScttcKttNtaccatNtaNataca +tttHaaDHgcaDatMWaaBttcRggctMaagctVYcacgatcaDttatYtaatcKatWat +caatVYtNagatttgattgaYttttYgacttVtcKaRagaaaHVgDtaMatKYagagttN +atWttaccNtYtcDWgSatgaRgtMatgKtcgacaagWtacttaagtcgKtgatccttNc +ttatagMatHVggtagcgHctatagccctYttggtaattKNaacgaaYatatVctaataM +aaaYtgVtcKaYtaataacagaatHcacVagatYWHttagaaSMaatWtYtgtaaagNaa +acaVgaWtcacNWgataNttcaSagctMDaRttgNactaccgataMaaatgtttattDtc +aagacgctDHYYatggttcaagccNctccttcMctttagacBtaaWtaWVHggaaaaNat +ttaDtDtgctaaHHtMtatNtMtagtcatttgcaaaRatacagRHtatDNtgtDgaatVg +tVNtcaaatYBMaaaagcaKgtgatgatMgWWMaHttttMgMagatDtataaattaacca +actMtacataaattgRataatacgBtKtaataattRgtatDagDtcRDacctatRcagag +cSHatNtcaScNtttggacNtaaggaccgtgKNttgttNcttgaaRgYgRtNtcagttBc +ttttcHtKtgcttYaaNgYagtaaatgaatggWaMattBHtatctatSgtcYtgcHtaat +tHgaaMtHcagaaSatggtatgccaHBtYtcNattWtgtNgctttaggtttgtWatNtgH +tgcDttactttttttgcNtactKtWRaVcttcatagtgSNKaNccgaataaBttataata +YtSagctttaaatSttggctaaKSaatRccgWHgagDttaaatcatgagMtcgagtVtaD +ggaBtatttgDacataaacgtagYRagBWtgDStKDgatgaagttcattatttaKWcata +aatWRgatataRgttRacaaNKttNtKagaaYaStaactScattattaacgatttaaatg +DtaattagatHgaYataaactatggggatVHtgccgtNgatNYcaStRtagaccacWcaM +tatRagHgVactYtWHtcttcatgatWgagaKggagtatgaWtDtVtNaNtcgYYgtaaa +ctttaDtBactagtaDctatagtaatatttatatataacgHaaaRagKattSagttYtSt +>THREE Homo sapiens frequency +agagagacgatgaaaattaatcgtcaatacgctggcgaacactgagggggacccaatgct +cttctcggtctaaaaaggaatgtgtcagaaattggtcagttcaaaagtagaccggatctt +tgcggagaacaattcacggaacgtagcgttgggaaatatcctttctaccacacatcggat +tttcgccctctcccattatttattgtgttctcacatagaattattgtttagacatccctc +gttgtatggagagttgcccgagcgtaaaggcataatccatataccgccgggtgagtgacc +tgaaattgtttttagttgggatttcgctatggattagcttacacgaagagattctaatgg +tactataggataattataatgctgcgtggcgcagtacaccgttacaaacgtcgttcgcat +atgtggctaacacggtgaaaatacctacatcgtatttgcaatttcggtcgtttcatagag +cgcattgaattactcaaaaattatatatgttgattatttgattagactgcgtggaaagaa +ggggtactcaagccatttgtaaaagctgcatctcgcttaagtttgagagcttacattagt +ctatttcagtcttctaggaaatgtctgtgtgagtggttgtcgtccataggtcactggcat +atgcgattcatgacatgctaaactaagaaagtagattactattaccggcatgcctaatgc +gattgcactgctatgaaggtgcggacgtcgcgcccatgtagccctgataataccaatact +tacatttggtcagcaattctgacattatacctagcacccataaatttactcagacttgag +gacaggctcttggagtcgatcttctgtttgtatgcatgtgatcatatagatgaataagcg +atgcgactagttagggcatagtatagatctgtgtatacagttcagctgaacgtccgcgag +tggaagtacagctgagatctatcctaaaatgcaaccatatcgttcacacatgatatgaac +ccagggggaaacattgagttcagttaaattggcagcgaatcccccaagaagaaggcggag +tgacgttgaacgggcttatggtttttcagtacttcctccgtataagttgagcgaaatgta +aacagaataatcgttgtgttaacaacattaaaatcgcggaatatgatgagaatacacagt +gtgagcatttcacttgtaaaatatctttggtagaacttactttgctttaaatatgttaaa +ccgatctaataatctacaaaacggtagattttgcctagcacattgcgtccttctctattc +agatagaggcaatactcagaaggttttatccaaagcactgtgttgactaacctaagtttt +agtctaataatcatgattgattataggtgccgtggactacatgactcgtccacaaataat +acttagcagatcagcaattggccaagcacccgacttttatttaatggttgtgcaatagtc +cagattcgtattcgggactctttcaaataatagtttcctggcatctaagtaagaaaagct +cataaggaagcgatattatgacacgctcttccgccgctgttttgaaacttgagtattgct +cgtccgaaattgagggtcacttcaaaatttactgagaagacgaagatcgactaaagttaa +aatgctagtccacagttggtcaagttgaattcatccacgagttatatagctattttaatt +tatagtcgagtgtacaaaaaacatccacaataagatttatcttagaataacaacccccgt +atcatcgaaatcctccgttatggcctgactcctcgagcttatagcatttgtgctggcgct +cttgccaggaacttgctcgcgaggtggtgacgagtgagatgatcagtttcattatgatga +tacgattttatcgcgactagttaatcatcatagcaagtaaaatttgaattatgtcattat +catgctccattaacaggttatttaattgatactgacgaaattttttcacaatgggttttc +tagaatttaatatcagtaattgaagccttcataggggtcctactagtatcctacacgacg +caggtccgcagtatcctggagggacgtgttactgattaaaagggtcaaaggaatgaaggc +tcacaatgttacctgcttcaccatagtgagccgatgagttttacattagtactaaatccc +aaatcatactttacgatgaggcttgctagcgctaaagagaatacatacaccaccacatag +aattgttagcgatgatatcaaatagactcctggaagtgtcagggggaaactgttcaatat +ttcgtccacaggactgaccaggcatggaaaagactgacgttggaaactataccatctcac +gcccgacgcttcactaattgatgatccaaaaaatatagcccggattcctgattagcaaag +ggttcacagagaaagatattatcgacgtatatcccaaaaaacagacgtaatgtgcatctt +cgaatcgggatgaatacttgtatcataaaaatgtgacctctagtatacaggttaatgtta +gtgatacacaatactcgtgggccatgggttctcaaataaaatgtaatattgcgtcgatca +ctcacccacgtatttggtctaattatgttttatttagtgacaatccaatagataaccggt +cctattaagggctatatttttagcgaccacgcgtttaaacaaaggattgtatgtagatgg +taccagtttaattgccagtgggcaatcctaagcaaaatgagattctatcctaaagtttgg +gcttgatataagatttcggatgtatgggttttataatcgttggagagctcaatcatgagc +taatacatggatttcgctacctcaccgagagaccttgcatgaagaattctaaccaaaagt +ttaataggccggattggattgagttaattaagaccttgttcagtcatagtaaaaaccctt +aaattttaccgattgacaaagtgagcagtcgcaataccctatgcgaaacgcctcgatagt +gactaggtatacaaggtttttgagttcctttgaaatagttaactaatttaaaattaatta +acgacatggaaatcacagaacctaatgctttgtaggagttatttatgctgtttactgcct +ctacaaccctaataaagcagtcctaagaatgaaacgcatcttttagttcagaaagtggta +tccagggtggtcaatttaataaattcaacatcgggtctcaggatattcggtcatataatt +tattaagggctcttcgagtcttactctgagtgaaattggaaacagtcatccttttcgttg +tgaggcatcttacaccgctatcgatatacaatgcattccaccgcggtgtcccgtacacaa +ggaaacttgttaccttggggatataagaaaactcacacgtctcattattaaactgagtac +aatttttgcacgagaaagtaatgcaatacaatatgatgaaagccagctaatgaaaaggga +tggaacgcacctcggatctgttgcactggattaaaatccgattatttttaaaaatattca +gtgctagagcatatcaggtctacttttttatctggtatgtaaagcccacggagcgatagt +gagatccttacgactcaacgaaaagttataacataactcccgttagccaaagcccaatcc +cgattactgccctaccctaacgtctgccatctaaatatcgaacttgttatgatcaatgtg +actacctcccaccctttccccttcatttgttccactggggataagctagcgttttcagaa +tcaatgcaataagaatagccaattgtctcacttcatcagagctcttggcaattccaggcg +ctacgtggttctggaatatattcatttttcaaatagtaatacgtttagtgttgctattgt +ctacacgtttggatattacgttatgtgagcggacatcaatagttgtctaactctttagta +agccagagatagcactcttagcgaatggataccatcttccataagtttagttaatagtcc +gaaacaactgcttcgagcatatttgaacctccttgtaggcaaatagcctcttcaaagcaa +tcttactaatagatagagtttgttttaagggactactagaaatgggacaatcttaatagt +atgacctaaactgacatttaaagatatatccaggtggcaagcataaagatcattgcgcca +cctccaccgtgggattacttatcagtcgatatcctatatgctaagtttgcgacggcagaa +tacaaactaagctgagttgatgctaaccttacctatgataccccattggaccggttaaca +gccctacttattccaaataaaagaacttttatgctgtagaagctattatagtgatgcctg +gtaacttcagtatattaaaatgacacacatacgccatatagagctcctggaactttgaat +aatgagcgaacttcgaagttgaagagcaagaaaccatatgtcacggttgcctaaagcccg +gtaaccagacatgtgctatcattgatcattatcgaggttttcataaccttgacccattat +cggctgtgcgcggacaagtacttaaatcactagtttcttcacctgcttatcggtaagaaa +taaggttggcaaagaatcgcataagacggacgtagagccgcagcgttgtgcgagtccagg +tgcatgcgcagcaataggattttaaattttgttccatttttaatttagccgtaaggatgt +ccgtaaatgattgaaaattggattcaatctttgggcctatgctactggaacctgatcgac +aaaatttcaaacatacgttaactccgaaagaccgtatttttgcggctagaatagtcagtc +gcttggagccatataccttaccacttaaacgacgtgctcctgtagttgaaatataaacag +aacacaaagactaccgatcatatcaactgaagatctttgtaactttgaggcgaagcaccc +tcttcgagacaactaagagtaaagtaccgggcgccgcaaggagtcgattgggaccctaaa +tcttgacgaattgctaagaggctcagagctaccactgtaatttctctagagcccataata +aatgaacgatacatccgtaggtagcacctaagggattataatggaagccaaatgcagtta +ataatattatatactggcgtacacgattcgacggatctctcacatagtgattcacgaccc +ccccctttgattgacacagcgtcagcattttgcaagaacgatcttctgcatagggtgcgc +caccgtaaggatgacgtcgaagctacaactgggtataatttaccatgcttccctgatgct +gagtgcaatacactaagaatgagtttttaccccatatcaccagtatttgttctgttattg +cgaagaaatggctatgctgagttggcgactaaagtcacccatcctttttattaggtaacc +ccctcccttaaactaactgatttgctggagctgccctgcatacatatactttatcattta +tggacgtccgtgacgcttattatccaccatagtcgatatgctacacggattcattaatgg +atcgtaggagtttaagttatatttactaagatcggtctcggctactatcccgccttaccc +ggcgctatttacggccatttttaatatattgacggtaattattcctatggtttcgaccgc +acgtccttggacaagaaagaatggcaaaaaaaatgtaaaagaaaaaaaatattgagtccc +taccatcatataaaaaatatgtgatgagtaacttgacgaaatgttagtggttattaaaga +ctatctattacaccttttgttttctgtcgtagtatattaaagtctagaagccttacagga +aaatcagggttatacagccgatactccgcagcatgaatcatcgaggaggtgtcctaccat +cgcgccttgtaatcttgtctgtgtatactgtatttagaccttttatacaaagtaaatatc +tcggctttatgtgattgggaggggcctactcaaacatgatgacttgacctaataatcact +gtgcgggcgtcttatgactagctattccttgaaatccaccaccaaatggttaatatgtaa +aaactttgacgatgaaacaaggtgaatgtgtagttactttgtgtaattagctgcgtcgag +cattgcttgtaaaaccgtcaatcgcacacgttacttccataaaatttctacgaatacacc +cttcttaaaaaaaacgtaggaattcacgagtttaacaaacgataactgtataaagtggaa +gtccgaagaaagcagatgcccgaactactcgaagatgtttcgttttcttaaccatagggg +cttcttaatggcccactacgcacattttgttcaagcccgagagggacatccccattacgg +gagtattactaaaactgttccgtaatacgttcagcaagggatgaaaaaggccactgctca +agttattgacgtgggagtattacatcggaagcctgaatcccacactatgatggtctgtac +aggcctagggactgcgtctagacggtattaccggcttctaatcatacgatcgtgagtctt +aacgggaagtaaggctcacacctaccccaaaccatttatctatgtaagtataaaattgtg +cgtaagtgttcaaagtggacaataaagacgtggcaaaaacccccgcacataagccgcttt +agatttcacaaataccaatgcggttaaaaacatccttgagtcgtacatacaccatactcg +cgttaaacggatataacagaagataataaatccggatgtggagtcggtgtaactatagaa +agccaagtgaaataatgcttaccagtcatttagctatacggctttcatttcatgtcaaga +gggtggagtttgacctgtacagttgatatatcaccgatacttagaactcacctaaagcta +aaattgctcgcagcgtgtaatccgcatattacaaacaatagatgggattcattatacata +agacacgatgatctgctttttcaggttgcgagatgttgcctatcgtcaatcgagtcctgc +cttacaccacttaaacaaaagtattgacagggaacctattttcgaggtattatatagtcc +agcttgaatatcaatttgacagttaacctagtgaaaatcagtaagaggaaatacgccaca +ttctccagtgaaattctacgggttatcgtctagtccaactatcaattataactcacgaga +tataagtaaattctcgtacttggcctgatttttattatactttggatccttagtaaacag +gaagggagaaaccttcaacgaaaaacactggattttgttttactctcaaagctcttatat +gacggaaataccctgtcaagtcttaactttattactagactaatgaaatgggcttggggt +ggccagaatcatagtacaatttagcggatacactattcggactttcctatcggctgtctg +gttggataagtatggggactaataggctagacatacctatacttaaactatacaggcgtc +atctatctctgcaactttggagttccctgatgttctcccgccctttgggttcacatcttc +tataccgacacccctaataacgattagtttgtgggttagagtaaattaatacggttaata +ttaatgtatcgttgaaaagctggtgtcgccaataaggtaaccggctaggcagagtatatg +tcacgaagtataactaccctaatgataagctgtaggaataaaattaatgctgtctctaag +cgaagagatatttccgactctgttttaatgacgaatctcattacttctgacttgcaaatg +ttcaatatggcacggtttcacggcacctttgtgacgcatataatgaacttagaagattat +aacgacggaactttatatgataatccgttacgattaaagaatctgttaaatatcataatg +gcattcagttctagaccgtgcatcatggtaaacttactttctctgcatggcgacatacat +ttcgctattcaaattcgcgtgtggttacacccactcgcacctttggaatattaagagaag +atgatcagaaaatccattcgctcaatttttctgacgtacgtctaatttatcctaggagac +aaatcgttttatgtctctcacatttttgaagaaaggttcgagagacaatactcaggtcct +gaactgctagaagatactcggtggagcgtggcaacaatgaaaaactcgtgacataaatga +atgatacttttccaagttcagttaagtgaatatgtttaacatacccggcttttcgatctt +aagctgacgctggacgtgcgagtaatgtcagtctcttacatacactagtgactccaagtt +tcgtcaaaaacgccccctcccttctcgagcccactcacgctatgtattgacgcgaacttg +ttcgggatcagacttttcaggagttcggtcgcgtgtccctatgtgctaatatataagtta +gatcgcattagatgctaatctgaatacttatagacgaccttcaacgagaacgggtaccac +cttgaggctagagttaggtgtgaaacgacaggtagggacatataaaatttgagtgcggct +ttagttaagggtttaattacctactcaaacatcacgctcgcgcccttcgtacgtaatcga +ccatctagaggctaaggggactgtactaggtagtgattaatgatatcctagacgcacgtg +ccttagatcttcagactctgatggtccgcgatcaccgtaattgtagtcctccaactcgat +cactttgttggcgtcaaagaaattacgatatctaaatacttataatacaataaccaagga +tgagaatgactcatcgcgttggagttatattgcttgaagttctatggaatgaaagcacgt +tatctgccgtcccaatatctccagtgagctaattcattggacggtccactttgatcaatc +cccgaggagatgttcggacactttagtctgtaacacttagcgttgagaccacgaacaatt +gattactcagtcttgaaggtgttttccaaagttcattttaaataagactacgataggcct +ttcctattgatataaactacccggctctgttgttcgtgtgagtcgtacttctctgtgttt +ttctgattatagcaagattcgattcttagtgtaaacagcgatttttatttgacccgtcaa +tgagaagcgcataggatctaagcaaaattatcaagttgtgccacaaggtaagatctttcc +agttattgcaggtaggatgtatcccacgttgatagtatgaggtctgacgtcaactgtcta +ggagagttgaccgcgtgcgggtacaccggatttgcatcgatgttgagaacgcagaactcc +cactgtcgtggcggcgttcctgatatttagcaagaggcgttgataaagccctcatcatct +agatctcgacctcatctgccctcttgctccatcattttctacacagactactttcctatc +tacgttagtataattgctttctatcttagtatcatttagagcttctccgtcaacaggttc +gtgctattaaagttagtacgaaagggacaacttgtagcaacgcatttaatcggttttcga +ctacttcgcacaaaatcagataaagaagtttgtcattctattagacattgaattgcgcaa +ttgacttgtaccacttatgatcgaacactgaatcaagactgtgattaactaaaatagaca +agccactatatcaactaataaaaacgcccctggtggtcgaacatagttgactacaggata +attaattggactggagccattacattctctacaatcgtatcacttcccaagtagacaact +ttgaccttgtagtttcatgtacaaaaaaatgctttcgcaggagcacattggtagttcaat +agtttcatgggaacctcttgagccgtcttctgtgggtgtgttcggatagtaggtactgat +aaagtcgtgtcgctttcgatgagagggaattcaccggaaaacaccttggttaacaggata +gtctatgtaaacttcgagacatgtttaagagttaccagcttaatccacggtgctctacta +gtatcatcagctgtcttgcctcgcctagaaatatgcattctatcgttatcctatcaacgg +ttgccgtactgagcagccttattgtggaagagtaatatataaatgtagtcttgtctttac +gaagcagacgtaagtaataatgacttggaataccaaaactaaacatagtggattatcata +ctcaagaactctccagataaataacagtttttacgatacgtcaccaatgagcttaaagat +taggatcctcaaaactgatacaaacgctaattcatttgttattggatccagtatcagtta +aactgaatggagtgaagattgtagaatgttgttctggcctcgcatggggtctaggtgata +tacaatttctcatacttacacggtagtggaaatctgattctagcttcgtagctgactata +ctcaaggaaccactgctcaaggtaggagactagttccgaccctacagtcaaagtggccga +agcttaaactatagactagttgttaaatgctgatttcaagatatcatctatatacagttt +ggacaattatgtgtgcgaaactaaaattcatgctattcagatggatttcacttatgcctt +agaaacagatattgcccgagctcaatcaacagttttagccggaaacaatcgaagcatagg +gacaatgtatcttttcctaaattgccatgtgcagatttctgagtgtcacgaagcgcataa +tagaatcttgtgttgcctcaactcgttgaaaagtttaaaacaatcgcagcagtctttttg +gggtctactgtgtgtttgcaaaataactgaaagaaacgcttgaacaactctgaagtagct +cgagtactcattaaagtgtaacacattagtgaatatcggccaatgaaccaaacgcttccc +ggtacgctatctctctcatcgggaggcgatgtgcaggttatctacgaaagcatcccttta +cgttgagagtgtcgatgcatgaacctcattgtaacaatagcccagcaaattctcatacgt +gcctcagggtccgggcgtactcctccatggaagggcgcgcatctagtgttataccaactc +gctttttaactactatgctgtagttctacaggcatagtggccagtattttctaacttctc +tggatagatgctctcactcctcatccatcacggcttcagtttacgtcttacttgcttgtt +cagcaacggatggaggcattaagtatcttcactgttccctaaaattgctgttcaatatca +aagtaaggacgatacagggaaagctcaagcacactcattgaatactgccccagttgcaac +ctcacttaatctgacaaaaataatgactactctaagtgttgcggaagcagtctcttccac +gagcttgtctgtatcacttcgtataggcatgtaactcgatagacacgaacaccgagtgag +aaactatattcttgcttccgtgtgtgtgacaccaggtaattgatgcggatataagctgga +gatcactcacgcccacacaaggcgctgctacctctttattccaatgtgtaagaatttgct +aacttcatttctagaccgcagctttgcggtcataatttcacggtacggacccttgggtta +gagacttgataacacacttcgcagtttccaccgcgcacatgttttagtggcttctaacat +agaatttttgttgtgacataaagagtgcgtgggagacttgcccgaccgttaagccataat +caattgaaagccccgtgagtcacatctaattggttgtactgcgcatttagctatccttta +gctgactcgaagagattcgattcctaatataggttaattagatggctgccgcgcgaagta +aaacgtgaaaaacgtagtgcgcagatctgcataactcgcgcttaattacttatgagtagt +tccaagttcgctacgttatgagagagattggaattaagcaaatatgttttatggtgattt +tgggatgagaaggactgctaagtacggctactaaacaaatttctaaaaccgccatctacc +ttatcttggagacatttaagttgtatatgtcactagtctagcttttgtctgtgggacgcg +ttctcggaatgagggaaatgcaagagccgattcatcaaatgcttatctaagaaagtagtg +gactattacaccaagcacgaatgccagggaactgctttcttgctcaggacctcgcgacaa +ggtaccccgcataagtcctagaattacatttggtcagcaatgctgacatttgaccgtgaa +aacataattttaatcagaaggcagctcacccgcttgctctagatcttatctttgtatgaa +tgtcagaatttactgcaatatccgttccgaatagtgagggcttagtatagttctctgtat +acaggtcacatcaaactccccctgtcctagtacagctctgagctttaattaattgcatac +atttccttcaatcatcagatgaaaacaccgcgaatcatgctcttctcgtatagggcaaga +gaagcaacaaacaactagcccgactcacgttcatccgccgtatccttgttcagttcttac +tccgtattaggtcagcgaaatctaatcagaataatcggtcgcgtatcaaaattaaaatcc +cgcttgaggttgacaattaaaacgctgagcagttatcggctattagatagtggggtgaaa +gtaattggctggaattatgttaaaacgtgatattaagctaaaatacgctacttgttgccg +acctaattcagtcattcgatattcagttagagccaagaataacaagcttgtataaattga +acggggtgcactaaacgatgtgttactctaatattcagcttggagtatacctgaaggcga +attcatgtatcggccaataataagacgttgaagatcacaatttggactagcaaaagaagg +tgatttatgcgtggggattgagtccactgtacgagtacggtctctggaaaattataggtt +cagggaatataaggaagtaaagataattaccaagagatttttggtatcgctatgacccag +aggtgttctaacgtctgttttgatccgcagaatttctgcctcaatgcatatttgacggac +ttgaactagagcctctaaagttaaatggcgacgcaactgttcctaaacttcaattattac +tactctttttttcctagggtattgtagaggccagtggacaaaataaatcaaatttaagat +gtttcggacattaacatcccccgtagcatagaaatcatcagttatccaatctctcatcga +gcttttacaatttctgctggcgctatggacagcatatgccgcgagacctccgcaagactc +acttgatcactgtaagtatcttcattagaggttagagcctatagttaagctgctgaccta +gtaaaattggtattttctaattttattgctcaagttaaaggttagtgaagggataatgac +gttatttttgaacaatgggttgtattcaattttatatcacgaatggaacccttcattccc +ggcataatactagacgacacgaacaagctccgatctatcagccaggcacgtgttaaggtt +taattccggcaaaccaatgaagcatcaaaaggtgacctgatgcaacttagggtcacgatg +agtttttcaggactacttattacctattaataagttaacatgagccttcataccccgtaa +gacaatacatactccaccaattagaattctgagccatcttatctttttgtatcatcgaag +ggtatggccgaataggttaattagttactcctaacgtctctacaggcatgcatttgacgc +accttcgaaaatagtcaatctctcgccacacgcgtctagtatgcagcatcaaaaatatag +tccacggtttccggattaccaaacgcggcaaagagaaacattgtatcgacggagataact +taatacagaaggaaggggcatcttcgaatacggatgaataattctatctgtttattctga +catcttgttttcaggttaatcttacgcattcaaatgacgcctgccccatgcgtgcgcaat +tattttctaatattgacgagagcaatctcactccttttgggtctatttatgttttattga +ggcacaagcctatacagaacaggtactattaaggccgtgagtgtgagactcaaaccgtgg +aaacaaaggatgggttgttcttggtacaagttttagtgcatgtgggcaatccttaccaaa +atcagatgctatccttaactttgggctgcatttaagatggcggttggaggcctgtgagaa +tcctgcgtgtcatctttaatgaccgaattcatccatgtagattcagatcacacactcatt +ccttgatgttgtctaaacaaaagttgttgtggacgcattggagggagttaagtaacaact +tgggatcgcatacttataaaaattatatgttaaactttcacaaacgctgaagtccaaagt +aactagcccaaacgcctcgagagtcactaggtattaatggtgtttgagttcctgtgaaat +agtgttcgaaggtaaaatttatgtaccaaatcgaaagaacacttaataaggcttgcttgc +acggaggtatgatgtttactgactctacaaccctaattttccagtacgtacattcattcc +aataggttagttctcaaagtgctatacaggctcctcaattgatgatatgcttcagccgct +ctatggatattagctcattttatttaggaagcccgcttagaggcttactatgagggaaat +gccaaaatgtcatacttttcggtgtgtcccatatgacaccgctttacatagaatttgaat +taaaacgcgctctcccgttcactaccatacttggtaccgtgcgcatattacatatagata +taggatcattttttaaagctgtactaggtttgatcgacaatcttatgctatactatatga +tgtaaccctcataatcaataccgatcgtacgatcctagcataggtggcaagcgattttat +gccgattattgtgttaaatagtctgtgagtgtgattatcagggctacgttggtagagggg +ttgtatagacctcgcacacattgtgacatacttaacaatatacgaaaactgatataataa +atccccttacccaaacaccaatcccgttgaatcaactaccataacgtctcccatataaat +tgcctacttgtttgcataaatctgaatacataacaccattgcaccttcttgtgttccaat +cccgttaagattgccttgtcagatgatatgcaagaacaatagcatttgctagcaattatt +aacagctcttcgaattgcctccacataacgcgggagggtatattttaatttggcaaatac +taagtactgttggcgtcatatgctattaacggttggatattaagttatgtcagccgtaag +caagagtgggcgaaatattttgttacccagtgagagcactcttagagtttggatacaata +ggccatatgttgacttaagaggacgtaactacgccgtacaccattgttcaaccgacttct +tggcaaatagaatcgtattagcaatcttaagaatagagacacgttcgtgttagggtatac +tacaaatccgaaaatcttaagaggatcacctaaactgaaatttatacatatttcaacgtg +gatagatttaacataattcagccacctccaacctgggagtaattttcagtagatttacta +gatgattagtggcccaacgcacttgactatataagatctggggatcctaacctgacctat +gagacaaaattggaaacgttaacagcccttatgtgtacaaagaaaagtaagttgttgctg +ttcaacagatgatagtcatgacgcgtaacttcactatagtaaattgaaacaaatacgcaa +tttagacagaatggtacggtcatgaatgacagtaattcgaagtgctagaccaacttaaaa +taggtaaacgtgcccgaaaccccccttaacagaaagctgctatcatggtgcagtatcgac +gtgttcagaaacttgtaacttttgagcaggtccgagcacatggaagtatatcacgtgttt +ctgaaccggcttatccctaagatatatccgtcgcaaactttcgatttagtcccacgtaga +gcccaagcgttgtgcgactccacgtgcatgcccagaaatacgagtttaaatttggttaca +tggttaattttgaccgaagcatcgcactttatgattgataattggattcaatatgtcgcc +ctatgcgaatgcaacatgatccacaatttggctataagacgtttaatccgtatcacactt +tgtttgcggctagtatagtaacgcccgtgcaccaagagtcagtaacaattataagtactc +cgcaggtacttcaaatataaaaactaatcaaacacgacccatatgatcatctgaagatat +ttggaactttctcgacaaccaccctcgtactcaatacttacactaatcgacaggcacacg +caacgtgtacagtcgcaccatattgagtcaagatttgcttagtggcgatgagcgtacacg +cttatttctctagtcacaattagttatctacgagacatcacgagggagcaaataagcgat +gttatggctacacataggcacgtatgaatatgatataagccagttaaacagtcgaaccat +cgagcaaattctcatgcaccaacccacacgttgaggcacaaagagtaagctgtttgaatg +taacttcttctgctgagcgggccccaacgtaaggatcaactagaagagaaaactcggtat +tagtttaaatgcgtcacggagcatgagtgcatttcactaagaatgtctgtgtaaccaata +taacatctatttgttatctgattgcctacttatggctttgcggtcgtggcgactaatgtc +tccaatccttttgaggtcggtaccaactccctttaaattacgctgtgcaggctcatgcac +tgcatacatatacggtagcaggtagggacctcacgcacccttattataatcaatagtagt +tatcagtcaacgaggcaggaatgctgaggtcgaggtgttggtatattttctatgtgccgt +ctaggcgactatcacgcattaccaggcgagatttaagccaattttgaatatagtcaacgt +aatttttactatgggttccaccgaaacgccttgcacaactaagaatcccataaaatatcg +atatcaaataaaagattgtgtcaataccttcatatatattttttcggttgactaacgtga +actaaggttaggggttttgtatgtctatataggaaacagtttcttttctgtcctacttta +gtaaagtcttcaagccttactccaaaatcacggtgattaagccgttactcagcagcatga +ttctgcctgctcgggtcctaaaatccagccttgtaagagtcgctgtgtattagctaggga +gacctttgttaaaaaggatatatcgcggcgggatgtgagtgcgtggcgcatactcaatct +tcagctcgtgtcattataatatctctcccccacgcttttcactagatatgccgtgtaagc +aaacaccttatgcttaatttcgaaaatattggtacttgaaaaaagctgtaggggtactta +atgtctggtaggagatcaggagagaattgagtgtaaaaccgtaaagccctcacctgactt +catgtaaatggcttagaagactccatgatttaataaatactacgaaggaaagactggatc +taaagataactctagtaaggccaactcccttcaatgctgttgccagttataatccaagag +ctgtccttttctgaaccatagcggcttctgaagcgaactagaagcaaagttggttctagc +cagacagccacataccctgtacgggtgtattactaaaactggtccggtattagttcacca +agggaggaattaggcaaaggatctaggtatgcaagtcggagtattacatccctaccctga +atccatcaataggttcctctgtactggccttcgcaatgagtattcaaggttgtacagccg +tataataataagatagtgactatgaacgggaagtaacccgctcaccttccccaaaacatt +gttatatctaagtattaaagtctgccgtagtgttaatactcgaaaataaacaactggcaa +attacaccgcacttaagccgcttttgatttatatttttccaatgcgcttttaaaaataat +tcagtcctacatactaattaagacccttaaacggagatatcacaagttaagttttaacca +tctcgactaggtggaactatagatacccaactcaatttatcattacctgtaatgttccta +gaaggattgcatttcatgtcaagacggtggagtttcacagcgaaacttcagtgtgaacag +attctgagaaatcacctaaacctattagtcagagcacccggttagaaccagttgtcaaaa +aatagagcggttgcatgagacagaagtaacgatgagatccgttgtaacgttgagacatct +ggcctatcgtcaatacagtcctcccttaaaaatatttttaaatactaggcaaacccaaca +taggttagtcctatgtgatacgccacatggtatatcattttgtaacgttacctagggata +atcaggaagtggaattacgcaaaagtagacagtgaaatgcttagggttatagtctagtcc +aaagataaaggataaagcacgtcagagaactatattagccgaatgggaatcattgttagg +agactgtggatcatgtctaaaaagcaacgcagaaacagtcatcgaaaaaatctcgttttt +gtttgaatctaaaagagctttgatgaccgatagtacctgtatactagttactgtattacg +tgtctaatgatttcggattggggtccccagaatcagacgtcattgtagacgattcaagtt +taccaatttaatttcccagctctccttggagaactatcgccaataattgcagtcactttc +cttttctgaaacgataaagccgtcagagttctctgcaacgttggacttacctgaggttct +aacccactttcggttctaatagtagttaacgacacaacgaataacctttactgtggggct +ttcacgatattttttcgcttattattaatggttacgtcataagctggtgtccaaattaag +gttaccggcttcgcagagtagttgtatccaagtataacttccctaatcataagatcgagg +tagaaaattaatgctgtctctaaccgaacagatatgtcccactatgtggtatggacgttg +ctaattacttctgaagggaaattggtcattatggatacgtgtctaccatcaggtcggacg +cagatatggttctgtcttcagttgatccaccgttctttataggataataactgacgatta +aagattatggtaaatagattaagccaattctcttcttgtcagtgaagcatccttaactga +cttgctctgcagcccctcatacatttagctattcaaagtaccggctcgtttcaaactctc +ccacctttggaagaggttgtcaacttgataagtatatcatttacagcattttttcggacg +tacctctaatgtttcattgcagaaaattagttttttctatcgcacattttgcaagtaacg +ttagagacacaattatctgcgaatgaactgctagatctgacgaccgggagcctcgcaaat +atcaaaaaagactgacatatatcaaggagtcgttgacaagtgctggtaagtcaattggtt +tatctgtcccggcgtttcgatcttaagctgaccatgcacggcagagtaatgtcactctcg +ttcttacaagtctgtctccaagggtcggcaaaaaagacccctccattctcgagcccactc +acgatatgtagggacgacaacttgtgcggcttatgaattgtctggactgcgggcgagggt +ccatatctccgaagttagaagggacatacctttagatgataagatcaattcttattgacg +aaattcatccacaacggggaacaacttcaccctagacttacgtctgaaaagacacctagc +gtcttataaaaggtcagtgccccgtttcgtaaggctggaattacctacgcaaacttaaac +ctcgcgcccttccttacgtatcgacaagatagaggctatcgcgaatgtactacggaggca +tgaatcatatactagaaccaagtgcctgtgatattaacaagatgatccgacgcgagcacc +gtaattctaggcataaaactccagcaatttgggggccgaaaacaaatgacgttagctaat +taattatatgacatgatcaaaggaggtcaatcacgcatcgagttcgacgtatattcattg +aacttcgtgcgtttgaaagaaacttttatgaaggcaaaattgatcctgtctcctatttca +tgcgtacctcctagttgataattccccgagcagtggttaggacacttttgtcggtatcaa +gttccggtctcaaaacgtaaaattctgtaatctgtatggatggtctgtgaattagttaat +ttttatgaagtcgtcgagacgcagttcctattgatttattctaaacggagatgtgcttcg +tgggactcggaagtagatctgtgtttatgattattgctactttagatgctgactgttaac +tccgtgttgtttttcaaccgtatatcacaaccgaattggatagaacctatagtttcaagt +tctgccacaaggtatcatatttacagttagtgctggttgcttctttcaaacgtggtgagt +ttgtgctatcacgtcaacggtagagctcagtggaccgagtgcgcgttcaaccctgttcca +gagagggtgtgatagcacatataccacgctcgtcgaggcgttcatgatagtttgcaagag +ccggtgttaaacacatattattattgttatccaactaatcggacctatgcataaagcatt +gtctaaacagaataattgcctatatacggtagttttagtgatttatatcttagtatcagt +tagagcttcgaactcttcaggttcctcatatttaacgttcttcgaaagcgaaaacttcta +caaacgaatgtaagcggttttccaagtagtacctataaatcacagaaagatctgtctcag +tatagttgaaatggtattcagctagtgacgtgtaccaattatcatagttcactcaagcaa +gacgctcattaacgaatatagacaagacactatatcatataataaaaaagaacatggtgc +tcgaacatagttgaattcaccatattgaaggggaatgctgacatgtaattcgctactaga +cgatcaattccctacttgtcaaagttgaactggtacgttcttggaattaaatatgattgc +gctggaccaaattgcgacttcttgagtttcagggcaaacgattgagccggaggatgtccg +tctcttacctttcttgcttatgataaacgacggtccctgtacatcactgggaattctcag +caaaaataattgggtaaatcgagactcgatgtattcggccacaaaggtgttagacgttaa +agattattcaacggggcgataataggatcataaccggtatgcaagcgcattgaaagagcc +atgagatccttatccgataaacgctgcacggtatgtgcagccttattgtcgatcacgaat +ttataaatgtagtctgggctgtaagttgaagacctaagttataatgaagtgcaataccaa +atcgattcatagtggattatcagactcaagatatctcctgataaattacagttgttaaga +tacggataaaatgagatttaagattagcagcctctaatctgtttcaatcccgttggaatg +tggtatgcgatcaaggttaagttaaaatcaagcctgtcttcagtcttgattcttgttctg +ccatcgcatgcggtctacgtgagttaatatgtagcttacgttctagcttgtgctaatctg +agtatagattcgtagaggaatattatcaagcttccacgcctcaacgtacgtgtattggtc +acacaagacactaaaagtggaagtagcgtaaactatagtctagttgttaaatgctcagtt +cttgttatattcgatatactcttggctaatttatgtctgagtatataaaattaatgatat +taacttgcatttcacggatcccttagaaaaagattttgaccgagcgcattataaacggtt +acaccgaatcaatagaagcatacccaatagctttctttgaatttattgcctgcgcaactt +ggctgactctctagatccgaataattctatatggtcgtgacgaaactagttcattactgt +ttaaaatgccaacatgtcttttgggccgataatggctctttgcaaaattactcaatgata +cgattgatcaaagcggtagttgctagtggtagcatgtaagtctatcaaatgtctgattat +ccgaaaatcttccaaaagagtccacgtaccatatctatctcatagcgacgcgaggggaac +cttatctaactatcattccatttaccgggtgactctcgatgcaggatccgattgggataa +attgcccagaaatggctcattcctgactaagggtaaggccgttctcagcaagggaacccc +gcgaatctaggcttataccatctagattgttaactacttgcctgtagttctacagccata +ctggacagttgtttctaaatgatcgggattcatgctagcactcctctgaatgcaccgcgt +aagtttaactattacgtccgtgggcagataaggatggaggctgtatgtatcttaactgtt +acctaatatggctggtaattatcaaagtaaggaccttaatgccatagcgctagcaatcgc +tttgtatactgaccatgtgccaacctctcttaatctgtaaaatataatgtcttagctaac +tgtggacgatcatgtctctgcctagagcttcgctgtatcaattcctatagccagcgtact +agtgacacaacaacaccgtgtgagaaaagatattagtccttacgtctgtctctctacagc +ttattgatgaggattgaacatggacatatagctccccctcaaaagcagatgctacctctt +tattccattctcgaacatttgccgaacttaatttcgacaaacctgaggtcacgtcttaat +ttatcggtaacgtcacgtccctttgagactggataaatatattaccaggggccaacgagc +aattgttggaggcgcttctataatacaaggtgtcttgtcaaagaaagacggcgtgcgtct +cgtgcaactcacttaaccaatattaatgtgaaacccccctctctcacatcttatgcggtg +tactgccctggtacatttcctgtacaggactccaacagtgtagattcctaagatagctgt +tggagttgcctcacgccagatcgaaaaactgaataaactagtgagctgagctgcagaaat +accgcttaattacttatgactagttcaaagggacctacgtgatgtcagacattgcaagga +agaaattaggtttgtgcgtcattttggctggactagcactccttacttcccctactattc +aaatgtcgtaaacagcatgagacaggatcgtgctgacatttaaggtctattgggaacgag +gctacctttggtcgcgcgctcgcgttctccgaatgaccgaaatgcatgagcacagtatgc +aattgcttatagatctaaggtctggtcgttgaaaccaagcacgtaggcctgggaaatcag +ttcttcctcagcaactacacaaaagcgtccaagcattagtacttgtagtaaatgtccgaa +cctatgcgctcatttgaaagtcaaaaaatatttttaagcagtaggcacctaacccgattc +ctctacttagtagctttctttgattctcagaattgactgcaatatcactgcacaattctg +tgccattactagacttctctgtattaacgtctcatcttactaacactcgcctaggacaca +tctgagagtgaagtatttcaatacatttactgaaatcttcagttctaaaatccccgaata +aggctcttatcggtttggccaacacaagaaaaaaacttcttgcaccactcaccttcatac +gcaggagcctggggaacttagtaataactatttcggcagacaaagcttataacaagttgc +cggcgcgtataatatttaaaagaccccttgagctgctcaattaaaacgctcacctggtat +aggctattagatagtgccgtcttagtaaggggcgggaattatcggataaactgatatttt +gataaaataaccgacttgttcacgacataagtcactaaggagattttatctttctccaaa +gtatatcttccttggataatttcaaagcgctgcaatttaagttctgttactagtttatgc +tgctgggaggtgaccggaaggcgtagtaatctagaggcaaattataagaagttcatcata +tcattttcgactacaaaaacaaggtgttgtatgccggcgcattgtgtaaactggacgagt +accctagatggaaaattatacgttaagccaagatttcgatgtaatgataattacctacac +atttttgctatccataggaacaagagctgttctataggctcgtggcatacgaacatttgc +tgccgctatgaatattggaagctcttcaactacagactctattcttaattgccgtcgaaa +atgggccgaatcggctattattaatactcggtttttccgaggggattgttgtcgacagtc +gtaattattattaatattgatgttggtgaggtcatttaaatacaaccttgcagacaatga +ataagggatccaatctctcatactccttttacaattgctcatgcccctatgcaaacctta +tgccgccacacctccgcaactctctcttctgaactgtaagtagcttcattactggtttga +gactatactgaagctgatgacattctaaaatggctattttcgaatgtgattcataatgtt +tatcgtttgggatggcagaatcacgttatttttgatatagcccgggtattctattgtata +gaacgtatgctacaagtcattccccgaagaagactagaagtaaacaacatgcgaccatcg +ttaagccacgcaaggctgtagctttatttcccgataacctatcttccataaatagcggac +agcaggatactgacgctcaacatcagtggttatggtctaatttttaacttttaataaggt +aacttcagcaggcatacacagtaactctttaatttataatcaaattagaagtctgacact +tcttatatttttctatcatccaacgcgatcgcccattagcttattgtgttactaataacg +tatctaaaccaatccttttcaagctactgcctatattgtcaatatatacaaacaacagga +tagtaggctgcttaaaaaatattgtcaaccgtgtacgctttacaatacccggaaatcaca +aactttgtagacaacgagtgaaatttatacactacgaagggccagcgtacaagacccatg +aattaggcgatatgtttattctgacatattggtttatccttaatctgtcgctgtaaaatg +aagccgcccccatccctgcgaattttttttcgaagattcacgactgaaatataaatacgt +ttggctatatttatgttggagggaggcaatagcctttactgttaaccgaagatttagcca +gtgagtgtgacactaaaacactggaataaatgcaggcgttcttctgggtaaaaggtttag +tcaatctcgcctataagttcatatagctctggatataattatctggcccatgcatttatc +atggcgcttggtgccctgtgtgaagccggcctctcatattgaaggtccgaagtattccat +gtacattaagatcactctctcattcatgcatcttggcttaacaaatctggttgtccaagc +tttccaggcacgtatggtacaaattcggatcgaatacttataaaaatgatatgttaaact +gtctaaaacgctcatctacaaagtaaagtgcactaaccaatagagtctcaagaccgtgta +atgctggtgcactgaatgtgtaatacggttagaagggattagttatgttacaaatccatt +gaaaacttaagaagcattgcgtgctcggagggtgcatcttttatcaagagactaacatta +ttttcaacgacgtacatgctttacaatagggtacttatcaaacgccgagaaacgcgccta +tagtgatgttatgattatgacccgatatccattggaccgaattttatgtaggttcccagc +gtactcgcgtaatatctcggtattgccataatgtaatacttgtcggtctctcccagatga +aaaagcgttacagagtatttcaatgaaaaacagcgcgcaacgtcaatacctttaggggta +acggccgctgatttcatatagatatacgataagttggtatagctctactaggtggcatcc +acaatcgttgcatttactatagctggttacaatcataatctataccgttccttacatact +accatagcgggatagcgtttttttgccgttgattgggtttaagaggatgtcagtctcatt +atatccgattcggtgggagagccgttgttttcaaatcgcacactttgtgacataatgtac +aagataacaaaactgatataagatataaactgtcaatatcaccttgacacttgaatcaaa +gtaaattaactcgcaaatataatttgactaattgggtgcagatttctcaattaataaaaa +aatggcaccggatgggcttacaagccccttatcattcacttgtatcatgatttccaagaa +caatagaatttgctagcaagtatgaacagagattcgaattgcatccacagtacgccggag +cgtttattttaatgtggatatgacgatgtactgttggcggcatttgctagtaaccggtcc +ttatttacgtagcgcacacgtaagcatgtctgggagaaatatggtggtacaatctcagag +aaagattacagtttggtttaaataggacttatcgggtcggaagtggaacttaataagcag +tacacaattgggcaacagacgtcttgcctattacaataggattacaatgcgttagatttc +agacacgttcgtgtttggctattcgtcaattccctaaatagttagacgatcaactattat +caaagtgattctttgttcatcctccattcatgtaacagatggcacactacgcataacgcc +gaggaattttaacgagatttaagagagcagttcgggcacaacccacttgactttataaca +gctcggcagcataaacggtaatatgtgacaaatttccaaacgttataagaacgtatgtgt +acttagaaaactaagtggttcatgttcaacagatgtgacgcagcaagcctaacttatcta +ttggttttgctataaaagaacaaagttacacagaatcctaagggcttgtttcacacttat +gcctagtgcttcaccatcttaaaatagcgaaaccggcacgaatcaaaccttaaaacaatg +cgcagatattggtgatggtgactccgggtatgataatggtaactgttgaccagcgcccac +ctcatcgaagtatagaaagtggttaggataaggatgagaccgaacttatttccggccata +actttagattttctacctagtacacaacatcagggcggacacgaaaccgccatcacatca +tataccaggtttaatttgcttaatgggggaagtgtcaacgaaccttcgaactttagcagg +catatggccattatatatggccccagagcagaatgctacagcagacaaaatttggattta +tgtagtttaatacctatcaaacttggtgtgaccatacttgtctaacgacagtgcacaaag +tgtaagttacaattattactactcagcagcttctgcaatgataaaatcttatcatacacg +tcacatatgataatatctacttagggggaacgggctccacaacctacatagtactcaata +cttacactattcgacaggcacaccaaacctgtacagtcccaaaagattgagtcaactttg +cagtactgcagatcacagtaatagcttagttagcgagtcaaaattagttttctacgagac +tgcacgaccgtgcaaatttccgatgtgttggctacaaatagcaacgtatgaatttgtttg +aagccacgtaaactgtacaaccttagagataagtctcaggctactaaaaacacgttgtgg +cactaacaggatcatggttgattcttacttattcggctgaccggcccaataagtaacctt +caactagaacagaataatcgggagtagtttaattcagtcaaggtgcaggtctcattgtaa +ctaacaagctctgtgtaaccaagttaaaatcgttttcttagcggattccctacttatgga +tttgagctcgtccacaatattcgatacaagaagtttgtggtccgtaacaacgaaatttta +attacgctgtgcagcctcatccaaggaattaatagaaggttgatggtaggctccgaacgc +tccatgattataatcaagtggactgtgcagtaaacgaggaaggtatcctgacgtcgtggt +gttcgtttttgttatttgtgccctatacgagtagataaaccatgaacagcacagtgtgaa +cccatggttgattttaggctaccttatttttaatttccgttacacagaaacgaattccac +aactaacatgccattaatttttcgatatcttataaaagatggtcgaaattcattcattta +ttttttttcggttctcgaaagtcaactaagctgtcgcgttttgtttctctttagaggtaa +aagtggctttgatctcctacgtttggatactagtcaaccattactccatttgatccgtga +gtatcacctgtctaacatccagcattatgactcctcggcgaagaaaagacacacttctta +gagtcgatgtgtattagctagggacacagttgtttaatacgatagtgagcccagggaggg +cagtgcgtcccccagtagatttattcagctagtgtaagtataagatatctcacccacgag +gttcaagtgatatgcagtcttagaataatacttatcctgaatttcgatattatgggtact +tcaataatccgctagcgctactttatgtctcgttggacagcaggacacatggcagtctta +aacactaaagacatcacctgaatgaatgtaatgggattacaagaatcaatgaggtattat +atacgacgtaggaaactctggatatatacagtaatctagttacgccatcgcacttcattc +ctctggaaacttagaagacatcagctgtacgtggaggaaccagacccccgtatgtagcca +aatagaaccaaagttgcttatacaaacacacccaatgacaatggaccgctggagttcgta +aactcggaacgtagtactgcacaaacccagcatttagcaataggagctacgtatgcaact +cccacgtggtaataccttcaagctatcaatatataggtgcctagctaatcgcattcgcaa +gcagtattcaagcttgtaaaccagtataataattacagaggctctatgaaacccaacttt +ccagctaaaagtcccaattaaatggttatttcgtacttttaaagtcgcccgttctgttat +tacgcgaattgattctactccaaaattaaacacaaattatcaaccgtttcatttatattt +gtcaatgcagctgtttaaaataaggctctactaaattataattaagacacttattaccag +atttctctagttaagtttgaaccagctcgactaccgcgaaagatacattcccttctctat +ttttcagttcatctatgggtcagagaagcattgaatttattctattcaccctcgtcgttc +acagcgaatcgtcagtgtgatcagtgtatgagaaatatcctaaaccgtttagtcagacca +cacgcttagaacaagtggtctaaaaagactgccctggaaggagtaagaagtatacagctg +atccggtgtatccttcagtcatctgccctatactaattacacgacgcaaggaaaaatagg +tttattttctaggcaaacccttcataggtgactccgatgtgttacgaatcatgcttgaga +atgtgctatcgttaccgacggataataacgatctccaatgaaccaaatgtagaatgtcta +ttgattacccttttactattcgacttagagataggagatagaacctcagtgtactttttt +agccgaatgggaatctttgggaggtgaatggccataaggtcgtaaatccaaccctcttaa +agtcttccatattatatcgttgttcgtggaatcgataacagatttgttgacccatagtaa +atgtatactagtttatgttgtaagtgtagattgttttccgattgccgtccaaactttatg +tcgtaattgtagaccagtaaagttgaccaaggtaagtgcccagcgatcctgcgagatcga +tcgccaatttttccagtcactgtaagtgtaggtttagataaagccgtatgagttatatca +taagggcctcggaaagcagcttcgaaccaaagttcccttataatagtagtttaactataa +aagtatatactggtctgtcgccctttcacgatttgttttaccggtttatgaagcgttacg +tcattagagcggctccaatttaaggttaacggcttccatgtgtagttgtatacaaggata +acttaaagtatctgttcagcgagctagttaagttatcctcgatagaacacaactcagagg +tcccaagatcgggtttgcaacttgctaatttattctcaaggcaaattgggaattatcgat +acctgtataccataaggtcgctcgatgtgatgcttatgtcttctggtgatcctaccttag +ttagtgctgattaacggaacattaatgtttatcgttttgagatttagccaattctctgat +tctaactcaagatgccttatctgacgtgctatgcagcccctaagtattttacattgtaat +aggacacgctcctttaaaactcgccaaaaggtcgttgtggttctctactggttaactata +taatttacagctttgttgagctagttcctctttggtttaagtcctcaatattagttggtt +cgagcgataagttggctagttaccttagtcactatattagatccgaatgttatgcttcat +ctgaagaccgccaccctccaaaatttcttttaagactcacttattgcaaggtgtaggtga +attcggctcgtttctcaagtggtgtatctgtacacgagtttccatattttcatcaacagc +caccgcacacttatgtcactctaggtattaaaagtcgctctacaaggggacgcaattaag +aaacagacatgctagtcaaaaataaacatagcgaggcaccactaattcggccgcttatca +atgggatgctctgcgcgagacgcgccagagctcagtagttagttcggacatacatttact +tcagatgatcaattagttttctacaaatgcttactctaccccgaaaaaagtcaccagact +cttacgtctctttagtatccttccgtcttatataaggtcagtcccccgtttcggtaccct +ggaatttactaagaataatgaaacagcccccaaggacgtacgtttacaaatgatagacca +gatcgcctagcttattccgacgcatgttgcatagaattgaaccaacggaatgtgagagta +actagatgagccgaccacagcacccgtttgcgtcgcagaatacgcctgatagttcggcca +cgaaatcatatgtcctttgagtattaagtatttgtaatgatcaatcgagctcaagcaagc +ttacacttcctcggatattcagggaacttagtgcctttgaaagatacgttgatcaacgaa +aaattgataatggctcatatggaatgcctacctcatagtgctgaattaacacagcactgc +ggacctaacttttcgaggtttcaagttcacgtctcaaaacctaataggctggaatatgta +gggatcctcggtgaatttgtgattgggtttgttgtagtactgaccaagtgaatattcttt +ttttctaaaagcagatctgctgccgggcactacgaaggagatctctgtgtatcattattg +cttcttgacatgatgactcttaaatcactgtgggtgtgcaaaacgatagcacaacccaat +tcgatagtacatattgttgatacttcgcactaaaccgttcatatttaaaggttgtgctcc +ttccttcgttaaatactggtgacttggtcctatctactattagctagacctctggggaac +cacgcccccgtaaaacctgtgcaagagagggggtcatacatcttagacatcgcgcctcca +ccagggaagcattgggtgattgaccaggtgtgtaacaaatatgattattcttatactaat +attagcaaagatgcataatgatttgtattaaatgtataattgaattgataagggtctttt +agtcagtgatagagtagtataaggtagacattagaactcttaaccggacgcagatttttc +ggtcttagtaagccaattagtcgacaaaacaaggtaagagcggttactagtagtacctat +aatgcactgaatcttcggtcgaagtatagttctaatgctatgcagattgtgacggcgaca +aatgttcagacttatatcatgaaacaagctcttgtaagtattgacaaatgaaaagattga +atatttttaaatacaaaatgcgcctacttattaggggaattaaccagattgaaggccaat +cctcacatgtaatgagataatagacgataaatgaaattcttgtaatagttgaactgctac +gtgatgggtattatatatgattgagatcctccaattgccgacgtcttgtcttgatgccca +aaagattgtcaacgaggagctccctcgcgtacctgtcgtccgtatcataaacgacgcgac +atgtacagcactccgaagtataagcaataataatgcgggtaatccagactagatcttttc +ggactcaatgcggtttcacggtaaacatgattaataccggagagtagtcgagcttatcag +cgatgcaagcgaattcattgtgccaggagatacgttgcagataaaaccggcaacgtatgt +caacaagttttggcgatctcgttgtttgtattcgacgaggcgcgggaacttcaagaacta +tcgtatattcaagtccattaccttttagtttcagactggtggagctgactaaagttatat +catcattttgtacactggtttagttaacgataatttcagatttaacatgaccagacgata +atcgctgtatatccagttggaatgtggtttgccagaaaggttaacttataatcaagcctc +tcttcagtcttgattcgtcgtatcccatccattgcgctatacctcagtgtatttggagct +gtagttataccgtgtgctaagatcagtagacatgacgagagcaatattatctaccttaca +agcatcaacggacgtctagtcggaacaaaagactctaaaactcgaacttcaggttaatat +actatagttctgtattcagcagttattcttatattcgatattatcttgcctattggatgt +ctgactttagtatattaatcatagtatctgccatgtaaaggtgccagtactaaatctgtt +tcacagtgcgaattataaacggttacaaccattaaagacaacaagaccctatagctttat +ttgaattttgtcaatgcgcaacttggagctcgcgatacatcccaattagtctatagggtc +gggacgattctacggcatttctggttataatgacaacatggattgtggcccgagaatcgc +tctttcattaattaagcaatcattacagtcttataagcgctacttccgagtggtagcagg +taactcgatataaggtcgcatgagccgaatagcttaaaaaacaggccaccgaacattgat +agagaataccgaccacagcgcaacctttgattactttcattaaattgtacggctcactcg +acatcaagcttaagattgcgataatgtgaactcaaatggatcagtactgaagaaccgtaa +cccacttcgcagaaagcgtacccagagaagatacgctgttacaatatacagggtgaaatt +attgcctgttcttcgtaaccatttcgccaaacttggttagaaatgatagccattcatgat +agaaataagctgaatgataccagtatctttaactatgtagtcagggggaagataacgatg +gtccatgtatgtttctgatatgtgacagtattggccgcgtaatttgctaacgaagctact +taatgcctttgagcttcatatagatttctttaatcaaaatcggcaaaaagatagtatgag +ctataatatatgctagtagagaactctggaccatcatctatatgaatactgattcgagcg +tgcaattactttagcctgcgtactactgactctacaaaacactctgagataagtttgtag +tcagtaagtcgctctctataaaccttttggatgaccattgtacagccacttatagatccc +aataaatagcacaggagacagagtttttcaatgctcgatcatttgccgatagtattttcg +tctaacctcagggcacctattatttgatacctaacctaacggccctttcacaatggagaa +atatatgacatcgggacaaacacaaatggtgggtggccaggagatatgacatggtggcgt +ctctaagaaacacggactccctctaggcaaactcacgtaaccaattttaatgtcaaacaa +aacgctcgaaaagattttgccgtgtaatgacctggtacattgactggtcaggaatacatc +actgtagttgccgtagtgtcctgttggtgttccatcaagacacatcgtataacgcaattt +acgacggacatcagatcaagttatacagattatttaagtatcacgtgtgcattgggacat +aagggatctcacacatgccttggaacatttttgctttgtgccgctttttcgctgcactac +caatccttacttaccagtatattcaaaggtcgttaacagaatgagaaaggttagggctct +aagttatcgtcgattgggatagacgagacatttgcgagcgccctccacggatacgaatct +cccatatcaatgtgaactggatgctatgcagtttagttcttacgtctcctagtggtaaaa +atcaaagtagcactcgcatagcagttattcagaacctaatacacaaaaccgtcaaacatt +ttctaattctaggtatgggccgatcataggagctaaggtgaaactcataaatgttttgtt +agatctagcatcctaaaaagatgcatatactgagtagctggcgtgcattctctcaattgt +atcctttttaactgaactagtcggtcccatttcgtgactgagatctattaaccgataaga +ttaataacactcgcattcgtatcagctcagagtgaagtttttcaataatttgactgatat +attaacttctaaaataaccctttaagcctcggatccgtttcccaatcacatcaaaaattc +ttattccaactatctacggattaacaacgtgcatggggatcgtagtaagaacttgttccg +atcactttgagtatatcaagttgacggcccggttattattgaatagaaacattcacctgc +taaattaaataccgcacatcggatacccgatttcagagggccgtcttactaagggcaggc +tttgttcggtttaactgagatgttcattattttacagtatgcttcaactaatatgtaacg +aaggacagtggatctgtctccatagtagatcttcagtcgtgaatttcataccgctcctat +ttaagttcgcgttcgagttgttgatcatggcacgtgaaagcaacccctagtattctagac +gaaaattttttctagttcatctgataatttgccaattcaaaaacaaccgctggtttcccg +gcgcattctctaaaatggaagtcgaacctagagccattatttgtcggtaacccatgagtt +ccttcttttcagaagttaatacactgtggtcctatacagaggaaaaacagcggttatata +cgatcgtggcataacaacattggatcaagatagcaatttggctacctattctaattctca +ctagattcggtattccactacaatatcggcagattaggattggatgaataatcggtgttt +aagtccggttgcgtctccaatctcctaatttttattaatattgatcttggtgacctattg +taaataaaaacttcaagactttgaataacggtgaaaagatagaagactcatttgaaaatg +gatcatccacagatccaaacattagcaagacactaatccccaactagctattctgatcgc +gatcgtgctgcagtactcctgtcacaatagtctgttcatgatctaattctttttgggctt +tgttcgatggtgattcagaatctttatccggtcgcttccctgtagctactttgtggggat +attgcccggggattatagggttgagatcgtttcctaaaagtatttaaaccaagtagactt +caactaaactacatcagaacatcgtgaagacaccatacgcggtacctttatttaccgata +acatttcttcaagaaataccggtaagcagcataatgaccctaaacagctcggggtatcgt +cgtagttttaaattttatttaggttactgctcaaggaataaaaactaactatttaattta +taataatattacaaggctcacactgattagatttgtctataagacttcgcgatcccccat +taccggattgtcttaagaataaactagataaaccatgcattttctagataaggcctttag +tctaattagatacaaaaaacacgatagttgcatccttaatttattgtgtcaaacctggaa +ccttttaattacccgcaaatcactttatgtcgagactacctctgaaatttattatctacc +taccgcatgaggacttgaaccatcttgtaggagttatgtttattagctaagattcgttta +tcctgtagcggtccatgtatattcaacaagcaaaaagcactcagaattgtttttagttga +gtcaagactgatatataaataagtttccctagttttttcgtggtgggacgatattgaatt +gaatcttaaccgaagagtttcccactctgtcgcacaataatacacgccaatatttccagc +cctgcttatgccttaatcggttactcaatctcccattgaagttcattttgatctgcatag +aagtttcgggcccagccttttttctgccaccttcctccaagctctgtagacgcactctaa +gattgatgctcacatgtattaattctacattaacataaatatataagtcatgcatcttcg +agtaaaatatctggttctccaacatgtcctggcacgtatcgttataatgcccatacatgt +agtattaaaatgattgggttaactggatattaagatcatcgaaattgtaaagtcaaatta +acaatactgtctcaagaccgtgtattcctcgtgctcggaagggctattacgcttacttcc +gttttggtatcttaatatgactttcaaaaattaagttgcagtgagtcctacctgcgtgca +tcggttagcaagagtataaaagttgtttaaacgaactacttgctttacaataccggtcgt +atatatcgccgtgaatccagaagattgtcttctttggattatcaaccgagatcctgtgga +ccgatgttttgggaccttcacagaggactccaggtagagctcgcttttgcattaatctaa +gaattgtacctctctaaaagatctaaaacagtgaatgtgtatttcatggaaaaacacaga +gaaacgtaaattactttaggccgaaaggcacatgagttattatacatatacgagatggtg +gtatacatcgaattcggggcatacactatagttgcattgtatttagctgctttaaataat +atgatattaccttccttacataagacattaccggcataccctggttttcaacttgtgggg +ctttttgacgatcgcactctcatttgatccgagtagggcggtgacccctgcttttcaaat +acaaaaatttcgctatgaaggtaatagattacttttcgctgttatgatagaaacggtaaa +tttaaaattgaaacttctagaaaagtaaagtaacgagaaatgattttgtgaataatgcgg +tcatgattgcgcaagtaagaaaaaaaggcaaaaggatgcgcggaatagaaacttatcagt +cacgggtatcttgatttcattcttcttgtcaattgccgacataggatgaaatcagattcc +aatgcaatacacagtaacccccacccttgattgtaatgtcgatttgaagttgtacgcgtc +gacgaagtggatagtatacgggccttttgtacggtgcgatcaactatgaatctcggcgag +ttagatggtcgtacaatctcacacatagaggtcacttgcctgtaatgacgaattttcggc +taggtactcgaactttattagaagtaaaaatgtgggcaaaagaaggattccattttacaa +gacgattacaatgagttacatgtctctcaacgtagtctttccctagtagtctttgaacta +tttaggtactccagaaaattttagcaaagggtttctgtgtgaatccgccattcatgttta +tgatggaacaataagaataacgccctcgtatgttatcgacagtgaagtcagcagttcggc +caaaaacatattcaatttagtacagatccccagaagttaagctaagtgctctaaaatggc +ctaaacggttatcaaagtaggtctaattactatactaacgggtgcatcgtaataactgct +gtcgatgcaacactatatgatagtgtcgttttgctatatatgtacaatgtgacaaagaag +ccttagcgattcttgcaaacttaggacttcggattctcaatcttaaatgtccgaaaacgc +aaagattcaaaaatttaatctatgagcagatatgcctgatggtgactacgcgtatgttaa +ggctaaatgttgacaaccgcacacataatcgaactattgatagtcgggagcataaccagg +tgaacgtactttgttcacgacatttattgacatgttctaaatacgtctcaaaatcacggc +gcactagaaaacgcaatcaaatcattgtcctggtttaagggccgtaatgccggtagtgtc +aaacttcatgagaactttagctggcttttggccagtatttagggaccaagagcactagcc +ttaagctgaatattttgccatttatctactgttataactttaaaacttggtggcaccaga +cttgtcgatacacacgcatcaatctgtaacgtaaaaggtttactaagaacaagcgtagga +attgagtttatattatatttaaactaaaagatgatattagcttctgagggcgatagggct +ccaaatcataaagaggaatatattattacacgattagaaacccacaacatacctcgaatc +gcccaaaagtttgacgaaacttggcagtactccacatctcagtaatacagttgggagagt +ctcaaatgttgttttattactcaatgaaccaccctcataatttcactgctgttccattaa +atttgcaaacgatcatttgctttgaagaaacgtaaaatcgacaaaattacagataagtag +atgcataataaaaaaaactgctcgctataacacgatcatcgtgcattcttacttaggagc +atcacccgcacaataacgtaccttaaactacaacactattagaccgagtactgtaattca +cgaaagctcaagctcgcattgtaaagaacttgctctctcgtaaaatgtgataatagtttg +cggagaggattcaattattttccattgcacctactccactagattcgataaaagaaggtg +gtcctcccttaaaaagaaatgttaagtaacatcggaaccataagcaaagcatgtaagtga +accgtcatccttccctaagaaacataaaggtttttaataatgtcgactgtgaactataac +tgcatcctttcctgacctactccggttccttgttgttatttctgaacgagaccagtagat +aaacaatgtaaaccacagtgggtaccaatggtgcatgtgacgctaccgttgttttaagtg +cccgtacaaacataagaagtcataatcttacttgaaattaattttgccttttattttttt +tcaggctcgaaattaatgatttgttttttttgaccttctagttacgctaatatgcggtcg +cctgtggtttctattgagtcctataacgggatgggatctaatacgtttggttactagtaa +acaaggtataaatttgataccggagtatcaactgtataacatcaagctttatgactcata +cgcgaagtaatgacacaaggctttcaggagatcgcgagtacagagccactaaggggtgta +ttacgatagtgacaccaccgagcgcactcactccccaagtagatttatgatcctacgcta +agtattagatatataaccaaagaggttctagtcagtgcaactcttagaataataattagc +cggttttgcctttttaggcctaatgcaatattcagctagcccttatgtatctcgcgttcc +acagcaccactcatggcacgcgtttaaactaatcaaatataatctatgaatgttatgcca +gtacttgaataaatcaggttttttataagtccttgcatactctcgttatatactgttaga +gtcttaccccatagaaattctttcatctgcaaacttagaagaattctcagctacggggag +cataaagtccccaggatgttgacaaatacaacaaatgtggcttatacaaacactccatat +gaaaatcgaaccctcgtggtagttttagccgaaccttgtacggataaatccctccatttt +ccaatagcagatacctatcctactacctcgtggtattaaattaaagcttgaaatatagag +ctgcatagcttatccaattcccaagcacgagtctaccgtcgtaaccacgatttgatttac +agacgctagagcaaacccatctttaaacatataagtaaaaattaaagggtgagtgcgtac +gtgtttactagcaacttcgcttattaagacaattgtttataagccataattaaaaacata +tgttcaacaggttcattgatatttgtaattgcacaggtttttaataaggatctacgtaag +tataatgaacaaactttttaccagagttatattctgtactttgaaaatgctcctctaccg +ccttagagactttcaattagattttttgcagttaatctatgcgtaagtgaaccatgcaag +ggatgcgattcaaccgcctcgtgctaaccctatcgtctgtctcataactgtaggtctaat +ataattttcagttttcgaacacataaccctttgaaaatctgctatttaatgtctcacctg +catgcactatcttctatactgctcagaacggctatacgtcactatgctccaagtgacgat +ttaaacgaagcaaggaataataggtttattttagtgcaaaacaattaagtgcggactacg +tgctctttacaataagccttgtgattgggctataggttaagtcccatattaacgatctcc +aatgtacaaaatcgacaatcgctttgcattacccggttactagtcgaattacagatagct +gttagatactcactctaattttggacaacaatcccaatcttggggtcgtctatcgcctga +agctcgtaaatccttccatcttaaacgattacatattatagacttgttcggggtagagat +atcacagttgtgcaaacattgtaaatcgatactagtttatgttggtagtctagttgcttt +taccattccccgaaaaacttgatctactatttcgacaacagtaaacttgaactaggtaag +tgaaaacagagaatgcctcatagtgccactatttgtccactatatgtaagtgtagcttta +cataatccactatgactgagatcattacggcctaggaaagcagcgtagaaaaaaagggcc +cggatattacgactgtaactataaaactagttactggtagcgcgccatgtatagatttgt +tttaccggttgtggttgcgttaacgaatttcagccgcgaaaattgatccgttaaccagtc +catctcgacttctataaaacgataaagtaaagttgatgttcagcctccttcttatggttg +catcgagagtacactactcagtgggaaatagatcggggttcctacttcagattgtattat +ctaggcaattgccgattgtgccatacctggataaaataagctacctacatgtgatgctta +tctattatcgtcatactaccttagggtgtcctgttgaacgctacattaatctttagccgt +ttgagatgttccaatggataggagtctaacgcatgatgaagtttaggaaggcagagcatc +ccactaagtatgtgacagtgtatttcgaaacgagacgttataaatagaaaaaaggtcctt +ctggttctattctgctgaactattgaatggaaagattggttgacctacgtactatttgct +tgaagtcatcaatttgacggggtgagagacatatggtgcatactttacggactctatatt +ttagatcagaagcttagcagtcttctctacaccccctcacgacataattgcttttaagaa +tctatgtttgattcctctacgggaattcggatccgttcgcatgtgcggtttatctaaacc +aggggacatatgttcagctaaagcatacgaacactttgctaactagacgtatgtatagta +gctataaatcccgacgatatttacaaaaagaaatgagactcaaatatatacatagcgacc +ctacacttattcgcaccctgatctaggcgatcctagcacccacacccgaaagtgagcact +agtgtcttccgtattaaatttactgcagttgagattttagttgtctactaaggattactc +taacccgtaataaggatcaagactcggtactagctttactatcattccctatgtgttttc +ctaactcacaagggtacgtaccagcctatgtaattacaataatgataaagacacaaagga +agtaactttacaaatgagtctccagttacactagcttagtccctcccatcttgctttgaa +gtctaaatacgcaatctctgaggatatacagcagaagaacactcataacgttggagtcca +agaattagactcatagggcccccaacatttaatatgtactgtgagtttgaaggtgttcta +ttgttaattcctgctcttgatacatgacacgtactccgtgtttaaggcttcggactgact +ttctttcataagttgagcaacgaaaatttcagaatcgataagttggattcactaactaat +acggctgattgaaaactccactccggacctatatggtcgacctttatacgtaaccgatat +aaaacttataggctggtatatcgagccttcctagcgcaatttcggatggggtttcttcta +ctactcaacaacggaatagtctttgtttagtaaaccagagctcaggacgcccaatacgta +ggagagcgctgtggagcatgtgtcattatggactggagcactcttaaatcactctgcgtg +tgctaaacgatagatcataacatgtcctgagtaaattttcttgatacgtcgcaatatacc +gttattagttaaacgttctcatccgtcatgcgtgaaatacggctgtcgtgctcagatata +ctattagcgactcatctcgcctaacacgcacacgtataaactcggaatgactgccgctct +tacatattagaaatacagactacaccacggaagcattgggtcattctcaaccgctgtata +aaagatgattagtcttataataagattaccaaagaggcagaatcatgggtagtaaatcta +ttattcaagtgattaccgtcgtgtaggcagggagtgaggacgagatggtactcaggacaa +atattaaccggacgaagtggtttacgtcgtactttcactattagtagtaaatacaaggta +acaccggggaatagtactaaatataatgatatctatcttcgggagaacgagtcgtctatt +gctttgaacattctcaaggcgtaaaatgtgctgacttatagcatgatacaaccgattgtt +acttttgtctattcaaaagattgaatagttttttatacaaaagccgcatacttatgacgg +ctagtatacagtttcatcccctagcatcaatgctatggacagtattgaacttataggaaa +ttcttctaatagggcaaatccgtcgtgatgcctattttttttcagtcacatcctcaaatg +gcactagtattgtcgggatcccattaacaggctcaaccacgagctcacgcgaggacatgt +agtccgtatctttaacgaagcgacagcgacagaactcccatggataaccaattataaggc +ccgtaatcctctagacatcgtttaccaataaatccgctttctccgtaatcatgttgaata +ccccagagtagtccagatgataaccgatgaaacacaagtctttctcaatgcacttacggt +gaacttattaccgccaacgtagctcatcaaggttgcgacatctagttgtgtgtttgcgac +gagcccagcgaacttcatcaactttcgtatattcaacgccttgtaattttactttaagac +gcctggtgatgtagattcttagataatcagtttgttatcggctgtactttaccataattt +cacaggtttcaggtcaagaagattatagctgtatatacagttccatgctcggtgcacaga +aacgtgatcggataataatcaatcgcttatgtcgtctttaggcgtatccaatacatgccc +cgataccgcagtgtatttcgacatgtaggtataccgtcgcatttgagctcgagtcaggac +gtcagctagattagattccttaatagaatataccgacctctagtccgaactaaactatag +ataacgccaacttcaggttaattgtctagtcgtctgtttgcagatgggattcttagatga +gtgagtatcggccatattggttcgagcactttagtttttgatgcataggatatgcaatgt +atagctgaaagtactttatctgtttcaaactcacattgattaaaccggtaaacctttaaa +gactacaagaaaatattcagtgagggcaattttgtcaatcacaatcttccagctagagat +acttcacaatttgtcttgaggctacgcaacattagacggattttcgcgttttattgaaat +aatcgaggggcccaagagtatccatagttcattttgtaagatttctttacaggcttatta +cagcttcttcagactcctacatgcttacgagttatatgctagcatgtgaacaatagatta +atatacaggaaaacgtacattgagagagatgaccctacacagcgcaaccgttgagtactt +tcattaaagggtaacgctctcgagacagcatccttaagatggccttattgtcaaatcatt +tgcagaagtacgcaagatccctaaccaacgtagaagaatccctacaaacacatgagacgc +ggtgaaaatagacagggtgttagtattcaatcttcggagtatcaatttcgccaatcttgg +tgagaaagcataccctttcttcagagaaagaagatcaatcataacactatctttaacgag +gtacgcacgcgcatcattacctgcctccatggatctttaggatagcggaaagtattggca +gcgtattgtgatttcgttcctactttatcaatttcacattcatatacatgtcttttatca +aaatcgccaataagataggatgagctatattagatgctagtagagttcgcgccaacatca +tcgataggaatactcaggacagcgtgataggacttttcaatccctaatactctctataat +tataactctctcttaagtttggaggcagtaacgcgctctatataatcagtttgctgcacc +attcttcagcctctgatacatacaaataaattccacagcagtaagagggtttaattgaga +catcttgggaacttaggattttactctaacatcaccgaaacgattattggataccgtacc +taaacgaactttctcaaggcagtaatataggacatccgcaataacacaaatgctgcctcc +ccaggagttatgtcttcctggaggctatatcttacacccactcactataggcaaactaaa +gtttaaatgttgattgtctaaaaaaaagatagataagagttggccggcgtagcacatgcg +aaagtgaatcgtaagctataattctctggacttgaagttctgtcctgttcctctgcaaga +aacaaacttcctttaaagctatttacgacgcacatctcagcaagttataaacatgttgga +agtttctagtcggaattcccaaagaacggatctatctaatgcattcctacatttttcctg +tctgccgatggtgccatcctattcaaagaatttcttaaaagtagattaaatgggactttt +aacaatgagtaaccttacgcctctaagggttcctcgagtgccatacaccagtcaggtccg +agccacatacacggagaacattctaacatagcattctcaactcgatcatttgcaggttac +ttctttcctatcctagtgctaaaaatcatacttgcaatcccatagcacggattaagaacc +taagaaacaattcagtaaaacatgttcgaattcttggtatgggaacatcattgcagctat +ggtctaacgcattaatgtttgggtacatcttccatcatataaacaggaagagtctgacga +cagggagtgcttgcgatcatgtctatcattgtgaaatcaaattgtagctcacatgtcgtc +tatgagagcgtgtatccgataagatttagaaaaatagaagtcgtataagatctcactgaa +cttttgaatgaatgtgaagcatatatgatctgctttaataaaactttatccataggatac +gtttccaaatcaattcaataattattagtcaaaatagataaggatgaacaacctgaaggc +cgatcggacgtagaaagtggtcccatcactttgagttgatattgttgaaccacacgttat +tatggttttcaaacagtctcaggatattgtatatacagataatccgataccagttgtctg +acgcccctcttacgtaccccaccctttgtgacgtttaaagcagttgttcagtattttaaa +ctaggcggcaactaatttggaaagaagcacagtggatatgtctaaattcttgttattcag +gcctgaatttaatacaccgcatagttaacttcgcggtagagttgttcatcatgcctcctc +taagctaccacttctatgatacaccaatagttgttctacggaatctgataattggccaag +tcataaacttccgctgcgttcaacccccttgctcgaatatccaactcgaaaagacagcct +tttggtgtccggaacaaatcagttacttcttttctgatgttaattctctgtggtcagata +cagaccaaaaactccgcggatttaccatcctccaagaacaaatttgcatcaacatagcat +tttggctacatattctaagtctcaatagtttaggttttcaactacattatcccaacatta +ggattggaggaataatagctgggtaagtccccttgcgtctacaatcgactattttttatg +aatatgcttctgccgcacctatggttattaaaaaagtcatgactttgaagaaccctgaaa +agatagatgaatcaggtgtaatggcagcagccaaagagcatataattagcaacactctaa +gaacattatagatatgatgatagcgatcgtcatgatgttatccggtcacaatagtagctt +catcagctaattcgttttgccagtggtgacttgcgctggaagaatcgttatacggtccct +tccctcttgatacggtgggggcttattcaaccgcgtggattgggttgtcatacttgcatt +aaacgatgtaaaccatctagtagtcaactatactaaatcacaaaatagtgatcaatacat +acccgcttcatggttttaaccatttaattgattaaagatattccgctaagaaccattatc +tacctaaactgatcgccgtatcctagtagtttgaaatttgatgtaccgtaatgatcaacg +aagtaaaacgttatattgtatgtagaataataggtcttggagctaaatgatgtgattggt +agtgaagacttacccttacaactttaccggtttctcggaagaatatactagagaatcaat +gcatgggctacataagcactttagtctaatgagataaaaaatacacgagtcttccatcat +gaattttttgtcgaaaaactcgaacctggtaatttaaaccatatatctttatgtcgtcaa +taactctcatatgttttatataacttcccaatcacgacttgtaactgcttgttcgactga +gctgtttgagctatgaggccgggatccggttgagctacatctatttgctacaagaaaaat +gaaagcacatttgttgggagttctggctacactcatagagaaataagtggcccgagtggg +tgcggcctgcctccatattcaagtgtatcttaaaccaagtggttccaacgctcgcgctaa +agaattaaagcctttatttcctccacggagtagcccgtaatccggttcgaaagagaccat +tgaagttaattttcatatccagtgaagtttaggcacaagcatgtgttctgccacatgcct +caaagcgctcttcaaccaagatatgattcatcctaacttcgatgaatgcgtctgtaacat +aaatatagaaggaatgattcggcgagttaattttcgccttctccaacatggcatccctac +gttcgttataaggaccatacatgtaggttttaaaggtttgcggttaatcgatatttacat +catagaaattctatagtcaaatttacaagactctagatactcactcgttgcagccggcta +ggaagcgctttgtaccttacttcccttttcgttgcgtaatatgaatttcatatagtaagt +tcaaggcactcatacctccgtgaagagggtagatagactattaaagttgtttaatagtac +gtattgatggaaatgacccgtaggagatttaccactcaatccacaagattcgctgctgtg +cattatcaaaacagtgcatgtcgaaacatgggttgggtccttcaaacacgaatccaggta +gagatacctttgcaattttt diff --git a/vendor/regex/examples/regexdna-output.txt b/vendor/regex/examples/regexdna-output.txt new file mode 100644 index 0000000000..d36baa5be8 --- /dev/null +++ b/vendor/regex/examples/regexdna-output.txt @@ -0,0 +1,13 @@ +agggtaaa|tttaccct 0 +[cgt]gggtaaa|tttaccc[acg] 3 +a[act]ggtaaa|tttacc[agt]t 9 +ag[act]gtaaa|tttac[agt]ct 8 +agg[act]taaa|ttta[agt]cct 10 +aggg[acg]aaa|ttt[cgt]ccct 3 +agggt[cgt]aa|tt[acg]accct 4 +agggta[cgt]a|t[acg]taccct 3 +agggtaa[cgt]|[acg]ttaccct 5 + +101745 +100000 +133640 diff --git a/vendor/regex/examples/shootout-regex-dna-bytes.rs b/vendor/regex/examples/shootout-regex-dna-bytes.rs new file mode 100644 index 0000000000..773fd9ba8d --- /dev/null +++ b/vendor/regex/examples/shootout-regex-dna-bytes.rs @@ -0,0 +1,68 @@ +// The Computer Language Benchmarks Game +// https://benchmarksgame-team.pages.debian.net/benchmarksgame/ +// +// contributed by the Rust Project Developers +// contributed by TeXitoi +// contributed by BurntSushi + +use std::io::{self, Read}; +use std::sync::Arc; +use std::thread; + +macro_rules! regex { + ($re:expr) => { + ::regex::bytes::Regex::new($re).unwrap() + }; +} + +fn main() { + let mut seq = Vec::with_capacity(51 * (1 << 20)); + io::stdin().read_to_end(&mut seq).unwrap(); + let ilen = seq.len(); + + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, &b""[..]).into_owned(); + let clen = seq.len(); + let seq_arc = Arc::new(seq.clone()); + + let variants = vec![ + regex!("agggtaaa|tttaccct"), + regex!("[cgt]gggtaaa|tttaccc[acg]"), + regex!("a[act]ggtaaa|tttacc[agt]t"), + regex!("ag[act]gtaaa|tttac[agt]ct"), + regex!("agg[act]taaa|ttta[agt]cct"), + regex!("aggg[acg]aaa|ttt[cgt]ccct"), + regex!("agggt[cgt]aa|tt[acg]accct"), + regex!("agggta[cgt]a|t[acg]taccct"), + regex!("agggtaa[cgt]|[acg]ttaccct"), + ]; + let mut counts = vec![]; + for variant in variants { + let seq = seq_arc.clone(); + let restr = variant.to_string(); + let future = thread::spawn(move || variant.find_iter(&seq).count()); + counts.push((restr, future)); + } + + let substs = vec![ + (regex!("B"), &b"(c|g|t)"[..]), + (regex!("D"), &b"(a|g|t)"[..]), + (regex!("H"), &b"(a|c|t)"[..]), + (regex!("K"), &b"(g|t)"[..]), + (regex!("M"), &b"(a|c)"[..]), + (regex!("N"), &b"(a|c|g|t)"[..]), + (regex!("R"), &b"(a|g)"[..]), + (regex!("S"), &b"(c|g)"[..]), + (regex!("V"), &b"(a|c|g)"[..]), + (regex!("W"), &b"(a|t)"[..]), + (regex!("Y"), &b"(c|t)"[..]), + ]; + let mut seq = seq; + for (re, replacement) in substs { + seq = re.replace_all(&seq, replacement).into_owned(); + } + + for (variant, count) in counts { + println!("{} {}", variant, count.join().unwrap()); + } + println!("\n{}\n{}\n{}", ilen, clen, seq.len()); +} diff --git a/vendor/regex/examples/shootout-regex-dna-cheat.rs b/vendor/regex/examples/shootout-regex-dna-cheat.rs new file mode 100644 index 0000000000..1bde7ab1ff --- /dev/null +++ b/vendor/regex/examples/shootout-regex-dna-cheat.rs @@ -0,0 +1,90 @@ +// The Computer Language Benchmarks Game +// https://benchmarksgame-team.pages.debian.net/benchmarksgame/ +// +// contributed by the Rust Project Developers +// contributed by TeXitoi +// contributed by BurntSushi + +// This technically solves the problem posed in the `regex-dna` benchmark, but +// it cheats by combining all of the replacements into a single regex and +// replacing them with a single linear scan. i.e., it re-implements +// `replace_all`. As a result, this is around 25% faster. ---AG + +use std::io::{self, Read}; +use std::sync::Arc; +use std::thread; + +macro_rules! regex { + ($re:expr) => { + ::regex::Regex::new($re).unwrap() + }; +} + +fn main() { + let mut seq = String::with_capacity(50 * (1 << 20)); + io::stdin().read_to_string(&mut seq).unwrap(); + let ilen = seq.len(); + + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); + let clen = seq.len(); + let seq_arc = Arc::new(seq.clone()); + + let variants = vec![ + regex!("agggtaaa|tttaccct"), + regex!("[cgt]gggtaaa|tttaccc[acg]"), + regex!("a[act]ggtaaa|tttacc[agt]t"), + regex!("ag[act]gtaaa|tttac[agt]ct"), + regex!("agg[act]taaa|ttta[agt]cct"), + regex!("aggg[acg]aaa|ttt[cgt]ccct"), + regex!("agggt[cgt]aa|tt[acg]accct"), + regex!("agggta[cgt]a|t[acg]taccct"), + regex!("agggtaa[cgt]|[acg]ttaccct"), + ]; + let mut counts = vec![]; + for variant in variants { + let seq = seq_arc.clone(); + let restr = variant.to_string(); + let future = thread::spawn(move || variant.find_iter(&seq).count()); + counts.push((restr, future)); + } + + let substs = vec![ + (b'B', "(c|g|t)"), + (b'D', "(a|g|t)"), + (b'H', "(a|c|t)"), + (b'K', "(g|t)"), + (b'M', "(a|c)"), + (b'N', "(a|c|g|t)"), + (b'R', "(a|g)"), + (b'S', "(c|g)"), + (b'V', "(a|c|g)"), + (b'W', "(a|t)"), + (b'Y', "(c|t)"), + ]; // combined into one regex in `replace_all` + let seq = replace_all(&seq, substs); + + for (variant, count) in counts { + println!("{} {}", variant, count.join().unwrap()); + } + println!("\n{}\n{}\n{}", ilen, clen, seq.len()); +} + +fn replace_all(text: &str, substs: Vec<(u8, &str)>) -> String { + let mut replacements = vec![""; 256]; + let mut alternates = vec![]; + for (re, replacement) in substs { + replacements[re as usize] = replacement; + alternates.push((re as char).to_string()); + } + + let re = regex!(&alternates.join("|")); + let mut new = String::with_capacity(text.len()); + let mut last_match = 0; + for m in re.find_iter(text) { + new.push_str(&text[last_match..m.start()]); + new.push_str(replacements[text.as_bytes()[m.start()] as usize]); + last_match = m.end(); + } + new.push_str(&text[last_match..]); + new +} diff --git a/vendor/regex/examples/shootout-regex-dna-replace.rs b/vendor/regex/examples/shootout-regex-dna-replace.rs new file mode 100644 index 0000000000..20694e06f3 --- /dev/null +++ b/vendor/regex/examples/shootout-regex-dna-replace.rs @@ -0,0 +1,17 @@ +use std::io::{self, Read}; + +macro_rules! regex { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new($re).build().unwrap().into_regex() + }}; +} + +fn main() { + let mut seq = String::with_capacity(50 * (1 << 20)); + io::stdin().read_to_string(&mut seq).unwrap(); + let ilen = seq.len(); + + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); + println!("original: {}, replaced: {}", ilen, seq.len()); +} diff --git a/vendor/regex/examples/shootout-regex-dna-single-cheat.rs b/vendor/regex/examples/shootout-regex-dna-single-cheat.rs new file mode 100644 index 0000000000..70a979c6d4 --- /dev/null +++ b/vendor/regex/examples/shootout-regex-dna-single-cheat.rs @@ -0,0 +1,75 @@ +// The Computer Language Benchmarks Game +// https://benchmarksgame-team.pages.debian.net/benchmarksgame/ +// +// contributed by the Rust Project Developers +// contributed by TeXitoi +// contributed by BurntSushi + +use std::io::{self, Read}; + +macro_rules! regex { + ($re:expr) => { + ::regex::Regex::new($re).unwrap() + }; +} + +fn main() { + let mut seq = String::with_capacity(50 * (1 << 20)); + io::stdin().read_to_string(&mut seq).unwrap(); + let ilen = seq.len(); + + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); + let clen = seq.len(); + + let variants = vec![ + regex!("agggtaaa|tttaccct"), + regex!("[cgt]gggtaaa|tttaccc[acg]"), + regex!("a[act]ggtaaa|tttacc[agt]t"), + regex!("ag[act]gtaaa|tttac[agt]ct"), + regex!("agg[act]taaa|ttta[agt]cct"), + regex!("aggg[acg]aaa|ttt[cgt]ccct"), + regex!("agggt[cgt]aa|tt[acg]accct"), + regex!("agggta[cgt]a|t[acg]taccct"), + regex!("agggtaa[cgt]|[acg]ttaccct"), + ]; + for re in variants { + println!("{} {}", re.to_string(), re.find_iter(&seq).count()); + } + + let substs = vec![ + (b'B', "(c|g|t)"), + (b'D', "(a|g|t)"), + (b'H', "(a|c|t)"), + (b'K', "(g|t)"), + (b'M', "(a|c)"), + (b'N', "(a|c|g|t)"), + (b'R', "(a|g)"), + (b'S', "(c|g)"), + (b'V', "(a|c|g)"), + (b'W', "(a|t)"), + (b'Y', "(c|t)"), + ]; // combined into one regex in `replace_all` + let seq = replace_all(&seq, substs); + + println!("\n{}\n{}\n{}", ilen, clen, seq.len()); +} + +fn replace_all(text: &str, substs: Vec<(u8, &str)>) -> String { + let mut replacements = vec![""; 256]; + let mut alternates = vec![]; + for (re, replacement) in substs { + replacements[re as usize] = replacement; + alternates.push((re as char).to_string()); + } + + let re = regex!(&alternates.join("|")); + let mut new = String::with_capacity(text.len()); + let mut last_match = 0; + for m in re.find_iter(text) { + new.push_str(&text[last_match..m.start()]); + new.push_str(replacements[text.as_bytes()[m.start()] as usize]); + last_match = m.end(); + } + new.push_str(&text[last_match..]); + new +} diff --git a/vendor/regex/examples/shootout-regex-dna-single.rs b/vendor/regex/examples/shootout-regex-dna-single.rs new file mode 100644 index 0000000000..b474059600 --- /dev/null +++ b/vendor/regex/examples/shootout-regex-dna-single.rs @@ -0,0 +1,57 @@ +// The Computer Language Benchmarks Game +// https://benchmarksgame-team.pages.debian.net/benchmarksgame/ +// +// contributed by the Rust Project Developers +// contributed by TeXitoi +// contributed by BurntSushi + +use std::io::{self, Read}; + +macro_rules! regex { + ($re:expr) => { + ::regex::Regex::new($re).unwrap() + }; +} + +fn main() { + let mut seq = String::with_capacity(50 * (1 << 20)); + io::stdin().read_to_string(&mut seq).unwrap(); + let ilen = seq.len(); + + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); + let clen = seq.len(); + + let variants = vec![ + regex!("agggtaaa|tttaccct"), + regex!("[cgt]gggtaaa|tttaccc[acg]"), + regex!("a[act]ggtaaa|tttacc[agt]t"), + regex!("ag[act]gtaaa|tttac[agt]ct"), + regex!("agg[act]taaa|ttta[agt]cct"), + regex!("aggg[acg]aaa|ttt[cgt]ccct"), + regex!("agggt[cgt]aa|tt[acg]accct"), + regex!("agggta[cgt]a|t[acg]taccct"), + regex!("agggtaa[cgt]|[acg]ttaccct"), + ]; + for re in variants { + println!("{} {}", re.to_string(), re.find_iter(&seq).count()); + } + + let substs = vec![ + (regex!("B"), "(c|g|t)"), + (regex!("D"), "(a|g|t)"), + (regex!("H"), "(a|c|t)"), + (regex!("K"), "(g|t)"), + (regex!("M"), "(a|c)"), + (regex!("N"), "(a|c|g|t)"), + (regex!("R"), "(a|g)"), + (regex!("S"), "(c|g)"), + (regex!("V"), "(a|c|g)"), + (regex!("W"), "(a|t)"), + (regex!("Y"), "(c|t)"), + ]; + let mut seq = seq; + for (re, replacement) in substs { + seq = re.replace_all(&seq, replacement).into_owned(); + } + println!("\n{}\n{}\n{}", ilen, clen, seq.len()); +} diff --git a/vendor/regex/examples/shootout-regex-dna.rs b/vendor/regex/examples/shootout-regex-dna.rs new file mode 100644 index 0000000000..b96518e4c4 --- /dev/null +++ b/vendor/regex/examples/shootout-regex-dna.rs @@ -0,0 +1,68 @@ +// The Computer Language Benchmarks Game +// https://benchmarksgame-team.pages.debian.net/benchmarksgame/ +// +// contributed by the Rust Project Developers +// contributed by TeXitoi +// contributed by BurntSushi + +use std::io::{self, Read}; +use std::sync::Arc; +use std::thread; + +macro_rules! regex { + ($re:expr) => { + ::regex::Regex::new($re).unwrap() + }; +} + +fn main() { + let mut seq = String::with_capacity(51 * (1 << 20)); + io::stdin().read_to_string(&mut seq).unwrap(); + let ilen = seq.len(); + + seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); + let clen = seq.len(); + let seq_arc = Arc::new(seq.clone()); + + let variants = vec![ + regex!("agggtaaa|tttaccct"), + regex!("[cgt]gggtaaa|tttaccc[acg]"), + regex!("a[act]ggtaaa|tttacc[agt]t"), + regex!("ag[act]gtaaa|tttac[agt]ct"), + regex!("agg[act]taaa|ttta[agt]cct"), + regex!("aggg[acg]aaa|ttt[cgt]ccct"), + regex!("agggt[cgt]aa|tt[acg]accct"), + regex!("agggta[cgt]a|t[acg]taccct"), + regex!("agggtaa[cgt]|[acg]ttaccct"), + ]; + let mut counts = vec![]; + for variant in variants { + let seq = seq_arc.clone(); + let restr = variant.to_string(); + let future = thread::spawn(move || variant.find_iter(&seq).count()); + counts.push((restr, future)); + } + + let substs = vec![ + (regex!("B"), "(c|g|t)"), + (regex!("D"), "(a|g|t)"), + (regex!("H"), "(a|c|t)"), + (regex!("K"), "(g|t)"), + (regex!("M"), "(a|c)"), + (regex!("N"), "(a|c|g|t)"), + (regex!("R"), "(a|g)"), + (regex!("S"), "(c|g)"), + (regex!("V"), "(a|c|g)"), + (regex!("W"), "(a|t)"), + (regex!("Y"), "(c|t)"), + ]; + let mut seq = seq; + for (re, replacement) in substs { + seq = re.replace_all(&seq, replacement).into_owned(); + } + + for (variant, count) in counts { + println!("{} {}", variant, count.join().unwrap()); + } + println!("\n{}\n{}\n{}", ilen, clen, seq.len()); +} diff --git a/vendor/regex/rustfmt.toml b/vendor/regex/rustfmt.toml new file mode 100644 index 0000000000..aa37a218b9 --- /dev/null +++ b/vendor/regex/rustfmt.toml @@ -0,0 +1,2 @@ +max_width = 79 +use_small_heuristics = "max" diff --git a/vendor/regex/src/backtrack.rs b/vendor/regex/src/backtrack.rs new file mode 100644 index 0000000000..4d83856ca0 --- /dev/null +++ b/vendor/regex/src/backtrack.rs @@ -0,0 +1,282 @@ +// This is the backtracking matching engine. It has the same exact capability +// as the full NFA simulation, except it is artificially restricted to small +// regexes on small inputs because of its memory requirements. +// +// In particular, this is a *bounded* backtracking engine. It retains worst +// case linear time by keeping track of the states that it has visited (using a +// bitmap). Namely, once a state is visited, it is never visited again. Since a +// state is keyed by `(instruction index, input index)`, we have that its time +// complexity is `O(mn)` (i.e., linear in the size of the search text). +// +// The backtracking engine can beat out the NFA simulation on small +// regexes/inputs because it doesn't have to keep track of multiple copies of +// the capture groups. In benchmarks, the backtracking engine is roughly twice +// as fast as the full NFA simulation. Note though that its performance doesn't +// scale, even if you're willing to live with the memory requirements. Namely, +// the bitset has to be zeroed on each execution, which becomes quite expensive +// on large bitsets. + +use crate::exec::ProgramCache; +use crate::input::{Input, InputAt}; +use crate::prog::{InstPtr, Program}; +use crate::re_trait::Slot; + +type Bits = u32; + +const BIT_SIZE: usize = 32; +const MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB + +/// Returns true iff the given regex and input should be executed by this +/// engine with reasonable memory usage. +pub fn should_exec(num_insts: usize, text_len: usize) -> bool { + // Total memory usage in bytes is determined by: + // + // ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32)) + // + // The actual limit picked is pretty much a heuristic. + // See: https://github.com/rust-lang/regex/issues/215 + let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4; + size <= MAX_SIZE_BYTES +} + +/// A backtracking matching engine. +#[derive(Debug)] +pub struct Bounded<'a, 'm, 'r, 's, I> { + prog: &'r Program, + input: I, + matches: &'m mut [bool], + slots: &'s mut [Slot], + m: &'a mut Cache, +} + +/// Shared cached state between multiple invocations of a backtracking engine +/// in the same thread. +#[derive(Clone, Debug)] +pub struct Cache { + jobs: Vec, + visited: Vec, +} + +impl Cache { + /// Create new empty cache for the backtracking engine. + pub fn new(_prog: &Program) -> Self { + Cache { jobs: vec![], visited: vec![] } + } +} + +/// A job is an explicit unit of stack space in the backtracking engine. +/// +/// The "normal" representation is a single state transition, which corresponds +/// to an NFA state and a character in the input. However, the backtracking +/// engine must keep track of old capture group values. We use the explicit +/// stack to do it. +#[derive(Clone, Copy, Debug)] +enum Job { + Inst { ip: InstPtr, at: InputAt }, + SaveRestore { slot: usize, old_pos: Option }, +} + +impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { + /// Execute the backtracking matching engine. + /// + /// If there's a match, `exec` returns `true` and populates the given + /// captures accordingly. + pub fn exec( + prog: &'r Program, + cache: &ProgramCache, + matches: &'m mut [bool], + slots: &'s mut [Slot], + input: I, + start: usize, + end: usize, + ) -> bool { + let mut cache = cache.borrow_mut(); + let cache = &mut cache.backtrack; + let start = input.at(start); + let mut b = Bounded { prog, input, matches, slots, m: cache }; + b.exec_(start, end) + } + + /// Clears the cache such that the backtracking engine can be executed + /// on some input of fixed length. + fn clear(&mut self) { + // Reset the job memory so that we start fresh. + self.m.jobs.clear(); + + // Now we need to clear the bit state set. + // We do this by figuring out how much space we need to keep track + // of the states we've visited. + // Then we reset all existing allocated space to 0. + // Finally, we request more space if we need it. + // + // This is all a little circuitous, but doing this using unchecked + // operations doesn't seem to have a measurable impact on performance. + // (Probably because backtracking is limited to such small + // inputs/regexes in the first place.) + let visited_len = + (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1) + / BIT_SIZE; + self.m.visited.truncate(visited_len); + for v in &mut self.m.visited { + *v = 0; + } + if visited_len > self.m.visited.len() { + let len = self.m.visited.len(); + self.m.visited.reserve_exact(visited_len - len); + for _ in 0..(visited_len - len) { + self.m.visited.push(0); + } + } + } + + /// Start backtracking at the given position in the input, but also look + /// for literal prefixes. + fn exec_(&mut self, mut at: InputAt, end: usize) -> bool { + self.clear(); + // If this is an anchored regex at the beginning of the input, then + // we're either already done or we only need to try backtracking once. + if self.prog.is_anchored_start { + return if !at.is_start() { false } else { self.backtrack(at) }; + } + let mut matched = false; + loop { + if !self.prog.prefixes.is_empty() { + at = match self.input.prefix_at(&self.prog.prefixes, at) { + None => break, + Some(at) => at, + }; + } + matched = self.backtrack(at) || matched; + if matched && self.prog.matches.len() == 1 { + return true; + } + if at.pos() >= end { + break; + } + at = self.input.at(at.next_pos()); + } + matched + } + + /// The main backtracking loop starting at the given input position. + fn backtrack(&mut self, start: InputAt) -> bool { + // N.B. We use an explicit stack to avoid recursion. + // To avoid excessive pushing and popping, most transitions are handled + // in the `step` helper function, which only pushes to the stack when + // there's a capture or a branch. + let mut matched = false; + self.m.jobs.push(Job::Inst { ip: 0, at: start }); + while let Some(job) = self.m.jobs.pop() { + match job { + Job::Inst { ip, at } => { + if self.step(ip, at) { + // Only quit if we're matching one regex. + // If we're matching a regex set, then mush on and + // try to find other matches (if we want them). + if self.prog.matches.len() == 1 { + return true; + } + matched = true; + } + } + Job::SaveRestore { slot, old_pos } => { + if slot < self.slots.len() { + self.slots[slot] = old_pos; + } + } + } + } + matched + } + + fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool { + use crate::prog::Inst::*; + loop { + // This loop is an optimization to avoid constantly pushing/popping + // from the stack. Namely, if we're pushing a job only to run it + // next, avoid the push and just mutate `ip` (and possibly `at`) + // in place. + if self.has_visited(ip, at) { + return false; + } + match self.prog[ip] { + Match(slot) => { + if slot < self.matches.len() { + self.matches[slot] = true; + } + return true; + } + Save(ref inst) => { + if let Some(&old_pos) = self.slots.get(inst.slot) { + // If this path doesn't work out, then we save the old + // capture index (if one exists) in an alternate + // job. If the next path fails, then the alternate + // job is popped and the old capture index is restored. + self.m.jobs.push(Job::SaveRestore { + slot: inst.slot, + old_pos, + }); + self.slots[inst.slot] = Some(at.pos()); + } + ip = inst.goto; + } + Split(ref inst) => { + self.m.jobs.push(Job::Inst { ip: inst.goto2, at }); + ip = inst.goto1; + } + EmptyLook(ref inst) => { + if self.input.is_empty_match(at, inst) { + ip = inst.goto; + } else { + return false; + } + } + Char(ref inst) => { + if inst.c == at.char() { + ip = inst.goto; + at = self.input.at(at.next_pos()); + } else { + return false; + } + } + Ranges(ref inst) => { + if inst.matches(at.char()) { + ip = inst.goto; + at = self.input.at(at.next_pos()); + } else { + return false; + } + } + Bytes(ref inst) => { + if let Some(b) = at.byte() { + if inst.matches(b) { + ip = inst.goto; + at = self.input.at(at.next_pos()); + continue; + } + } + return false; + } + } + } + } + + fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool { + let k = ip * (self.input.len() + 1) + at.pos(); + let k1 = k / BIT_SIZE; + let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1))); + if self.m.visited[k1] & k2 == 0 { + self.m.visited[k1] |= k2; + false + } else { + true + } + } +} + +fn usize_to_u32(n: usize) -> u32 { + if (n as u64) > (::std::u32::MAX as u64) { + panic!("BUG: {} is too big to fit into u32", n) + } + n as u32 +} diff --git a/vendor/regex/src/compile.rs b/vendor/regex/src/compile.rs new file mode 100644 index 0000000000..90ca25015f --- /dev/null +++ b/vendor/regex/src/compile.rs @@ -0,0 +1,1264 @@ +use std::collections::HashMap; +use std::fmt; +use std::iter; +use std::result; +use std::sync::Arc; + +use regex_syntax::hir::{self, Hir}; +use regex_syntax::is_word_byte; +use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; + +use crate::prog::{ + EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges, + InstSave, InstSplit, Program, +}; + +use crate::Error; + +type Result = result::Result; +type ResultOrEmpty = result::Result, Error>; + +#[derive(Debug)] +struct Patch { + hole: Hole, + entry: InstPtr, +} + +/// A compiler translates a regular expression AST to a sequence of +/// instructions. The sequence of instructions represents an NFA. +// `Compiler` is only public via the `internal` module, so avoid deriving +// `Debug`. +#[allow(missing_debug_implementations)] +pub struct Compiler { + insts: Vec, + compiled: Program, + capture_name_idx: HashMap, + num_exprs: usize, + size_limit: usize, + suffix_cache: SuffixCache, + utf8_seqs: Option, + byte_classes: ByteClassSet, + // This keeps track of extra bytes allocated while compiling the regex + // program. Currently, this corresponds to two things. First is the heap + // memory allocated by Unicode character classes ('InstRanges'). Second is + // a "fake" amount of memory used by empty sub-expressions, so that enough + // empty sub-expressions will ultimately trigger the compiler to bail + // because of a size limit restriction. (That empty sub-expressions don't + // add to heap memory usage is more-or-less an implementation detail.) In + // the second case, if we don't bail, then an excessively large repetition + // on an empty sub-expression can result in the compiler using a very large + // amount of CPU time. + extra_inst_bytes: usize, +} + +impl Compiler { + /// Create a new regular expression compiler. + /// + /// Various options can be set before calling `compile` on an expression. + pub fn new() -> Self { + Compiler { + insts: vec![], + compiled: Program::new(), + capture_name_idx: HashMap::new(), + num_exprs: 0, + size_limit: 10 * (1 << 20), + suffix_cache: SuffixCache::new(1000), + utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')), + byte_classes: ByteClassSet::new(), + extra_inst_bytes: 0, + } + } + + /// The size of the resulting program is limited by size_limit. If + /// the program approximately exceeds the given size (in bytes), then + /// compilation will stop and return an error. + pub fn size_limit(mut self, size_limit: usize) -> Self { + self.size_limit = size_limit; + self + } + + /// If bytes is true, then the program is compiled as a byte based + /// automaton, which incorporates UTF-8 decoding into the machine. If it's + /// false, then the automaton is Unicode scalar value based, e.g., an + /// engine utilizing such an automaton is responsible for UTF-8 decoding. + /// + /// The specific invariant is that when returning a byte based machine, + /// the neither the `Char` nor `Ranges` instructions are produced. + /// Conversely, when producing a Unicode scalar value machine, the `Bytes` + /// instruction is never produced. + /// + /// Note that `dfa(true)` implies `bytes(true)`. + pub fn bytes(mut self, yes: bool) -> Self { + self.compiled.is_bytes = yes; + self + } + + /// When disabled, the program compiled may match arbitrary bytes. + /// + /// When enabled (the default), all compiled programs exclusively match + /// valid UTF-8 bytes. + pub fn only_utf8(mut self, yes: bool) -> Self { + self.compiled.only_utf8 = yes; + self + } + + /// When set, the machine returned is suitable for use in the DFA matching + /// engine. + /// + /// In particular, this ensures that if the regex is not anchored in the + /// beginning, then a preceding `.*?` is included in the program. (The NFA + /// based engines handle the preceding `.*?` explicitly, which is difficult + /// or impossible in the DFA engine.) + pub fn dfa(mut self, yes: bool) -> Self { + self.compiled.is_dfa = yes; + self + } + + /// When set, the machine returned is suitable for matching text in + /// reverse. In particular, all concatenations are flipped. + pub fn reverse(mut self, yes: bool) -> Self { + self.compiled.is_reverse = yes; + self + } + + /// Compile a regular expression given its AST. + /// + /// The compiler is guaranteed to succeed unless the program exceeds the + /// specified size limit. If the size limit is exceeded, then compilation + /// stops and returns an error. + pub fn compile(mut self, exprs: &[Hir]) -> result::Result { + debug_assert!(!exprs.is_empty()); + self.num_exprs = exprs.len(); + if exprs.len() == 1 { + self.compile_one(&exprs[0]) + } else { + self.compile_many(exprs) + } + } + + fn compile_one(mut self, expr: &Hir) -> result::Result { + // If we're compiling a forward DFA and we aren't anchored, then + // add a `.*?` before the first capture group. + // Other matching engines handle this by baking the logic into the + // matching engine itself. + let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; + self.compiled.is_anchored_start = expr.is_anchored_start(); + self.compiled.is_anchored_end = expr.is_anchored_end(); + if self.compiled.needs_dotstar() { + dotstar_patch = self.c_dotstar()?; + self.compiled.start = dotstar_patch.entry; + } + self.compiled.captures = vec![None]; + let patch = + self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); + if self.compiled.needs_dotstar() { + self.fill(dotstar_patch.hole, patch.entry); + } else { + self.compiled.start = patch.entry; + } + self.fill_to_next(patch.hole); + self.compiled.matches = vec![self.insts.len()]; + self.push_compiled(Inst::Match(0)); + self.compile_finish() + } + + fn compile_many( + mut self, + exprs: &[Hir], + ) -> result::Result { + debug_assert!(exprs.len() > 1); + + self.compiled.is_anchored_start = + exprs.iter().all(|e| e.is_anchored_start()); + self.compiled.is_anchored_end = + exprs.iter().all(|e| e.is_anchored_end()); + let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; + if self.compiled.needs_dotstar() { + dotstar_patch = self.c_dotstar()?; + self.compiled.start = dotstar_patch.entry; + } else { + self.compiled.start = 0; // first instruction is always split + } + self.fill_to_next(dotstar_patch.hole); + + let mut prev_hole = Hole::None; + for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() { + self.fill_to_next(prev_hole); + let split = self.push_split_hole(); + let Patch { hole, entry } = + self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); + self.fill_to_next(hole); + self.compiled.matches.push(self.insts.len()); + self.push_compiled(Inst::Match(i)); + prev_hole = self.fill_split(split, Some(entry), None); + } + let i = exprs.len() - 1; + let Patch { hole, entry } = + self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst()); + self.fill(prev_hole, entry); + self.fill_to_next(hole); + self.compiled.matches.push(self.insts.len()); + self.push_compiled(Inst::Match(i)); + self.compile_finish() + } + + fn compile_finish(mut self) -> result::Result { + self.compiled.insts = + self.insts.into_iter().map(|inst| inst.unwrap()).collect(); + self.compiled.byte_classes = self.byte_classes.byte_classes(); + self.compiled.capture_name_idx = Arc::new(self.capture_name_idx); + Ok(self.compiled) + } + + /// Compile expr into self.insts, returning a patch on success, + /// or an error if we run out of memory. + /// + /// All of the c_* methods of the compiler share the contract outlined + /// here. + /// + /// The main thing that a c_* method does is mutate `self.insts` + /// to add a list of mostly compiled instructions required to execute + /// the given expression. `self.insts` contains MaybeInsts rather than + /// Insts because there is some backpatching required. + /// + /// The `Patch` value returned by each c_* method provides metadata + /// about the compiled instructions emitted to `self.insts`. The + /// `entry` member of the patch refers to the first instruction + /// (the entry point), while the `hole` member contains zero or + /// more offsets to partial instructions that need to be backpatched. + /// The c_* routine can't know where its list of instructions are going to + /// jump to after execution, so it is up to the caller to patch + /// these jumps to point to the right place. So compiling some + /// expression, e, we would end up with a situation that looked like: + /// + /// ```text + /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...] + /// ^ ^ ^ + /// | \ / + /// entry \ / + /// hole + /// ``` + /// + /// To compile two expressions, e1 and e2, concatenated together we + /// would do: + /// + /// ```ignore + /// let patch1 = self.c(e1); + /// let patch2 = self.c(e2); + /// ``` + /// + /// while leaves us with a situation that looks like + /// + /// ```text + /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ] + /// ^ ^ ^ ^ + /// | | | | + /// entry1 hole1 entry2 hole2 + /// ``` + /// + /// Then to merge the two patches together into one we would backpatch + /// hole1 with entry2 and return a new patch that enters at entry1 + /// and has hole2 for a hole. In fact, if you look at the c_concat + /// method you will see that it does exactly this, though it handles + /// a list of expressions rather than just the two that we use for + /// an example. + /// + /// Ok(None) is returned when an expression is compiled to no + /// instruction, and so no patch.entry value makes sense. + fn c(&mut self, expr: &Hir) -> ResultOrEmpty { + use crate::prog; + use regex_syntax::hir::HirKind::*; + + self.check_size()?; + match *expr.kind() { + Empty => self.c_empty(), + Literal(hir::Literal::Unicode(c)) => self.c_char(c), + Literal(hir::Literal::Byte(b)) => { + assert!(self.compiled.uses_bytes()); + self.c_byte(b) + } + Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()), + Class(hir::Class::Bytes(ref cls)) => { + if self.compiled.uses_bytes() { + self.c_class_bytes(cls.ranges()) + } else { + assert!(cls.is_all_ascii()); + let mut char_ranges = vec![]; + for r in cls.iter() { + let (s, e) = (r.start() as char, r.end() as char); + char_ranges.push(hir::ClassUnicodeRange::new(s, e)); + } + self.c_class(&char_ranges) + } + } + Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::EndLine) + } + Anchor(hir::Anchor::StartLine) => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::StartLine) + } + Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::StartLine) + } + Anchor(hir::Anchor::EndLine) => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::EndLine) + } + Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => { + self.c_empty_look(prog::EmptyLook::EndText) + } + Anchor(hir::Anchor::StartText) => { + self.c_empty_look(prog::EmptyLook::StartText) + } + Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => { + self.c_empty_look(prog::EmptyLook::StartText) + } + Anchor(hir::Anchor::EndText) => { + self.c_empty_look(prog::EmptyLook::EndText) + } + WordBoundary(hir::WordBoundary::Unicode) => { + if !cfg!(feature = "unicode-perl") { + return Err(Error::Syntax( + "Unicode word boundaries are unavailable when \ + the unicode-perl feature is disabled" + .to_string(), + )); + } + self.compiled.has_unicode_word_boundary = true; + self.byte_classes.set_word_boundary(); + // We also make sure that all ASCII bytes are in a different + // class from non-ASCII bytes. Otherwise, it's possible for + // ASCII bytes to get lumped into the same class as non-ASCII + // bytes. This in turn may cause the lazy DFA to falsely start + // when it sees an ASCII byte that maps to a byte class with + // non-ASCII bytes. This ensures that never happens. + self.byte_classes.set_range(0, 0x7F); + self.c_empty_look(prog::EmptyLook::WordBoundary) + } + WordBoundary(hir::WordBoundary::UnicodeNegate) => { + if !cfg!(feature = "unicode-perl") { + return Err(Error::Syntax( + "Unicode word boundaries are unavailable when \ + the unicode-perl feature is disabled" + .to_string(), + )); + } + self.compiled.has_unicode_word_boundary = true; + self.byte_classes.set_word_boundary(); + // See comments above for why we set the ASCII range here. + self.byte_classes.set_range(0, 0x7F); + self.c_empty_look(prog::EmptyLook::NotWordBoundary) + } + WordBoundary(hir::WordBoundary::Ascii) => { + self.byte_classes.set_word_boundary(); + self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) + } + WordBoundary(hir::WordBoundary::AsciiNegate) => { + self.byte_classes.set_word_boundary(); + self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) + } + Group(ref g) => match g.kind { + hir::GroupKind::NonCapturing => self.c(&g.hir), + hir::GroupKind::CaptureIndex(index) => { + if index as usize >= self.compiled.captures.len() { + self.compiled.captures.push(None); + } + self.c_capture(2 * index as usize, &g.hir) + } + hir::GroupKind::CaptureName { index, ref name } => { + if index as usize >= self.compiled.captures.len() { + let n = name.to_string(); + self.compiled.captures.push(Some(n.clone())); + self.capture_name_idx.insert(n, index as usize); + } + self.c_capture(2 * index as usize, &g.hir) + } + }, + Concat(ref es) => { + if self.compiled.is_reverse { + self.c_concat(es.iter().rev()) + } else { + self.c_concat(es) + } + } + Alternation(ref es) => self.c_alternate(&**es), + Repetition(ref rep) => self.c_repeat(rep), + } + } + + fn c_empty(&mut self) -> ResultOrEmpty { + // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 + // See: CVE-2022-24713 + // + // Since 'empty' sub-expressions don't increase the size of + // the actual compiled object, we "fake" an increase in its + // size so that our 'check_size_limit' routine will eventually + // stop compilation if there are too many empty sub-expressions + // (e.g., via a large repetition). + self.extra_inst_bytes += std::mem::size_of::(); + Ok(None) + } + + fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty { + if self.num_exprs > 1 || self.compiled.is_dfa { + // Don't ever compile Save instructions for regex sets because + // they are never used. They are also never used in DFA programs + // because DFAs can't handle captures. + self.c(expr) + } else { + let entry = self.insts.len(); + let hole = self.push_hole(InstHole::Save { slot: first_slot }); + let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst()); + self.fill(hole, patch.entry); + self.fill_to_next(patch.hole); + let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 }); + Ok(Some(Patch { hole, entry })) + } + } + + fn c_dotstar(&mut self) -> Result { + Ok(if !self.compiled.only_utf8() { + self.c(&Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + greedy: false, + hir: Box::new(Hir::any(true)), + }))? + .unwrap() + } else { + self.c(&Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + greedy: false, + hir: Box::new(Hir::any(false)), + }))? + .unwrap() + }) + } + + fn c_char(&mut self, c: char) -> ResultOrEmpty { + if self.compiled.uses_bytes() { + if c.is_ascii() { + let b = c as u8; + let hole = + self.push_hole(InstHole::Bytes { start: b, end: b }); + self.byte_classes.set_range(b, b); + Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) + } else { + self.c_class(&[hir::ClassUnicodeRange::new(c, c)]) + } + } else { + let hole = self.push_hole(InstHole::Char { c }); + Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) + } + } + + fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { + use std::mem::size_of; + + assert!(!ranges.is_empty()); + if self.compiled.uses_bytes() { + Ok(Some(CompileClass { c: self, ranges }.compile()?)) + } else { + let ranges: Vec<(char, char)> = + ranges.iter().map(|r| (r.start(), r.end())).collect(); + let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 { + self.push_hole(InstHole::Char { c: ranges[0].0 }) + } else { + self.extra_inst_bytes += + ranges.len() * (size_of::() * 2); + self.push_hole(InstHole::Ranges { ranges }) + }; + Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) + } + } + + fn c_byte(&mut self, b: u8) -> ResultOrEmpty { + self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)]) + } + + fn c_class_bytes( + &mut self, + ranges: &[hir::ClassBytesRange], + ) -> ResultOrEmpty { + debug_assert!(!ranges.is_empty()); + + let first_split_entry = self.insts.len(); + let mut holes = vec![]; + let mut prev_hole = Hole::None; + for r in &ranges[0..ranges.len() - 1] { + self.fill_to_next(prev_hole); + let split = self.push_split_hole(); + let next = self.insts.len(); + self.byte_classes.set_range(r.start(), r.end()); + holes.push(self.push_hole(InstHole::Bytes { + start: r.start(), + end: r.end(), + })); + prev_hole = self.fill_split(split, Some(next), None); + } + let next = self.insts.len(); + let r = &ranges[ranges.len() - 1]; + self.byte_classes.set_range(r.start(), r.end()); + holes.push( + self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }), + ); + self.fill(prev_hole, next); + Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) + } + + fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty { + let hole = self.push_hole(InstHole::EmptyLook { look }); + Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) + } + + fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty + where + I: IntoIterator, + { + let mut exprs = exprs.into_iter(); + let Patch { mut hole, entry } = loop { + match exprs.next() { + None => return self.c_empty(), + Some(e) => { + if let Some(p) = self.c(e)? { + break p; + } + } + } + }; + for e in exprs { + if let Some(p) = self.c(e)? { + self.fill(hole, p.entry); + hole = p.hole; + } + } + Ok(Some(Patch { hole, entry })) + } + + fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty { + debug_assert!( + exprs.len() >= 2, + "alternates must have at least 2 exprs" + ); + + // Initial entry point is always the first split. + let first_split_entry = self.insts.len(); + + // Save up all of the holes from each alternate. They will all get + // patched to point to the same location. + let mut holes = vec![]; + + // true indicates that the hole is a split where we want to fill + // the second branch. + let mut prev_hole = (Hole::None, false); + for e in &exprs[0..exprs.len() - 1] { + if prev_hole.1 { + let next = self.insts.len(); + self.fill_split(prev_hole.0, None, Some(next)); + } else { + self.fill_to_next(prev_hole.0); + } + let split = self.push_split_hole(); + if let Some(Patch { hole, entry }) = self.c(e)? { + holes.push(hole); + prev_hole = (self.fill_split(split, Some(entry), None), false); + } else { + let (split1, split2) = split.dup_one(); + holes.push(split1); + prev_hole = (split2, true); + } + } + if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? { + holes.push(hole); + if prev_hole.1 { + self.fill_split(prev_hole.0, None, Some(entry)); + } else { + self.fill(prev_hole.0, entry); + } + } else { + // We ignore prev_hole.1. When it's true, it means we have two + // empty branches both pushing prev_hole.0 into holes, so both + // branches will go to the same place anyway. + holes.push(prev_hole.0); + } + Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry })) + } + + fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { + use regex_syntax::hir::RepetitionKind::*; + match rep.kind { + ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy), + ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy), + OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy), + Range(hir::RepetitionRange::Exactly(min_max)) => { + self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max) + } + Range(hir::RepetitionRange::AtLeast(min)) => { + self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min) + } + Range(hir::RepetitionRange::Bounded(min, max)) => { + self.c_repeat_range(&rep.hir, rep.greedy, min, max) + } + } + } + + fn c_repeat_zero_or_one( + &mut self, + expr: &Hir, + greedy: bool, + ) -> ResultOrEmpty { + let split_entry = self.insts.len(); + let split = self.push_split_hole(); + let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { + Some(p) => p, + None => return self.pop_split_hole(), + }; + let split_hole = if greedy { + self.fill_split(split, Some(entry_rep), None) + } else { + self.fill_split(split, None, Some(entry_rep)) + }; + let holes = vec![hole_rep, split_hole]; + Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry })) + } + + fn c_repeat_zero_or_more( + &mut self, + expr: &Hir, + greedy: bool, + ) -> ResultOrEmpty { + let split_entry = self.insts.len(); + let split = self.push_split_hole(); + let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { + Some(p) => p, + None => return self.pop_split_hole(), + }; + + self.fill(hole_rep, split_entry); + let split_hole = if greedy { + self.fill_split(split, Some(entry_rep), None) + } else { + self.fill_split(split, None, Some(entry_rep)) + }; + Ok(Some(Patch { hole: split_hole, entry: split_entry })) + } + + fn c_repeat_one_or_more( + &mut self, + expr: &Hir, + greedy: bool, + ) -> ResultOrEmpty { + let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? { + Some(p) => p, + None => return Ok(None), + }; + self.fill_to_next(hole_rep); + let split = self.push_split_hole(); + + let split_hole = if greedy { + self.fill_split(split, Some(entry_rep), None) + } else { + self.fill_split(split, None, Some(entry_rep)) + }; + Ok(Some(Patch { hole: split_hole, entry: entry_rep })) + } + + fn c_repeat_range_min_or_more( + &mut self, + expr: &Hir, + greedy: bool, + min: u32, + ) -> ResultOrEmpty { + let min = u32_to_usize(min); + // Using next_inst() is ok, because we can't return it (concat would + // have to return Some(_) while c_repeat_range_min_or_more returns + // None). + let patch_concat = self + .c_concat(iter::repeat(expr).take(min))? + .unwrap_or_else(|| self.next_inst()); + if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? { + self.fill(patch_concat.hole, patch_rep.entry); + Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry })) + } else { + Ok(None) + } + } + + fn c_repeat_range( + &mut self, + expr: &Hir, + greedy: bool, + min: u32, + max: u32, + ) -> ResultOrEmpty { + let (min, max) = (u32_to_usize(min), u32_to_usize(max)); + debug_assert!(min <= max); + let patch_concat = self.c_concat(iter::repeat(expr).take(min))?; + if min == max { + return Ok(patch_concat); + } + // Same reasoning as in c_repeat_range_min_or_more (we know that min < + // max at this point). + let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst()); + let initial_entry = patch_concat.entry; + // It is much simpler to compile, e.g., `a{2,5}` as: + // + // aaa?a?a? + // + // But you end up with a sequence of instructions like this: + // + // 0: 'a' + // 1: 'a', + // 2: split(3, 4) + // 3: 'a' + // 4: split(5, 6) + // 5: 'a' + // 6: split(7, 8) + // 7: 'a' + // 8: MATCH + // + // This is *incredibly* inefficient because the splits end + // up forming a chain, which has to be resolved everything a + // transition is followed. + let mut holes = vec![]; + let mut prev_hole = patch_concat.hole; + for _ in min..max { + self.fill_to_next(prev_hole); + let split = self.push_split_hole(); + let Patch { hole, entry } = match self.c(expr)? { + Some(p) => p, + None => return self.pop_split_hole(), + }; + prev_hole = hole; + if greedy { + holes.push(self.fill_split(split, Some(entry), None)); + } else { + holes.push(self.fill_split(split, None, Some(entry))); + } + } + holes.push(prev_hole); + Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry })) + } + + /// Can be used as a default value for the c_* functions when the call to + /// c_function is followed by inserting at least one instruction that is + /// always executed after the ones written by the c* function. + fn next_inst(&self) -> Patch { + Patch { hole: Hole::None, entry: self.insts.len() } + } + + fn fill(&mut self, hole: Hole, goto: InstPtr) { + match hole { + Hole::None => {} + Hole::One(pc) => { + self.insts[pc].fill(goto); + } + Hole::Many(holes) => { + for hole in holes { + self.fill(hole, goto); + } + } + } + } + + fn fill_to_next(&mut self, hole: Hole) { + let next = self.insts.len(); + self.fill(hole, next); + } + + fn fill_split( + &mut self, + hole: Hole, + goto1: Option, + goto2: Option, + ) -> Hole { + match hole { + Hole::None => Hole::None, + Hole::One(pc) => match (goto1, goto2) { + (Some(goto1), Some(goto2)) => { + self.insts[pc].fill_split(goto1, goto2); + Hole::None + } + (Some(goto1), None) => { + self.insts[pc].half_fill_split_goto1(goto1); + Hole::One(pc) + } + (None, Some(goto2)) => { + self.insts[pc].half_fill_split_goto2(goto2); + Hole::One(pc) + } + (None, None) => unreachable!( + "at least one of the split \ + holes must be filled" + ), + }, + Hole::Many(holes) => { + let mut new_holes = vec![]; + for hole in holes { + new_holes.push(self.fill_split(hole, goto1, goto2)); + } + if new_holes.is_empty() { + Hole::None + } else if new_holes.len() == 1 { + new_holes.pop().unwrap() + } else { + Hole::Many(new_holes) + } + } + } + } + + fn push_compiled(&mut self, inst: Inst) { + self.insts.push(MaybeInst::Compiled(inst)); + } + + fn push_hole(&mut self, inst: InstHole) -> Hole { + let hole = self.insts.len(); + self.insts.push(MaybeInst::Uncompiled(inst)); + Hole::One(hole) + } + + fn push_split_hole(&mut self) -> Hole { + let hole = self.insts.len(); + self.insts.push(MaybeInst::Split); + Hole::One(hole) + } + + fn pop_split_hole(&mut self) -> ResultOrEmpty { + self.insts.pop(); + Ok(None) + } + + fn check_size(&self) -> result::Result<(), Error> { + use std::mem::size_of; + + let size = + self.extra_inst_bytes + (self.insts.len() * size_of::()); + if size > self.size_limit { + Err(Error::CompiledTooBig(self.size_limit)) + } else { + Ok(()) + } + } +} + +#[derive(Debug)] +enum Hole { + None, + One(InstPtr), + Many(Vec), +} + +impl Hole { + fn dup_one(self) -> (Self, Self) { + match self { + Hole::One(pc) => (Hole::One(pc), Hole::One(pc)), + Hole::None | Hole::Many(_) => { + unreachable!("must be called on single hole") + } + } + } +} + +#[derive(Clone, Debug)] +enum MaybeInst { + Compiled(Inst), + Uncompiled(InstHole), + Split, + Split1(InstPtr), + Split2(InstPtr), +} + +impl MaybeInst { + fn fill(&mut self, goto: InstPtr) { + let maybeinst = match *self { + MaybeInst::Split => MaybeInst::Split1(goto), + MaybeInst::Uncompiled(ref inst) => { + MaybeInst::Compiled(inst.fill(goto)) + } + MaybeInst::Split1(goto1) => { + MaybeInst::Compiled(Inst::Split(InstSplit { + goto1, + goto2: goto, + })) + } + MaybeInst::Split2(goto2) => { + MaybeInst::Compiled(Inst::Split(InstSplit { + goto1: goto, + goto2, + })) + } + _ => unreachable!( + "not all instructions were compiled! \ + found uncompiled instruction: {:?}", + self + ), + }; + *self = maybeinst; + } + + fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) { + let filled = match *self { + MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }), + _ => unreachable!( + "must be called on Split instruction, \ + instead it was called on: {:?}", + self + ), + }; + *self = MaybeInst::Compiled(filled); + } + + fn half_fill_split_goto1(&mut self, goto1: InstPtr) { + let half_filled = match *self { + MaybeInst::Split => goto1, + _ => unreachable!( + "must be called on Split instruction, \ + instead it was called on: {:?}", + self + ), + }; + *self = MaybeInst::Split1(half_filled); + } + + fn half_fill_split_goto2(&mut self, goto2: InstPtr) { + let half_filled = match *self { + MaybeInst::Split => goto2, + _ => unreachable!( + "must be called on Split instruction, \ + instead it was called on: {:?}", + self + ), + }; + *self = MaybeInst::Split2(half_filled); + } + + fn unwrap(self) -> Inst { + match self { + MaybeInst::Compiled(inst) => inst, + _ => unreachable!( + "must be called on a compiled instruction, \ + instead it was called on: {:?}", + self + ), + } + } +} + +#[derive(Clone, Debug)] +enum InstHole { + Save { slot: usize }, + EmptyLook { look: EmptyLook }, + Char { c: char }, + Ranges { ranges: Vec<(char, char)> }, + Bytes { start: u8, end: u8 }, +} + +impl InstHole { + fn fill(&self, goto: InstPtr) -> Inst { + match *self { + InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }), + InstHole::EmptyLook { look } => { + Inst::EmptyLook(InstEmptyLook { goto, look }) + } + InstHole::Char { c } => Inst::Char(InstChar { goto, c }), + InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges { + goto, + ranges: ranges.clone().into_boxed_slice(), + }), + InstHole::Bytes { start, end } => { + Inst::Bytes(InstBytes { goto, start, end }) + } + } + } +} + +struct CompileClass<'a, 'b> { + c: &'a mut Compiler, + ranges: &'b [hir::ClassUnicodeRange], +} + +impl<'a, 'b> CompileClass<'a, 'b> { + fn compile(mut self) -> Result { + let mut holes = vec![]; + let mut initial_entry = None; + let mut last_split = Hole::None; + let mut utf8_seqs = self.c.utf8_seqs.take().unwrap(); + self.c.suffix_cache.clear(); + + for (i, range) in self.ranges.iter().enumerate() { + let is_last_range = i + 1 == self.ranges.len(); + utf8_seqs.reset(range.start(), range.end()); + let mut it = (&mut utf8_seqs).peekable(); + loop { + let utf8_seq = match it.next() { + None => break, + Some(utf8_seq) => utf8_seq, + }; + if is_last_range && it.peek().is_none() { + let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; + holes.push(hole); + self.c.fill(last_split, entry); + last_split = Hole::None; + if initial_entry.is_none() { + initial_entry = Some(entry); + } + } else { + if initial_entry.is_none() { + initial_entry = Some(self.c.insts.len()); + } + self.c.fill_to_next(last_split); + last_split = self.c.push_split_hole(); + let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?; + holes.push(hole); + last_split = + self.c.fill_split(last_split, Some(entry), None); + } + } + } + self.c.utf8_seqs = Some(utf8_seqs); + Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() }) + } + + fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result { + if self.c.compiled.is_reverse { + self.c_utf8_seq_(seq) + } else { + self.c_utf8_seq_(seq.into_iter().rev()) + } + } + + fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result + where + I: IntoIterator, + { + // The initial instruction for each UTF-8 sequence should be the same. + let mut from_inst = ::std::usize::MAX; + let mut last_hole = Hole::None; + for byte_range in seq { + let key = SuffixCacheKey { + from_inst, + start: byte_range.start, + end: byte_range.end, + }; + { + let pc = self.c.insts.len(); + if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) { + from_inst = cached_pc; + continue; + } + } + self.c.byte_classes.set_range(byte_range.start, byte_range.end); + if from_inst == ::std::usize::MAX { + last_hole = self.c.push_hole(InstHole::Bytes { + start: byte_range.start, + end: byte_range.end, + }); + } else { + self.c.push_compiled(Inst::Bytes(InstBytes { + goto: from_inst, + start: byte_range.start, + end: byte_range.end, + })); + } + from_inst = self.c.insts.len().checked_sub(1).unwrap(); + debug_assert!(from_inst < ::std::usize::MAX); + } + debug_assert!(from_inst < ::std::usize::MAX); + Ok(Patch { hole: last_hole, entry: from_inst }) + } +} + +/// `SuffixCache` is a simple bounded hash map for caching suffix entries in +/// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}. +/// The set of byte ranges looks like this: +/// +/// [0-7F] +/// [C2-DF][80-BF] +/// [E0][A0-BF][80-BF] +/// [E1-EC][80-BF][80-BF] +/// [ED][80-9F][80-BF] +/// [EE-EF][80-BF][80-BF] +/// +/// Each line above translates to one alternate in the compiled regex program. +/// However, all but one of the alternates end in the same suffix, which is +/// a waste of an instruction. The suffix cache facilitates reusing them across +/// alternates. +/// +/// Note that a HashMap could be trivially used for this, but we don't need its +/// overhead. Some small bounded space (LRU style) is more than enough. +/// +/// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html), +/// except it uses hashes as original indices and then compares full keys for +/// validation against `dense` array. +#[derive(Debug)] +struct SuffixCache { + sparse: Box<[usize]>, + dense: Vec, +} + +#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] +struct SuffixCacheEntry { + key: SuffixCacheKey, + pc: InstPtr, +} + +#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] +struct SuffixCacheKey { + from_inst: InstPtr, + start: u8, + end: u8, +} + +impl SuffixCache { + fn new(size: usize) -> Self { + SuffixCache { + sparse: vec![0usize; size].into(), + dense: Vec::with_capacity(size), + } + } + + fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option { + let hash = self.hash(&key); + let pos = &mut self.sparse[hash]; + if let Some(entry) = self.dense.get(*pos) { + if entry.key == key { + return Some(entry.pc); + } + } + *pos = self.dense.len(); + self.dense.push(SuffixCacheEntry { key, pc }); + None + } + + fn clear(&mut self) { + self.dense.clear(); + } + + fn hash(&self, suffix: &SuffixCacheKey) -> usize { + // Basic FNV-1a hash as described: + // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function + const FNV_PRIME: u64 = 1_099_511_628_211; + let mut h = 14_695_981_039_346_656_037; + h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME); + h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME); + h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME); + (h as usize) % self.sparse.len() + } +} + +struct ByteClassSet([bool; 256]); + +impl ByteClassSet { + fn new() -> Self { + ByteClassSet([false; 256]) + } + + fn set_range(&mut self, start: u8, end: u8) { + debug_assert!(start <= end); + if start > 0 { + self.0[start as usize - 1] = true; + } + self.0[end as usize] = true; + } + + fn set_word_boundary(&mut self) { + // We need to mark all ranges of bytes whose pairs result in + // evaluating \b differently. + let iswb = is_word_byte; + let mut b1: u16 = 0; + let mut b2: u16; + while b1 <= 255 { + b2 = b1 + 1; + while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) { + b2 += 1; + } + self.set_range(b1 as u8, (b2 - 1) as u8); + b1 = b2; + } + } + + fn byte_classes(&self) -> Vec { + // N.B. If you're debugging the DFA, it's useful to simply return + // `(0..256).collect()`, which effectively removes the byte classes + // and makes the transitions easier to read. + // (0usize..256).map(|x| x as u8).collect() + let mut byte_classes = vec![0; 256]; + let mut class = 0u8; + let mut i = 0; + loop { + byte_classes[i] = class as u8; + if i >= 255 { + break; + } + if self.0[i] { + class = class.checked_add(1).unwrap(); + } + i += 1; + } + byte_classes + } +} + +impl fmt::Debug for ByteClassSet { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish() + } +} + +fn u32_to_usize(n: u32) -> usize { + // In case usize is less than 32 bits, we need to guard against overflow. + // On most platforms this compiles to nothing. + // TODO Use `std::convert::TryFrom` once it's stable. + if (n as u64) > (::std::usize::MAX as u64) { + panic!("BUG: {} is too big to be pointer sized", n) + } + n as usize +} + +#[cfg(test)] +mod tests { + use super::ByteClassSet; + + #[test] + fn byte_classes() { + let mut set = ByteClassSet::new(); + set.set_range(b'a', b'z'); + let classes = set.byte_classes(); + assert_eq!(classes[0], 0); + assert_eq!(classes[1], 0); + assert_eq!(classes[2], 0); + assert_eq!(classes[b'a' as usize - 1], 0); + assert_eq!(classes[b'a' as usize], 1); + assert_eq!(classes[b'm' as usize], 1); + assert_eq!(classes[b'z' as usize], 1); + assert_eq!(classes[b'z' as usize + 1], 2); + assert_eq!(classes[254], 2); + assert_eq!(classes[255], 2); + + let mut set = ByteClassSet::new(); + set.set_range(0, 2); + set.set_range(4, 6); + let classes = set.byte_classes(); + assert_eq!(classes[0], 0); + assert_eq!(classes[1], 0); + assert_eq!(classes[2], 0); + assert_eq!(classes[3], 1); + assert_eq!(classes[4], 2); + assert_eq!(classes[5], 2); + assert_eq!(classes[6], 2); + assert_eq!(classes[7], 3); + assert_eq!(classes[255], 3); + } + + #[test] + fn full_byte_classes() { + let mut set = ByteClassSet::new(); + for i in 0..256u16 { + set.set_range(i as u8, i as u8); + } + assert_eq!(set.byte_classes().len(), 256); + } +} diff --git a/vendor/regex/src/dfa.rs b/vendor/regex/src/dfa.rs new file mode 100644 index 0000000000..dc9952120e --- /dev/null +++ b/vendor/regex/src/dfa.rs @@ -0,0 +1,1945 @@ +/*! +The DFA matching engine. + +A DFA provides faster matching because the engine is in exactly one state at +any point in time. In the NFA, there may be multiple active states, and +considerable CPU cycles are spent shuffling them around. In finite automata +speak, the DFA follows epsilon transitions in the regex far less than the NFA. + +A DFA is a classic trade off between time and space. The NFA is slower, but +its memory requirements are typically small and predictable. The DFA is faster, +but given the right regex and the right input, the number of states in the +DFA can grow exponentially. To mitigate this space problem, we do two things: + +1. We implement an *online* DFA. That is, the DFA is constructed from the NFA + during a search. When a new state is computed, it is stored in a cache so + that it may be reused. An important consequence of this implementation + is that states that are never reached for a particular input are never + computed. (This is impossible in an "offline" DFA which needs to compute + all possible states up front.) +2. If the cache gets too big, we wipe it and continue matching. + +In pathological cases, a new state can be created for every byte of input. +(e.g., The regex `(a|b)*a(a|b){20}` on a long sequence of a's and b's.) +In this case, performance regresses to slightly slower than the full NFA +simulation, in large part because the cache becomes useless. If the cache +is wiped too frequently, the DFA quits and control falls back to one of the +NFA simulations. + +Because of the "lazy" nature of this DFA, the inner matching loop is +considerably more complex than one might expect out of a DFA. A number of +tricks are employed to make it fast. Tread carefully. + +N.B. While this implementation is heavily commented, Russ Cox's series of +articles on regexes is strongly recommended: +(As is the DFA implementation in RE2, which heavily influenced this +implementation.) +*/ + +use std::collections::HashMap; +use std::fmt; +use std::iter::repeat; +use std::mem; +use std::sync::Arc; + +use crate::exec::ProgramCache; +use crate::prog::{Inst, Program}; +use crate::sparse::SparseSet; + +/// Return true if and only if the given program can be executed by a DFA. +/// +/// Generally, a DFA is always possible. A pathological case where it is not +/// possible is if the number of NFA states exceeds `u32::MAX`, in which case, +/// this function will return false. +/// +/// This function will also return false if the given program has any Unicode +/// instructions (Char or Ranges) since the DFA operates on bytes only. +pub fn can_exec(insts: &Program) -> bool { + use crate::prog::Inst::*; + // If for some reason we manage to allocate a regex program with more + // than i32::MAX instructions, then we can't execute the DFA because we + // use 32 bit instruction pointer deltas for memory savings. + // If i32::MAX is the largest positive delta, + // then -i32::MAX == i32::MIN + 1 is the largest negative delta, + // and we are OK to use 32 bits. + if insts.dfa_size_limit == 0 || insts.len() > ::std::i32::MAX as usize { + return false; + } + for inst in insts { + match *inst { + Char(_) | Ranges(_) => return false, + EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) => {} + } + } + true +} + +/// A reusable cache of DFA states. +/// +/// This cache is reused between multiple invocations of the same regex +/// program. (It is not shared simultaneously between threads. If there is +/// contention, then new caches are created.) +#[derive(Debug)] +pub struct Cache { + /// Group persistent DFA related cache state together. The sparse sets + /// listed below are used as scratch space while computing uncached states. + inner: CacheInner, + /// qcur and qnext are ordered sets with constant time + /// addition/membership/clearing-whole-set and linear time iteration. They + /// are used to manage the sets of NFA states in DFA states when computing + /// cached DFA states. In particular, the order of the NFA states matters + /// for leftmost-first style matching. Namely, when computing a cached + /// state, the set of NFA states stops growing as soon as the first Match + /// instruction is observed. + qcur: SparseSet, + qnext: SparseSet, +} + +/// `CacheInner` is logically just a part of Cache, but groups together fields +/// that aren't passed as function parameters throughout search. (This split +/// is mostly an artifact of the borrow checker. It is happily paid.) +#[derive(Debug)] +struct CacheInner { + /// A cache of pre-compiled DFA states, keyed by the set of NFA states + /// and the set of empty-width flags set at the byte in the input when the + /// state was observed. + /// + /// A StatePtr is effectively a `*State`, but to avoid various inconvenient + /// things, we just pass indexes around manually. The performance impact of + /// this is probably an instruction or two in the inner loop. However, on + /// 64 bit, each StatePtr is half the size of a *State. + compiled: StateMap, + /// The transition table. + /// + /// The transition table is laid out in row-major order, where states are + /// rows and the transitions for each state are columns. At a high level, + /// given state `s` and byte `b`, the next state can be found at index + /// `s * 256 + b`. + /// + /// This is, of course, a lie. A StatePtr is actually a pointer to the + /// *start* of a row in this table. When indexing in the DFA's inner loop, + /// this removes the need to multiply the StatePtr by the stride. Yes, it + /// matters. This reduces the number of states we can store, but: the + /// stride is rarely 256 since we define transitions in terms of + /// *equivalence classes* of bytes. Each class corresponds to a set of + /// bytes that never discriminate a distinct path through the DFA from each + /// other. + trans: Transitions, + /// A set of cached start states, which are limited to the number of + /// permutations of flags set just before the initial byte of input. (The + /// index into this vec is a `EmptyFlags`.) + /// + /// N.B. A start state can be "dead" (i.e., no possible match), so we + /// represent it with a StatePtr. + start_states: Vec, + /// Stack scratch space used to follow epsilon transitions in the NFA. + /// (This permits us to avoid recursion.) + /// + /// The maximum stack size is the number of NFA states. + stack: Vec, + /// The total number of times this cache has been flushed by the DFA + /// because of space constraints. + flush_count: u64, + /// The total heap size of the DFA's cache. We use this to determine when + /// we should flush the cache. + size: usize, + /// Scratch space used when building instruction pointer lists for new + /// states. This helps amortize allocation. + insts_scratch_space: Vec, +} + +/// The transition table. +/// +/// It is laid out in row-major order, with states as rows and byte class +/// transitions as columns. +/// +/// The transition table is responsible for producing valid `StatePtrs`. A +/// `StatePtr` points to the start of a particular row in this table. When +/// indexing to find the next state this allows us to avoid a multiplication +/// when computing an index into the table. +#[derive(Clone)] +struct Transitions { + /// The table. + table: Vec, + /// The stride. + num_byte_classes: usize, +} + +/// Fsm encapsulates the actual execution of the DFA. +#[derive(Debug)] +pub struct Fsm<'a> { + /// prog contains the NFA instruction opcodes. DFA execution uses either + /// the `dfa` instructions or the `dfa_reverse` instructions from + /// `exec::ExecReadOnly`. (It never uses `ExecReadOnly.nfa`, which may have + /// Unicode opcodes that cannot be executed by the DFA.) + prog: &'a Program, + /// The start state. We record it here because the pointer may change + /// when the cache is wiped. + start: StatePtr, + /// The current position in the input. + at: usize, + /// Should we quit after seeing the first match? e.g., When the caller + /// uses `is_match` or `shortest_match`. + quit_after_match: bool, + /// The last state that matched. + /// + /// When no match has occurred, this is set to STATE_UNKNOWN. + /// + /// This is only useful when matching regex sets. The last match state + /// is useful because it contains all of the match instructions seen, + /// thereby allowing us to enumerate which regexes in the set matched. + last_match_si: StatePtr, + /// The input position of the last cache flush. We use this to determine + /// if we're thrashing in the cache too often. If so, the DFA quits so + /// that we can fall back to the NFA algorithm. + last_cache_flush: usize, + /// All cached DFA information that is persisted between searches. + cache: &'a mut CacheInner, +} + +/// The result of running the DFA. +/// +/// Generally, the result is either a match or not a match, but sometimes the +/// DFA runs too slowly because the cache size is too small. In that case, it +/// gives up with the intent of falling back to the NFA algorithm. +/// +/// The DFA can also give up if it runs out of room to create new states, or if +/// it sees non-ASCII bytes in the presence of a Unicode word boundary. +#[derive(Clone, Debug)] +pub enum Result { + Match(T), + NoMatch(usize), + Quit, +} + +impl Result { + /// Returns true if this result corresponds to a match. + pub fn is_match(&self) -> bool { + match *self { + Result::Match(_) => true, + Result::NoMatch(_) | Result::Quit => false, + } + } + + /// Maps the given function onto T and returns the result. + /// + /// If this isn't a match, then this is a no-op. + #[cfg(feature = "perf-literal")] + pub fn map U>(self, mut f: F) -> Result { + match self { + Result::Match(t) => Result::Match(f(t)), + Result::NoMatch(x) => Result::NoMatch(x), + Result::Quit => Result::Quit, + } + } + + /// Sets the non-match position. + /// + /// If this isn't a non-match, then this is a no-op. + fn set_non_match(self, at: usize) -> Result { + match self { + Result::NoMatch(_) => Result::NoMatch(at), + r => r, + } + } +} + +/// `State` is a DFA state. It contains an ordered set of NFA states (not +/// necessarily complete) and a smattering of flags. +/// +/// The flags are packed into the first byte of data. +/// +/// States don't carry their transitions. Instead, transitions are stored in +/// a single row-major table. +/// +/// Delta encoding is used to store the instruction pointers. +/// The first instruction pointer is stored directly starting +/// at data[1], and each following pointer is stored as an offset +/// to the previous one. If a delta is in the range -127..127, +/// it is packed into a single byte; Otherwise the byte 128 (-128 as an i8) +/// is coded as a flag, followed by 4 bytes encoding the delta. +#[derive(Clone, Eq, Hash, PartialEq)] +struct State { + data: Arc<[u8]>, +} + +/// `InstPtr` is a 32 bit pointer into a sequence of opcodes (i.e., it indexes +/// an NFA state). +/// +/// Throughout this library, this is usually set to `usize`, but we force a +/// `u32` here for the DFA to save on space. +type InstPtr = u32; + +/// Adds ip to data using delta encoding with respect to prev. +/// +/// After completion, `data` will contain `ip` and `prev` will be set to `ip`. +fn push_inst_ptr(data: &mut Vec, prev: &mut InstPtr, ip: InstPtr) { + let delta = (ip as i32) - (*prev as i32); + write_vari32(data, delta); + *prev = ip; +} + +struct InstPtrs<'a> { + base: usize, + data: &'a [u8], +} + +impl<'a> Iterator for InstPtrs<'a> { + type Item = usize; + + fn next(&mut self) -> Option { + if self.data.is_empty() { + return None; + } + let (delta, nread) = read_vari32(self.data); + let base = self.base as i32 + delta; + debug_assert!(base >= 0); + debug_assert!(nread > 0); + self.data = &self.data[nread..]; + self.base = base as usize; + Some(self.base) + } +} + +impl State { + fn flags(&self) -> StateFlags { + StateFlags(self.data[0]) + } + + fn inst_ptrs(&self) -> InstPtrs<'_> { + InstPtrs { base: 0, data: &self.data[1..] } + } +} + +/// `StatePtr` is a 32 bit pointer to the start of a row in the transition +/// table. +/// +/// It has many special values. There are two types of special values: +/// sentinels and flags. +/// +/// Sentinels corresponds to special states that carry some kind of +/// significance. There are three such states: unknown, dead and quit states. +/// +/// Unknown states are states that haven't been computed yet. They indicate +/// that a transition should be filled in that points to either an existing +/// cached state or a new state altogether. In general, an unknown state means +/// "follow the NFA's epsilon transitions." +/// +/// Dead states are states that can never lead to a match, no matter what +/// subsequent input is observed. This means that the DFA should quit +/// immediately and return the longest match it has found thus far. +/// +/// Quit states are states that imply the DFA is not capable of matching the +/// regex correctly. Currently, this is only used when a Unicode word boundary +/// exists in the regex *and* a non-ASCII byte is observed. +/// +/// The other type of state pointer is a state pointer with special flag bits. +/// There are two flags: a start flag and a match flag. The lower bits of both +/// kinds always contain a "valid" `StatePtr` (indicated by the `STATE_MAX` +/// mask). +/// +/// The start flag means that the state is a start state, and therefore may be +/// subject to special prefix scanning optimizations. +/// +/// The match flag means that the state is a match state, and therefore the +/// current position in the input (while searching) should be recorded. +/// +/// The above exists mostly in the service of making the inner loop fast. +/// In particular, the inner *inner* loop looks something like this: +/// +/// ```ignore +/// while state <= STATE_MAX and i < len(text): +/// state = state.next[i] +/// ``` +/// +/// This is nice because it lets us execute a lazy DFA as if it were an +/// entirely offline DFA (i.e., with very few instructions). The loop will +/// quit only when we need to examine a case that needs special attention. +type StatePtr = u32; + +/// An unknown state means that the state has not been computed yet, and that +/// the only way to progress is to compute it. +const STATE_UNKNOWN: StatePtr = 1 << 31; + +/// A dead state means that the state has been computed and it is known that +/// once it is entered, no future match can ever occur. +const STATE_DEAD: StatePtr = STATE_UNKNOWN + 1; + +/// A quit state means that the DFA came across some input that it doesn't +/// know how to process correctly. The DFA should quit and another matching +/// engine should be run in its place. +const STATE_QUIT: StatePtr = STATE_DEAD + 1; + +/// A start state is a state that the DFA can start in. +/// +/// Note that start states have their lower bits set to a state pointer. +const STATE_START: StatePtr = 1 << 30; + +/// A match state means that the regex has successfully matched. +/// +/// Note that match states have their lower bits set to a state pointer. +const STATE_MATCH: StatePtr = 1 << 29; + +/// The maximum state pointer. This is useful to mask out the "valid" state +/// pointer from a state with the "start" or "match" bits set. +/// +/// It doesn't make sense to use this with unknown, dead or quit state +/// pointers, since those pointers are sentinels and never have their lower +/// bits set to anything meaningful. +const STATE_MAX: StatePtr = STATE_MATCH - 1; + +/// Byte is a u8 in spirit, but a u16 in practice so that we can represent the +/// special EOF sentinel value. +#[derive(Copy, Clone, Debug)] +struct Byte(u16); + +/// A set of flags for zero-width assertions. +#[derive(Clone, Copy, Eq, Debug, Default, Hash, PartialEq)] +struct EmptyFlags { + start: bool, + end: bool, + start_line: bool, + end_line: bool, + word_boundary: bool, + not_word_boundary: bool, +} + +/// A set of flags describing various configurations of a DFA state. This is +/// represented by a `u8` so that it is compact. +#[derive(Clone, Copy, Eq, Default, Hash, PartialEq)] +struct StateFlags(u8); + +impl Cache { + /// Create new empty cache for the DFA engine. + pub fn new(prog: &Program) -> Self { + // We add 1 to account for the special EOF byte. + let num_byte_classes = (prog.byte_classes[255] as usize + 1) + 1; + let starts = vec![STATE_UNKNOWN; 256]; + let mut cache = Cache { + inner: CacheInner { + compiled: StateMap::new(num_byte_classes), + trans: Transitions::new(num_byte_classes), + start_states: starts, + stack: vec![], + flush_count: 0, + size: 0, + insts_scratch_space: vec![], + }, + qcur: SparseSet::new(prog.insts.len()), + qnext: SparseSet::new(prog.insts.len()), + }; + cache.inner.reset_size(); + cache + } +} + +impl CacheInner { + /// Resets the cache size to account for fixed costs, such as the program + /// and stack sizes. + fn reset_size(&mut self) { + self.size = (self.start_states.len() * mem::size_of::()) + + (self.stack.len() * mem::size_of::()); + } +} + +impl<'a> Fsm<'a> { + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn forward( + prog: &'a Program, + cache: &ProgramCache, + quit_after_match: bool, + text: &[u8], + at: usize, + ) -> Result { + let mut cache = cache.borrow_mut(); + let cache = &mut cache.dfa; + let mut dfa = Fsm { + prog, + start: 0, // filled in below + at, + quit_after_match, + last_match_si: STATE_UNKNOWN, + last_cache_flush: at, + cache: &mut cache.inner, + }; + let (empty_flags, state_flags) = dfa.start_flags(text, at); + dfa.start = + match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) { + None => return Result::Quit, + Some(STATE_DEAD) => return Result::NoMatch(at), + Some(si) => si, + }; + debug_assert!(dfa.start != STATE_UNKNOWN); + dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn reverse( + prog: &'a Program, + cache: &ProgramCache, + quit_after_match: bool, + text: &[u8], + at: usize, + ) -> Result { + let mut cache = cache.borrow_mut(); + let cache = &mut cache.dfa_reverse; + let mut dfa = Fsm { + prog, + start: 0, // filled in below + at, + quit_after_match, + last_match_si: STATE_UNKNOWN, + last_cache_flush: at, + cache: &mut cache.inner, + }; + let (empty_flags, state_flags) = dfa.start_flags_reverse(text, at); + dfa.start = + match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) { + None => return Result::Quit, + Some(STATE_DEAD) => return Result::NoMatch(at), + Some(si) => si, + }; + debug_assert!(dfa.start != STATE_UNKNOWN); + dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn forward_many( + prog: &'a Program, + cache: &ProgramCache, + matches: &mut [bool], + text: &[u8], + at: usize, + ) -> Result { + debug_assert!(matches.len() == prog.matches.len()); + let mut cache = cache.borrow_mut(); + let cache = &mut cache.dfa; + let mut dfa = Fsm { + prog, + start: 0, // filled in below + at, + quit_after_match: false, + last_match_si: STATE_UNKNOWN, + last_cache_flush: at, + cache: &mut cache.inner, + }; + let (empty_flags, state_flags) = dfa.start_flags(text, at); + dfa.start = + match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) { + None => return Result::Quit, + Some(STATE_DEAD) => return Result::NoMatch(at), + Some(si) => si, + }; + debug_assert!(dfa.start != STATE_UNKNOWN); + let result = dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text); + if result.is_match() { + if matches.len() == 1 { + matches[0] = true; + } else { + debug_assert!(dfa.last_match_si != STATE_UNKNOWN); + debug_assert!(dfa.last_match_si != STATE_DEAD); + for ip in dfa.state(dfa.last_match_si).inst_ptrs() { + if let Inst::Match(slot) = dfa.prog[ip] { + matches[slot] = true; + } + } + } + } + result + } + + /// Executes the DFA on a forward NFA. + /// + /// {qcur,qnext} are scratch ordered sets which may be non-empty. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn exec_at( + &mut self, + qcur: &mut SparseSet, + qnext: &mut SparseSet, + text: &[u8], + ) -> Result { + // For the most part, the DFA is basically: + // + // last_match = null + // while current_byte != EOF: + // si = current_state.next[current_byte] + // if si is match + // last_match = si + // return last_match + // + // However, we need to deal with a few things: + // + // 1. This is an *online* DFA, so the current state's next list + // may not point to anywhere yet, so we must go out and compute + // them. (They are then cached into the current state's next list + // to avoid re-computation.) + // 2. If we come across a state that is known to be dead (i.e., never + // leads to a match), then we can quit early. + // 3. If the caller just wants to know if a match occurs, then we + // can quit as soon as we know we have a match. (Full leftmost + // first semantics require continuing on.) + // 4. If we're in the start state, then we can use a pre-computed set + // of prefix literals to skip quickly along the input. + // 5. After the input is exhausted, we run the DFA on one symbol + // that stands for EOF. This is useful for handling empty width + // assertions. + // 6. We can't actually do state.next[byte]. Instead, we have to do + // state.next[byte_classes[byte]], which permits us to keep the + // 'next' list very small. + // + // Since there's a bunch of extra stuff we need to consider, we do some + // pretty hairy tricks to get the inner loop to run as fast as + // possible. + debug_assert!(!self.prog.is_reverse); + + // The last match is the currently known ending match position. It is + // reported as an index to the most recent byte that resulted in a + // transition to a match state and is always stored in capture slot `1` + // when searching forwards. Its maximum value is `text.len()`. + let mut result = Result::NoMatch(self.at); + let (mut prev_si, mut next_si) = (self.start, self.start); + let mut at = self.at; + while at < text.len() { + // This is the real inner loop. We take advantage of special bits + // set in the state pointer to determine whether a state is in the + // "common" case or not. Specifically, the common case is a + // non-match non-start non-dead state that has already been + // computed. So long as we remain in the common case, this inner + // loop will chew through the input. + // + // We also unroll the loop 4 times to amortize the cost of checking + // whether we've consumed the entire input. We are also careful + // to make sure that `prev_si` always represents the previous state + // and `next_si` always represents the next state after the loop + // exits, even if it isn't always true inside the loop. + while next_si <= STATE_MAX && at < text.len() { + // Argument for safety is in the definition of next_si. + prev_si = unsafe { self.next_si(next_si, text, at) }; + at += 1; + if prev_si > STATE_MAX || at + 2 >= text.len() { + mem::swap(&mut prev_si, &mut next_si); + break; + } + next_si = unsafe { self.next_si(prev_si, text, at) }; + at += 1; + if next_si > STATE_MAX { + break; + } + prev_si = unsafe { self.next_si(next_si, text, at) }; + at += 1; + if prev_si > STATE_MAX { + mem::swap(&mut prev_si, &mut next_si); + break; + } + next_si = unsafe { self.next_si(prev_si, text, at) }; + at += 1; + } + if next_si & STATE_MATCH > 0 { + // A match state is outside of the common case because it needs + // special case analysis. In particular, we need to record the + // last position as having matched and possibly quit the DFA if + // we don't need to keep matching. + next_si &= !STATE_MATCH; + result = Result::Match(at - 1); + if self.quit_after_match { + return result; + } + self.last_match_si = next_si; + prev_si = next_si; + + // This permits short-circuiting when matching a regex set. + // In particular, if this DFA state contains only match states, + // then it's impossible to extend the set of matches since + // match states are final. Therefore, we can quit. + if self.prog.matches.len() > 1 { + let state = self.state(next_si); + let just_matches = + state.inst_ptrs().all(|ip| self.prog[ip].is_match()); + if just_matches { + return result; + } + } + + // Another inner loop! If the DFA stays in this particular + // match state, then we can rip through all of the input + // very quickly, and only recording the match location once + // we've left this particular state. + let cur = at; + while (next_si & !STATE_MATCH) == prev_si + && at + 2 < text.len() + { + // Argument for safety is in the definition of next_si. + next_si = unsafe { + self.next_si(next_si & !STATE_MATCH, text, at) + }; + at += 1; + } + if at > cur { + result = Result::Match(at - 2); + } + } else if next_si & STATE_START > 0 { + // A start state isn't in the common case because we may + // want to do quick prefix scanning. If the program doesn't + // have a detected prefix, then start states are actually + // considered common and this case is never reached. + debug_assert!(self.has_prefix()); + next_si &= !STATE_START; + prev_si = next_si; + at = match self.prefix_at(text, at) { + None => return Result::NoMatch(text.len()), + Some(i) => i, + }; + } else if next_si >= STATE_UNKNOWN { + if next_si == STATE_QUIT { + return Result::Quit; + } + // Finally, this corresponds to the case where the transition + // entered a state that can never lead to a match or a state + // that hasn't been computed yet. The latter being the "slow" + // path. + let byte = Byte::byte(text[at - 1]); + // We no longer care about the special bits in the state + // pointer. + prev_si &= STATE_MAX; + // Record where we are. This is used to track progress for + // determining whether we should quit if we've flushed the + // cache too much. + self.at = at; + next_si = match self.next_state(qcur, qnext, prev_si, byte) { + None => return Result::Quit, + Some(STATE_DEAD) => return result.set_non_match(at), + Some(si) => si, + }; + debug_assert!(next_si != STATE_UNKNOWN); + if next_si & STATE_MATCH > 0 { + next_si &= !STATE_MATCH; + result = Result::Match(at - 1); + if self.quit_after_match { + return result; + } + self.last_match_si = next_si; + } + prev_si = next_si; + } else { + prev_si = next_si; + } + } + + // Run the DFA once more on the special EOF sentinel value. + // We don't care about the special bits in the state pointer any more, + // so get rid of them. + prev_si &= STATE_MAX; + prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) { + None => return Result::Quit, + Some(STATE_DEAD) => return result.set_non_match(text.len()), + Some(si) => si & !STATE_START, + }; + debug_assert!(prev_si != STATE_UNKNOWN); + if prev_si & STATE_MATCH > 0 { + prev_si &= !STATE_MATCH; + self.last_match_si = prev_si; + result = Result::Match(text.len()); + } + result + } + + /// Executes the DFA on a reverse NFA. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn exec_at_reverse( + &mut self, + qcur: &mut SparseSet, + qnext: &mut SparseSet, + text: &[u8], + ) -> Result { + // The comments in `exec_at` above mostly apply here too. The main + // difference is that we move backwards over the input and we look for + // the longest possible match instead of the leftmost-first match. + // + // N.B. The code duplication here is regrettable. Efforts to improve + // it without sacrificing performance are welcome. ---AG + debug_assert!(self.prog.is_reverse); + let mut result = Result::NoMatch(self.at); + let (mut prev_si, mut next_si) = (self.start, self.start); + let mut at = self.at; + while at > 0 { + while next_si <= STATE_MAX && at > 0 { + // Argument for safety is in the definition of next_si. + at -= 1; + prev_si = unsafe { self.next_si(next_si, text, at) }; + if prev_si > STATE_MAX || at <= 4 { + mem::swap(&mut prev_si, &mut next_si); + break; + } + at -= 1; + next_si = unsafe { self.next_si(prev_si, text, at) }; + if next_si > STATE_MAX { + break; + } + at -= 1; + prev_si = unsafe { self.next_si(next_si, text, at) }; + if prev_si > STATE_MAX { + mem::swap(&mut prev_si, &mut next_si); + break; + } + at -= 1; + next_si = unsafe { self.next_si(prev_si, text, at) }; + } + if next_si & STATE_MATCH > 0 { + next_si &= !STATE_MATCH; + result = Result::Match(at + 1); + if self.quit_after_match { + return result; + } + self.last_match_si = next_si; + prev_si = next_si; + let cur = at; + while (next_si & !STATE_MATCH) == prev_si && at >= 2 { + // Argument for safety is in the definition of next_si. + at -= 1; + next_si = unsafe { + self.next_si(next_si & !STATE_MATCH, text, at) + }; + } + if at < cur { + result = Result::Match(at + 2); + } + } else if next_si >= STATE_UNKNOWN { + if next_si == STATE_QUIT { + return Result::Quit; + } + let byte = Byte::byte(text[at]); + prev_si &= STATE_MAX; + self.at = at; + next_si = match self.next_state(qcur, qnext, prev_si, byte) { + None => return Result::Quit, + Some(STATE_DEAD) => return result.set_non_match(at), + Some(si) => si, + }; + debug_assert!(next_si != STATE_UNKNOWN); + if next_si & STATE_MATCH > 0 { + next_si &= !STATE_MATCH; + result = Result::Match(at + 1); + if self.quit_after_match { + return result; + } + self.last_match_si = next_si; + } + prev_si = next_si; + } else { + prev_si = next_si; + } + } + + // Run the DFA once more on the special EOF sentinel value. + prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) { + None => return Result::Quit, + Some(STATE_DEAD) => return result.set_non_match(0), + Some(si) => si, + }; + debug_assert!(prev_si != STATE_UNKNOWN); + if prev_si & STATE_MATCH > 0 { + prev_si &= !STATE_MATCH; + self.last_match_si = prev_si; + result = Result::Match(0); + } + result + } + + /// next_si transitions to the next state, where the transition input + /// corresponds to text[i]. + /// + /// This elides bounds checks, and is therefore not safe. + #[cfg_attr(feature = "perf-inline", inline(always))] + unsafe fn next_si(&self, si: StatePtr, text: &[u8], i: usize) -> StatePtr { + // What is the argument for safety here? + // We have three unchecked accesses that could possibly violate safety: + // + // 1. The given byte of input (`text[i]`). + // 2. The class of the byte of input (`classes[text[i]]`). + // 3. The transition for the class (`trans[si + cls]`). + // + // (1) is only safe when calling next_si is guarded by + // `i < text.len()`. + // + // (2) is the easiest case to guarantee since `text[i]` is always a + // `u8` and `self.prog.byte_classes` always has length `u8::MAX`. + // (See `ByteClassSet.byte_classes` in `compile.rs`.) + // + // (3) is only safe if (1)+(2) are safe. Namely, the transitions + // of every state are defined to have length equal to the number of + // byte classes in the program. Therefore, a valid class leads to a + // valid transition. (All possible transitions are valid lookups, even + // if it points to a state that hasn't been computed yet.) (3) also + // relies on `si` being correct, but StatePtrs should only ever be + // retrieved from the transition table, which ensures they are correct. + debug_assert!(i < text.len()); + let b = *text.get_unchecked(i); + debug_assert!((b as usize) < self.prog.byte_classes.len()); + let cls = *self.prog.byte_classes.get_unchecked(b as usize); + self.cache.trans.next_unchecked(si, cls as usize) + } + + /// Computes the next state given the current state and the current input + /// byte (which may be EOF). + /// + /// If STATE_DEAD is returned, then there is no valid state transition. + /// This implies that no permutation of future input can lead to a match + /// state. + /// + /// STATE_UNKNOWN can never be returned. + fn exec_byte( + &mut self, + qcur: &mut SparseSet, + qnext: &mut SparseSet, + mut si: StatePtr, + b: Byte, + ) -> Option { + use crate::prog::Inst::*; + + // Initialize a queue with the current DFA state's NFA states. + qcur.clear(); + for ip in self.state(si).inst_ptrs() { + qcur.insert(ip); + } + + // Before inspecting the current byte, we may need to also inspect + // whether the position immediately preceding the current byte + // satisfies the empty assertions found in the current state. + // + // We only need to do this step if there are any empty assertions in + // the current state. + let is_word_last = self.state(si).flags().is_word(); + let is_word = b.is_ascii_word(); + if self.state(si).flags().has_empty() { + // Compute the flags immediately preceding the current byte. + // This means we only care about the "end" or "end line" flags. + // (The "start" flags are computed immediately following the + // current byte and are handled below.) + let mut flags = EmptyFlags::default(); + if b.is_eof() { + flags.end = true; + flags.end_line = true; + } else if b.as_byte().map_or(false, |b| b == b'\n') { + flags.end_line = true; + } + if is_word_last == is_word { + flags.not_word_boundary = true; + } else { + flags.word_boundary = true; + } + // Now follow epsilon transitions from every NFA state, but make + // sure we only follow transitions that satisfy our flags. + qnext.clear(); + for &ip in &*qcur { + self.follow_epsilons(usize_to_u32(ip), qnext, flags); + } + mem::swap(qcur, qnext); + } + + // Now we set flags for immediately after the current byte. Since start + // states are processed separately, and are the only states that can + // have the StartText flag set, we therefore only need to worry about + // the StartLine flag here. + // + // We do also keep track of whether this DFA state contains a NFA state + // that is a matching state. This is precisely how we delay the DFA + // matching by one byte in order to process the special EOF sentinel + // byte. Namely, if this DFA state containing a matching NFA state, + // then it is the *next* DFA state that is marked as a match. + let mut empty_flags = EmptyFlags::default(); + let mut state_flags = StateFlags::default(); + empty_flags.start_line = b.as_byte().map_or(false, |b| b == b'\n'); + if b.is_ascii_word() { + state_flags.set_word(); + } + // Now follow all epsilon transitions again, but only after consuming + // the current byte. + qnext.clear(); + for &ip in &*qcur { + match self.prog[ip as usize] { + // These states never happen in a byte-based program. + Char(_) | Ranges(_) => unreachable!(), + // These states are handled when following epsilon transitions. + Save(_) | Split(_) | EmptyLook(_) => {} + Match(_) => { + state_flags.set_match(); + if !self.continue_past_first_match() { + break; + } else if self.prog.matches.len() > 1 + && !qnext.contains(ip as usize) + { + // If we are continuing on to find other matches, + // then keep a record of the match states we've seen. + qnext.insert(ip); + } + } + Bytes(ref inst) => { + if b.as_byte().map_or(false, |b| inst.matches(b)) { + self.follow_epsilons( + inst.goto as InstPtr, + qnext, + empty_flags, + ); + } + } + } + } + + let cache = if b.is_eof() && self.prog.matches.len() > 1 { + // If we're processing the last byte of the input and we're + // matching a regex set, then make the next state contain the + // previous states transitions. We do this so that the main + // matching loop can extract all of the match instructions. + mem::swap(qcur, qnext); + // And don't cache this state because it's totally bunk. + false + } else { + true + }; + + // We've now built up the set of NFA states that ought to comprise the + // next DFA state, so try to find it in the cache, and if it doesn't + // exist, cache it. + // + // N.B. We pass `&mut si` here because the cache may clear itself if + // it has gotten too full. When that happens, the location of the + // current state may change. + let mut next = + match self.cached_state(qnext, state_flags, Some(&mut si)) { + None => return None, + Some(next) => next, + }; + if (self.start & !STATE_START) == next { + // Start states can never be match states since all matches are + // delayed by one byte. + debug_assert!(!self.state(next).flags().is_match()); + next = self.start_ptr(next); + } + if next <= STATE_MAX && self.state(next).flags().is_match() { + next |= STATE_MATCH; + } + debug_assert!(next != STATE_UNKNOWN); + // And now store our state in the current state's next list. + if cache { + let cls = self.byte_class(b); + self.cache.trans.set_next(si, cls, next); + } + Some(next) + } + + /// Follows the epsilon transitions starting at (and including) `ip`. The + /// resulting states are inserted into the ordered set `q`. + /// + /// Conditional epsilon transitions (i.e., empty width assertions) are only + /// followed if they are satisfied by the given flags, which should + /// represent the flags set at the current location in the input. + /// + /// If the current location corresponds to the empty string, then only the + /// end line and/or end text flags may be set. If the current location + /// corresponds to a real byte in the input, then only the start line + /// and/or start text flags may be set. + /// + /// As an exception to the above, when finding the initial state, any of + /// the above flags may be set: + /// + /// If matching starts at the beginning of the input, then start text and + /// start line should be set. If the input is empty, then end text and end + /// line should also be set. + /// + /// If matching starts after the beginning of the input, then only start + /// line should be set if the preceding byte is `\n`. End line should never + /// be set in this case. (Even if the following byte is a `\n`, it will + /// be handled in a subsequent DFA state.) + fn follow_epsilons( + &mut self, + ip: InstPtr, + q: &mut SparseSet, + flags: EmptyFlags, + ) { + use crate::prog::EmptyLook::*; + use crate::prog::Inst::*; + + // We need to traverse the NFA to follow epsilon transitions, so avoid + // recursion with an explicit stack. + self.cache.stack.push(ip); + while let Some(mut ip) = self.cache.stack.pop() { + // Try to munch through as many states as possible without + // pushes/pops to the stack. + loop { + // Don't visit states we've already added. + if q.contains(ip as usize) { + break; + } + q.insert(ip as usize); + match self.prog[ip as usize] { + Char(_) | Ranges(_) => unreachable!(), + Match(_) | Bytes(_) => { + break; + } + EmptyLook(ref inst) => { + // Only follow empty assertion states if our flags + // satisfy the assertion. + match inst.look { + StartLine if flags.start_line => { + ip = inst.goto as InstPtr; + } + EndLine if flags.end_line => { + ip = inst.goto as InstPtr; + } + StartText if flags.start => { + ip = inst.goto as InstPtr; + } + EndText if flags.end => { + ip = inst.goto as InstPtr; + } + WordBoundaryAscii if flags.word_boundary => { + ip = inst.goto as InstPtr; + } + NotWordBoundaryAscii + if flags.not_word_boundary => + { + ip = inst.goto as InstPtr; + } + WordBoundary if flags.word_boundary => { + ip = inst.goto as InstPtr; + } + NotWordBoundary if flags.not_word_boundary => { + ip = inst.goto as InstPtr; + } + StartLine | EndLine | StartText | EndText + | WordBoundaryAscii | NotWordBoundaryAscii + | WordBoundary | NotWordBoundary => { + break; + } + } + } + Save(ref inst) => { + ip = inst.goto as InstPtr; + } + Split(ref inst) => { + self.cache.stack.push(inst.goto2 as InstPtr); + ip = inst.goto1 as InstPtr; + } + } + } + } + } + + /// Find a previously computed state matching the given set of instructions + /// and is_match bool. + /// + /// The given set of instructions should represent a single state in the + /// NFA along with all states reachable without consuming any input. + /// + /// The is_match bool should be true if and only if the preceding DFA state + /// contains an NFA matching state. The cached state produced here will + /// then signify a match. (This enables us to delay a match by one byte, + /// in order to account for the EOF sentinel byte.) + /// + /// If the cache is full, then it is wiped before caching a new state. + /// + /// The current state should be specified if it exists, since it will need + /// to be preserved if the cache clears itself. (Start states are + /// always saved, so they should not be passed here.) It takes a mutable + /// pointer to the index because if the cache is cleared, the state's + /// location may change. + fn cached_state( + &mut self, + q: &SparseSet, + mut state_flags: StateFlags, + current_state: Option<&mut StatePtr>, + ) -> Option { + // If we couldn't come up with a non-empty key to represent this state, + // then it is dead and can never lead to a match. + // + // Note that inst_flags represent the set of empty width assertions + // in q. We use this as an optimization in exec_byte to determine when + // we should follow epsilon transitions at the empty string preceding + // the current byte. + let key = match self.cached_state_key(q, &mut state_flags) { + None => return Some(STATE_DEAD), + Some(v) => v, + }; + // In the cache? Cool. Done. + if let Some(si) = self.cache.compiled.get_ptr(&key) { + return Some(si); + } + // If the cache has gotten too big, wipe it. + if self.approximate_size() > self.prog.dfa_size_limit + && !self.clear_cache_and_save(current_state) + { + // Ooops. DFA is giving up. + return None; + } + // Allocate room for our state and add it. + self.add_state(key) + } + + /// Produces a key suitable for describing a state in the DFA cache. + /// + /// The key invariant here is that equivalent keys are produced for any two + /// sets of ordered NFA states (and toggling of whether the previous NFA + /// states contain a match state) that do not discriminate a match for any + /// input. + /// + /// Specifically, q should be an ordered set of NFA states and is_match + /// should be true if and only if the previous NFA states contained a match + /// state. + fn cached_state_key( + &mut self, + q: &SparseSet, + state_flags: &mut StateFlags, + ) -> Option { + use crate::prog::Inst::*; + + // We need to build up enough information to recognize pre-built states + // in the DFA. Generally speaking, this includes every instruction + // except for those which are purely epsilon transitions, e.g., the + // Save and Split instructions. + // + // Empty width assertions are also epsilon transitions, but since they + // are conditional, we need to make them part of a state's key in the + // cache. + + let mut insts = + mem::replace(&mut self.cache.insts_scratch_space, vec![]); + insts.clear(); + // Reserve 1 byte for flags. + insts.push(0); + + let mut prev = 0; + for &ip in q { + let ip = usize_to_u32(ip); + match self.prog[ip as usize] { + Char(_) | Ranges(_) => unreachable!(), + Save(_) | Split(_) => {} + Bytes(_) => push_inst_ptr(&mut insts, &mut prev, ip), + EmptyLook(_) => { + state_flags.set_empty(); + push_inst_ptr(&mut insts, &mut prev, ip) + } + Match(_) => { + push_inst_ptr(&mut insts, &mut prev, ip); + if !self.continue_past_first_match() { + break; + } + } + } + } + // If we couldn't transition to any other instructions and we didn't + // see a match when expanding NFA states previously, then this is a + // dead state and no amount of additional input can transition out + // of this state. + let opt_state = if insts.len() == 1 && !state_flags.is_match() { + None + } else { + let StateFlags(f) = *state_flags; + insts[0] = f; + Some(State { data: Arc::from(&*insts) }) + }; + self.cache.insts_scratch_space = insts; + opt_state + } + + /// Clears the cache, but saves and restores current_state if it is not + /// none. + /// + /// The current state must be provided here in case its location in the + /// cache changes. + /// + /// This returns false if the cache is not cleared and the DFA should + /// give up. + fn clear_cache_and_save( + &mut self, + current_state: Option<&mut StatePtr>, + ) -> bool { + if self.cache.compiled.is_empty() { + // Nothing to clear... + return true; + } + match current_state { + None => self.clear_cache(), + Some(si) => { + let cur = self.state(*si).clone(); + if !self.clear_cache() { + return false; + } + // The unwrap is OK because we just cleared the cache and + // therefore know that the next state pointer won't exceed + // STATE_MAX. + *si = self.restore_state(cur).unwrap(); + true + } + } + } + + /// Wipes the state cache, but saves and restores the current start state. + /// + /// This returns false if the cache is not cleared and the DFA should + /// give up. + fn clear_cache(&mut self) -> bool { + // Bail out of the DFA if we're moving too "slowly." + // A heuristic from RE2: assume the DFA is too slow if it is processing + // 10 or fewer bytes per state. + // Additionally, we permit the cache to be flushed a few times before + // caling it quits. + let nstates = self.cache.compiled.len(); + if self.cache.flush_count >= 3 + && self.at >= self.last_cache_flush + && (self.at - self.last_cache_flush) <= 10 * nstates + { + return false; + } + // Update statistics tracking cache flushes. + self.last_cache_flush = self.at; + self.cache.flush_count += 1; + + // OK, actually flush the cache. + let start = self.state(self.start & !STATE_START).clone(); + let last_match = if self.last_match_si <= STATE_MAX { + Some(self.state(self.last_match_si).clone()) + } else { + None + }; + self.cache.reset_size(); + self.cache.trans.clear(); + self.cache.compiled.clear(); + for s in &mut self.cache.start_states { + *s = STATE_UNKNOWN; + } + // The unwraps are OK because we just cleared the cache and therefore + // know that the next state pointer won't exceed STATE_MAX. + let start_ptr = self.restore_state(start).unwrap(); + self.start = self.start_ptr(start_ptr); + if let Some(last_match) = last_match { + self.last_match_si = self.restore_state(last_match).unwrap(); + } + true + } + + /// Restores the given state back into the cache, and returns a pointer + /// to it. + fn restore_state(&mut self, state: State) -> Option { + // If we've already stored this state, just return a pointer to it. + // None will be the wiser. + if let Some(si) = self.cache.compiled.get_ptr(&state) { + return Some(si); + } + self.add_state(state) + } + + /// Returns the next state given the current state si and current byte + /// b. {qcur,qnext} are used as scratch space for storing ordered NFA + /// states. + /// + /// This tries to fetch the next state from the cache, but if that fails, + /// it computes the next state, caches it and returns a pointer to it. + /// + /// The pointer can be to a real state, or it can be STATE_DEAD. + /// STATE_UNKNOWN cannot be returned. + /// + /// None is returned if a new state could not be allocated (i.e., the DFA + /// ran out of space and thinks it's running too slowly). + fn next_state( + &mut self, + qcur: &mut SparseSet, + qnext: &mut SparseSet, + si: StatePtr, + b: Byte, + ) -> Option { + if si == STATE_DEAD { + return Some(STATE_DEAD); + } + match self.cache.trans.next(si, self.byte_class(b)) { + STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b), + STATE_QUIT => None, + nsi => Some(nsi), + } + } + + /// Computes and returns the start state, where searching begins at + /// position `at` in `text`. If the state has already been computed, + /// then it is pulled from the cache. If the state hasn't been cached, + /// then it is computed, cached and a pointer to it is returned. + /// + /// This may return STATE_DEAD but never STATE_UNKNOWN. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn start_state( + &mut self, + q: &mut SparseSet, + empty_flags: EmptyFlags, + state_flags: StateFlags, + ) -> Option { + // Compute an index into our cache of start states based on the set + // of empty/state flags set at the current position in the input. We + // don't use every flag since not all flags matter. For example, since + // matches are delayed by one byte, start states can never be match + // states. + let flagi = { + (((empty_flags.start as u8) << 0) + | ((empty_flags.end as u8) << 1) + | ((empty_flags.start_line as u8) << 2) + | ((empty_flags.end_line as u8) << 3) + | ((empty_flags.word_boundary as u8) << 4) + | ((empty_flags.not_word_boundary as u8) << 5) + | ((state_flags.is_word() as u8) << 6)) as usize + }; + match self.cache.start_states[flagi] { + STATE_UNKNOWN => {} + si => return Some(si), + } + q.clear(); + let start = usize_to_u32(self.prog.start); + self.follow_epsilons(start, q, empty_flags); + // Start states can never be match states because we delay every match + // by one byte. Given an empty string and an empty match, the match + // won't actually occur until the DFA processes the special EOF + // sentinel byte. + let sp = match self.cached_state(q, state_flags, None) { + None => return None, + Some(sp) => self.start_ptr(sp), + }; + self.cache.start_states[flagi] = sp; + Some(sp) + } + + /// Computes the set of starting flags for the given position in text. + /// + /// This should only be used when executing the DFA forwards over the + /// input. + fn start_flags(&self, text: &[u8], at: usize) -> (EmptyFlags, StateFlags) { + let mut empty_flags = EmptyFlags::default(); + let mut state_flags = StateFlags::default(); + empty_flags.start = at == 0; + empty_flags.end = text.is_empty(); + empty_flags.start_line = at == 0 || text[at - 1] == b'\n'; + empty_flags.end_line = text.is_empty(); + + let is_word_last = at > 0 && Byte::byte(text[at - 1]).is_ascii_word(); + let is_word = at < text.len() && Byte::byte(text[at]).is_ascii_word(); + if is_word_last { + state_flags.set_word(); + } + if is_word == is_word_last { + empty_flags.not_word_boundary = true; + } else { + empty_flags.word_boundary = true; + } + (empty_flags, state_flags) + } + + /// Computes the set of starting flags for the given position in text. + /// + /// This should only be used when executing the DFA in reverse over the + /// input. + fn start_flags_reverse( + &self, + text: &[u8], + at: usize, + ) -> (EmptyFlags, StateFlags) { + let mut empty_flags = EmptyFlags::default(); + let mut state_flags = StateFlags::default(); + empty_flags.start = at == text.len(); + empty_flags.end = text.is_empty(); + empty_flags.start_line = at == text.len() || text[at] == b'\n'; + empty_flags.end_line = text.is_empty(); + + let is_word_last = + at < text.len() && Byte::byte(text[at]).is_ascii_word(); + let is_word = at > 0 && Byte::byte(text[at - 1]).is_ascii_word(); + if is_word_last { + state_flags.set_word(); + } + if is_word == is_word_last { + empty_flags.not_word_boundary = true; + } else { + empty_flags.word_boundary = true; + } + (empty_flags, state_flags) + } + + /// Returns a reference to a State given a pointer to it. + fn state(&self, si: StatePtr) -> &State { + self.cache.compiled.get_state(si).unwrap() + } + + /// Adds the given state to the DFA. + /// + /// This allocates room for transitions out of this state in + /// self.cache.trans. The transitions can be set with the returned + /// StatePtr. + /// + /// If None is returned, then the state limit was reached and the DFA + /// should quit. + fn add_state(&mut self, state: State) -> Option { + // This will fail if the next state pointer exceeds STATE_PTR. In + // practice, the cache limit will prevent us from ever getting here, + // but maybe callers will set the cache size to something ridiculous... + let si = match self.cache.trans.add() { + None => return None, + Some(si) => si, + }; + // If the program has a Unicode word boundary, then set any transitions + // for non-ASCII bytes to STATE_QUIT. If the DFA stumbles over such a + // transition, then it will quit and an alternative matching engine + // will take over. + if self.prog.has_unicode_word_boundary { + for b in 128..256 { + let cls = self.byte_class(Byte::byte(b as u8)); + self.cache.trans.set_next(si, cls, STATE_QUIT); + } + } + // Finally, put our actual state on to our heap of states and index it + // so we can find it later. + self.cache.size += self.cache.trans.state_heap_size() + + state.data.len() + + (2 * mem::size_of::()) + + mem::size_of::(); + self.cache.compiled.insert(state, si); + // Transition table and set of states and map should all be in sync. + debug_assert!( + self.cache.compiled.len() == self.cache.trans.num_states() + ); + Some(si) + } + + /// Quickly finds the next occurrence of any literal prefixes in the regex. + /// If there are no literal prefixes, then the current position is + /// returned. If there are literal prefixes and one could not be found, + /// then None is returned. + /// + /// This should only be called when the DFA is in a start state. + fn prefix_at(&self, text: &[u8], at: usize) -> Option { + self.prog.prefixes.find(&text[at..]).map(|(s, _)| at + s) + } + + /// Returns the number of byte classes required to discriminate transitions + /// in each state. + /// + /// invariant: num_byte_classes() == len(State.next) + fn num_byte_classes(&self) -> usize { + // We add 1 to account for the special EOF byte. + (self.prog.byte_classes[255] as usize + 1) + 1 + } + + /// Given an input byte or the special EOF sentinel, return its + /// corresponding byte class. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn byte_class(&self, b: Byte) -> usize { + match b.as_byte() { + None => self.num_byte_classes() - 1, + Some(b) => self.u8_class(b), + } + } + + /// Like byte_class, but explicitly for u8s. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn u8_class(&self, b: u8) -> usize { + self.prog.byte_classes[b as usize] as usize + } + + /// Returns true if the DFA should continue searching past the first match. + /// + /// Leftmost first semantics in the DFA are preserved by not following NFA + /// transitions after the first match is seen. + /// + /// On occasion, we want to avoid leftmost first semantics to find either + /// the longest match (for reverse search) or all possible matches (for + /// regex sets). + fn continue_past_first_match(&self) -> bool { + self.prog.is_reverse || self.prog.matches.len() > 1 + } + + /// Returns true if there is a prefix we can quickly search for. + fn has_prefix(&self) -> bool { + !self.prog.is_reverse + && !self.prog.prefixes.is_empty() + && !self.prog.is_anchored_start + } + + /// Sets the STATE_START bit in the given state pointer if and only if + /// we have a prefix to scan for. + /// + /// If there's no prefix, then it's a waste to treat the start state + /// specially. + fn start_ptr(&self, si: StatePtr) -> StatePtr { + if self.has_prefix() { + si | STATE_START + } else { + si + } + } + + /// Approximate size returns the approximate heap space currently used by + /// the DFA. It is used to determine whether the DFA's state cache needs to + /// be wiped. Namely, it is possible that for certain regexes on certain + /// inputs, a new state could be created for every byte of input. (This is + /// bad for memory use, so we bound it with a cache.) + fn approximate_size(&self) -> usize { + self.cache.size + self.prog.approximate_size() + } +} + +/// An abstraction for representing a map of states. The map supports two +/// different ways of state lookup. One is fast constant time access via a +/// state pointer. The other is a hashmap lookup based on the DFA's +/// constituent NFA states. +/// +/// A DFA state internally uses an Arc such that we only need to store the +/// set of NFA states on the heap once, even though we support looking up +/// states by two different means. A more natural way to express this might +/// use raw pointers, but an Arc is safe and effectively achieves the same +/// thing. +#[derive(Debug)] +struct StateMap { + /// The keys are not actually static but rely on always pointing to a + /// buffer in `states` which will never be moved except when clearing + /// the map or on drop, in which case the keys of this map will be + /// removed before + map: HashMap, + /// Our set of states. Note that `StatePtr / num_byte_classes` indexes + /// this Vec rather than just a `StatePtr`. + states: Vec, + /// The number of byte classes in the DFA. Used to index `states`. + num_byte_classes: usize, +} + +impl StateMap { + fn new(num_byte_classes: usize) -> StateMap { + StateMap { map: HashMap::new(), states: vec![], num_byte_classes } + } + + fn len(&self) -> usize { + self.states.len() + } + + fn is_empty(&self) -> bool { + self.states.is_empty() + } + + fn get_ptr(&self, state: &State) -> Option { + self.map.get(state).cloned() + } + + fn get_state(&self, si: StatePtr) -> Option<&State> { + self.states.get(si as usize / self.num_byte_classes) + } + + fn insert(&mut self, state: State, si: StatePtr) { + self.map.insert(state.clone(), si); + self.states.push(state); + } + + fn clear(&mut self) { + self.map.clear(); + self.states.clear(); + } +} + +impl Transitions { + /// Create a new transition table. + /// + /// The number of byte classes corresponds to the stride. Every state will + /// have `num_byte_classes` slots for transitions. + fn new(num_byte_classes: usize) -> Transitions { + Transitions { table: vec![], num_byte_classes } + } + + /// Returns the total number of states currently in this table. + fn num_states(&self) -> usize { + self.table.len() / self.num_byte_classes + } + + /// Allocates room for one additional state and returns a pointer to it. + /// + /// If there's no more room, None is returned. + fn add(&mut self) -> Option { + let si = self.table.len(); + if si > STATE_MAX as usize { + return None; + } + self.table.extend(repeat(STATE_UNKNOWN).take(self.num_byte_classes)); + Some(usize_to_u32(si)) + } + + /// Clears the table of all states. + fn clear(&mut self) { + self.table.clear(); + } + + /// Sets the transition from (si, cls) to next. + fn set_next(&mut self, si: StatePtr, cls: usize, next: StatePtr) { + self.table[si as usize + cls] = next; + } + + /// Returns the transition corresponding to (si, cls). + fn next(&self, si: StatePtr, cls: usize) -> StatePtr { + self.table[si as usize + cls] + } + + /// The heap size, in bytes, of a single state in the transition table. + fn state_heap_size(&self) -> usize { + self.num_byte_classes * mem::size_of::() + } + + /// Like `next`, but uses unchecked access and is therefore not safe. + unsafe fn next_unchecked(&self, si: StatePtr, cls: usize) -> StatePtr { + debug_assert!((si as usize) < self.table.len()); + debug_assert!(cls < self.num_byte_classes); + *self.table.get_unchecked(si as usize + cls) + } +} + +impl StateFlags { + fn is_match(&self) -> bool { + self.0 & 0b0000_0001 > 0 + } + + fn set_match(&mut self) { + self.0 |= 0b0000_0001; + } + + fn is_word(&self) -> bool { + self.0 & 0b0000_0010 > 0 + } + + fn set_word(&mut self) { + self.0 |= 0b0000_0010; + } + + fn has_empty(&self) -> bool { + self.0 & 0b0000_0100 > 0 + } + + fn set_empty(&mut self) { + self.0 |= 0b0000_0100; + } +} + +impl Byte { + fn byte(b: u8) -> Self { + Byte(b as u16) + } + fn eof() -> Self { + Byte(256) + } + fn is_eof(&self) -> bool { + self.0 == 256 + } + + fn is_ascii_word(&self) -> bool { + let b = match self.as_byte() { + None => return false, + Some(b) => b, + }; + match b { + b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' => true, + _ => false, + } + } + + fn as_byte(&self) -> Option { + if self.is_eof() { + None + } else { + Some(self.0 as u8) + } + } +} + +impl fmt::Debug for State { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let ips: Vec = self.inst_ptrs().collect(); + f.debug_struct("State") + .field("flags", &self.flags()) + .field("insts", &ips) + .finish() + } +} + +impl fmt::Debug for Transitions { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut fmtd = f.debug_map(); + for si in 0..self.num_states() { + let s = si * self.num_byte_classes; + let e = s + self.num_byte_classes; + fmtd.entry(&si.to_string(), &TransitionsRow(&self.table[s..e])); + } + fmtd.finish() + } +} + +struct TransitionsRow<'a>(&'a [StatePtr]); + +impl<'a> fmt::Debug for TransitionsRow<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut fmtd = f.debug_map(); + for (b, si) in self.0.iter().enumerate() { + match *si { + STATE_UNKNOWN => {} + STATE_DEAD => { + fmtd.entry(&vb(b as usize), &"DEAD"); + } + si => { + fmtd.entry(&vb(b as usize), &si.to_string()); + } + } + } + fmtd.finish() + } +} + +impl fmt::Debug for StateFlags { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("StateFlags") + .field("is_match", &self.is_match()) + .field("is_word", &self.is_word()) + .field("has_empty", &self.has_empty()) + .finish() + } +} + +/// Helper function for formatting a byte as a nice-to-read escaped string. +fn vb(b: usize) -> String { + use std::ascii::escape_default; + + if b > ::std::u8::MAX as usize { + "EOF".to_owned() + } else { + let escaped = escape_default(b as u8).collect::>(); + String::from_utf8_lossy(&escaped).into_owned() + } +} + +fn usize_to_u32(n: usize) -> u32 { + if (n as u64) > (::std::u32::MAX as u64) { + panic!("BUG: {} is too big to fit into u32", n) + } + n as u32 +} + +#[allow(dead_code)] // useful for debugging +fn show_state_ptr(si: StatePtr) -> String { + let mut s = format!("{:?}", si & STATE_MAX); + if si == STATE_UNKNOWN { + s = format!("{} (unknown)", s); + } + if si == STATE_DEAD { + s = format!("{} (dead)", s); + } + if si == STATE_QUIT { + s = format!("{} (quit)", s); + } + if si & STATE_START > 0 { + s = format!("{} (start)", s); + } + if si & STATE_MATCH > 0 { + s = format!("{} (match)", s); + } + s +} + +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn write_vari32(data: &mut Vec, n: i32) { + let mut un = (n as u32) << 1; + if n < 0 { + un = !un; + } + write_varu32(data, un) +} + +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn read_vari32(data: &[u8]) -> (i32, usize) { + let (un, i) = read_varu32(data); + let mut n = (un >> 1) as i32; + if un & 1 != 0 { + n = !n; + } + (n, i) +} + +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn write_varu32(data: &mut Vec, mut n: u32) { + while n >= 0b1000_0000 { + data.push((n as u8) | 0b1000_0000); + n >>= 7; + } + data.push(n as u8); +} + +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +fn read_varu32(data: &[u8]) -> (u32, usize) { + let mut n: u32 = 0; + let mut shift: u32 = 0; + for (i, &b) in data.iter().enumerate() { + if b < 0b1000_0000 { + return (n | ((b as u32) << shift), i + 1); + } + n |= ((b as u32) & 0b0111_1111) << shift; + shift += 7; + } + (0, 0) +} + +#[cfg(test)] +mod tests { + + use super::{ + push_inst_ptr, read_vari32, read_varu32, write_vari32, write_varu32, + State, StateFlags, + }; + use quickcheck::{quickcheck, Gen, QuickCheck}; + use std::sync::Arc; + + #[test] + fn prop_state_encode_decode() { + fn p(mut ips: Vec, flags: u8) -> bool { + // It looks like our encoding scheme can't handle instruction + // pointers at or above 2**31. We should fix that, but it seems + // unlikely to occur in real code due to the amount of memory + // required for such a state machine. So for now, we just clamp + // our test data. + for ip in &mut ips { + if *ip >= 1 << 31 { + *ip = (1 << 31) - 1; + } + } + let mut data = vec![flags]; + let mut prev = 0; + for &ip in ips.iter() { + push_inst_ptr(&mut data, &mut prev, ip); + } + let state = State { data: Arc::from(&data[..]) }; + + let expected: Vec = + ips.into_iter().map(|ip| ip as usize).collect(); + let got: Vec = state.inst_ptrs().collect(); + expected == got && state.flags() == StateFlags(flags) + } + QuickCheck::new() + .gen(Gen::new(10_000)) + .quickcheck(p as fn(Vec, u8) -> bool); + } + + #[test] + fn prop_read_write_u32() { + fn p(n: u32) -> bool { + let mut buf = vec![]; + write_varu32(&mut buf, n); + let (got, nread) = read_varu32(&buf); + nread == buf.len() && got == n + } + quickcheck(p as fn(u32) -> bool); + } + + #[test] + fn prop_read_write_i32() { + fn p(n: i32) -> bool { + let mut buf = vec![]; + write_vari32(&mut buf, n); + let (got, nread) = read_vari32(&buf); + nread == buf.len() && got == n + } + quickcheck(p as fn(i32) -> bool); + } +} diff --git a/vendor/regex/src/error.rs b/vendor/regex/src/error.rs new file mode 100644 index 0000000000..3e0ec75210 --- /dev/null +++ b/vendor/regex/src/error.rs @@ -0,0 +1,71 @@ +use std::fmt; +use std::iter::repeat; + +/// An error that occurred during parsing or compiling a regular expression. +#[derive(Clone, PartialEq)] +pub enum Error { + /// A syntax error. + Syntax(String), + /// The compiled program exceeded the set size limit. + /// The argument is the size limit imposed. + CompiledTooBig(usize), + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl ::std::error::Error for Error { + // TODO: Remove this method entirely on the next breaking semver release. + #[allow(deprecated)] + fn description(&self) -> &str { + match *self { + Error::Syntax(ref err) => err, + Error::CompiledTooBig(_) => "compiled program too big", + Error::__Nonexhaustive => unreachable!(), + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + Error::Syntax(ref err) => err.fmt(f), + Error::CompiledTooBig(limit) => write!( + f, + "Compiled regex exceeds size limit of {} bytes.", + limit + ), + Error::__Nonexhaustive => unreachable!(), + } + } +} + +// We implement our own Debug implementation so that we show nicer syntax +// errors when people use `Regex::new(...).unwrap()`. It's a little weird, +// but the `Syntax` variant is already storing a `String` anyway, so we might +// as well format it nicely. +impl fmt::Debug for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + Error::Syntax(ref err) => { + let hr: String = repeat('~').take(79).collect(); + writeln!(f, "Syntax(")?; + writeln!(f, "{}", hr)?; + writeln!(f, "{}", err)?; + writeln!(f, "{}", hr)?; + write!(f, ")")?; + Ok(()) + } + Error::CompiledTooBig(limit) => { + f.debug_tuple("CompiledTooBig").field(&limit).finish() + } + Error::__Nonexhaustive => { + f.debug_tuple("__Nonexhaustive").finish() + } + } + } +} diff --git a/vendor/regex/src/exec.rs b/vendor/regex/src/exec.rs new file mode 100644 index 0000000000..b9abcdc040 --- /dev/null +++ b/vendor/regex/src/exec.rs @@ -0,0 +1,1655 @@ +use std::cell::RefCell; +use std::collections::HashMap; +use std::panic::AssertUnwindSafe; +use std::sync::Arc; + +#[cfg(feature = "perf-literal")] +use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; +use regex_syntax::hir::literal::Literals; +use regex_syntax::hir::Hir; +use regex_syntax::ParserBuilder; + +use crate::backtrack; +use crate::compile::Compiler; +#[cfg(feature = "perf-dfa")] +use crate::dfa; +use crate::error::Error; +use crate::input::{ByteInput, CharInput}; +use crate::literal::LiteralSearcher; +use crate::pikevm; +use crate::pool::{Pool, PoolGuard}; +use crate::prog::Program; +use crate::re_builder::RegexOptions; +use crate::re_bytes; +use crate::re_set; +use crate::re_trait::{Locations, RegularExpression, Slot}; +use crate::re_unicode; +use crate::utf8::next_utf8; + +/// `Exec` manages the execution of a regular expression. +/// +/// In particular, this manages the various compiled forms of a single regular +/// expression and the choice of which matching engine to use to execute a +/// regular expression. +#[derive(Debug)] +pub struct Exec { + /// All read only state. + ro: Arc, + /// A pool of reusable values for the various matching engines. + /// + /// Note that boxing this value is not strictly necessary, but it is an + /// easy way to ensure that T does not bloat the stack sized used by a pool + /// in the case where T is big. And this turns out to be the case at the + /// time of writing for regex's use of this pool. At the time of writing, + /// the size of a Regex on the stack is 856 bytes. Boxing this value + /// reduces that size to 16 bytes. + pool: Box>, +} + +/// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This +/// means it is no longer Sync, but we can now avoid the overhead of +/// synchronization to fetch the cache. +#[derive(Debug)] +pub struct ExecNoSync<'c> { + /// All read only state. + ro: &'c Arc, + /// Caches for the various matching engines. + cache: PoolGuard<'c, ProgramCache>, +} + +/// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8]. +#[derive(Debug)] +pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>); + +/// `ExecReadOnly` comprises all read only state for a regex. Namely, all such +/// state is determined at compile time and never changes during search. +#[derive(Debug)] +struct ExecReadOnly { + /// The original regular expressions given by the caller to compile. + res: Vec, + /// A compiled program that is used in the NFA simulation and backtracking. + /// It can be byte-based or Unicode codepoint based. + /// + /// N.B. It is not possibly to make this byte-based from the public API. + /// It is only used for testing byte based programs in the NFA simulations. + nfa: Program, + /// A compiled byte based program for DFA execution. This is only used + /// if a DFA can be executed. (Currently, only word boundary assertions are + /// not supported.) Note that this program contains an embedded `.*?` + /// preceding the first capture group, unless the regex is anchored at the + /// beginning. + dfa: Program, + /// The same as above, except the program is reversed (and there is no + /// preceding `.*?`). This is used by the DFA to find the starting location + /// of matches. + dfa_reverse: Program, + /// A set of suffix literals extracted from the regex. + /// + /// Prefix literals are stored on the `Program`, since they are used inside + /// the matching engines. + suffixes: LiteralSearcher, + /// An Aho-Corasick automaton with leftmost-first match semantics. + /// + /// This is only set when the entire regex is a simple unanchored + /// alternation of literals. We could probably use it more circumstances, + /// but this is already hacky enough in this architecture. + /// + /// N.B. We use u32 as a state ID representation under the assumption that + /// if we were to exhaust the ID space, we probably would have long + /// surpassed the compilation size limit. + #[cfg(feature = "perf-literal")] + ac: Option>, + /// match_type encodes as much upfront knowledge about how we're going to + /// execute a search as possible. + match_type: MatchType, +} + +/// Facilitates the construction of an executor by exposing various knobs +/// to control how a regex is executed and what kinds of resources it's +/// permitted to use. +// `ExecBuilder` is only public via the `internal` module, so avoid deriving +// `Debug`. +#[allow(missing_debug_implementations)] +pub struct ExecBuilder { + options: RegexOptions, + match_type: Option, + bytes: bool, + only_utf8: bool, +} + +/// Parsed represents a set of parsed regular expressions and their detected +/// literals. +struct Parsed { + exprs: Vec, + prefixes: Literals, + suffixes: Literals, + bytes: bool, +} + +impl ExecBuilder { + /// Create a regex execution builder. + /// + /// This uses default settings for everything except the regex itself, + /// which must be provided. Further knobs can be set by calling methods, + /// and then finally, `build` to actually create the executor. + pub fn new(re: &str) -> Self { + Self::new_many(&[re]) + } + + /// Like new, but compiles the union of the given regular expressions. + /// + /// Note that when compiling 2 or more regular expressions, capture groups + /// are completely unsupported. (This means both `find` and `captures` + /// won't work.) + pub fn new_many(res: I) -> Self + where + S: AsRef, + I: IntoIterator, + { + let mut opts = RegexOptions::default(); + opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect(); + Self::new_options(opts) + } + + /// Create a regex execution builder. + pub fn new_options(opts: RegexOptions) -> Self { + ExecBuilder { + options: opts, + match_type: None, + bytes: false, + only_utf8: true, + } + } + + /// Set the matching engine to be automatically determined. + /// + /// This is the default state and will apply whatever optimizations are + /// possible, such as running a DFA. + /// + /// This overrides whatever was previously set via the `nfa` or + /// `bounded_backtracking` methods. + pub fn automatic(mut self) -> Self { + self.match_type = None; + self + } + + /// Sets the matching engine to use the NFA algorithm no matter what + /// optimizations are possible. + /// + /// This overrides whatever was previously set via the `automatic` or + /// `bounded_backtracking` methods. + pub fn nfa(mut self) -> Self { + self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM)); + self + } + + /// Sets the matching engine to use a bounded backtracking engine no + /// matter what optimizations are possible. + /// + /// One must use this with care, since the bounded backtracking engine + /// uses memory proportion to `len(regex) * len(text)`. + /// + /// This overrides whatever was previously set via the `automatic` or + /// `nfa` methods. + pub fn bounded_backtracking(mut self) -> Self { + self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack)); + self + } + + /// Compiles byte based programs for use with the NFA matching engines. + /// + /// By default, the NFA engines match on Unicode scalar values. They can + /// be made to use byte based programs instead. In general, the byte based + /// programs are slower because of a less efficient encoding of character + /// classes. + /// + /// Note that this does not impact DFA matching engines, which always + /// execute on bytes. + pub fn bytes(mut self, yes: bool) -> Self { + self.bytes = yes; + self + } + + /// When disabled, the program compiled may match arbitrary bytes. + /// + /// When enabled (the default), all compiled programs exclusively match + /// valid UTF-8 bytes. + pub fn only_utf8(mut self, yes: bool) -> Self { + self.only_utf8 = yes; + self + } + + /// Set the Unicode flag. + pub fn unicode(mut self, yes: bool) -> Self { + self.options.unicode = yes; + self + } + + /// Parse the current set of patterns into their AST and extract literals. + fn parse(&self) -> Result { + let mut exprs = Vec::with_capacity(self.options.pats.len()); + let mut prefixes = Some(Literals::empty()); + let mut suffixes = Some(Literals::empty()); + let mut bytes = false; + let is_set = self.options.pats.len() > 1; + // If we're compiling a regex set and that set has any anchored + // expressions, then disable all literal optimizations. + for pat in &self.options.pats { + let mut parser = ParserBuilder::new() + .octal(self.options.octal) + .case_insensitive(self.options.case_insensitive) + .multi_line(self.options.multi_line) + .dot_matches_new_line(self.options.dot_matches_new_line) + .swap_greed(self.options.swap_greed) + .ignore_whitespace(self.options.ignore_whitespace) + .unicode(self.options.unicode) + .allow_invalid_utf8(!self.only_utf8) + .nest_limit(self.options.nest_limit) + .build(); + let expr = + parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?; + bytes = bytes || !expr.is_always_utf8(); + + if cfg!(feature = "perf-literal") { + if !expr.is_anchored_start() && expr.is_any_anchored_start() { + // Partial anchors unfortunately make it hard to use + // prefixes, so disable them. + prefixes = None; + } else if is_set && expr.is_anchored_start() { + // Regex sets with anchors do not go well with literal + // optimizations. + prefixes = None; + } + prefixes = prefixes.and_then(|mut prefixes| { + if !prefixes.union_prefixes(&expr) { + None + } else { + Some(prefixes) + } + }); + + if !expr.is_anchored_end() && expr.is_any_anchored_end() { + // Partial anchors unfortunately make it hard to use + // suffixes, so disable them. + suffixes = None; + } else if is_set && expr.is_anchored_end() { + // Regex sets with anchors do not go well with literal + // optimizations. + suffixes = None; + } + suffixes = suffixes.and_then(|mut suffixes| { + if !suffixes.union_suffixes(&expr) { + None + } else { + Some(suffixes) + } + }); + } + exprs.push(expr); + } + Ok(Parsed { + exprs, + prefixes: prefixes.unwrap_or_else(Literals::empty), + suffixes: suffixes.unwrap_or_else(Literals::empty), + bytes, + }) + } + + /// Build an executor that can run a regular expression. + pub fn build(self) -> Result { + // Special case when we have no patterns to compile. + // This can happen when compiling a regex set. + if self.options.pats.is_empty() { + let ro = Arc::new(ExecReadOnly { + res: vec![], + nfa: Program::new(), + dfa: Program::new(), + dfa_reverse: Program::new(), + suffixes: LiteralSearcher::empty(), + #[cfg(feature = "perf-literal")] + ac: None, + match_type: MatchType::Nothing, + }); + let pool = ExecReadOnly::new_pool(&ro); + return Ok(Exec { ro, pool }); + } + let parsed = self.parse()?; + let mut nfa = Compiler::new() + .size_limit(self.options.size_limit) + .bytes(self.bytes || parsed.bytes) + .only_utf8(self.only_utf8) + .compile(&parsed.exprs)?; + let mut dfa = Compiler::new() + .size_limit(self.options.size_limit) + .dfa(true) + .only_utf8(self.only_utf8) + .compile(&parsed.exprs)?; + let mut dfa_reverse = Compiler::new() + .size_limit(self.options.size_limit) + .dfa(true) + .only_utf8(self.only_utf8) + .reverse(true) + .compile(&parsed.exprs)?; + + #[cfg(feature = "perf-literal")] + let ac = self.build_aho_corasick(&parsed); + nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes); + dfa.prefixes = nfa.prefixes.clone(); + dfa.dfa_size_limit = self.options.dfa_size_limit; + dfa_reverse.dfa_size_limit = self.options.dfa_size_limit; + + let mut ro = ExecReadOnly { + res: self.options.pats, + nfa, + dfa, + dfa_reverse, + suffixes: LiteralSearcher::suffixes(parsed.suffixes), + #[cfg(feature = "perf-literal")] + ac, + match_type: MatchType::Nothing, + }; + ro.match_type = ro.choose_match_type(self.match_type); + + let ro = Arc::new(ro); + let pool = ExecReadOnly::new_pool(&ro); + Ok(Exec { ro, pool }) + } + + #[cfg(feature = "perf-literal")] + fn build_aho_corasick(&self, parsed: &Parsed) -> Option> { + if parsed.exprs.len() != 1 { + return None; + } + let lits = match alternation_literals(&parsed.exprs[0]) { + None => return None, + Some(lits) => lits, + }; + // If we have a small number of literals, then let Teddy handle + // things (see literal/mod.rs). + if lits.len() <= 32 { + return None; + } + Some( + AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostFirst) + .auto_configure(&lits) + .build_with_size::(&lits) + // This should never happen because we'd long exceed the + // compilation limit for regexes first. + .expect("AC automaton too big"), + ) + } +} + +impl<'c> RegularExpression for ExecNoSyncStr<'c> { + type Text = str; + + fn slots_len(&self) -> usize { + self.0.slots_len() + } + + fn next_after_empty(&self, text: &str, i: usize) -> usize { + next_utf8(text.as_bytes(), i) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn shortest_match_at(&self, text: &str, start: usize) -> Option { + self.0.shortest_match_at(text.as_bytes(), start) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match_at(&self, text: &str, start: usize) -> bool { + self.0.is_match_at(text.as_bytes(), start) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { + self.0.find_at(text.as_bytes(), start) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn captures_read_at( + &self, + locs: &mut Locations, + text: &str, + start: usize, + ) -> Option<(usize, usize)> { + self.0.captures_read_at(locs, text.as_bytes(), start) + } +} + +impl<'c> RegularExpression for ExecNoSync<'c> { + type Text = [u8]; + + /// Returns the number of capture slots in the regular expression. (There + /// are two slots for every capture group, corresponding to possibly empty + /// start and end locations of the capture.) + fn slots_len(&self) -> usize { + self.ro.nfa.captures.len() * 2 + } + + fn next_after_empty(&self, _text: &[u8], i: usize) -> usize { + i + 1 + } + + /// Returns the end of a match location, possibly occurring before the + /// end location of the correct leftmost-first match. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn shortest_match_at(&self, text: &[u8], start: usize) -> Option { + if !self.is_anchor_end_match(text) { + return None; + } + match self.ro.match_type { + #[cfg(feature = "perf-literal")] + MatchType::Literal(ty) => { + self.find_literals(ty, text, start).map(|(_, e)| e) + } + #[cfg(feature = "perf-dfa")] + MatchType::Dfa | MatchType::DfaMany => { + match self.shortest_dfa(text, start) { + dfa::Result::Match(end) => Some(end), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.shortest_nfa(text, start), + } + } + #[cfg(feature = "perf-dfa")] + MatchType::DfaAnchoredReverse => { + match dfa::Fsm::reverse( + &self.ro.dfa_reverse, + self.cache.value(), + true, + &text[start..], + text.len() - start, + ) { + dfa::Result::Match(_) => Some(text.len()), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.shortest_nfa(text, start), + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + MatchType::DfaSuffix => { + match self.shortest_dfa_reverse_suffix(text, start) { + dfa::Result::Match(e) => Some(e), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.shortest_nfa(text, start), + } + } + MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start), + MatchType::Nothing => None, + } + } + + /// Returns true if and only if the regex matches text. + /// + /// For single regular expressions, this is equivalent to calling + /// shortest_match(...).is_some(). + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match_at(&self, text: &[u8], start: usize) -> bool { + if !self.is_anchor_end_match(text) { + return false; + } + // We need to do this dance because shortest_match relies on the NFA + // filling in captures[1], but a RegexSet has no captures. In other + // words, a RegexSet can't (currently) use shortest_match. ---AG + match self.ro.match_type { + #[cfg(feature = "perf-literal")] + MatchType::Literal(ty) => { + self.find_literals(ty, text, start).is_some() + } + #[cfg(feature = "perf-dfa")] + MatchType::Dfa | MatchType::DfaMany => { + match self.shortest_dfa(text, start) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.match_nfa(text, start), + } + } + #[cfg(feature = "perf-dfa")] + MatchType::DfaAnchoredReverse => { + match dfa::Fsm::reverse( + &self.ro.dfa_reverse, + self.cache.value(), + true, + &text[start..], + text.len() - start, + ) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.match_nfa(text, start), + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + MatchType::DfaSuffix => { + match self.shortest_dfa_reverse_suffix(text, start) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.match_nfa(text, start), + } + } + MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start), + MatchType::Nothing => false, + } + } + + /// Finds the start and end location of the leftmost-first match, starting + /// at the given location. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> { + if !self.is_anchor_end_match(text) { + return None; + } + match self.ro.match_type { + #[cfg(feature = "perf-literal")] + MatchType::Literal(ty) => self.find_literals(ty, text, start), + #[cfg(feature = "perf-dfa")] + MatchType::Dfa => match self.find_dfa_forward(text, start) { + dfa::Result::Match((s, e)) => Some((s, e)), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => { + self.find_nfa(MatchNfaType::Auto, text, start) + } + }, + #[cfg(feature = "perf-dfa")] + MatchType::DfaAnchoredReverse => { + match self.find_dfa_anchored_reverse(text, start) { + dfa::Result::Match((s, e)) => Some((s, e)), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => { + self.find_nfa(MatchNfaType::Auto, text, start) + } + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + MatchType::DfaSuffix => { + match self.find_dfa_reverse_suffix(text, start) { + dfa::Result::Match((s, e)) => Some((s, e)), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => { + self.find_nfa(MatchNfaType::Auto, text, start) + } + } + } + MatchType::Nfa(ty) => self.find_nfa(ty, text, start), + MatchType::Nothing => None, + #[cfg(feature = "perf-dfa")] + MatchType::DfaMany => { + unreachable!("BUG: RegexSet cannot be used with find") + } + } + } + + /// Finds the start and end location of the leftmost-first match and also + /// fills in all matching capture groups. + /// + /// The number of capture slots given should be equal to the total number + /// of capture slots in the compiled program. + /// + /// Note that the first two slots always correspond to the start and end + /// locations of the overall match. + fn captures_read_at( + &self, + locs: &mut Locations, + text: &[u8], + start: usize, + ) -> Option<(usize, usize)> { + let slots = locs.as_slots(); + for slot in slots.iter_mut() { + *slot = None; + } + // If the caller unnecessarily uses this, then we try to save them + // from themselves. + match slots.len() { + 0 => return self.find_at(text, start), + 2 => { + return self.find_at(text, start).map(|(s, e)| { + slots[0] = Some(s); + slots[1] = Some(e); + (s, e) + }); + } + _ => {} // fallthrough + } + if !self.is_anchor_end_match(text) { + return None; + } + match self.ro.match_type { + #[cfg(feature = "perf-literal")] + MatchType::Literal(ty) => { + self.find_literals(ty, text, start).and_then(|(s, e)| { + self.captures_nfa_type( + MatchNfaType::Auto, + slots, + text, + s, + e, + ) + }) + } + #[cfg(feature = "perf-dfa")] + MatchType::Dfa => { + if self.ro.nfa.is_anchored_start { + self.captures_nfa(slots, text, start) + } else { + match self.find_dfa_forward(text, start) { + dfa::Result::Match((s, e)) => self.captures_nfa_type( + MatchNfaType::Auto, + slots, + text, + s, + e, + ), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => { + self.captures_nfa(slots, text, start) + } + } + } + } + #[cfg(feature = "perf-dfa")] + MatchType::DfaAnchoredReverse => { + match self.find_dfa_anchored_reverse(text, start) { + dfa::Result::Match((s, e)) => self.captures_nfa_type( + MatchNfaType::Auto, + slots, + text, + s, + e, + ), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.captures_nfa(slots, text, start), + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + MatchType::DfaSuffix => { + match self.find_dfa_reverse_suffix(text, start) { + dfa::Result::Match((s, e)) => self.captures_nfa_type( + MatchNfaType::Auto, + slots, + text, + s, + e, + ), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.captures_nfa(slots, text, start), + } + } + MatchType::Nfa(ty) => { + self.captures_nfa_type(ty, slots, text, start, text.len()) + } + MatchType::Nothing => None, + #[cfg(feature = "perf-dfa")] + MatchType::DfaMany => { + unreachable!("BUG: RegexSet cannot be used with captures") + } + } + } +} + +impl<'c> ExecNoSync<'c> { + /// Finds the leftmost-first match using only literal search. + #[cfg(feature = "perf-literal")] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_literals( + &self, + ty: MatchLiteralType, + text: &[u8], + start: usize, + ) -> Option<(usize, usize)> { + use self::MatchLiteralType::*; + match ty { + Unanchored => { + let lits = &self.ro.nfa.prefixes; + lits.find(&text[start..]).map(|(s, e)| (start + s, start + e)) + } + AnchoredStart => { + let lits = &self.ro.nfa.prefixes; + if start == 0 || !self.ro.nfa.is_anchored_start { + lits.find_start(&text[start..]) + .map(|(s, e)| (start + s, start + e)) + } else { + None + } + } + AnchoredEnd => { + let lits = &self.ro.suffixes; + lits.find_end(&text[start..]) + .map(|(s, e)| (start + s, start + e)) + } + AhoCorasick => self + .ro + .ac + .as_ref() + .unwrap() + .find(&text[start..]) + .map(|m| (start + m.start(), start + m.end())), + } + } + + /// Finds the leftmost-first match (start and end) using only the DFA. + /// + /// If the result returned indicates that the DFA quit, then another + /// matching engine should be used. + #[cfg(feature = "perf-dfa")] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_dfa_forward( + &self, + text: &[u8], + start: usize, + ) -> dfa::Result<(usize, usize)> { + use crate::dfa::Result::*; + let end = match dfa::Fsm::forward( + &self.ro.dfa, + self.cache.value(), + false, + text, + start, + ) { + NoMatch(i) => return NoMatch(i), + Quit => return Quit, + Match(end) if start == end => return Match((start, start)), + Match(end) => end, + }; + // Now run the DFA in reverse to find the start of the match. + match dfa::Fsm::reverse( + &self.ro.dfa_reverse, + self.cache.value(), + false, + &text[start..], + end - start, + ) { + Match(s) => Match((start + s, end)), + NoMatch(i) => NoMatch(i), + Quit => Quit, + } + } + + /// Finds the leftmost-first match (start and end) using only the DFA, + /// but assumes the regex is anchored at the end and therefore starts at + /// the end of the regex and matches in reverse. + /// + /// If the result returned indicates that the DFA quit, then another + /// matching engine should be used. + #[cfg(feature = "perf-dfa")] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_dfa_anchored_reverse( + &self, + text: &[u8], + start: usize, + ) -> dfa::Result<(usize, usize)> { + use crate::dfa::Result::*; + match dfa::Fsm::reverse( + &self.ro.dfa_reverse, + self.cache.value(), + false, + &text[start..], + text.len() - start, + ) { + Match(s) => Match((start + s, text.len())), + NoMatch(i) => NoMatch(i), + Quit => Quit, + } + } + + /// Finds the end of the shortest match using only the DFA. + #[cfg(feature = "perf-dfa")] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result { + dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start) + } + + /// Finds the end of the shortest match using only the DFA by scanning for + /// suffix literals. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn shortest_dfa_reverse_suffix( + &self, + text: &[u8], + start: usize, + ) -> dfa::Result { + match self.exec_dfa_reverse_suffix(text, start) { + None => self.shortest_dfa(text, start), + Some(r) => r.map(|(_, end)| end), + } + } + + /// Finds the end of the shortest match using only the DFA by scanning for + /// suffix literals. It also reports the start of the match. + /// + /// Note that if None is returned, then the optimization gave up to avoid + /// worst case quadratic behavior. A forward scanning DFA should be tried + /// next. + /// + /// If a match is returned and the full leftmost-first match is desired, + /// then a forward scan starting from the beginning of the match must be + /// done. + /// + /// If the result returned indicates that the DFA quit, then another + /// matching engine should be used. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn exec_dfa_reverse_suffix( + &self, + text: &[u8], + original_start: usize, + ) -> Option> { + use crate::dfa::Result::*; + + let lcs = self.ro.suffixes.lcs(); + debug_assert!(lcs.len() >= 1); + let mut start = original_start; + let mut end = start; + let mut last_literal = start; + while end <= text.len() { + last_literal += match lcs.find(&text[last_literal..]) { + None => return Some(NoMatch(text.len())), + Some(i) => i, + }; + end = last_literal + lcs.len(); + match dfa::Fsm::reverse( + &self.ro.dfa_reverse, + self.cache.value(), + false, + &text[start..end], + end - start, + ) { + Match(0) | NoMatch(0) => return None, + Match(i) => return Some(Match((start + i, end))), + NoMatch(i) => { + start += i; + last_literal += 1; + continue; + } + Quit => return Some(Quit), + }; + } + Some(NoMatch(text.len())) + } + + /// Finds the leftmost-first match (start and end) using only the DFA + /// by scanning for suffix literals. + /// + /// If the result returned indicates that the DFA quit, then another + /// matching engine should be used. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_dfa_reverse_suffix( + &self, + text: &[u8], + start: usize, + ) -> dfa::Result<(usize, usize)> { + use crate::dfa::Result::*; + + let match_start = match self.exec_dfa_reverse_suffix(text, start) { + None => return self.find_dfa_forward(text, start), + Some(Match((start, _))) => start, + Some(r) => return r, + }; + // At this point, we've found a match. The only way to quit now + // without a match is if the DFA gives up (seems unlikely). + // + // Now run the DFA forwards to find the proper end of the match. + // (The suffix literal match can only indicate the earliest + // possible end location, which may appear before the end of the + // leftmost-first match.) + match dfa::Fsm::forward( + &self.ro.dfa, + self.cache.value(), + false, + text, + match_start, + ) { + NoMatch(_) => panic!("BUG: reverse match implies forward match"), + Quit => Quit, + Match(e) => Match((match_start, e)), + } + } + + /// Executes the NFA engine to return whether there is a match or not. + /// + /// Ideally, we could use shortest_nfa(...).is_some() and get the same + /// performance characteristics, but regex sets don't have captures, which + /// shortest_nfa depends on. + #[cfg(feature = "perf-dfa")] + fn match_nfa(&self, text: &[u8], start: usize) -> bool { + self.match_nfa_type(MatchNfaType::Auto, text, start) + } + + /// Like match_nfa, but allows specification of the type of NFA engine. + fn match_nfa_type( + &self, + ty: MatchNfaType, + text: &[u8], + start: usize, + ) -> bool { + self.exec_nfa( + ty, + &mut [false], + &mut [], + true, + false, + text, + start, + text.len(), + ) + } + + /// Finds the shortest match using an NFA. + #[cfg(feature = "perf-dfa")] + fn shortest_nfa(&self, text: &[u8], start: usize) -> Option { + self.shortest_nfa_type(MatchNfaType::Auto, text, start) + } + + /// Like shortest_nfa, but allows specification of the type of NFA engine. + fn shortest_nfa_type( + &self, + ty: MatchNfaType, + text: &[u8], + start: usize, + ) -> Option { + let mut slots = [None, None]; + if self.exec_nfa( + ty, + &mut [false], + &mut slots, + true, + true, + text, + start, + text.len(), + ) { + slots[1] + } else { + None + } + } + + /// Like find, but executes an NFA engine. + fn find_nfa( + &self, + ty: MatchNfaType, + text: &[u8], + start: usize, + ) -> Option<(usize, usize)> { + let mut slots = [None, None]; + if self.exec_nfa( + ty, + &mut [false], + &mut slots, + false, + false, + text, + start, + text.len(), + ) { + match (slots[0], slots[1]) { + (Some(s), Some(e)) => Some((s, e)), + _ => None, + } + } else { + None + } + } + + /// Like find_nfa, but fills in captures. + /// + /// `slots` should have length equal to `2 * nfa.captures.len()`. + #[cfg(feature = "perf-dfa")] + fn captures_nfa( + &self, + slots: &mut [Slot], + text: &[u8], + start: usize, + ) -> Option<(usize, usize)> { + self.captures_nfa_type( + MatchNfaType::Auto, + slots, + text, + start, + text.len(), + ) + } + + /// Like captures_nfa, but allows specification of type of NFA engine. + fn captures_nfa_type( + &self, + ty: MatchNfaType, + slots: &mut [Slot], + text: &[u8], + start: usize, + end: usize, + ) -> Option<(usize, usize)> { + if self.exec_nfa( + ty, + &mut [false], + slots, + false, + false, + text, + start, + end, + ) { + match (slots[0], slots[1]) { + (Some(s), Some(e)) => Some((s, e)), + _ => None, + } + } else { + None + } + } + + fn exec_nfa( + &self, + mut ty: MatchNfaType, + matches: &mut [bool], + slots: &mut [Slot], + quit_after_match: bool, + quit_after_match_with_pos: bool, + text: &[u8], + start: usize, + end: usize, + ) -> bool { + use self::MatchNfaType::*; + if let Auto = ty { + if backtrack::should_exec(self.ro.nfa.len(), text.len()) { + ty = Backtrack; + } else { + ty = PikeVM; + } + } + // The backtracker can't return the shortest match position as it is + // implemented today. So if someone calls `shortest_match` and we need + // to run an NFA, then use the PikeVM. + if quit_after_match_with_pos || ty == PikeVM { + self.exec_pikevm( + matches, + slots, + quit_after_match, + text, + start, + end, + ) + } else { + self.exec_backtrack(matches, slots, text, start, end) + } + } + + /// Always run the NFA algorithm. + fn exec_pikevm( + &self, + matches: &mut [bool], + slots: &mut [Slot], + quit_after_match: bool, + text: &[u8], + start: usize, + end: usize, + ) -> bool { + if self.ro.nfa.uses_bytes() { + pikevm::Fsm::exec( + &self.ro.nfa, + self.cache.value(), + matches, + slots, + quit_after_match, + ByteInput::new(text, self.ro.nfa.only_utf8), + start, + end, + ) + } else { + pikevm::Fsm::exec( + &self.ro.nfa, + self.cache.value(), + matches, + slots, + quit_after_match, + CharInput::new(text), + start, + end, + ) + } + } + + /// Always runs the NFA using bounded backtracking. + fn exec_backtrack( + &self, + matches: &mut [bool], + slots: &mut [Slot], + text: &[u8], + start: usize, + end: usize, + ) -> bool { + if self.ro.nfa.uses_bytes() { + backtrack::Bounded::exec( + &self.ro.nfa, + self.cache.value(), + matches, + slots, + ByteInput::new(text, self.ro.nfa.only_utf8), + start, + end, + ) + } else { + backtrack::Bounded::exec( + &self.ro.nfa, + self.cache.value(), + matches, + slots, + CharInput::new(text), + start, + end, + ) + } + } + + /// Finds which regular expressions match the given text. + /// + /// `matches` should have length equal to the number of regexes being + /// searched. + /// + /// This is only useful when one wants to know which regexes in a set + /// match some text. + pub fn many_matches_at( + &self, + matches: &mut [bool], + text: &[u8], + start: usize, + ) -> bool { + use self::MatchType::*; + if !self.is_anchor_end_match(text) { + return false; + } + match self.ro.match_type { + #[cfg(feature = "perf-literal")] + Literal(ty) => { + debug_assert_eq!(matches.len(), 1); + matches[0] = self.find_literals(ty, text, start).is_some(); + matches[0] + } + #[cfg(feature = "perf-dfa")] + Dfa | DfaAnchoredReverse | DfaMany => { + match dfa::Fsm::forward_many( + &self.ro.dfa, + self.cache.value(), + matches, + text, + start, + ) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.exec_nfa( + MatchNfaType::Auto, + matches, + &mut [], + false, + false, + text, + start, + text.len(), + ), + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + DfaSuffix => { + match dfa::Fsm::forward_many( + &self.ro.dfa, + self.cache.value(), + matches, + text, + start, + ) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.exec_nfa( + MatchNfaType::Auto, + matches, + &mut [], + false, + false, + text, + start, + text.len(), + ), + } + } + Nfa(ty) => self.exec_nfa( + ty, + matches, + &mut [], + false, + false, + text, + start, + text.len(), + ), + Nothing => false, + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_anchor_end_match(&self, text: &[u8]) -> bool { + #[cfg(not(feature = "perf-literal"))] + fn imp(_: &ExecReadOnly, _: &[u8]) -> bool { + true + } + + #[cfg(feature = "perf-literal")] + fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool { + // Only do this check if the haystack is big (>1MB). + if text.len() > (1 << 20) && ro.nfa.is_anchored_end { + let lcs = ro.suffixes.lcs(); + if lcs.len() >= 1 && !lcs.is_suffix(text) { + return false; + } + } + true + } + + imp(&self.ro, text) + } + + pub fn capture_name_idx(&self) -> &Arc> { + &self.ro.nfa.capture_name_idx + } +} + +impl<'c> ExecNoSyncStr<'c> { + pub fn capture_name_idx(&self) -> &Arc> { + self.0.capture_name_idx() + } +} + +impl Exec { + /// Get a searcher that isn't Sync. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn searcher(&self) -> ExecNoSync<'_> { + ExecNoSync { + ro: &self.ro, // a clone is too expensive here! (and not needed) + cache: self.pool.get(), + } + } + + /// Get a searcher that isn't Sync and can match on &str. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn searcher_str(&self) -> ExecNoSyncStr<'_> { + ExecNoSyncStr(self.searcher()) + } + + /// Build a Regex from this executor. + pub fn into_regex(self) -> re_unicode::Regex { + re_unicode::Regex::from(self) + } + + /// Build a RegexSet from this executor. + pub fn into_regex_set(self) -> re_set::unicode::RegexSet { + re_set::unicode::RegexSet::from(self) + } + + /// Build a Regex from this executor that can match arbitrary bytes. + pub fn into_byte_regex(self) -> re_bytes::Regex { + re_bytes::Regex::from(self) + } + + /// Build a RegexSet from this executor that can match arbitrary bytes. + pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet { + re_set::bytes::RegexSet::from(self) + } + + /// The original regular expressions given by the caller that were + /// compiled. + pub fn regex_strings(&self) -> &[String] { + &self.ro.res + } + + /// Return a slice of capture names. + /// + /// Any capture that isn't named is None. + pub fn capture_names(&self) -> &[Option] { + &self.ro.nfa.captures + } + + /// Return a reference to named groups mapping (from group name to + /// group position). + pub fn capture_name_idx(&self) -> &Arc> { + &self.ro.nfa.capture_name_idx + } +} + +impl Clone for Exec { + fn clone(&self) -> Exec { + let pool = ExecReadOnly::new_pool(&self.ro); + Exec { ro: self.ro.clone(), pool } + } +} + +impl ExecReadOnly { + fn choose_match_type(&self, hint: Option) -> MatchType { + if let Some(MatchType::Nfa(_)) = hint { + return hint.unwrap(); + } + // If the NFA is empty, then we'll never match anything. + if self.nfa.insts.is_empty() { + return MatchType::Nothing; + } + if let Some(literalty) = self.choose_literal_match_type() { + return literalty; + } + if let Some(dfaty) = self.choose_dfa_match_type() { + return dfaty; + } + // We're so totally hosed. + MatchType::Nfa(MatchNfaType::Auto) + } + + /// If a plain literal scan can be used, then a corresponding literal + /// search type is returned. + fn choose_literal_match_type(&self) -> Option { + #[cfg(not(feature = "perf-literal"))] + fn imp(_: &ExecReadOnly) -> Option { + None + } + + #[cfg(feature = "perf-literal")] + fn imp(ro: &ExecReadOnly) -> Option { + // If our set of prefixes is complete, then we can use it to find + // a match in lieu of a regex engine. This doesn't quite work well + // in the presence of multiple regexes, so only do it when there's + // one. + // + // TODO(burntsushi): Also, don't try to match literals if the regex + // is partially anchored. We could technically do it, but we'd need + // to create two sets of literals: all of them and then the subset + // that aren't anchored. We would then only search for all of them + // when at the beginning of the input and use the subset in all + // other cases. + if ro.res.len() != 1 { + return None; + } + if ro.ac.is_some() { + return Some(MatchType::Literal( + MatchLiteralType::AhoCorasick, + )); + } + if ro.nfa.prefixes.complete() { + return if ro.nfa.is_anchored_start { + Some(MatchType::Literal(MatchLiteralType::AnchoredStart)) + } else { + Some(MatchType::Literal(MatchLiteralType::Unanchored)) + }; + } + if ro.suffixes.complete() { + return if ro.nfa.is_anchored_end { + Some(MatchType::Literal(MatchLiteralType::AnchoredEnd)) + } else { + // This case shouldn't happen. When the regex isn't + // anchored, then complete prefixes should imply complete + // suffixes. + Some(MatchType::Literal(MatchLiteralType::Unanchored)) + }; + } + None + } + + imp(self) + } + + /// If a DFA scan can be used, then choose the appropriate DFA strategy. + fn choose_dfa_match_type(&self) -> Option { + #[cfg(not(feature = "perf-dfa"))] + fn imp(_: &ExecReadOnly) -> Option { + None + } + + #[cfg(feature = "perf-dfa")] + fn imp(ro: &ExecReadOnly) -> Option { + if !dfa::can_exec(&ro.dfa) { + return None; + } + // Regex sets require a slightly specialized path. + if ro.res.len() >= 2 { + return Some(MatchType::DfaMany); + } + // If the regex is anchored at the end but not the start, then + // just match in reverse from the end of the haystack. + if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end { + return Some(MatchType::DfaAnchoredReverse); + } + #[cfg(feature = "perf-literal")] + { + // If there's a longish suffix literal, then it might be faster + // to look for that first. + if ro.should_suffix_scan() { + return Some(MatchType::DfaSuffix); + } + } + // Fall back to your garden variety forward searching lazy DFA. + Some(MatchType::Dfa) + } + + imp(self) + } + + /// Returns true if the program is amenable to suffix scanning. + /// + /// When this is true, as a heuristic, we assume it is OK to quickly scan + /// for suffix literals and then do a *reverse* DFA match from any matches + /// produced by the literal scan. (And then followed by a forward DFA + /// search, since the previously found suffix literal maybe not actually be + /// the end of a match.) + /// + /// This is a bit of a specialized optimization, but can result in pretty + /// big performance wins if 1) there are no prefix literals and 2) the + /// suffix literals are pretty rare in the text. (1) is obviously easy to + /// account for but (2) is harder. As a proxy, we assume that longer + /// strings are generally rarer, so we only enable this optimization when + /// we have a meaty suffix. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + fn should_suffix_scan(&self) -> bool { + if self.suffixes.is_empty() { + return false; + } + let lcs_len = self.suffixes.lcs().char_len(); + lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len() + } + + fn new_pool(ro: &Arc) -> Box> { + let ro = ro.clone(); + Box::new(Pool::new(Box::new(move || { + AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro))) + }))) + } +} + +#[derive(Clone, Copy, Debug)] +enum MatchType { + /// A single or multiple literal search. This is only used when the regex + /// can be decomposed into a literal search. + #[cfg(feature = "perf-literal")] + Literal(MatchLiteralType), + /// A normal DFA search. + #[cfg(feature = "perf-dfa")] + Dfa, + /// A reverse DFA search starting from the end of a haystack. + #[cfg(feature = "perf-dfa")] + DfaAnchoredReverse, + /// A reverse DFA search with suffix literal scanning. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + DfaSuffix, + /// Use the DFA on two or more regular expressions. + #[cfg(feature = "perf-dfa")] + DfaMany, + /// An NFA variant. + Nfa(MatchNfaType), + /// No match is ever possible, so don't ever try to search. + Nothing, +} + +#[derive(Clone, Copy, Debug)] +#[cfg(feature = "perf-literal")] +enum MatchLiteralType { + /// Match literals anywhere in text. + Unanchored, + /// Match literals only at the start of text. + AnchoredStart, + /// Match literals only at the end of text. + AnchoredEnd, + /// Use an Aho-Corasick automaton. This requires `ac` to be Some on + /// ExecReadOnly. + AhoCorasick, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum MatchNfaType { + /// Choose between Backtrack and PikeVM. + Auto, + /// NFA bounded backtracking. + /// + /// (This is only set by tests, since it never makes sense to always want + /// backtracking.) + Backtrack, + /// The Pike VM. + /// + /// (This is only set by tests, since it never makes sense to always want + /// the Pike VM.) + PikeVM, +} + +/// `ProgramCache` maintains reusable allocations for each matching engine +/// available to a particular program. +/// +/// We declare this as unwind safe since it's a cache that's only used for +/// performance purposes. If a panic occurs, it is (or should be) always safe +/// to continue using the same regex object. +pub type ProgramCache = AssertUnwindSafe>; + +#[derive(Debug)] +pub struct ProgramCacheInner { + pub pikevm: pikevm::Cache, + pub backtrack: backtrack::Cache, + #[cfg(feature = "perf-dfa")] + pub dfa: dfa::Cache, + #[cfg(feature = "perf-dfa")] + pub dfa_reverse: dfa::Cache, +} + +impl ProgramCacheInner { + fn new(ro: &ExecReadOnly) -> Self { + ProgramCacheInner { + pikevm: pikevm::Cache::new(&ro.nfa), + backtrack: backtrack::Cache::new(&ro.nfa), + #[cfg(feature = "perf-dfa")] + dfa: dfa::Cache::new(&ro.dfa), + #[cfg(feature = "perf-dfa")] + dfa_reverse: dfa::Cache::new(&ro.dfa_reverse), + } + } +} + +/// Alternation literals checks if the given HIR is a simple alternation of +/// literals, and if so, returns them. Otherwise, this returns None. +#[cfg(feature = "perf-literal")] +fn alternation_literals(expr: &Hir) -> Option>> { + use regex_syntax::hir::{HirKind, Literal}; + + // This is pretty hacky, but basically, if `is_alternation_literal` is + // true, then we can make several assumptions about the structure of our + // HIR. This is what justifies the `unreachable!` statements below. + // + // This code should be refactored once we overhaul this crate's + // optimization pipeline, because this is a terribly inflexible way to go + // about things. + + if !expr.is_alternation_literal() { + return None; + } + let alts = match *expr.kind() { + HirKind::Alternation(ref alts) => alts, + _ => return None, // one literal isn't worth it + }; + + let extendlit = |lit: &Literal, dst: &mut Vec| match *lit { + Literal::Unicode(c) => { + let mut buf = [0; 4]; + dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes()); + } + Literal::Byte(b) => { + dst.push(b); + } + }; + + let mut lits = vec![]; + for alt in alts { + let mut lit = vec![]; + match *alt.kind() { + HirKind::Literal(ref x) => extendlit(x, &mut lit), + HirKind::Concat(ref exprs) => { + for e in exprs { + match *e.kind() { + HirKind::Literal(ref x) => extendlit(x, &mut lit), + _ => unreachable!("expected literal, got {:?}", e), + } + } + } + _ => unreachable!("expected literal or concat, got {:?}", alt), + } + lits.push(lit); + } + Some(lits) +} + +#[cfg(test)] +mod test { + #[test] + fn uppercut_s_backtracking_bytes_default_bytes_mismatch() { + use crate::internal::ExecBuilder; + + let backtrack_bytes_re = ExecBuilder::new("^S") + .bounded_backtracking() + .only_utf8(false) + .build() + .map(|exec| exec.into_byte_regex()) + .map_err(|err| format!("{}", err)) + .unwrap(); + + let default_bytes_re = ExecBuilder::new("^S") + .only_utf8(false) + .build() + .map(|exec| exec.into_byte_regex()) + .map_err(|err| format!("{}", err)) + .unwrap(); + + let input = vec![83, 83]; + + let s1 = backtrack_bytes_re.split(&input); + let s2 = default_bytes_re.split(&input); + for (chunk1, chunk2) in s1.zip(s2) { + assert_eq!(chunk1, chunk2); + } + } + + #[test] + fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() { + use crate::internal::ExecBuilder; + + let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)") + .bounded_backtracking() + .bytes(true) + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err)) + .unwrap(); + + let default_bytes_re = ExecBuilder::new(r"^(?u:\*)") + .bytes(true) + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err)) + .unwrap(); + + let input = "**"; + + let s1 = backtrack_bytes_re.split(input); + let s2 = default_bytes_re.split(input); + for (chunk1, chunk2) in s1.zip(s2) { + assert_eq!(chunk1, chunk2); + } + } +} diff --git a/vendor/regex/src/expand.rs b/vendor/regex/src/expand.rs new file mode 100644 index 0000000000..67b514926a --- /dev/null +++ b/vendor/regex/src/expand.rs @@ -0,0 +1,239 @@ +use std::str; + +use crate::find_byte::find_byte; + +use crate::re_bytes; +use crate::re_unicode; + +pub fn expand_str( + caps: &re_unicode::Captures<'_>, + mut replacement: &str, + dst: &mut String, +) { + while !replacement.is_empty() { + match find_byte(b'$', replacement.as_bytes()) { + None => break, + Some(i) => { + dst.push_str(&replacement[..i]); + replacement = &replacement[i..]; + } + } + if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') { + dst.push_str("$"); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement.as_bytes()) { + Some(cap_ref) => cap_ref, + None => { + dst.push_str("$"); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => { + dst.push_str(caps.get(i).map(|m| m.as_str()).unwrap_or("")); + } + Ref::Named(name) => { + dst.push_str( + caps.name(name).map(|m| m.as_str()).unwrap_or(""), + ); + } + } + } + dst.push_str(replacement); +} + +pub fn expand_bytes( + caps: &re_bytes::Captures<'_>, + mut replacement: &[u8], + dst: &mut Vec, +) { + while !replacement.is_empty() { + match find_byte(b'$', replacement) { + None => break, + Some(i) => { + dst.extend(&replacement[..i]); + replacement = &replacement[i..]; + } + } + if replacement.get(1).map_or(false, |&b| b == b'$') { + dst.push(b'$'); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement) { + Some(cap_ref) => cap_ref, + None => { + dst.push(b'$'); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => { + dst.extend(caps.get(i).map(|m| m.as_bytes()).unwrap_or(b"")); + } + Ref::Named(name) => { + dst.extend( + caps.name(name).map(|m| m.as_bytes()).unwrap_or(b""), + ); + } + } + } + dst.extend(replacement); +} + +/// `CaptureRef` represents a reference to a capture group inside some text. +/// The reference is either a capture group name or a number. +/// +/// It is also tagged with the position in the text following the +/// capture reference. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct CaptureRef<'a> { + cap: Ref<'a>, + end: usize, +} + +/// A reference to a capture group in some text. +/// +/// e.g., `$2`, `$foo`, `${foo}`. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum Ref<'a> { + Named(&'a str), + Number(usize), +} + +impl<'a> From<&'a str> for Ref<'a> { + fn from(x: &'a str) -> Ref<'a> { + Ref::Named(x) + } +} + +impl From for Ref<'static> { + fn from(x: usize) -> Ref<'static> { + Ref::Number(x) + } +} + +/// Parses a possible reference to a capture group name in the given text, +/// starting at the beginning of `replacement`. +/// +/// If no such valid reference could be found, None is returned. +fn find_cap_ref(replacement: &[u8]) -> Option> { + let mut i = 0; + let rep: &[u8] = replacement; + if rep.len() <= 1 || rep[0] != b'$' { + return None; + } + i += 1; + if rep[i] == b'{' { + return find_cap_ref_braced(rep, i + 1); + } + let mut cap_end = i; + while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) { + cap_end += 1; + } + if cap_end == i { + return None; + } + // We just verified that the range 0..cap_end is valid ASCII, so it must + // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 + // check via an unchecked conversion or by parsing the number straight from + // &[u8]. + let cap = + str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name"); + Some(CaptureRef { + cap: match cap.parse::() { + Ok(i) => Ref::Number(i as usize), + Err(_) => Ref::Named(cap), + }, + end: cap_end, + }) +} + +fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option> { + let start = i; + while rep.get(i).map_or(false, |&b| b != b'}') { + i += 1; + } + if !rep.get(i).map_or(false, |&b| b == b'}') { + return None; + } + // When looking at braced names, we don't put any restrictions on the name, + // so it's possible it could be invalid UTF-8. But a capture group name + // can never be invalid UTF-8, so if we have invalid UTF-8, then we can + // safely return None. + let cap = match str::from_utf8(&rep[start..i]) { + Err(_) => return None, + Ok(cap) => cap, + }; + Some(CaptureRef { + cap: match cap.parse::() { + Ok(i) => Ref::Number(i as usize), + Err(_) => Ref::Named(cap), + }, + end: i + 1, + }) +} + +/// Returns true if and only if the given byte is allowed in a capture name. +fn is_valid_cap_letter(b: u8) -> bool { + match b { + b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, + _ => false, + } +} + +#[cfg(test)] +mod tests { + use super::{find_cap_ref, CaptureRef}; + + macro_rules! find { + ($name:ident, $text:expr) => { + #[test] + fn $name() { + assert_eq!(None, find_cap_ref($text.as_bytes())); + } + }; + ($name:ident, $text:expr, $capref:expr) => { + #[test] + fn $name() { + assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); + } + }; + } + + macro_rules! c { + ($name_or_number:expr, $pos:expr) => { + CaptureRef { cap: $name_or_number.into(), end: $pos } + }; + } + + find!(find_cap_ref1, "$foo", c!("foo", 4)); + find!(find_cap_ref2, "${foo}", c!("foo", 6)); + find!(find_cap_ref3, "$0", c!(0, 2)); + find!(find_cap_ref4, "$5", c!(5, 2)); + find!(find_cap_ref5, "$10", c!(10, 3)); + // See https://github.com/rust-lang/regex/pull/585 + // for more on characters following numbers + find!(find_cap_ref6, "$42a", c!("42a", 4)); + find!(find_cap_ref7, "${42}a", c!(42, 5)); + find!(find_cap_ref8, "${42"); + find!(find_cap_ref9, "${42 "); + find!(find_cap_ref10, " $0 "); + find!(find_cap_ref11, "$"); + find!(find_cap_ref12, " "); + find!(find_cap_ref13, ""); + find!(find_cap_ref14, "$1-$2", c!(1, 2)); + find!(find_cap_ref15, "$1_$2", c!("1_", 3)); + find!(find_cap_ref16, "$x-$y", c!("x", 2)); + find!(find_cap_ref17, "$x_$y", c!("x_", 3)); + find!(find_cap_ref18, "${#}", c!("#", 4)); + find!(find_cap_ref19, "${Z[}", c!("Z[", 5)); +} diff --git a/vendor/regex/src/find_byte.rs b/vendor/regex/src/find_byte.rs new file mode 100644 index 0000000000..e95f72afb9 --- /dev/null +++ b/vendor/regex/src/find_byte.rs @@ -0,0 +1,18 @@ +/// Searches for the given needle in the given haystack. +/// +/// If the perf-literal feature is enabled, then this uses the super optimized +/// memchr crate. Otherwise, it uses the naive byte-at-a-time implementation. +pub fn find_byte(needle: u8, haystack: &[u8]) -> Option { + #[cfg(not(feature = "perf-literal"))] + fn imp(needle: u8, haystack: &[u8]) -> Option { + haystack.iter().position(|&b| b == needle) + } + + #[cfg(feature = "perf-literal")] + fn imp(needle: u8, haystack: &[u8]) -> Option { + use memchr::memchr; + memchr(needle, haystack) + } + + imp(needle, haystack) +} diff --git a/vendor/regex/src/freqs.rs b/vendor/regex/src/freqs.rs new file mode 100644 index 0000000000..fcffa95fb5 --- /dev/null +++ b/vendor/regex/src/freqs.rs @@ -0,0 +1,261 @@ +// NOTE: The following code was generated by "scripts/frequencies.py", do not +// edit directly + +pub const BYTE_FREQUENCIES: [u8; 256] = [ + 55, // '\x00' + 52, // '\x01' + 51, // '\x02' + 50, // '\x03' + 49, // '\x04' + 48, // '\x05' + 47, // '\x06' + 46, // '\x07' + 45, // '\x08' + 103, // '\t' + 242, // '\n' + 66, // '\x0b' + 67, // '\x0c' + 229, // '\r' + 44, // '\x0e' + 43, // '\x0f' + 42, // '\x10' + 41, // '\x11' + 40, // '\x12' + 39, // '\x13' + 38, // '\x14' + 37, // '\x15' + 36, // '\x16' + 35, // '\x17' + 34, // '\x18' + 33, // '\x19' + 56, // '\x1a' + 32, // '\x1b' + 31, // '\x1c' + 30, // '\x1d' + 29, // '\x1e' + 28, // '\x1f' + 255, // ' ' + 148, // '!' + 164, // '"' + 149, // '#' + 136, // '$' + 160, // '%' + 155, // '&' + 173, // "'" + 221, // '(' + 222, // ')' + 134, // '*' + 122, // '+' + 232, // ',' + 202, // '-' + 215, // '.' + 224, // '/' + 208, // '0' + 220, // '1' + 204, // '2' + 187, // '3' + 183, // '4' + 179, // '5' + 177, // '6' + 168, // '7' + 178, // '8' + 200, // '9' + 226, // ':' + 195, // ';' + 154, // '<' + 184, // '=' + 174, // '>' + 126, // '?' + 120, // '@' + 191, // 'A' + 157, // 'B' + 194, // 'C' + 170, // 'D' + 189, // 'E' + 162, // 'F' + 161, // 'G' + 150, // 'H' + 193, // 'I' + 142, // 'J' + 137, // 'K' + 171, // 'L' + 176, // 'M' + 185, // 'N' + 167, // 'O' + 186, // 'P' + 112, // 'Q' + 175, // 'R' + 192, // 'S' + 188, // 'T' + 156, // 'U' + 140, // 'V' + 143, // 'W' + 123, // 'X' + 133, // 'Y' + 128, // 'Z' + 147, // '[' + 138, // '\\' + 146, // ']' + 114, // '^' + 223, // '_' + 151, // '`' + 249, // 'a' + 216, // 'b' + 238, // 'c' + 236, // 'd' + 253, // 'e' + 227, // 'f' + 218, // 'g' + 230, // 'h' + 247, // 'i' + 135, // 'j' + 180, // 'k' + 241, // 'l' + 233, // 'm' + 246, // 'n' + 244, // 'o' + 231, // 'p' + 139, // 'q' + 245, // 'r' + 243, // 's' + 251, // 't' + 235, // 'u' + 201, // 'v' + 196, // 'w' + 240, // 'x' + 214, // 'y' + 152, // 'z' + 182, // '{' + 205, // '|' + 181, // '}' + 127, // '~' + 27, // '\x7f' + 212, // '\x80' + 211, // '\x81' + 210, // '\x82' + 213, // '\x83' + 228, // '\x84' + 197, // '\x85' + 169, // '\x86' + 159, // '\x87' + 131, // '\x88' + 172, // '\x89' + 105, // '\x8a' + 80, // '\x8b' + 98, // '\x8c' + 96, // '\x8d' + 97, // '\x8e' + 81, // '\x8f' + 207, // '\x90' + 145, // '\x91' + 116, // '\x92' + 115, // '\x93' + 144, // '\x94' + 130, // '\x95' + 153, // '\x96' + 121, // '\x97' + 107, // '\x98' + 132, // '\x99' + 109, // '\x9a' + 110, // '\x9b' + 124, // '\x9c' + 111, // '\x9d' + 82, // '\x9e' + 108, // '\x9f' + 118, // '\xa0' + 141, // '¡' + 113, // '¢' + 129, // '£' + 119, // '¤' + 125, // '¥' + 165, // '¦' + 117, // '§' + 92, // '¨' + 106, // '©' + 83, // 'ª' + 72, // '«' + 99, // '¬' + 93, // '\xad' + 65, // '®' + 79, // '¯' + 166, // '°' + 237, // '±' + 163, // '²' + 199, // '³' + 190, // '´' + 225, // 'µ' + 209, // '¶' + 203, // '·' + 198, // '¸' + 217, // '¹' + 219, // 'º' + 206, // '»' + 234, // '¼' + 248, // '½' + 158, // '¾' + 239, // '¿' + 255, // 'À' + 255, // 'Á' + 255, // 'Â' + 255, // 'Ã' + 255, // 'Ä' + 255, // 'Å' + 255, // 'Æ' + 255, // 'Ç' + 255, // 'È' + 255, // 'É' + 255, // 'Ê' + 255, // 'Ë' + 255, // 'Ì' + 255, // 'Í' + 255, // 'Î' + 255, // 'Ï' + 255, // 'Ð' + 255, // 'Ñ' + 255, // 'Ò' + 255, // 'Ó' + 255, // 'Ô' + 255, // 'Õ' + 255, // 'Ö' + 255, // '×' + 255, // 'Ø' + 255, // 'Ù' + 255, // 'Ú' + 255, // 'Û' + 255, // 'Ü' + 255, // 'Ý' + 255, // 'Þ' + 255, // 'ß' + 255, // 'à' + 255, // 'á' + 255, // 'â' + 255, // 'ã' + 255, // 'ä' + 255, // 'å' + 255, // 'æ' + 255, // 'ç' + 255, // 'è' + 255, // 'é' + 255, // 'ê' + 255, // 'ë' + 255, // 'ì' + 255, // 'í' + 255, // 'î' + 255, // 'ï' + 255, // 'ð' + 255, // 'ñ' + 255, // 'ò' + 255, // 'ó' + 255, // 'ô' + 255, // 'õ' + 255, // 'ö' + 255, // '÷' + 255, // 'ø' + 255, // 'ù' + 255, // 'ú' + 255, // 'û' + 255, // 'ü' + 255, // 'ý' + 255, // 'þ' + 255, // 'ÿ' +]; diff --git a/vendor/regex/src/input.rs b/vendor/regex/src/input.rs new file mode 100644 index 0000000000..df6c3e0c91 --- /dev/null +++ b/vendor/regex/src/input.rs @@ -0,0 +1,432 @@ +use std::char; +use std::cmp::Ordering; +use std::fmt; +use std::ops; +use std::u32; + +use crate::literal::LiteralSearcher; +use crate::prog::InstEmptyLook; +use crate::utf8::{decode_last_utf8, decode_utf8}; + +/// Represents a location in the input. +#[derive(Clone, Copy, Debug)] +pub struct InputAt { + pos: usize, + c: Char, + byte: Option, + len: usize, +} + +impl InputAt { + /// Returns true iff this position is at the beginning of the input. + pub fn is_start(&self) -> bool { + self.pos == 0 + } + + /// Returns true iff this position is past the end of the input. + pub fn is_end(&self) -> bool { + self.c.is_none() && self.byte.is_none() + } + + /// Returns the character at this position. + /// + /// If this position is just before or after the input, then an absent + /// character is returned. + pub fn char(&self) -> Char { + self.c + } + + /// Returns the byte at this position. + pub fn byte(&self) -> Option { + self.byte + } + + /// Returns the UTF-8 width of the character at this position. + pub fn len(&self) -> usize { + self.len + } + + /// Returns whether the UTF-8 width of the character at this position + /// is zero. + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Returns the byte offset of this position. + pub fn pos(&self) -> usize { + self.pos + } + + /// Returns the byte offset of the next position in the input. + pub fn next_pos(&self) -> usize { + self.pos + self.len + } +} + +/// An abstraction over input used in the matching engines. +pub trait Input: fmt::Debug { + /// Return an encoding of the position at byte offset `i`. + fn at(&self, i: usize) -> InputAt; + + /// Return the Unicode character occurring next to `at`. + /// + /// If no such character could be decoded, then `Char` is absent. + fn next_char(&self, at: InputAt) -> Char; + + /// Return the Unicode character occurring previous to `at`. + /// + /// If no such character could be decoded, then `Char` is absent. + fn previous_char(&self, at: InputAt) -> Char; + + /// Return true if the given empty width instruction matches at the + /// input position given. + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool; + + /// Scan the input for a matching prefix. + fn prefix_at( + &self, + prefixes: &LiteralSearcher, + at: InputAt, + ) -> Option; + + /// The number of bytes in the input. + fn len(&self) -> usize; + + /// Whether the input is empty. + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Return the given input as a sequence of bytes. + fn as_bytes(&self) -> &[u8]; +} + +impl<'a, T: Input> Input for &'a T { + fn at(&self, i: usize) -> InputAt { + (**self).at(i) + } + + fn next_char(&self, at: InputAt) -> Char { + (**self).next_char(at) + } + + fn previous_char(&self, at: InputAt) -> Char { + (**self).previous_char(at) + } + + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + (**self).is_empty_match(at, empty) + } + + fn prefix_at( + &self, + prefixes: &LiteralSearcher, + at: InputAt, + ) -> Option { + (**self).prefix_at(prefixes, at) + } + + fn len(&self) -> usize { + (**self).len() + } + + fn as_bytes(&self) -> &[u8] { + (**self).as_bytes() + } +} + +/// An input reader over characters. +#[derive(Clone, Copy, Debug)] +pub struct CharInput<'t>(&'t [u8]); + +impl<'t> CharInput<'t> { + /// Return a new character input reader for the given string. + pub fn new(s: &'t [u8]) -> CharInput<'t> { + CharInput(s) + } +} + +impl<'t> ops::Deref for CharInput<'t> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + self.0 + } +} + +impl<'t> Input for CharInput<'t> { + fn at(&self, i: usize) -> InputAt { + if i >= self.len() { + InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } + } else { + let c = decode_utf8(&self[i..]).map(|(c, _)| c).into(); + InputAt { pos: i, c, byte: None, len: c.len_utf8() } + } + } + + fn next_char(&self, at: InputAt) -> Char { + at.char() + } + + fn previous_char(&self, at: InputAt) -> Char { + decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() + } + + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + use crate::prog::EmptyLook::*; + match empty.look { + StartLine => { + let c = self.previous_char(at); + at.pos() == 0 || c == '\n' + } + EndLine => { + let c = self.next_char(at); + at.pos() == self.len() || c == '\n' + } + StartText => at.pos() == 0, + EndText => at.pos() == self.len(), + WordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() != c2.is_word_char() + } + NotWordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() == c2.is_word_char() + } + WordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_byte() != c2.is_word_byte() + } + NotWordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_byte() == c2.is_word_byte() + } + } + } + + fn prefix_at( + &self, + prefixes: &LiteralSearcher, + at: InputAt, + ) -> Option { + prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) + } + + fn len(&self) -> usize { + self.0.len() + } + + fn as_bytes(&self) -> &[u8] { + self.0 + } +} + +/// An input reader over bytes. +#[derive(Clone, Copy, Debug)] +pub struct ByteInput<'t> { + text: &'t [u8], + only_utf8: bool, +} + +impl<'t> ByteInput<'t> { + /// Return a new byte-based input reader for the given string. + pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> { + ByteInput { text, only_utf8 } + } +} + +impl<'t> ops::Deref for ByteInput<'t> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + self.text + } +} + +impl<'t> Input for ByteInput<'t> { + fn at(&self, i: usize) -> InputAt { + if i >= self.len() { + InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } + } else { + InputAt { + pos: i, + c: None.into(), + byte: self.get(i).cloned(), + len: 1, + } + } + } + + fn next_char(&self, at: InputAt) -> Char { + decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into() + } + + fn previous_char(&self, at: InputAt) -> Char { + decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() + } + + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + use crate::prog::EmptyLook::*; + match empty.look { + StartLine => { + let c = self.previous_char(at); + at.pos() == 0 || c == '\n' + } + EndLine => { + let c = self.next_char(at); + at.pos() == self.len() || c == '\n' + } + StartText => at.pos() == 0, + EndText => at.pos() == self.len(), + WordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() != c2.is_word_char() + } + NotWordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() == c2.is_word_char() + } + WordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + if self.only_utf8 { + // If we must match UTF-8, then we can't match word + // boundaries at invalid UTF-8. + if c1.is_none() && !at.is_start() { + return false; + } + if c2.is_none() && !at.is_end() { + return false; + } + } + c1.is_word_byte() != c2.is_word_byte() + } + NotWordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + if self.only_utf8 { + // If we must match UTF-8, then we can't match word + // boundaries at invalid UTF-8. + if c1.is_none() && !at.is_start() { + return false; + } + if c2.is_none() && !at.is_end() { + return false; + } + } + c1.is_word_byte() == c2.is_word_byte() + } + } + } + + fn prefix_at( + &self, + prefixes: &LiteralSearcher, + at: InputAt, + ) -> Option { + prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) + } + + fn len(&self) -> usize { + self.text.len() + } + + fn as_bytes(&self) -> &[u8] { + self.text + } +} + +/// An inline representation of `Option`. +/// +/// This eliminates the need to do case analysis on `Option` to determine +/// ordinality with other characters. +/// +/// (The `Option` is not related to encoding. Instead, it is used in the +/// matching engines to represent the beginning and ending boundaries of the +/// search text.) +#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub struct Char(u32); + +impl fmt::Debug for Char { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match char::from_u32(self.0) { + None => write!(f, "Empty"), + Some(c) => write!(f, "{:?}", c), + } + } +} + +impl Char { + /// Returns true iff the character is absent. + #[inline] + pub fn is_none(self) -> bool { + self.0 == u32::MAX + } + + /// Returns the length of the character's UTF-8 encoding. + /// + /// If the character is absent, then `1` is returned. + #[inline] + pub fn len_utf8(self) -> usize { + char::from_u32(self.0).map_or(1, |c| c.len_utf8()) + } + + /// Returns true iff the character is a word character. + /// + /// If the character is absent, then false is returned. + pub fn is_word_char(self) -> bool { + // is_word_character can panic if the Unicode data for \w isn't + // available. However, our compiler ensures that if a Unicode word + // boundary is used, then the data must also be available. If it isn't, + // then the compiler returns an error. + char::from_u32(self.0).map_or(false, regex_syntax::is_word_character) + } + + /// Returns true iff the byte is a word byte. + /// + /// If the byte is absent, then false is returned. + pub fn is_word_byte(self) -> bool { + match char::from_u32(self.0) { + Some(c) if c <= '\u{7F}' => regex_syntax::is_word_byte(c as u8), + None | Some(_) => false, + } + } +} + +impl From for Char { + fn from(c: char) -> Char { + Char(c as u32) + } +} + +impl From> for Char { + fn from(c: Option) -> Char { + c.map_or(Char(u32::MAX), |c| c.into()) + } +} + +impl PartialEq for Char { + #[inline] + fn eq(&self, other: &char) -> bool { + self.0 == *other as u32 + } +} + +impl PartialEq for char { + #[inline] + fn eq(&self, other: &Char) -> bool { + *self as u32 == other.0 + } +} + +impl PartialOrd for Char { + #[inline] + fn partial_cmp(&self, other: &char) -> Option { + self.0.partial_cmp(&(*other as u32)) + } +} + +impl PartialOrd for char { + #[inline] + fn partial_cmp(&self, other: &Char) -> Option { + (*self as u32).partial_cmp(&other.0) + } +} diff --git a/vendor/regex/src/lib.rs b/vendor/regex/src/lib.rs new file mode 100644 index 0000000000..6b95739c5c --- /dev/null +++ b/vendor/regex/src/lib.rs @@ -0,0 +1,769 @@ +/*! +This crate provides a library for parsing, compiling, and executing regular +expressions. Its syntax is similar to Perl-style regular expressions, but lacks +a few features like look around and backreferences. In exchange, all searches +execute in linear time with respect to the size of the regular expression and +search text. + +This crate's documentation provides some simple examples, describes +[Unicode support](#unicode) and exhaustively lists the +[supported syntax](#syntax). + +For more specific details on the API for regular expressions, please see the +documentation for the [`Regex`](struct.Regex.html) type. + +# Usage + +This crate is [on crates.io](https://crates.io/crates/regex) and can be +used by adding `regex` to your dependencies in your project's `Cargo.toml`. + +```toml +[dependencies] +regex = "1" +``` + +# Example: find a date + +General use of regular expressions in this package involves compiling an +expression and then using it to search, split or replace text. For example, +to confirm that some text resembles a date: + +```rust +use regex::Regex; +let re = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap(); +assert!(re.is_match("2014-01-01")); +``` + +Notice the use of the `^` and `$` anchors. In this crate, every expression +is executed with an implicit `.*?` at the beginning and end, which allows +it to match anywhere in the text. Anchors can be used to ensure that the +full text matches an expression. + +This example also demonstrates the utility of +[raw strings](https://doc.rust-lang.org/stable/reference/tokens.html#raw-string-literals) +in Rust, which +are just like regular strings except they are prefixed with an `r` and do +not process any escape sequences. For example, `"\\d"` is the same +expression as `r"\d"`. + +# Example: Avoid compiling the same regex in a loop + +It is an anti-pattern to compile the same regular expression in a loop +since compilation is typically expensive. (It takes anywhere from a few +microseconds to a few **milliseconds** depending on the size of the +regex.) Not only is compilation itself expensive, but this also prevents +optimizations that reuse allocations internally to the matching engines. + +In Rust, it can sometimes be a pain to pass regular expressions around if +they're used from inside a helper function. Instead, we recommend using the +[`lazy_static`](https://crates.io/crates/lazy_static) crate to ensure that +regular expressions are compiled exactly once. + +For example: + +```rust +use lazy_static::lazy_static; +use regex::Regex; + +fn some_helper_function(text: &str) -> bool { + lazy_static! { + static ref RE: Regex = Regex::new("...").unwrap(); + } + RE.is_match(text) +} + +fn main() {} +``` + +Specifically, in this example, the regex will be compiled when it is used for +the first time. On subsequent uses, it will reuse the previous compilation. + +# Example: iterating over capture groups + +This crate provides convenient iterators for matching an expression +repeatedly against a search string to find successive non-overlapping +matches. For example, to find all dates in a string and be able to access +them by their component pieces: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap(); +let text = "2012-03-14, 2013-01-01 and 2014-07-05"; +for cap in re.captures_iter(text) { + println!("Month: {} Day: {} Year: {}", &cap[2], &cap[3], &cap[1]); +} +// Output: +// Month: 03 Day: 14 Year: 2012 +// Month: 01 Day: 01 Year: 2013 +// Month: 07 Day: 05 Year: 2014 +# } +``` + +Notice that the year is in the capture group indexed at `1`. This is +because the *entire match* is stored in the capture group at index `0`. + +# Example: replacement with named capture groups + +Building on the previous example, perhaps we'd like to rearrange the date +formats. This can be done with text replacement. But to make the code +clearer, we can *name* our capture groups and use those names as variables +in our replacement text: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"(?P\d{4})-(?P\d{2})-(?P\d{2})").unwrap(); +let before = "2012-03-14, 2013-01-01 and 2014-07-05"; +let after = re.replace_all(before, "$m/$d/$y"); +assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014"); +# } +``` + +The `replace` methods are actually polymorphic in the replacement, which +provides more flexibility than is seen here. (See the documentation for +`Regex::replace` for more details.) + +Note that if your regex gets complicated, you can use the `x` flag to +enable insignificant whitespace mode, which also lets you write comments: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"(?x) + (?P\d{4}) # the year + - + (?P\d{2}) # the month + - + (?P\d{2}) # the day +").unwrap(); +let before = "2012-03-14, 2013-01-01 and 2014-07-05"; +let after = re.replace_all(before, "$m/$d/$y"); +assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014"); +# } +``` + +If you wish to match against whitespace in this mode, you can still use `\s`, +`\n`, `\t`, etc. For escaping a single space character, you can escape it +directly with `\ `, use its hex character code `\x20` or temporarily disable +the `x` flag, e.g., `(?-x: )`. + +# Example: match multiple regular expressions simultaneously + +This demonstrates how to use a `RegexSet` to match multiple (possibly +overlapping) regular expressions in a single scan of the search text: + +```rust +use regex::RegexSet; + +let set = RegexSet::new(&[ + r"\w+", + r"\d+", + r"\pL+", + r"foo", + r"bar", + r"barfoo", + r"foobar", +]).unwrap(); + +// Iterate over and collect all of the matches. +let matches: Vec<_> = set.matches("foobar").into_iter().collect(); +assert_eq!(matches, vec![0, 2, 3, 4, 6]); + +// You can also test whether a particular regex matched: +let matches = set.matches("foobar"); +assert!(!matches.matched(5)); +assert!(matches.matched(6)); +``` + +# Pay for what you use + +With respect to searching text with a regular expression, there are three +questions that can be asked: + +1. Does the text match this expression? +2. If so, where does it match? +3. Where did the capturing groups match? + +Generally speaking, this crate could provide a function to answer only #3, +which would subsume #1 and #2 automatically. However, it can be significantly +more expensive to compute the location of capturing group matches, so it's best +not to do it if you don't need to. + +Therefore, only use what you need. For example, don't use `find` if you +only need to test if an expression matches a string. (Use `is_match` +instead.) + +# Unicode + +This implementation executes regular expressions **only** on valid UTF-8 +while exposing match locations as byte indices into the search string. (To +relax this restriction, use the [`bytes`](bytes/index.html) sub-module.) + +Only simple case folding is supported. Namely, when matching +case-insensitively, the characters are first mapped using the "simple" case +folding rules defined by Unicode. + +Regular expressions themselves are **only** interpreted as a sequence of +Unicode scalar values. This means you can use Unicode characters directly +in your expression: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"(?i)Δ+").unwrap(); +let mat = re.find("ΔδΔ").unwrap(); +assert_eq!((mat.start(), mat.end()), (0, 6)); +# } +``` + +Most features of the regular expressions in this crate are Unicode aware. Here +are some examples: + +* `.` will match any valid UTF-8 encoded Unicode scalar value except for `\n`. + (To also match `\n`, enable the `s` flag, e.g., `(?s:.)`.) +* `\w`, `\d` and `\s` are Unicode aware. For example, `\s` will match all forms + of whitespace categorized by Unicode. +* `\b` matches a Unicode word boundary. +* Negated character classes like `[^a]` match all Unicode scalar values except + for `a`. +* `^` and `$` are **not** Unicode aware in multi-line mode. Namely, they only + recognize `\n` and not any of the other forms of line terminators defined + by Unicode. + +Unicode general categories, scripts, script extensions, ages and a smattering +of boolean properties are available as character classes. For example, you can +match a sequence of numerals, Greek or Cherokee letters: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"[\pN\p{Greek}\p{Cherokee}]+").unwrap(); +let mat = re.find("abcΔᎠβⅠᏴγδⅡxyz").unwrap(); +assert_eq!((mat.start(), mat.end()), (3, 23)); +# } +``` + +For a more detailed breakdown of Unicode support with respect to +[UTS#18](https://unicode.org/reports/tr18/), +please see the +[UNICODE](https://github.com/rust-lang/regex/blob/master/UNICODE.md) +document in the root of the regex repository. + +# Opt out of Unicode support + +The `bytes` sub-module provides a `Regex` type that can be used to match +on `&[u8]`. By default, text is interpreted as UTF-8 just like it is with +the main `Regex` type. However, this behavior can be disabled by turning +off the `u` flag, even if doing so could result in matching invalid UTF-8. +For example, when the `u` flag is disabled, `.` will match any byte instead +of any Unicode scalar value. + +Disabling the `u` flag is also possible with the standard `&str`-based `Regex` +type, but it is only allowed where the UTF-8 invariant is maintained. For +example, `(?-u:\w)` is an ASCII-only `\w` character class and is legal in an +`&str`-based `Regex`, but `(?-u:\xFF)` will attempt to match the raw byte +`\xFF`, which is invalid UTF-8 and therefore is illegal in `&str`-based +regexes. + +Finally, since Unicode support requires bundling large Unicode data +tables, this crate exposes knobs to disable the compilation of those +data tables, which can be useful for shrinking binary size and reducing +compilation times. For details on how to do that, see the section on [crate +features](#crate-features). + +# Syntax + +The syntax supported in this crate is documented below. + +Note that the regular expression parser and abstract syntax are exposed in +a separate crate, [`regex-syntax`](https://docs.rs/regex-syntax). + +## Matching one character + +
+.             any character except new line (includes new line with s flag)
+\d            digit (\p{Nd})
+\D            not digit
+\pN           One-letter name Unicode character class
+\p{Greek}     Unicode character class (general category or script)
+\PN           Negated one-letter name Unicode character class
+\P{Greek}     negated Unicode character class (general category or script)
+
+ +### Character classes + +
+[xyz]         A character class matching either x, y or z (union).
+[^xyz]        A character class matching any character except x, y and z.
+[a-z]         A character class matching any character in range a-z.
+[[:alpha:]]   ASCII character class ([A-Za-z])
+[[:^alpha:]]  Negated ASCII character class ([^A-Za-z])
+[x[^xyz]]     Nested/grouping character class (matching any character except y and z)
+[a-y&&xyz]    Intersection (matching x or y)
+[0-9&&[^4]]   Subtraction using intersection and negation (matching 0-9 except 4)
+[0-9--4]      Direct subtraction (matching 0-9 except 4)
+[a-g~~b-h]    Symmetric difference (matching `a` and `h` only)
+[\[\]]        Escaping in character classes (matching [ or ])
+
+ +Any named character class may appear inside a bracketed `[...]` character +class. For example, `[\p{Greek}[:digit:]]` matches any Greek or ASCII +digit. `[\p{Greek}&&\pL]` matches Greek letters. + +Precedence in character classes, from most binding to least: + +1. Ranges: `a-cd` == `[a-c]d` +2. Union: `ab&&bc` == `[ab]&&[bc]` +3. Intersection: `^a-z&&b` == `^[a-z&&b]` +4. Negation + +## Composites + +
+xy    concatenation (x followed by y)
+x|y   alternation (x or y, prefer x)
+
+ +## Repetitions + +
+x*        zero or more of x (greedy)
+x+        one or more of x (greedy)
+x?        zero or one of x (greedy)
+x*?       zero or more of x (ungreedy/lazy)
+x+?       one or more of x (ungreedy/lazy)
+x??       zero or one of x (ungreedy/lazy)
+x{n,m}    at least n x and at most m x (greedy)
+x{n,}     at least n x (greedy)
+x{n}      exactly n x
+x{n,m}?   at least n x and at most m x (ungreedy/lazy)
+x{n,}?    at least n x (ungreedy/lazy)
+x{n}?     exactly n x
+
+ +## Empty matches + +
+^     the beginning of text (or start-of-line with multi-line mode)
+$     the end of text (or end-of-line with multi-line mode)
+\A    only the beginning of text (even with multi-line mode enabled)
+\z    only the end of text (even with multi-line mode enabled)
+\b    a Unicode word boundary (\w on one side and \W, \A, or \z on other)
+\B    not a Unicode word boundary
+
+ +The empty regex is valid and matches the empty string. For example, the empty +regex matches `abc` at positions `0`, `1`, `2` and `3`. + +## Grouping and flags + +
+(exp)          numbered capture group (indexed by opening parenthesis)
+(?P<name>exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
+(?:exp)        non-capturing group
+(?flags)       set flags within current group
+(?flags:exp)   set flags for exp (non-capturing)
+
+ +Flags are each a single character. For example, `(?x)` sets the flag `x` +and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at +the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets +the `x` flag and clears the `y` flag. + +All flags are by default disabled unless stated otherwise. They are: + +
+i     case-insensitive: letters match both upper and lower case
+m     multi-line mode: ^ and $ match begin/end of line
+s     allow . to match \n
+U     swap the meaning of x* and x*?
+u     Unicode support (enabled by default)
+x     ignore whitespace and allow line comments (starting with `#`)
+
+ +Flags can be toggled within a pattern. Here's an example that matches +case-insensitively for the first part but case-sensitively for the second part: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"(?i)a+(?-i)b+").unwrap(); +let cap = re.captures("AaAaAbbBBBb").unwrap(); +assert_eq!(&cap[0], "AaAaAbb"); +# } +``` + +Notice that the `a+` matches either `a` or `A`, but the `b+` only matches +`b`. + +Multi-line mode means `^` and `$` no longer match just at the beginning/end of +the input, but at the beginning/end of lines: + +``` +# use regex::Regex; +let re = Regex::new(r"(?m)^line \d+").unwrap(); +let m = re.find("line one\nline 2\n").unwrap(); +assert_eq!(m.as_str(), "line 2"); +``` + +Note that `^` matches after new lines, even at the end of input: + +``` +# use regex::Regex; +let re = Regex::new(r"(?m)^").unwrap(); +let m = re.find_iter("test\n").last().unwrap(); +assert_eq!((m.start(), m.end()), (5, 5)); +``` + +Here is an example that uses an ASCII word boundary instead of a Unicode +word boundary: + +```rust +# use regex::Regex; +# fn main() { +let re = Regex::new(r"(?-u:\b).+(?-u:\b)").unwrap(); +let cap = re.captures("$$abc$$").unwrap(); +assert_eq!(&cap[0], "abc"); +# } +``` + +## Escape sequences + +
+\*          literal *, works for any punctuation character: \.+*?()|[]{}^$
+\a          bell (\x07)
+\f          form feed (\x0C)
+\t          horizontal tab
+\n          new line
+\r          carriage return
+\v          vertical tab (\x0B)
+\123        octal character code (up to three digits) (when enabled)
+\x7F        hex character code (exactly two digits)
+\x{10FFFF}  any hex character code corresponding to a Unicode code point
+\u007F      hex character code (exactly four digits)
+\u{7F}      any hex character code corresponding to a Unicode code point
+\U0000007F  hex character code (exactly eight digits)
+\U{7F}      any hex character code corresponding to a Unicode code point
+
+ +## Perl character classes (Unicode friendly) + +These classes are based on the definitions provided in +[UTS#18](https://www.unicode.org/reports/tr18/#Compatibility_Properties): + +
+\d     digit (\p{Nd})
+\D     not digit
+\s     whitespace (\p{White_Space})
+\S     not whitespace
+\w     word character (\p{Alphabetic} + \p{M} + \d + \p{Pc} + \p{Join_Control})
+\W     not word character
+
+ +## ASCII character classes + +
+[[:alnum:]]    alphanumeric ([0-9A-Za-z])
+[[:alpha:]]    alphabetic ([A-Za-z])
+[[:ascii:]]    ASCII ([\x00-\x7F])
+[[:blank:]]    blank ([\t ])
+[[:cntrl:]]    control ([\x00-\x1F\x7F])
+[[:digit:]]    digits ([0-9])
+[[:graph:]]    graphical ([!-~])
+[[:lower:]]    lower case ([a-z])
+[[:print:]]    printable ([ -~])
+[[:punct:]]    punctuation ([!-/:-@\[-`{-~])
+[[:space:]]    whitespace ([\t\n\v\f\r ])
+[[:upper:]]    upper case ([A-Z])
+[[:word:]]     word characters ([0-9A-Za-z_])
+[[:xdigit:]]   hex digit ([0-9A-Fa-f])
+
+ +# Crate features + +By default, this crate tries pretty hard to make regex matching both as fast +as possible and as correct as it can be, within reason. This means that there +is a lot of code dedicated to performance, the handling of Unicode data and the +Unicode data itself. Overall, this leads to more dependencies, larger binaries +and longer compile times. This trade off may not be appropriate in all cases, +and indeed, even when all Unicode and performance features are disabled, one +is still left with a perfectly serviceable regex engine that will work well +in many cases. + +This crate exposes a number of features for controlling that trade off. Some +of these features are strictly performance oriented, such that disabling them +won't result in a loss of functionality, but may result in worse performance. +Other features, such as the ones controlling the presence or absence of Unicode +data, can result in a loss of functionality. For example, if one disables the +`unicode-case` feature (described below), then compiling the regex `(?i)a` +will fail since Unicode case insensitivity is enabled by default. Instead, +callers must use `(?i-u)a` instead to disable Unicode case folding. Stated +differently, enabling or disabling any of the features below can only add or +subtract from the total set of valid regular expressions. Enabling or disabling +a feature will never modify the match semantics of a regular expression. + +All features below are enabled by default. + +### Ecosystem features + +* **std** - + When enabled, this will cause `regex` to use the standard library. Currently, + disabling this feature will always result in a compilation error. It is + intended to add `alloc`-only support to regex in the future. + +### Performance features + +* **perf** - + Enables all performance related features. This feature is enabled by default + and will always cover all features that improve performance, even if more + are added in the future. +* **perf-dfa** - + Enables the use of a lazy DFA for matching. The lazy DFA is used to compile + portions of a regex to a very fast DFA on an as-needed basis. This can + result in substantial speedups, usually by an order of magnitude on large + haystacks. The lazy DFA does not bring in any new dependencies, but it can + make compile times longer. +* **perf-inline** - + Enables the use of aggressive inlining inside match routines. This reduces + the overhead of each match. The aggressive inlining, however, increases + compile times and binary size. +* **perf-literal** - + Enables the use of literal optimizations for speeding up matches. In some + cases, literal optimizations can result in speedups of _several_ orders of + magnitude. Disabling this drops the `aho-corasick` and `memchr` dependencies. +* **perf-cache** - + This feature used to enable a faster internal cache at the cost of using + additional dependencies, but this is no longer an option. A fast internal + cache is now used unconditionally with no additional dependencies. This may + change in the future. + +### Unicode features + +* **unicode** - + Enables all Unicode features. This feature is enabled by default, and will + always cover all Unicode features, even if more are added in the future. +* **unicode-age** - + Provide the data for the + [Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age). + This makes it possible to use classes like `\p{Age:6.0}` to refer to all + codepoints first introduced in Unicode 6.0 +* **unicode-bool** - + Provide the data for numerous Unicode boolean properties. The full list + is not included here, but contains properties like `Alphabetic`, `Emoji`, + `Lowercase`, `Math`, `Uppercase` and `White_Space`. +* **unicode-case** - + Provide the data for case insensitive matching using + [Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches). +* **unicode-gencat** - + Provide the data for + [Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values). + This includes, but is not limited to, `Decimal_Number`, `Letter`, + `Math_Symbol`, `Number` and `Punctuation`. +* **unicode-perl** - + Provide the data for supporting the Unicode-aware Perl character classes, + corresponding to `\w`, `\s` and `\d`. This is also necessary for using + Unicode-aware word boundary assertions. Note that if this feature is + disabled, the `\s` and `\d` character classes are still available if the + `unicode-bool` and `unicode-gencat` features are enabled, respectively. +* **unicode-script** - + Provide the data for + [Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/). + This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`, + `Latin` and `Thai`. +* **unicode-segment** - + Provide the data necessary to provide the properties used to implement the + [Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/). + This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and + `\p{sb=ATerm}`. + + +# Untrusted input + +This crate can handle both untrusted regular expressions and untrusted +search text. + +Untrusted regular expressions are handled by capping the size of a compiled +regular expression. +(See [`RegexBuilder::size_limit`](struct.RegexBuilder.html#method.size_limit).) +Without this, it would be trivial for an attacker to exhaust your system's +memory with expressions like `a{100}{100}{100}`. + +Untrusted search text is allowed because the matching engine(s) in this +crate have time complexity `O(mn)` (with `m ~ regex` and `n ~ search +text`), which means there's no way to cause exponential blow-up like with +some other regular expression engines. (We pay for this by disallowing +features like arbitrary look-ahead and backreferences.) + +When a DFA is used, pathological cases with exponential state blow-up are +avoided by constructing the DFA lazily or in an "online" manner. Therefore, +at most one new state can be created for each byte of input. This satisfies +our time complexity guarantees, but can lead to memory growth +proportional to the size of the input. As a stopgap, the DFA is only +allowed to store a fixed number of states. When the limit is reached, its +states are wiped and continues on, possibly duplicating previous work. If +the limit is reached too frequently, it gives up and hands control off to +another matching engine with fixed memory requirements. +(The DFA size limit can also be tweaked. See +[`RegexBuilder::dfa_size_limit`](struct.RegexBuilder.html#method.dfa_size_limit).) +*/ + +#![deny(missing_docs)] +#![cfg_attr(feature = "pattern", feature(pattern))] +#![warn(missing_debug_implementations)] + +#[cfg(not(feature = "std"))] +compile_error!("`std` feature is currently required to build this crate"); + +// To check README's example +// TODO: Re-enable this once the MSRV is 1.43 or greater. +// See: https://github.com/rust-lang/regex/issues/684 +// See: https://github.com/rust-lang/regex/issues/685 +// #[cfg(doctest)] +// doc_comment::doctest!("../README.md"); + +#[cfg(feature = "std")] +pub use crate::error::Error; +#[cfg(feature = "std")] +pub use crate::re_builder::set_unicode::*; +#[cfg(feature = "std")] +pub use crate::re_builder::unicode::*; +#[cfg(feature = "std")] +pub use crate::re_set::unicode::*; +#[cfg(feature = "std")] +pub use crate::re_unicode::{ + escape, CaptureLocations, CaptureMatches, CaptureNames, Captures, + Locations, Match, Matches, NoExpand, Regex, Replacer, ReplacerRef, Split, + SplitN, SubCaptureMatches, +}; + +/** +Match regular expressions on arbitrary bytes. + +This module provides a nearly identical API to the one found in the +top-level of this crate. There are two important differences: + +1. Matching is done on `&[u8]` instead of `&str`. Additionally, `Vec` +is used where `String` would have been used. +2. Unicode support can be disabled even when disabling it would result in +matching invalid UTF-8 bytes. + +# Example: match null terminated string + +This shows how to find all null-terminated strings in a slice of bytes: + +```rust +# use regex::bytes::Regex; +let re = Regex::new(r"(?-u)(?P[^\x00]+)\x00").unwrap(); +let text = b"foo\x00bar\x00baz\x00"; + +// Extract all of the strings without the null terminator from each match. +// The unwrap is OK here since a match requires the `cstr` capture to match. +let cstrs: Vec<&[u8]> = + re.captures_iter(text) + .map(|c| c.name("cstr").unwrap().as_bytes()) + .collect(); +assert_eq!(vec![&b"foo"[..], &b"bar"[..], &b"baz"[..]], cstrs); +``` + +# Example: selectively enable Unicode support + +This shows how to match an arbitrary byte pattern followed by a UTF-8 encoded +string (e.g., to extract a title from a Matroska file): + +```rust +# use std::str; +# use regex::bytes::Regex; +let re = Regex::new( + r"(?-u)\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))" +).unwrap(); +let text = b"\x12\xd0\x3b\x5f\x7b\xa9\x85\xe2\x98\x83\x80\x98\x54\x76\x68\x65"; +let caps = re.captures(text).unwrap(); + +// Notice that despite the `.*` at the end, it will only match valid UTF-8 +// because Unicode mode was enabled with the `u` flag. Without the `u` flag, +// the `.*` would match the rest of the bytes. +let mat = caps.get(1).unwrap(); +assert_eq!((7, 10), (mat.start(), mat.end())); + +// If there was a match, Unicode mode guarantees that `title` is valid UTF-8. +let title = str::from_utf8(&caps[1]).unwrap(); +assert_eq!("☃", title); +``` + +In general, if the Unicode flag is enabled in a capture group and that capture +is part of the overall match, then the capture is *guaranteed* to be valid +UTF-8. + +# Syntax + +The supported syntax is pretty much the same as the syntax for Unicode +regular expressions with a few changes that make sense for matching arbitrary +bytes: + +1. The `u` flag can be disabled even when disabling it might cause the regex to +match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in +"ASCII compatible" mode. +2. In ASCII compatible mode, neither Unicode scalar values nor Unicode +character classes are allowed. +3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`) +revert to their typical ASCII definition. `\w` maps to `[[:word:]]`, `\d` maps +to `[[:digit:]]` and `\s` maps to `[[:space:]]`. +4. In ASCII compatible mode, word boundaries use the ASCII compatible `\w` to +determine whether a byte is a word byte or not. +5. Hexadecimal notation can be used to specify arbitrary bytes instead of +Unicode codepoints. For example, in ASCII compatible mode, `\xFF` matches the +literal byte `\xFF`, while in Unicode mode, `\xFF` is a Unicode codepoint that +matches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal notation when +enabled. +6. In ASCII compatible mode, `.` matches any *byte* except for `\n`. When the +`s` flag is additionally enabled, `.` matches any byte. + +# Performance + +In general, one should expect performance on `&[u8]` to be roughly similar to +performance on `&str`. +*/ +#[cfg(feature = "std")] +pub mod bytes { + pub use crate::re_builder::bytes::*; + pub use crate::re_builder::set_bytes::*; + pub use crate::re_bytes::*; + pub use crate::re_set::bytes::*; +} + +mod backtrack; +mod compile; +#[cfg(feature = "perf-dfa")] +mod dfa; +mod error; +mod exec; +mod expand; +mod find_byte; +mod input; +mod literal; +#[cfg(feature = "pattern")] +mod pattern; +mod pikevm; +mod pool; +mod prog; +mod re_builder; +mod re_bytes; +mod re_set; +mod re_trait; +mod re_unicode; +mod sparse; +mod utf8; + +/// The `internal` module exists to support suspicious activity, such as +/// testing different matching engines and supporting the `regex-debug` CLI +/// utility. +#[doc(hidden)] +#[cfg(feature = "std")] +pub mod internal { + pub use crate::compile::Compiler; + pub use crate::exec::{Exec, ExecBuilder}; + pub use crate::input::{Char, CharInput, Input, InputAt}; + pub use crate::literal::LiteralSearcher; + pub use crate::prog::{EmptyLook, Inst, InstRanges, Program}; +} diff --git a/vendor/regex/src/literal/imp.rs b/vendor/regex/src/literal/imp.rs new file mode 100644 index 0000000000..90b2f11606 --- /dev/null +++ b/vendor/regex/src/literal/imp.rs @@ -0,0 +1,402 @@ +use std::mem; + +use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder}; +use memchr::{memchr, memchr2, memchr3, memmem}; +use regex_syntax::hir::literal::{Literal, Literals}; + +/// A prefix extracted from a compiled regular expression. +/// +/// A regex prefix is a set of literal strings that *must* be matched at the +/// beginning of a regex in order for the entire regex to match. Similarly +/// for a regex suffix. +#[derive(Clone, Debug)] +pub struct LiteralSearcher { + complete: bool, + lcp: Memmem, + lcs: Memmem, + matcher: Matcher, +} + +#[derive(Clone, Debug)] +enum Matcher { + /// No literals. (Never advances through the input.) + Empty, + /// A set of four or more single byte literals. + Bytes(SingleByteSet), + /// A single substring, using vector accelerated routines when available. + Memmem(Memmem), + /// An Aho-Corasick automaton. + AC { ac: AhoCorasick, lits: Vec }, + /// A packed multiple substring searcher, using SIMD. + /// + /// Note that Aho-Corasick will actually use this packed searcher + /// internally automatically, however, there is some overhead associated + /// with going through the Aho-Corasick machinery. So using the packed + /// searcher directly results in some gains. + Packed { s: packed::Searcher, lits: Vec }, +} + +impl LiteralSearcher { + /// Returns a matcher that never matches and never advances the input. + pub fn empty() -> Self { + Self::new(Literals::empty(), Matcher::Empty) + } + + /// Returns a matcher for literal prefixes from the given set. + pub fn prefixes(lits: Literals) -> Self { + let matcher = Matcher::prefixes(&lits); + Self::new(lits, matcher) + } + + /// Returns a matcher for literal suffixes from the given set. + pub fn suffixes(lits: Literals) -> Self { + let matcher = Matcher::suffixes(&lits); + Self::new(lits, matcher) + } + + fn new(lits: Literals, matcher: Matcher) -> Self { + let complete = lits.all_complete(); + LiteralSearcher { + complete, + lcp: Memmem::new(lits.longest_common_prefix()), + lcs: Memmem::new(lits.longest_common_suffix()), + matcher, + } + } + + /// Returns true if all matches comprise the entire regular expression. + /// + /// This does not necessarily mean that a literal match implies a match + /// of the regular expression. For example, the regular expression `^a` + /// is comprised of a single complete literal `a`, but the regular + /// expression demands that it only match at the beginning of a string. + pub fn complete(&self) -> bool { + self.complete && !self.is_empty() + } + + /// Find the position of a literal in `haystack` if it exists. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> { + use self::Matcher::*; + match self.matcher { + Empty => Some((0, 0)), + Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)), + Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())), + AC { ref ac, .. } => { + ac.find(haystack).map(|m| (m.start(), m.end())) + } + Packed { ref s, .. } => { + s.find(haystack).map(|m| (m.start(), m.end())) + } + } + } + + /// Like find, except matches must start at index `0`. + pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> { + for lit in self.iter() { + if lit.len() > haystack.len() { + continue; + } + if lit == &haystack[0..lit.len()] { + return Some((0, lit.len())); + } + } + None + } + + /// Like find, except matches must end at index `haystack.len()`. + pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> { + for lit in self.iter() { + if lit.len() > haystack.len() { + continue; + } + if lit == &haystack[haystack.len() - lit.len()..] { + return Some((haystack.len() - lit.len(), haystack.len())); + } + } + None + } + + /// Returns an iterator over all literals to be matched. + pub fn iter(&self) -> LiteralIter<'_> { + match self.matcher { + Matcher::Empty => LiteralIter::Empty, + Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense), + Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()), + Matcher::AC { ref lits, .. } => LiteralIter::AC(lits), + Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits), + } + } + + /// Returns a matcher for the longest common prefix of this matcher. + pub fn lcp(&self) -> &Memmem { + &self.lcp + } + + /// Returns a matcher for the longest common suffix of this matcher. + pub fn lcs(&self) -> &Memmem { + &self.lcs + } + + /// Returns true iff this prefix is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns the number of prefixes in this machine. + pub fn len(&self) -> usize { + use self::Matcher::*; + match self.matcher { + Empty => 0, + Bytes(ref sset) => sset.dense.len(), + Memmem(_) => 1, + AC { ref ac, .. } => ac.pattern_count(), + Packed { ref lits, .. } => lits.len(), + } + } + + /// Return the approximate heap usage of literals in bytes. + pub fn approximate_size(&self) -> usize { + use self::Matcher::*; + match self.matcher { + Empty => 0, + Bytes(ref sset) => sset.approximate_size(), + Memmem(ref single) => single.approximate_size(), + AC { ref ac, .. } => ac.heap_bytes(), + Packed { ref s, .. } => s.heap_bytes(), + } + } +} + +impl Matcher { + fn prefixes(lits: &Literals) -> Self { + let sset = SingleByteSet::prefixes(lits); + Matcher::new(lits, sset) + } + + fn suffixes(lits: &Literals) -> Self { + let sset = SingleByteSet::suffixes(lits); + Matcher::new(lits, sset) + } + + fn new(lits: &Literals, sset: SingleByteSet) -> Self { + if lits.literals().is_empty() { + return Matcher::Empty; + } + if sset.dense.len() >= 26 { + // Avoid trying to match a large number of single bytes. + // This is *very* sensitive to a frequency analysis comparison + // between the bytes in sset and the composition of the haystack. + // No matter the size of sset, if its members all are rare in the + // haystack, then it'd be worth using it. How to tune this... IDK. + // ---AG + return Matcher::Empty; + } + if sset.complete { + return Matcher::Bytes(sset); + } + if lits.literals().len() == 1 { + return Matcher::Memmem(Memmem::new(&lits.literals()[0])); + } + + let pats = lits.literals().to_owned(); + let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii; + if lits.literals().len() <= 100 && !is_aho_corasick_fast { + let mut builder = packed::Config::new() + .match_kind(packed::MatchKind::LeftmostFirst) + .builder(); + if let Some(s) = builder.extend(&pats).build() { + return Matcher::Packed { s, lits: pats }; + } + } + let ac = AhoCorasickBuilder::new() + .match_kind(aho_corasick::MatchKind::LeftmostFirst) + .dfa(true) + .build_with_size::(&pats) + .unwrap(); + Matcher::AC { ac, lits: pats } + } +} + +#[derive(Debug)] +pub enum LiteralIter<'a> { + Empty, + Bytes(&'a [u8]), + Single(&'a [u8]), + AC(&'a [Literal]), + Packed(&'a [Literal]), +} + +impl<'a> Iterator for LiteralIter<'a> { + type Item = &'a [u8]; + + fn next(&mut self) -> Option { + match *self { + LiteralIter::Empty => None, + LiteralIter::Bytes(ref mut many) => { + if many.is_empty() { + None + } else { + let next = &many[0..1]; + *many = &many[1..]; + Some(next) + } + } + LiteralIter::Single(ref mut one) => { + if one.is_empty() { + None + } else { + let next = &one[..]; + *one = &[]; + Some(next) + } + } + LiteralIter::AC(ref mut lits) => { + if lits.is_empty() { + None + } else { + let next = &lits[0]; + *lits = &lits[1..]; + Some(&**next) + } + } + LiteralIter::Packed(ref mut lits) => { + if lits.is_empty() { + None + } else { + let next = &lits[0]; + *lits = &lits[1..]; + Some(&**next) + } + } + } + } +} + +#[derive(Clone, Debug)] +struct SingleByteSet { + sparse: Vec, + dense: Vec, + complete: bool, + all_ascii: bool, +} + +impl SingleByteSet { + fn new() -> SingleByteSet { + SingleByteSet { + sparse: vec![false; 256], + dense: vec![], + complete: true, + all_ascii: true, + } + } + + fn prefixes(lits: &Literals) -> SingleByteSet { + let mut sset = SingleByteSet::new(); + for lit in lits.literals() { + sset.complete = sset.complete && lit.len() == 1; + if let Some(&b) = lit.get(0) { + if !sset.sparse[b as usize] { + if b > 0x7F { + sset.all_ascii = false; + } + sset.dense.push(b); + sset.sparse[b as usize] = true; + } + } + } + sset + } + + fn suffixes(lits: &Literals) -> SingleByteSet { + let mut sset = SingleByteSet::new(); + for lit in lits.literals() { + sset.complete = sset.complete && lit.len() == 1; + if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) { + if !sset.sparse[b as usize] { + if b > 0x7F { + sset.all_ascii = false; + } + sset.dense.push(b); + sset.sparse[b as usize] = true; + } + } + } + sset + } + + /// Faster find that special cases certain sizes to use memchr. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find(&self, text: &[u8]) -> Option { + match self.dense.len() { + 0 => None, + 1 => memchr(self.dense[0], text), + 2 => memchr2(self.dense[0], self.dense[1], text), + 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text), + _ => self._find(text), + } + } + + /// Generic find that works on any sized set. + fn _find(&self, haystack: &[u8]) -> Option { + for (i, &b) in haystack.iter().enumerate() { + if self.sparse[b as usize] { + return Some(i); + } + } + None + } + + fn approximate_size(&self) -> usize { + (self.dense.len() * mem::size_of::()) + + (self.sparse.len() * mem::size_of::()) + } +} + +/// A simple wrapper around the memchr crate's memmem implementation. +/// +/// The API this exposes mirrors the API of previous substring searchers that +/// this supplanted. +#[derive(Clone, Debug)] +pub struct Memmem { + finder: memmem::Finder<'static>, + char_len: usize, +} + +impl Memmem { + fn new(pat: &[u8]) -> Memmem { + Memmem { + finder: memmem::Finder::new(pat).into_owned(), + char_len: char_len_lossy(pat), + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn find(&self, haystack: &[u8]) -> Option { + self.finder.find(haystack) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn is_suffix(&self, text: &[u8]) -> bool { + if text.len() < self.len() { + return false; + } + &text[text.len() - self.len()..] == self.finder.needle() + } + + pub fn len(&self) -> usize { + self.finder.needle().len() + } + + pub fn char_len(&self) -> usize { + self.char_len + } + + fn approximate_size(&self) -> usize { + self.finder.needle().len() * mem::size_of::() + } +} + +fn char_len_lossy(bytes: &[u8]) -> usize { + String::from_utf8_lossy(bytes).chars().count() +} diff --git a/vendor/regex/src/literal/mod.rs b/vendor/regex/src/literal/mod.rs new file mode 100644 index 0000000000..980f523309 --- /dev/null +++ b/vendor/regex/src/literal/mod.rs @@ -0,0 +1,55 @@ +pub use self::imp::*; + +#[cfg(feature = "perf-literal")] +mod imp; + +#[allow(missing_docs)] +#[cfg(not(feature = "perf-literal"))] +mod imp { + use regex_syntax::hir::literal::Literals; + + #[derive(Clone, Debug)] + pub struct LiteralSearcher(()); + + impl LiteralSearcher { + pub fn empty() -> Self { + LiteralSearcher(()) + } + + pub fn prefixes(_: Literals) -> Self { + LiteralSearcher(()) + } + + pub fn suffixes(_: Literals) -> Self { + LiteralSearcher(()) + } + + pub fn complete(&self) -> bool { + false + } + + pub fn find(&self, _: &[u8]) -> Option<(usize, usize)> { + unreachable!() + } + + pub fn find_start(&self, _: &[u8]) -> Option<(usize, usize)> { + unreachable!() + } + + pub fn find_end(&self, _: &[u8]) -> Option<(usize, usize)> { + unreachable!() + } + + pub fn is_empty(&self) -> bool { + true + } + + pub fn len(&self) -> usize { + 0 + } + + pub fn approximate_size(&self) -> usize { + 0 + } + } +} diff --git a/vendor/regex/src/pattern.rs b/vendor/regex/src/pattern.rs new file mode 100644 index 0000000000..00549e5106 --- /dev/null +++ b/vendor/regex/src/pattern.rs @@ -0,0 +1,63 @@ +use std::str::pattern::{Pattern, SearchStep, Searcher}; + +use crate::re_unicode::{Matches, Regex}; + +#[derive(Debug)] +pub struct RegexSearcher<'r, 't> { + haystack: &'t str, + it: Matches<'r, 't>, + last_step_end: usize, + next_match: Option<(usize, usize)>, +} + +impl<'r, 't> Pattern<'t> for &'r Regex { + type Searcher = RegexSearcher<'r, 't>; + + fn into_searcher(self, haystack: &'t str) -> RegexSearcher<'r, 't> { + RegexSearcher { + haystack, + it: self.find_iter(haystack), + last_step_end: 0, + next_match: None, + } + } +} + +unsafe impl<'r, 't> Searcher<'t> for RegexSearcher<'r, 't> { + #[inline] + fn haystack(&self) -> &'t str { + self.haystack + } + + #[inline] + fn next(&mut self) -> SearchStep { + if let Some((s, e)) = self.next_match { + self.next_match = None; + self.last_step_end = e; + return SearchStep::Match(s, e); + } + match self.it.next() { + None => { + if self.last_step_end < self.haystack().len() { + let last = self.last_step_end; + self.last_step_end = self.haystack().len(); + SearchStep::Reject(last, self.haystack().len()) + } else { + SearchStep::Done + } + } + Some(m) => { + let (s, e) = (m.start(), m.end()); + if s == self.last_step_end { + self.last_step_end = e; + SearchStep::Match(s, e) + } else { + self.next_match = Some((s, e)); + let last = self.last_step_end; + self.last_step_end = s; + SearchStep::Reject(last, s) + } + } + } + } +} diff --git a/vendor/regex/src/pikevm.rs b/vendor/regex/src/pikevm.rs new file mode 100644 index 0000000000..8c9eac2d39 --- /dev/null +++ b/vendor/regex/src/pikevm.rs @@ -0,0 +1,360 @@ +// This module implements the Pike VM. That is, it guarantees linear time +// search of a regex on any text with memory use proportional to the size of +// the regex. +// +// It is equal in power to the backtracking engine in this crate, except the +// backtracking engine is typically faster on small regexes/texts at the +// expense of a bigger memory footprint. +// +// It can do more than the DFA can (specifically, record capture locations +// and execute Unicode word boundary assertions), but at a slower speed. +// Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding +// epsilon transitions. That is, the Pike VM engine can be in multiple states +// at once where as the DFA is only ever in one state at a time. +// +// Therefore, the Pike VM is generally treated as the fallback when the other +// matching engines either aren't feasible to run or are insufficient. + +use std::mem; + +use crate::exec::ProgramCache; +use crate::input::{Input, InputAt}; +use crate::prog::{InstPtr, Program}; +use crate::re_trait::Slot; +use crate::sparse::SparseSet; + +/// An NFA simulation matching engine. +#[derive(Debug)] +pub struct Fsm<'r, I> { + /// The sequence of opcodes (among other things) that is actually executed. + /// + /// The program may be byte oriented or Unicode codepoint oriented. + prog: &'r Program, + /// An explicit stack used for following epsilon transitions. (This is + /// borrowed from the cache.) + stack: &'r mut Vec, + /// The input to search. + input: I, +} + +/// A cached allocation that can be reused on each execution. +#[derive(Clone, Debug)] +pub struct Cache { + /// A pair of ordered sets for tracking NFA states. + clist: Threads, + nlist: Threads, + /// An explicit stack used for following epsilon transitions. + stack: Vec, +} + +/// An ordered set of NFA states and their captures. +#[derive(Clone, Debug)] +struct Threads { + /// An ordered set of opcodes (each opcode is an NFA state). + set: SparseSet, + /// Captures for every NFA state. + /// + /// It is stored in row-major order, where the columns are the capture + /// slots and the rows are the states. + caps: Vec, + /// The number of capture slots stored per thread. (Every capture has + /// two slots.) + slots_per_thread: usize, +} + +/// A representation of an explicit stack frame when following epsilon +/// transitions. This is used to avoid recursion. +#[derive(Clone, Debug)] +enum FollowEpsilon { + /// Follow transitions at the given instruction pointer. + IP(InstPtr), + /// Restore the capture slot with the given position in the input. + Capture { slot: usize, pos: Slot }, +} + +impl Cache { + /// Create a new allocation used by the NFA machine to record execution + /// and captures. + pub fn new(_prog: &Program) -> Self { + Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] } + } +} + +impl<'r, I: Input> Fsm<'r, I> { + /// Execute the NFA matching engine. + /// + /// If there's a match, `exec` returns `true` and populates the given + /// captures accordingly. + pub fn exec( + prog: &'r Program, + cache: &ProgramCache, + matches: &mut [bool], + slots: &mut [Slot], + quit_after_match: bool, + input: I, + start: usize, + end: usize, + ) -> bool { + let mut cache = cache.borrow_mut(); + let cache = &mut cache.pikevm; + cache.clist.resize(prog.len(), prog.captures.len()); + cache.nlist.resize(prog.len(), prog.captures.len()); + let at = input.at(start); + Fsm { prog, stack: &mut cache.stack, input }.exec_( + &mut cache.clist, + &mut cache.nlist, + matches, + slots, + quit_after_match, + at, + end, + ) + } + + fn exec_( + &mut self, + mut clist: &mut Threads, + mut nlist: &mut Threads, + matches: &mut [bool], + slots: &mut [Slot], + quit_after_match: bool, + mut at: InputAt, + end: usize, + ) -> bool { + let mut matched = false; + let mut all_matched = false; + clist.set.clear(); + nlist.set.clear(); + 'LOOP: loop { + if clist.set.is_empty() { + // Three ways to bail out when our current set of threads is + // empty. + // + // 1. We have a match---so we're done exploring any possible + // alternatives. Time to quit. (We can't do this if we're + // looking for matches for multiple regexes, unless we know + // they all matched.) + // + // 2. If the expression starts with a '^' we can terminate as + // soon as the last thread dies. + if (matched && matches.len() <= 1) + || all_matched + || (!at.is_start() && self.prog.is_anchored_start) + { + break; + } + + // 3. If there's a literal prefix for the program, try to + // jump ahead quickly. If it can't be found, then we can + // bail out early. + if !self.prog.prefixes.is_empty() { + at = match self.input.prefix_at(&self.prog.prefixes, at) { + None => break, + Some(at) => at, + }; + } + } + + // This simulates a preceding '.*?' for every regex by adding + // a state starting at the current position in the input for the + // beginning of the program only if we don't already have a match. + if clist.set.is_empty() + || (!self.prog.is_anchored_start && !all_matched) + { + self.add(&mut clist, slots, 0, at); + } + // The previous call to "add" actually inspects the position just + // before the current character. For stepping through the machine, + // we can to look at the current character, so we advance the + // input. + let at_next = self.input.at(at.next_pos()); + for i in 0..clist.set.len() { + let ip = clist.set[i]; + if self.step( + &mut nlist, + matches, + slots, + clist.caps(ip), + ip, + at, + at_next, + ) { + matched = true; + all_matched = all_matched || matches.iter().all(|&b| b); + if quit_after_match { + // If we only care if a match occurs (not its + // position), then we can quit right now. + break 'LOOP; + } + if self.prog.matches.len() == 1 { + // We don't need to check the rest of the threads + // in this set because we've matched something + // ("leftmost-first"). However, we still need to check + // threads in the next set to support things like + // greedy matching. + // + // This is only true on normal regexes. For regex sets, + // we need to mush on to observe other matches. + break; + } + } + } + if at.pos() >= end { + break; + } + at = at_next; + mem::swap(clist, nlist); + nlist.set.clear(); + } + matched + } + + /// Step through the input, one token (byte or codepoint) at a time. + /// + /// nlist is the set of states that will be processed on the next token + /// in the input. + /// + /// caps is the set of captures passed by the caller of the NFA. They are + /// written to only when a match state is visited. + /// + /// thread_caps is the set of captures set for the current NFA state, ip. + /// + /// at and at_next are the current and next positions in the input. at or + /// at_next may be EOF. + fn step( + &mut self, + nlist: &mut Threads, + matches: &mut [bool], + slots: &mut [Slot], + thread_caps: &mut [Option], + ip: usize, + at: InputAt, + at_next: InputAt, + ) -> bool { + use crate::prog::Inst::*; + match self.prog[ip] { + Match(match_slot) => { + if match_slot < matches.len() { + matches[match_slot] = true; + } + for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) { + *slot = *val; + } + true + } + Char(ref inst) => { + if inst.c == at.char() { + self.add(nlist, thread_caps, inst.goto, at_next); + } + false + } + Ranges(ref inst) => { + if inst.matches(at.char()) { + self.add(nlist, thread_caps, inst.goto, at_next); + } + false + } + Bytes(ref inst) => { + if let Some(b) = at.byte() { + if inst.matches(b) { + self.add(nlist, thread_caps, inst.goto, at_next); + } + } + false + } + EmptyLook(_) | Save(_) | Split(_) => false, + } + } + + /// Follows epsilon transitions and adds them for processing to nlist, + /// starting at and including ip. + fn add( + &mut self, + nlist: &mut Threads, + thread_caps: &mut [Option], + ip: usize, + at: InputAt, + ) { + self.stack.push(FollowEpsilon::IP(ip)); + while let Some(frame) = self.stack.pop() { + match frame { + FollowEpsilon::IP(ip) => { + self.add_step(nlist, thread_caps, ip, at); + } + FollowEpsilon::Capture { slot, pos } => { + thread_caps[slot] = pos; + } + } + } + } + + /// A helper function for add that avoids excessive pushing to the stack. + fn add_step( + &mut self, + nlist: &mut Threads, + thread_caps: &mut [Option], + mut ip: usize, + at: InputAt, + ) { + // Instead of pushing and popping to the stack, we mutate ip as we + // traverse the set of states. We only push to the stack when we + // absolutely need recursion (restoring captures or following a + // branch). + use crate::prog::Inst::*; + loop { + // Don't visit states we've already added. + if nlist.set.contains(ip) { + return; + } + nlist.set.insert(ip); + match self.prog[ip] { + EmptyLook(ref inst) => { + if self.input.is_empty_match(at, inst) { + ip = inst.goto; + } + } + Save(ref inst) => { + if inst.slot < thread_caps.len() { + self.stack.push(FollowEpsilon::Capture { + slot: inst.slot, + pos: thread_caps[inst.slot], + }); + thread_caps[inst.slot] = Some(at.pos()); + } + ip = inst.goto; + } + Split(ref inst) => { + self.stack.push(FollowEpsilon::IP(inst.goto2)); + ip = inst.goto1; + } + Match(_) | Char(_) | Ranges(_) | Bytes(_) => { + let t = &mut nlist.caps(ip); + for (slot, val) in t.iter_mut().zip(thread_caps.iter()) { + *slot = *val; + } + return; + } + } + } + } +} + +impl Threads { + fn new() -> Self { + Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 } + } + + fn resize(&mut self, num_insts: usize, ncaps: usize) { + if num_insts == self.set.capacity() { + return; + } + self.slots_per_thread = ncaps * 2; + self.set = SparseSet::new(num_insts); + self.caps = vec![None; self.slots_per_thread * num_insts]; + } + + fn caps(&mut self, pc: usize) -> &mut [Option] { + let i = pc * self.slots_per_thread; + &mut self.caps[i..i + self.slots_per_thread] + } +} diff --git a/vendor/regex/src/pool.rs b/vendor/regex/src/pool.rs new file mode 100644 index 0000000000..6a6f15b194 --- /dev/null +++ b/vendor/regex/src/pool.rs @@ -0,0 +1,333 @@ +// This module provides a relatively simple thread-safe pool of reusable +// objects. For the most part, it's implemented by a stack represented by a +// Mutex>. It has one small trick: because unlocking a mutex is somewhat +// costly, in the case where a pool is accessed by the first thread that tried +// to get a value, we bypass the mutex. Here are some benchmarks showing the +// difference. +// +// 1) misc::anchored_literal_long_non_match 21 (18571 MB/s) +// 2) misc::anchored_literal_long_non_match 107 (3644 MB/s) +// 3) misc::anchored_literal_long_non_match 45 (8666 MB/s) +// 4) misc::anchored_literal_long_non_match 19 (20526 MB/s) +// +// (1) represents our baseline: the master branch at the time of writing when +// using the 'thread_local' crate to implement the pool below. +// +// (2) represents a naive pool implemented completely via Mutex>. There +// is no special trick for bypassing the mutex. +// +// (3) is the same as (2), except it uses Mutex>>. It is twice as +// fast because a Box is much smaller than the T we use with a Pool in this +// crate. So pushing and popping a Box from a Vec is quite a bit faster +// than for T. +// +// (4) is the same as (3), but with the trick for bypassing the mutex in the +// case of the first-to-get thread. +// +// Why move off of thread_local? Even though (4) is a hair faster than (1) +// above, this was not the main goal. The main goal was to move off of +// thread_local and find a way to *simply* re-capture some of its speed for +// regex's specific case. So again, why move off of it? The *primary* reason is +// because of memory leaks. See https://github.com/rust-lang/regex/issues/362 +// for example. (Why do I want it to be simple? Well, I suppose what I mean is, +// "use as much safe code as possible to minimize risk and be as sure as I can +// be that it is correct.") +// +// My guess is that the thread_local design is probably not appropriate for +// regex since its memory usage scales to the number of active threads that +// have used a regex, where as the pool below scales to the number of threads +// that simultaneously use a regex. While neither case permits contraction, +// since we own the pool data structure below, we can add contraction if a +// clear use case pops up in the wild. More pressingly though, it seems that +// there are at least some use case patterns where one might have many threads +// sitting around that might have used a regex at one point. While thread_local +// does try to reuse space previously used by a thread that has since stopped, +// its maximal memory usage still scales with the total number of active +// threads. In contrast, the pool below scales with the total number of threads +// *simultaneously* using the pool. The hope is that this uses less memory +// overall. And if it doesn't, we can hopefully tune it somehow. +// +// It seems that these sort of conditions happen frequently +// in FFI inside of other more "managed" languages. This was +// mentioned in the issue linked above, and also mentioned here: +// https://github.com/BurntSushi/rure-go/issues/3. And in particular, users +// confirm that disabling the use of thread_local resolves the leak. +// +// There were other weaker reasons for moving off of thread_local as well. +// Namely, at the time, I was looking to reduce dependencies. And for something +// like regex, maintenance can be simpler when we own the full dependency tree. + +use std::panic::{RefUnwindSafe, UnwindSafe}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Mutex; + +/// An atomic counter used to allocate thread IDs. +static COUNTER: AtomicUsize = AtomicUsize::new(1); + +thread_local!( + /// A thread local used to assign an ID to a thread. + static THREAD_ID: usize = { + let next = COUNTER.fetch_add(1, Ordering::Relaxed); + // SAFETY: We cannot permit the reuse of thread IDs since reusing a + // thread ID might result in more than one thread "owning" a pool, + // and thus, permit accessing a mutable value from multiple threads + // simultaneously without synchronization. The intent of this panic is + // to be a sanity check. It is not expected that the thread ID space + // will actually be exhausted in practice. + // + // This checks that the counter never wraps around, since atomic + // addition wraps around on overflow. + if next == 0 { + panic!("regex: thread ID allocation space exhausted"); + } + next + }; +); + +/// The type of the function used to create values in a pool when the pool is +/// empty and the caller requests one. +type CreateFn = + Box T + Send + Sync + UnwindSafe + RefUnwindSafe + 'static>; + +/// A simple thread safe pool for reusing values. +/// +/// Getting a value out comes with a guard. When that guard is dropped, the +/// value is automatically put back in the pool. +/// +/// A Pool impls Sync when T is Send (even if it's not Sync). This means +/// that T can use interior mutability. This is possible because a pool is +/// guaranteed to provide a value to exactly one thread at any time. +/// +/// Currently, a pool never contracts in size. Its size is proportional to the +/// number of simultaneous uses. +pub struct Pool { + /// A stack of T values to hand out. These are used when a Pool is + /// accessed by a thread that didn't create it. + stack: Mutex>>, + /// A function to create more T values when stack is empty and a caller + /// has requested a T. + create: CreateFn, + /// The ID of the thread that owns this pool. The owner is the thread + /// that makes the first call to 'get'. When the owner calls 'get', it + /// gets 'owner_val' directly instead of returning a T from 'stack'. + /// See comments elsewhere for details, but this is intended to be an + /// optimization for the common case that makes getting a T faster. + /// + /// It is initialized to a value of zero (an impossible thread ID) as a + /// sentinel to indicate that it is unowned. + owner: AtomicUsize, + /// A value to return when the caller is in the same thread that created + /// the Pool. + owner_val: T, +} + +// SAFETY: Since we want to use a Pool from multiple threads simultaneously +// behind an Arc, we need for it to be Sync. In cases where T is sync, Pool +// would be Sync. However, since we use a Pool to store mutable scratch space, +// we wind up using a T that has interior mutability and is thus itself not +// Sync. So what we *really* want is for our Pool to by Sync even when T is +// not Sync (but is at least Send). +// +// The only non-sync aspect of a Pool is its 'owner_val' field, which is used +// to implement faster access to a pool value in the common case of a pool +// being accessed in the same thread in which it was created. The 'stack' field +// is also shared, but a Mutex where T: Send is already Sync. So we only +// need to worry about 'owner_val'. +// +// The key is to guarantee that 'owner_val' can only ever be accessed from one +// thread. In our implementation below, we guarantee this by only returning the +// 'owner_val' when the ID of the current thread matches the ID of the thread +// that created the Pool. Since this can only ever be one thread, it follows +// that only one thread can access 'owner_val' at any point in time. Thus, it +// is safe to declare that Pool is Sync when T is Send. +// +// NOTE: It would also be possible to make the owning thread be the *first* +// thread that tries to get a value out of a Pool. However, the current +// implementation is a little simpler and it's not clear if making the first +// thread (rather than the creating thread) is meaningfully better. +// +// If there is a way to achieve our performance goals using safe code, then +// I would very much welcome a patch. As it stands, the implementation below +// tries to balance safety with performance. The case where a Regex is used +// from multiple threads simultaneously will suffer a bit since getting a cache +// will require unlocking a mutex. +unsafe impl Sync for Pool {} + +impl ::std::fmt::Debug for Pool { + fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { + f.debug_struct("Pool") + .field("stack", &self.stack) + .field("owner", &self.owner) + .field("owner_val", &self.owner_val) + .finish() + } +} + +/// A guard that is returned when a caller requests a value from the pool. +/// +/// The purpose of the guard is to use RAII to automatically put the value back +/// in the pool once it's dropped. +#[derive(Debug)] +pub struct PoolGuard<'a, T: Send> { + /// The pool that this guard is attached to. + pool: &'a Pool, + /// This is None when the guard represents the special "owned" value. In + /// which case, the value is retrieved from 'pool.owner_val'. + value: Option>, +} + +impl Pool { + /// Create a new pool. The given closure is used to create values in the + /// pool when necessary. + pub fn new(create: CreateFn) -> Pool { + let owner = AtomicUsize::new(0); + let owner_val = create(); + Pool { stack: Mutex::new(vec![]), create, owner, owner_val } + } + + /// Get a value from the pool. The caller is guaranteed to have exclusive + /// access to the given value. + /// + /// Note that there is no guarantee provided about which value in the + /// pool is returned. That is, calling get, dropping the guard (causing + /// the value to go back into the pool) and then calling get again is NOT + /// guaranteed to return the same value received in the first get call. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn get(&self) -> PoolGuard<'_, T> { + // Our fast path checks if the caller is the thread that "owns" this + // pool. Or stated differently, whether it is the first thread that + // tried to extract a value from the pool. If it is, then we can return + // a T to the caller without going through a mutex. + // + // SAFETY: We must guarantee that only one thread gets access to this + // value. Since a thread is uniquely identified by the THREAD_ID thread + // local, it follows that is the caller's thread ID is equal to the + // owner, then only one thread may receive this value. + let caller = THREAD_ID.with(|id| *id); + let owner = self.owner.load(Ordering::Relaxed); + if caller == owner { + return self.guard_owned(); + } + self.get_slow(caller, owner) + } + + /// This is the "slow" version that goes through a mutex to pop an + /// allocated value off a stack to return to the caller. (Or, if the stack + /// is empty, a new value is created.) + /// + /// If the pool has no owner, then this will set the owner. + #[cold] + fn get_slow(&self, caller: usize, owner: usize) -> PoolGuard<'_, T> { + use std::sync::atomic::Ordering::Relaxed; + + if owner == 0 { + // The sentinel 0 value means this pool is not yet owned. We + // try to atomically set the owner. If we do, then this thread + // becomes the owner and we can return a guard that represents + // the special T for the owner. + let res = self.owner.compare_exchange(0, caller, Relaxed, Relaxed); + if res.is_ok() { + return self.guard_owned(); + } + } + let mut stack = self.stack.lock().unwrap(); + let value = match stack.pop() { + None => Box::new((self.create)()), + Some(value) => value, + }; + self.guard_stack(value) + } + + /// Puts a value back into the pool. Callers don't need to call this. Once + /// the guard that's returned by 'get' is dropped, it is put back into the + /// pool automatically. + fn put(&self, value: Box) { + let mut stack = self.stack.lock().unwrap(); + stack.push(value); + } + + /// Create a guard that represents the special owned T. + fn guard_owned(&self) -> PoolGuard<'_, T> { + PoolGuard { pool: self, value: None } + } + + /// Create a guard that contains a value from the pool's stack. + fn guard_stack(&self, value: Box) -> PoolGuard<'_, T> { + PoolGuard { pool: self, value: Some(value) } + } +} + +impl<'a, T: Send> PoolGuard<'a, T> { + /// Return the underlying value. + pub fn value(&self) -> &T { + match self.value { + None => &self.pool.owner_val, + Some(ref v) => &**v, + } + } +} + +impl<'a, T: Send> Drop for PoolGuard<'a, T> { + #[cfg_attr(feature = "perf-inline", inline(always))] + fn drop(&mut self) { + if let Some(value) = self.value.take() { + self.pool.put(value); + } + } +} + +#[cfg(test)] +mod tests { + use std::panic::{RefUnwindSafe, UnwindSafe}; + + use super::*; + + #[test] + fn oibits() { + use crate::exec::ProgramCache; + + fn has_oibits() {} + has_oibits::>(); + } + + // Tests that Pool implements the "single owner" optimization. That is, the + // thread that first accesses the pool gets its own copy, while all other + // threads get distinct copies. + #[test] + fn thread_owner_optimization() { + use std::cell::RefCell; + use std::sync::Arc; + + let pool: Arc>>> = + Arc::new(Pool::new(Box::new(|| RefCell::new(vec!['a'])))); + pool.get().value().borrow_mut().push('x'); + + let pool1 = pool.clone(); + let t1 = std::thread::spawn(move || { + let guard = pool1.get(); + let v = guard.value(); + v.borrow_mut().push('y'); + }); + + let pool2 = pool.clone(); + let t2 = std::thread::spawn(move || { + let guard = pool2.get(); + let v = guard.value(); + v.borrow_mut().push('z'); + }); + + t1.join().unwrap(); + t2.join().unwrap(); + + // If we didn't implement the single owner optimization, then one of + // the threads above is likely to have mutated the [a, x] vec that + // we stuffed in the pool before spawning the threads. But since + // neither thread was first to access the pool, and because of the + // optimization, we should be guaranteed that neither thread mutates + // the special owned pool value. + // + // (Technically this is an implementation detail and not a contract of + // Pool's API.) + assert_eq!(vec!['a', 'x'], *pool.get().value().borrow()); + } +} diff --git a/vendor/regex/src/prog.rs b/vendor/regex/src/prog.rs new file mode 100644 index 0000000000..c211f71d8a --- /dev/null +++ b/vendor/regex/src/prog.rs @@ -0,0 +1,447 @@ +use std::cmp::Ordering; +use std::collections::HashMap; +use std::fmt; +use std::mem; +use std::ops::Deref; +use std::slice; +use std::sync::Arc; + +use crate::input::Char; +use crate::literal::LiteralSearcher; + +/// `InstPtr` represents the index of an instruction in a regex program. +pub type InstPtr = usize; + +/// Program is a sequence of instructions and various facts about thos +/// instructions. +#[derive(Clone)] +pub struct Program { + /// A sequence of instructions that represents an NFA. + pub insts: Vec, + /// Pointers to each Match instruction in the sequence. + /// + /// This is always length 1 unless this program represents a regex set. + pub matches: Vec, + /// The ordered sequence of all capture groups extracted from the AST. + /// Unnamed groups are `None`. + pub captures: Vec>, + /// Pointers to all named capture groups into `captures`. + pub capture_name_idx: Arc>, + /// A pointer to the start instruction. This can vary depending on how + /// the program was compiled. For example, programs for use with the DFA + /// engine have a `.*?` inserted at the beginning of unanchored regular + /// expressions. The actual starting point of the program is after the + /// `.*?`. + pub start: InstPtr, + /// A set of equivalence classes for discriminating bytes in the compiled + /// program. + pub byte_classes: Vec, + /// When true, this program can only match valid UTF-8. + pub only_utf8: bool, + /// When true, this program uses byte range instructions instead of Unicode + /// range instructions. + pub is_bytes: bool, + /// When true, the program is compiled for DFA matching. For example, this + /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored + /// regexes. + pub is_dfa: bool, + /// When true, the program matches text in reverse (for use only in the + /// DFA). + pub is_reverse: bool, + /// Whether the regex must match from the start of the input. + pub is_anchored_start: bool, + /// Whether the regex must match at the end of the input. + pub is_anchored_end: bool, + /// Whether this program contains a Unicode word boundary instruction. + pub has_unicode_word_boundary: bool, + /// A possibly empty machine for very quickly matching prefix literals. + pub prefixes: LiteralSearcher, + /// A limit on the size of the cache that the DFA is allowed to use while + /// matching. + /// + /// The cache limit specifies approximately how much space we're willing to + /// give to the state cache. Once the state cache exceeds the size, it is + /// wiped and all states must be re-computed. + /// + /// Note that this value does not impact correctness. It can be set to 0 + /// and the DFA will run just fine. (It will only ever store exactly one + /// state in the cache, and will likely run very slowly, but it will work.) + /// + /// Also note that this limit is *per thread of execution*. That is, + /// if the same regex is used to search text across multiple threads + /// simultaneously, then the DFA cache is not shared. Instead, copies are + /// made. + pub dfa_size_limit: usize, +} + +impl Program { + /// Creates an empty instruction sequence. Fields are given default + /// values. + pub fn new() -> Self { + Program { + insts: vec![], + matches: vec![], + captures: vec![], + capture_name_idx: Arc::new(HashMap::new()), + start: 0, + byte_classes: vec![0; 256], + only_utf8: true, + is_bytes: false, + is_dfa: false, + is_reverse: false, + is_anchored_start: false, + is_anchored_end: false, + has_unicode_word_boundary: false, + prefixes: LiteralSearcher::empty(), + dfa_size_limit: 2 * (1 << 20), + } + } + + /// If pc is an index to a no-op instruction (like Save), then return the + /// next pc that is not a no-op instruction. + pub fn skip(&self, mut pc: usize) -> usize { + loop { + match self[pc] { + Inst::Save(ref i) => pc = i.goto, + _ => return pc, + } + } + } + + /// Return true if and only if an execution engine at instruction `pc` will + /// always lead to a match. + pub fn leads_to_match(&self, pc: usize) -> bool { + if self.matches.len() > 1 { + // If we have a regex set, then we have more than one ending + // state, so leading to one of those states is generally + // meaningless. + return false; + } + match self[self.skip(pc)] { + Inst::Match(_) => true, + _ => false, + } + } + + /// Returns true if the current configuration demands that an implicit + /// `.*?` be prepended to the instruction sequence. + pub fn needs_dotstar(&self) -> bool { + self.is_dfa && !self.is_reverse && !self.is_anchored_start + } + + /// Returns true if this program uses Byte instructions instead of + /// Char/Range instructions. + pub fn uses_bytes(&self) -> bool { + self.is_bytes || self.is_dfa + } + + /// Returns true if this program exclusively matches valid UTF-8 bytes. + /// + /// That is, if an invalid UTF-8 byte is seen, then no match is possible. + pub fn only_utf8(&self) -> bool { + self.only_utf8 + } + + /// Return the approximate heap usage of this instruction sequence in + /// bytes. + pub fn approximate_size(&self) -> usize { + // The only instruction that uses heap space is Ranges (for + // Unicode codepoint programs) to store non-overlapping codepoint + // ranges. To keep this operation constant time, we ignore them. + (self.len() * mem::size_of::()) + + (self.matches.len() * mem::size_of::()) + + (self.captures.len() * mem::size_of::>()) + + (self.capture_name_idx.len() + * (mem::size_of::() + mem::size_of::())) + + (self.byte_classes.len() * mem::size_of::()) + + self.prefixes.approximate_size() + } +} + +impl Deref for Program { + type Target = [Inst]; + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn deref(&self) -> &Self::Target { + &*self.insts + } +} + +impl fmt::Debug for Program { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use self::Inst::*; + + fn with_goto(cur: usize, goto: usize, fmtd: String) -> String { + if goto == cur + 1 { + fmtd + } else { + format!("{} (goto: {})", fmtd, goto) + } + } + + fn visible_byte(b: u8) -> String { + use std::ascii::escape_default; + let escaped = escape_default(b).collect::>(); + String::from_utf8_lossy(&escaped).into_owned() + } + + for (pc, inst) in self.iter().enumerate() { + match *inst { + Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?, + Save(ref inst) => { + let s = format!("{:04} Save({})", pc, inst.slot); + write!(f, "{}", with_goto(pc, inst.goto, s))?; + } + Split(ref inst) => { + write!( + f, + "{:04} Split({}, {})", + pc, inst.goto1, inst.goto2 + )?; + } + EmptyLook(ref inst) => { + let s = format!("{:?}", inst.look); + write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; + } + Char(ref inst) => { + let s = format!("{:?}", inst.c); + write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; + } + Ranges(ref inst) => { + let ranges = inst + .ranges + .iter() + .map(|r| format!("{:?}-{:?}", r.0, r.1)) + .collect::>() + .join(", "); + write!( + f, + "{:04} {}", + pc, + with_goto(pc, inst.goto, ranges) + )?; + } + Bytes(ref inst) => { + let s = format!( + "Bytes({}, {})", + visible_byte(inst.start), + visible_byte(inst.end) + ); + write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; + } + } + if pc == self.start { + write!(f, " (start)")?; + } + writeln!(f)?; + } + Ok(()) + } +} + +impl<'a> IntoIterator for &'a Program { + type Item = &'a Inst; + type IntoIter = slice::Iter<'a, Inst>; + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +/// Inst is an instruction code in a Regex program. +/// +/// Regrettably, a regex program either contains Unicode codepoint +/// instructions (Char and Ranges) or it contains byte instructions (Bytes). +/// A regex program can never contain both. +/// +/// It would be worth investigating splitting this into two distinct types and +/// then figuring out how to make the matching engines polymorphic over those +/// types without sacrificing performance. +/// +/// Other than the benefit of moving invariants into the type system, another +/// benefit is the decreased size. If we remove the `Char` and `Ranges` +/// instructions from the `Inst` enum, then its size shrinks from 32 bytes to +/// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges` +/// variant.) Given that byte based machines are typically much bigger than +/// their Unicode analogues (because they can decode UTF-8 directly), this ends +/// up being a pretty significant savings. +#[derive(Clone, Debug)] +pub enum Inst { + /// Match indicates that the program has reached a match state. + /// + /// The number in the match corresponds to the Nth logical regular + /// expression in this program. This index is always 0 for normal regex + /// programs. Values greater than 0 appear when compiling regex sets, and + /// each match instruction gets its own unique value. The value corresponds + /// to the Nth regex in the set. + Match(usize), + /// Save causes the program to save the current location of the input in + /// the slot indicated by InstSave. + Save(InstSave), + /// Split causes the program to diverge to one of two paths in the + /// program, preferring goto1 in InstSplit. + Split(InstSplit), + /// EmptyLook represents a zero-width assertion in a regex program. A + /// zero-width assertion does not consume any of the input text. + EmptyLook(InstEmptyLook), + /// Char requires the regex program to match the character in InstChar at + /// the current position in the input. + Char(InstChar), + /// Ranges requires the regex program to match the character at the current + /// position in the input with one of the ranges specified in InstRanges. + Ranges(InstRanges), + /// Bytes is like Ranges, except it expresses a single byte range. It is + /// used in conjunction with Split instructions to implement multi-byte + /// character classes. + Bytes(InstBytes), +} + +impl Inst { + /// Returns true if and only if this is a match instruction. + pub fn is_match(&self) -> bool { + match *self { + Inst::Match(_) => true, + _ => false, + } + } +} + +/// Representation of the Save instruction. +#[derive(Clone, Debug)] +pub struct InstSave { + /// The next location to execute in the program. + pub goto: InstPtr, + /// The capture slot (there are two slots for every capture in a regex, + /// including the zeroth capture for the entire match). + pub slot: usize, +} + +/// Representation of the Split instruction. +#[derive(Clone, Debug)] +pub struct InstSplit { + /// The first instruction to try. A match resulting from following goto1 + /// has precedence over a match resulting from following goto2. + pub goto1: InstPtr, + /// The second instruction to try. A match resulting from following goto1 + /// has precedence over a match resulting from following goto2. + pub goto2: InstPtr, +} + +/// Representation of the `EmptyLook` instruction. +#[derive(Clone, Debug)] +pub struct InstEmptyLook { + /// The next location to execute in the program if this instruction + /// succeeds. + pub goto: InstPtr, + /// The type of zero-width assertion to check. + pub look: EmptyLook, +} + +/// The set of zero-width match instructions. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum EmptyLook { + /// Start of line or input. + StartLine, + /// End of line or input. + EndLine, + /// Start of input. + StartText, + /// End of input. + EndText, + /// Word character on one side and non-word character on other. + WordBoundary, + /// Word character on both sides or non-word character on both sides. + NotWordBoundary, + /// ASCII word boundary. + WordBoundaryAscii, + /// Not ASCII word boundary. + NotWordBoundaryAscii, +} + +/// Representation of the Char instruction. +#[derive(Clone, Debug)] +pub struct InstChar { + /// The next location to execute in the program if this instruction + /// succeeds. + pub goto: InstPtr, + /// The character to test. + pub c: char, +} + +/// Representation of the Ranges instruction. +#[derive(Clone, Debug)] +pub struct InstRanges { + /// The next location to execute in the program if this instruction + /// succeeds. + pub goto: InstPtr, + /// The set of Unicode scalar value ranges to test. + pub ranges: Box<[(char, char)]>, +} + +impl InstRanges { + /// Tests whether the given input character matches this instruction. + pub fn matches(&self, c: Char) -> bool { + // This speeds up the `match_class_unicode` benchmark by checking + // some common cases quickly without binary search. e.g., Matching + // a Unicode class on predominantly ASCII text. + for r in self.ranges.iter().take(4) { + if c < r.0 { + return false; + } + if c <= r.1 { + return true; + } + } + self.ranges + .binary_search_by(|r| { + if r.1 < c { + Ordering::Less + } else if r.0 > c { + Ordering::Greater + } else { + Ordering::Equal + } + }) + .is_ok() + } + + /// Return the number of distinct characters represented by all of the + /// ranges. + pub fn num_chars(&self) -> usize { + self.ranges + .iter() + .map(|&(s, e)| 1 + (e as u32) - (s as u32)) + .sum::() as usize + } +} + +/// Representation of the Bytes instruction. +#[derive(Clone, Debug)] +pub struct InstBytes { + /// The next location to execute in the program if this instruction + /// succeeds. + pub goto: InstPtr, + /// The start (inclusive) of this byte range. + pub start: u8, + /// The end (inclusive) of this byte range. + pub end: u8, +} + +impl InstBytes { + /// Returns true if and only if the given byte is in this range. + pub fn matches(&self, byte: u8) -> bool { + self.start <= byte && byte <= self.end + } +} + +#[cfg(test)] +mod test { + #[test] + #[cfg(target_pointer_width = "64")] + fn test_size_of_inst() { + use std::mem::size_of; + + use super::Inst; + + assert_eq!(32, size_of::()); + } +} diff --git a/vendor/regex/src/re_builder.rs b/vendor/regex/src/re_builder.rs new file mode 100644 index 0000000000..ee6383690d --- /dev/null +++ b/vendor/regex/src/re_builder.rs @@ -0,0 +1,421 @@ +/// The set of user configurable options for compiling zero or more regexes. +#[derive(Clone, Debug)] +#[allow(missing_docs)] +pub struct RegexOptions { + pub pats: Vec, + pub size_limit: usize, + pub dfa_size_limit: usize, + pub nest_limit: u32, + pub case_insensitive: bool, + pub multi_line: bool, + pub dot_matches_new_line: bool, + pub swap_greed: bool, + pub ignore_whitespace: bool, + pub unicode: bool, + pub octal: bool, +} + +impl Default for RegexOptions { + fn default() -> Self { + RegexOptions { + pats: vec![], + size_limit: 10 * (1 << 20), + dfa_size_limit: 2 * (1 << 20), + nest_limit: 250, + case_insensitive: false, + multi_line: false, + dot_matches_new_line: false, + swap_greed: false, + ignore_whitespace: false, + unicode: true, + octal: false, + } + } +} + +macro_rules! define_builder { + ($name:ident, $regex_mod:ident, $only_utf8:expr) => { + pub mod $name { + use super::RegexOptions; + use crate::error::Error; + use crate::exec::ExecBuilder; + + use crate::$regex_mod::Regex; + + /// A configurable builder for a regular expression. + /// + /// A builder can be used to configure how the regex is built, for example, by + /// setting the default flags (which can be overridden in the expression + /// itself) or setting various limits. + #[derive(Debug)] + pub struct RegexBuilder(RegexOptions); + + impl RegexBuilder { + /// Create a new regular expression builder with the given pattern. + /// + /// If the pattern is invalid, then an error will be returned when + /// `build` is called. + pub fn new(pattern: &str) -> RegexBuilder { + let mut builder = RegexBuilder(RegexOptions::default()); + builder.0.pats.push(pattern.to_owned()); + builder + } + + /// Consume the builder and compile the regular expression. + /// + /// Note that calling `as_str` on the resulting `Regex` will produce the + /// pattern given to `new` verbatim. Notably, it will not incorporate any + /// of the flags set on this builder. + pub fn build(&self) -> Result { + ExecBuilder::new_options(self.0.clone()) + .only_utf8($only_utf8) + .build() + .map(Regex::from) + } + + /// Set the value for the case insensitive (`i`) flag. + /// + /// When enabled, letters in the pattern will match both upper case and + /// lower case variants. + pub fn case_insensitive( + &mut self, + yes: bool, + ) -> &mut RegexBuilder { + self.0.case_insensitive = yes; + self + } + + /// Set the value for the multi-line matching (`m`) flag. + /// + /// When enabled, `^` matches the beginning of lines and `$` matches the + /// end of lines. + /// + /// By default, they match beginning/end of the input. + pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { + self.0.multi_line = yes; + self + } + + /// Set the value for the any character (`s`) flag, where in `.` matches + /// anything when `s` is set and matches anything except for new line when + /// it is not set (the default). + /// + /// N.B. "matches anything" means "any byte" when Unicode is disabled and + /// means "any valid UTF-8 encoding of any Unicode scalar value" when + /// Unicode is enabled. + pub fn dot_matches_new_line( + &mut self, + yes: bool, + ) -> &mut RegexBuilder { + self.0.dot_matches_new_line = yes; + self + } + + /// Set the value for the greedy swap (`U`) flag. + /// + /// When enabled, a pattern like `a*` is lazy (tries to find shortest + /// match) and `a*?` is greedy (tries to find longest match). + /// + /// By default, `a*` is greedy and `a*?` is lazy. + pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { + self.0.swap_greed = yes; + self + } + + /// Set the value for the ignore whitespace (`x`) flag. + /// + /// When enabled, whitespace such as new lines and spaces will be ignored + /// between expressions of the pattern, and `#` can be used to start a + /// comment until the next new line. + pub fn ignore_whitespace( + &mut self, + yes: bool, + ) -> &mut RegexBuilder { + self.0.ignore_whitespace = yes; + self + } + + /// Set the value for the Unicode (`u`) flag. + /// + /// Enabled by default. When disabled, character classes such as `\w` only + /// match ASCII word characters instead of all Unicode word characters. + pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { + self.0.unicode = yes; + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\0` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder { + self.0.octal = yes; + self + } + + /// Set the approximate size limit of the compiled regular expression. + /// + /// This roughly corresponds to the number of bytes occupied by a single + /// compiled program. If the program exceeds this number, then a + /// compilation error is returned. + pub fn size_limit( + &mut self, + limit: usize, + ) -> &mut RegexBuilder { + self.0.size_limit = limit; + self + } + + /// Set the approximate size of the cache used by the DFA. + /// + /// This roughly corresponds to the number of bytes that the DFA will + /// use while searching. + /// + /// Note that this is a *per thread* limit. There is no way to set a global + /// limit. In particular, if a regex is used from multiple threads + /// simultaneously, then each thread may use up to the number of bytes + /// specified here. + pub fn dfa_size_limit( + &mut self, + limit: usize, + ) -> &mut RegexBuilder { + self.0.dfa_size_limit = limit; + self + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an `Ast` using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire Ast is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since this parser + /// implementation will limit itself to heap space proportional to the + /// length of the pattern string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation, which results in a nest + /// depth of `1`. In general, a nest limit is not something that manifests + /// in an obvious way in the concrete syntax, therefore, it should not be + /// used in a granular way. + pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { + self.0.nest_limit = limit; + self + } + } + } + }; +} + +define_builder!(bytes, re_bytes, false); +define_builder!(unicode, re_unicode, true); + +macro_rules! define_set_builder { + ($name:ident, $regex_mod:ident, $only_utf8:expr) => { + pub mod $name { + use super::RegexOptions; + use crate::error::Error; + use crate::exec::ExecBuilder; + + use crate::re_set::$regex_mod::RegexSet; + + /// A configurable builder for a set of regular expressions. + /// + /// A builder can be used to configure how the regexes are built, for example, + /// by setting the default flags (which can be overridden in the expression + /// itself) or setting various limits. + #[derive(Debug)] + pub struct RegexSetBuilder(RegexOptions); + + impl RegexSetBuilder { + /// Create a new regular expression builder with the given pattern. + /// + /// If the pattern is invalid, then an error will be returned when + /// `build` is called. + pub fn new(patterns: I) -> RegexSetBuilder + where + S: AsRef, + I: IntoIterator, + { + let mut builder = RegexSetBuilder(RegexOptions::default()); + for pat in patterns { + builder.0.pats.push(pat.as_ref().to_owned()); + } + builder + } + + /// Consume the builder and compile the regular expressions into a set. + pub fn build(&self) -> Result { + ExecBuilder::new_options(self.0.clone()) + .only_utf8($only_utf8) + .build() + .map(RegexSet::from) + } + + /// Set the value for the case insensitive (`i`) flag. + pub fn case_insensitive( + &mut self, + yes: bool, + ) -> &mut RegexSetBuilder { + self.0.case_insensitive = yes; + self + } + + /// Set the value for the multi-line matching (`m`) flag. + pub fn multi_line( + &mut self, + yes: bool, + ) -> &mut RegexSetBuilder { + self.0.multi_line = yes; + self + } + + /// Set the value for the any character (`s`) flag, where in `.` matches + /// anything when `s` is set and matches anything except for new line when + /// it is not set (the default). + /// + /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet` + /// expressions and means "any Unicode scalar value" for `regex::RegexSet` + /// expressions. + pub fn dot_matches_new_line( + &mut self, + yes: bool, + ) -> &mut RegexSetBuilder { + self.0.dot_matches_new_line = yes; + self + } + + /// Set the value for the greedy swap (`U`) flag. + pub fn swap_greed( + &mut self, + yes: bool, + ) -> &mut RegexSetBuilder { + self.0.swap_greed = yes; + self + } + + /// Set the value for the ignore whitespace (`x`) flag. + pub fn ignore_whitespace( + &mut self, + yes: bool, + ) -> &mut RegexSetBuilder { + self.0.ignore_whitespace = yes; + self + } + + /// Set the value for the Unicode (`u`) flag. + pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.0.unicode = yes; + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\0` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.0.octal = yes; + self + } + + /// Set the approximate size limit of the compiled regular expression. + /// + /// This roughly corresponds to the number of bytes occupied by a single + /// compiled program. If the program exceeds this number, then a + /// compilation error is returned. + pub fn size_limit( + &mut self, + limit: usize, + ) -> &mut RegexSetBuilder { + self.0.size_limit = limit; + self + } + + /// Set the approximate size of the cache used by the DFA. + /// + /// This roughly corresponds to the number of bytes that the DFA will + /// use while searching. + /// + /// Note that this is a *per thread* limit. There is no way to set a global + /// limit. In particular, if a regex is used from multiple threads + /// simultaneously, then each thread may use up to the number of bytes + /// specified here. + pub fn dfa_size_limit( + &mut self, + limit: usize, + ) -> &mut RegexSetBuilder { + self.0.dfa_size_limit = limit; + self + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an `Ast` using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire Ast is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since this parser + /// implementation will limit itself to heap space proportional to the + /// length of the pattern string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation, which results in a nest + /// depth of `1`. In general, a nest limit is not something that manifests + /// in an obvious way in the concrete syntax, therefore, it should not be + /// used in a granular way. + pub fn nest_limit( + &mut self, + limit: u32, + ) -> &mut RegexSetBuilder { + self.0.nest_limit = limit; + self + } + } + } + }; +} + +define_set_builder!(set_bytes, bytes, false); +define_set_builder!(set_unicode, unicode, true); diff --git a/vendor/regex/src/re_bytes.rs b/vendor/regex/src/re_bytes.rs new file mode 100644 index 0000000000..07e9f98acc --- /dev/null +++ b/vendor/regex/src/re_bytes.rs @@ -0,0 +1,1260 @@ +use std::borrow::Cow; +use std::collections::HashMap; +use std::fmt; +use std::iter::FusedIterator; +use std::ops::{Index, Range}; +use std::str::FromStr; +use std::sync::Arc; + +use crate::find_byte::find_byte; + +use crate::error::Error; +use crate::exec::{Exec, ExecNoSync}; +use crate::expand::expand_bytes; +use crate::re_builder::bytes::RegexBuilder; +use crate::re_trait::{self, RegularExpression, SubCapturesPosIter}; + +/// Match represents a single match of a regex in a haystack. +/// +/// The lifetime parameter `'t` refers to the lifetime of the matched text. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct Match<'t> { + text: &'t [u8], + start: usize, + end: usize, +} + +impl<'t> Match<'t> { + /// Returns the starting byte offset of the match in the haystack. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// Returns the ending byte offset of the match in the haystack. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns the range over the starting and ending byte offsets of the + /// match in the haystack. + #[inline] + pub fn range(&self) -> Range { + self.start..self.end + } + + /// Returns the matched text. + #[inline] + pub fn as_bytes(&self) -> &'t [u8] { + &self.text[self.range()] + } + + /// Creates a new match from the given haystack and byte offsets. + #[inline] + fn new(haystack: &'t [u8], start: usize, end: usize) -> Match<'t> { + Match { text: haystack, start, end } + } +} + +impl<'t> From> for Range { + fn from(m: Match<'t>) -> Range { + m.range() + } +} + +/// A compiled regular expression for matching arbitrary bytes. +/// +/// It can be used to search, split or replace text. All searching is done with +/// an implicit `.*?` at the beginning and end of an expression. To force an +/// expression to match the whole string (or a prefix or a suffix), you must +/// use an anchor like `^` or `$` (or `\A` and `\z`). +/// +/// Like the `Regex` type in the parent module, matches with this regex return +/// byte offsets into the search text. **Unlike** the parent `Regex` type, +/// these byte offsets may not correspond to UTF-8 sequence boundaries since +/// the regexes in this module can match arbitrary bytes. +#[derive(Clone)] +pub struct Regex(Exec); + +impl fmt::Display for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +impl fmt::Debug for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(self, f) + } +} + +/// A constructor for Regex from an Exec. +/// +/// This is hidden because Exec isn't actually part of the public API. +#[doc(hidden)] +impl From for Regex { + fn from(exec: Exec) -> Regex { + Regex(exec) + } +} + +impl FromStr for Regex { + type Err = Error; + + /// Attempts to parse a string into a regular expression + fn from_str(s: &str) -> Result { + Regex::new(s) + } +} + +/// Core regular expression methods. +impl Regex { + /// Compiles a regular expression. Once compiled, it can be used repeatedly + /// to search, split or replace text in a string. + /// + /// If an invalid expression is given, then an error is returned. + pub fn new(re: &str) -> Result { + RegexBuilder::new(re).build() + } + + /// Returns true if and only if there is a match for the regex in the + /// string given. + /// + /// It is recommended to use this method if all you need to do is test + /// a match, since the underlying matching engine may be able to do less + /// work. + /// + /// # Example + /// + /// Test if some text contains at least one word with exactly 13 ASCII word + /// bytes: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let text = b"I categorically deny having triskaidekaphobia."; + /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text)); + /// # } + /// ``` + pub fn is_match(&self, text: &[u8]) -> bool { + self.is_match_at(text, 0) + } + + /// Returns the start and end byte range of the leftmost-first match in + /// `text`. If no match exists, then `None` is returned. + /// + /// Note that this should only be used if you want to discover the position + /// of the match. Testing the existence of a match is faster if you use + /// `is_match`. + /// + /// # Example + /// + /// Find the start and end location of the first word with exactly 13 + /// ASCII word bytes: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let text = b"I categorically deny having triskaidekaphobia."; + /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap(); + /// assert_eq!((mat.start(), mat.end()), (2, 15)); + /// # } + /// ``` + pub fn find<'t>(&self, text: &'t [u8]) -> Option> { + self.find_at(text, 0) + } + + /// Returns an iterator for each successive non-overlapping match in + /// `text`, returning the start and end byte indices with respect to + /// `text`. + /// + /// # Example + /// + /// Find the start and end location of every word with exactly 13 ASCII + /// word bytes: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let text = b"Retroactively relinquishing remunerations is reprehensible."; + /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { + /// println!("{:?}", mat); + /// } + /// # } + /// ``` + pub fn find_iter<'r, 't>(&'r self, text: &'t [u8]) -> Matches<'r, 't> { + Matches(self.0.searcher().find_iter(text)) + } + + /// Returns the capture groups corresponding to the leftmost-first + /// match in `text`. Capture group `0` always corresponds to the entire + /// match. If no match is found, then `None` is returned. + /// + /// You should only use `captures` if you need access to the location of + /// capturing group matches. Otherwise, `find` is faster for discovering + /// the location of the overall match. + /// + /// # Examples + /// + /// Say you have some text with movie names and their release years, + /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text + /// looking like that, while also extracting the movie name and its release + /// year separately. + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); + /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(text).unwrap(); + /// assert_eq!(caps.get(1).unwrap().as_bytes(), &b"Citizen Kane"[..]); + /// assert_eq!(caps.get(2).unwrap().as_bytes(), &b"1941"[..]); + /// assert_eq!(caps.get(0).unwrap().as_bytes(), &b"'Citizen Kane' (1941)"[..]); + /// // You can also access the groups by index using the Index notation. + /// // Note that this will panic on an invalid index. + /// assert_eq!(&caps[1], b"Citizen Kane"); + /// assert_eq!(&caps[2], b"1941"); + /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); + /// # } + /// ``` + /// + /// Note that the full match is at capture group `0`. Each subsequent + /// capture group is indexed by the order of its opening `(`. + /// + /// We can make this example a bit clearer by using *named* capture groups: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"'(?P[^']+)'\s+\((?P<year>\d{4})\)") + /// .unwrap(); + /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(text).unwrap(); + /// assert_eq!(caps.name("title").unwrap().as_bytes(), b"Citizen Kane"); + /// assert_eq!(caps.name("year").unwrap().as_bytes(), b"1941"); + /// assert_eq!(caps.get(0).unwrap().as_bytes(), &b"'Citizen Kane' (1941)"[..]); + /// // You can also access the groups by name using the Index notation. + /// // Note that this will panic on an invalid group name. + /// assert_eq!(&caps["title"], b"Citizen Kane"); + /// assert_eq!(&caps["year"], b"1941"); + /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); + /// + /// # } + /// ``` + /// + /// Here we name the capture groups, which we can access with the `name` + /// method or the `Index` notation with a `&str`. Note that the named + /// capture groups are still accessible with `get` or the `Index` notation + /// with a `usize`. + /// + /// The `0`th capture group is always unnamed, so it must always be + /// accessed with `get(0)` or `[0]`. + pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> { + let mut locs = self.capture_locations(); + self.captures_read_at(&mut locs, text, 0).map(move |_| Captures { + text, + locs: locs.0, + named_groups: self.0.capture_name_idx().clone(), + }) + } + + /// Returns an iterator over all the non-overlapping capture groups matched + /// in `text`. This is operationally the same as `find_iter`, except it + /// yields information about capturing group matches. + /// + /// # Example + /// + /// We can use this to find all movie titles and their release years in + /// some text, where the movie is formatted like "'Title' (xxxx)": + /// + /// ```rust + /// # use std::str; use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)") + /// .unwrap(); + /// let text = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; + /// for caps in re.captures_iter(text) { + /// let title = str::from_utf8(&caps["title"]).unwrap(); + /// let year = str::from_utf8(&caps["year"]).unwrap(); + /// println!("Movie: {:?}, Released: {:?}", title, year); + /// } + /// // Output: + /// // Movie: Citizen Kane, Released: 1941 + /// // Movie: The Wizard of Oz, Released: 1939 + /// // Movie: M, Released: 1931 + /// # } + /// ``` + pub fn captures_iter<'r, 't>( + &'r self, + text: &'t [u8], + ) -> CaptureMatches<'r, 't> { + CaptureMatches(self.0.searcher().captures_iter(text)) + } + + /// Returns an iterator of substrings of `text` delimited by a match of the + /// regular expression. Namely, each element of the iterator corresponds to + /// text that *isn't* matched by the regular expression. + /// + /// This method will *not* copy the text given. + /// + /// # Example + /// + /// To split a string delimited by arbitrary amounts of spaces or tabs: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"[ \t]+").unwrap(); + /// let fields: Vec<&[u8]> = re.split(b"a b \t c\td e").collect(); + /// assert_eq!(fields, vec![ + /// &b"a"[..], &b"b"[..], &b"c"[..], &b"d"[..], &b"e"[..], + /// ]); + /// # } + /// ``` + pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> Split<'r, 't> { + Split { finder: self.find_iter(text), last: 0 } + } + + /// Returns an iterator of at most `limit` substrings of `text` delimited + /// by a match of the regular expression. (A `limit` of `0` will return no + /// substrings.) Namely, each element of the iterator corresponds to text + /// that *isn't* matched by the regular expression. The remainder of the + /// string that is not split will be the last element in the iterator. + /// + /// This method will *not* copy the text given. + /// + /// # Example + /// + /// Get the first two words in some text: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"\W+").unwrap(); + /// let fields: Vec<&[u8]> = re.splitn(b"Hey! How are you?", 3).collect(); + /// assert_eq!(fields, vec![&b"Hey"[..], &b"How"[..], &b"are you?"[..]]); + /// # } + /// ``` + pub fn splitn<'r, 't>( + &'r self, + text: &'t [u8], + limit: usize, + ) -> SplitN<'r, 't> { + SplitN { splits: self.split(text), n: limit } + } + + /// Replaces the leftmost-first match with the replacement provided. The + /// replacement can be a regular byte string (where `$N` and `$name` are + /// expanded to match capture groups) or a function that takes the matches' + /// `Captures` and returns the replaced byte string. + /// + /// If no match is found, then a copy of the byte string is returned + /// unchanged. + /// + /// # Replacement string syntax + /// + /// All instances of `$name` in the replacement text is replaced with the + /// corresponding capture group `name`. + /// + /// `name` may be an integer corresponding to the index of the + /// capture group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert more + /// precise control over the name, use braces, e.g., `${1}a`. + /// + /// To write a literal `$` use `$$`. + /// + /// # Examples + /// + /// Note that this function is polymorphic with respect to the replacement. + /// In typical usage, this can just be a normal byte string: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new("[^01]+").unwrap(); + /// assert_eq!(re.replace(b"1078910", &b""[..]), &b"1010"[..]); + /// # } + /// ``` + /// + /// But anything satisfying the `Replacer` trait will work. For example, a + /// closure of type `|&Captures| -> Vec<u8>` provides direct access to the + /// captures corresponding to a match. This allows one to access capturing + /// group matches easily: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # use regex::bytes::Captures; fn main() { + /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap(); + /// let result = re.replace(b"Springsteen, Bruce", |caps: &Captures| { + /// let mut replacement = caps[2].to_owned(); + /// replacement.push(b' '); + /// replacement.extend(&caps[1]); + /// replacement + /// }); + /// assert_eq!(result, &b"Bruce Springsteen"[..]); + /// # } + /// ``` + /// + /// But this is a bit cumbersome to use all the time. Instead, a simple + /// syntax is supported that expands `$name` into the corresponding capture + /// group. Here's the last example, but using this expansion technique + /// with named capture groups: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap(); + /// let result = re.replace(b"Springsteen, Bruce", &b"$first $last"[..]); + /// assert_eq!(result, &b"Bruce Springsteen"[..]); + /// # } + /// ``` + /// + /// Note that using `$2` instead of `$first` or `$1` instead of `$last` + /// would produce the same result. To write a literal `$` use `$$`. + /// + /// Sometimes the replacement string requires use of curly braces to + /// delineate a capture group replacement and surrounding literal text. + /// For example, if we wanted to join two words together with an + /// underscore: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap(); + /// let result = re.replace(b"deep fried", &b"${first}_$second"[..]); + /// assert_eq!(result, &b"deep_fried"[..]); + /// # } + /// ``` + /// + /// Without the curly braces, the capture group name `first_` would be + /// used, and since it doesn't exist, it would be replaced with the empty + /// string. + /// + /// Finally, sometimes you just want to replace a literal string with no + /// regard for capturing group expansion. This can be done by wrapping a + /// byte string with `NoExpand`: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// use regex::bytes::NoExpand; + /// + /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap(); + /// let result = re.replace(b"Springsteen, Bruce", NoExpand(b"$2 $last")); + /// assert_eq!(result, &b"$2 $last"[..]); + /// # } + /// ``` + pub fn replace<'t, R: Replacer>( + &self, + text: &'t [u8], + rep: R, + ) -> Cow<'t, [u8]> { + self.replacen(text, 1, rep) + } + + /// Replaces all non-overlapping matches in `text` with the replacement + /// provided. This is the same as calling `replacen` with `limit` set to + /// `0`. + /// + /// See the documentation for `replace` for details on how to access + /// capturing group matches in the replacement text. + pub fn replace_all<'t, R: Replacer>( + &self, + text: &'t [u8], + rep: R, + ) -> Cow<'t, [u8]> { + self.replacen(text, 0, rep) + } + + /// Replaces at most `limit` non-overlapping matches in `text` with the + /// replacement provided. If `limit` is 0, then all non-overlapping matches + /// are replaced. + /// + /// See the documentation for `replace` for details on how to access + /// capturing group matches in the replacement text. + pub fn replacen<'t, R: Replacer>( + &self, + text: &'t [u8], + limit: usize, + mut rep: R, + ) -> Cow<'t, [u8]> { + if let Some(rep) = rep.no_expansion() { + let mut it = self.find_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } + let mut new = Vec::with_capacity(text.len()); + let mut last_match = 0; + for (i, m) in it { + new.extend_from_slice(&text[last_match..m.start()]); + new.extend_from_slice(&rep); + last_match = m.end(); + if limit > 0 && i >= limit - 1 { + break; + } + } + new.extend_from_slice(&text[last_match..]); + return Cow::Owned(new); + } + + // The slower path, which we use if the replacement needs access to + // capture groups. + let mut it = self.captures_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } + let mut new = Vec::with_capacity(text.len()); + let mut last_match = 0; + for (i, cap) in it { + // unwrap on 0 is OK because captures only reports matches + let m = cap.get(0).unwrap(); + new.extend_from_slice(&text[last_match..m.start()]); + rep.replace_append(&cap, &mut new); + last_match = m.end(); + if limit > 0 && i >= limit - 1 { + break; + } + } + new.extend_from_slice(&text[last_match..]); + Cow::Owned(new) + } +} + +/// Advanced or "lower level" search methods. +impl Regex { + /// Returns the end location of a match in the text given. + /// + /// This method may have the same performance characteristics as + /// `is_match`, except it provides an end location for a match. In + /// particular, the location returned *may be shorter* than the proper end + /// of the leftmost-first match. + /// + /// # Example + /// + /// Typically, `a+` would match the entire first sequence of `a` in some + /// text, but `shortest_match` can give up as soon as it sees the first + /// `a`. + /// + /// ```rust + /// # use regex::bytes::Regex; + /// # fn main() { + /// let text = b"aaaaa"; + /// let pos = Regex::new(r"a+").unwrap().shortest_match(text); + /// assert_eq!(pos, Some(1)); + /// # } + /// ``` + pub fn shortest_match(&self, text: &[u8]) -> Option<usize> { + self.shortest_match_at(text, 0) + } + + /// Returns the same as shortest_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn shortest_match_at( + &self, + text: &[u8], + start: usize, + ) -> Option<usize> { + self.0.searcher().shortest_match_at(text, start) + } + + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn is_match_at(&self, text: &[u8], start: usize) -> bool { + self.0.searcher().is_match_at(text, start) + } + + /// Returns the same as find, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn find_at<'t>( + &self, + text: &'t [u8], + start: usize, + ) -> Option<Match<'t>> { + self.0 + .searcher() + .find_at(text, start) + .map(|(s, e)| Match::new(text, s, e)) + } + + /// This is like `captures`, but uses + /// [`CaptureLocations`](struct.CaptureLocations.html) + /// instead of + /// [`Captures`](struct.Captures.html) in order to amortize allocations. + /// + /// To create a `CaptureLocations` value, use the + /// `Regex::capture_locations` method. + /// + /// This returns the overall match if this was successful, which is always + /// equivalence to the `0`th capture group. + pub fn captures_read<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t [u8], + ) -> Option<Match<'t>> { + self.captures_read_at(locs, text, 0) + } + + /// Returns the same as `captures_read`, but starts the search at the given + /// offset and populates the capture locations given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn captures_read_at<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t [u8], + start: usize, + ) -> Option<Match<'t>> { + self.0 + .searcher() + .captures_read_at(&mut locs.0, text, start) + .map(|(s, e)| Match::new(text, s, e)) + } + + /// An undocumented alias for `captures_read_at`. + /// + /// The `regex-capi` crate previously used this routine, so to avoid + /// breaking that crate, we continue to provide the name as an undocumented + /// alias. + #[doc(hidden)] + pub fn read_captures_at<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t [u8], + start: usize, + ) -> Option<Match<'t>> { + self.captures_read_at(locs, text, start) + } +} + +/// Auxiliary methods. +impl Regex { + /// Returns the original string of this regex. + pub fn as_str(&self) -> &str { + &self.0.regex_strings()[0] + } + + /// Returns an iterator over the capture names. + pub fn capture_names(&self) -> CaptureNames<'_> { + CaptureNames(self.0.capture_names().iter()) + } + + /// Returns the number of captures. + pub fn captures_len(&self) -> usize { + self.0.capture_names().len() + } + + /// Returns an empty set of capture locations that can be reused in + /// multiple calls to `captures_read` or `captures_read_at`. + pub fn capture_locations(&self) -> CaptureLocations { + CaptureLocations(self.0.searcher().locations()) + } + + /// An alias for `capture_locations` to preserve backward compatibility. + /// + /// The `regex-capi` crate uses this method, so to avoid breaking that + /// crate, we continue to export it as an undocumented API. + #[doc(hidden)] + pub fn locations(&self) -> CaptureLocations { + CaptureLocations(self.0.searcher().locations()) + } +} + +/// An iterator over all non-overlapping matches for a particular string. +/// +/// The iterator yields a tuple of integers corresponding to the start and end +/// of the match. The indices are byte offsets. The iterator stops when no more +/// matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the matched byte string. +#[derive(Debug)] +pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSync<'r>>); + +impl<'r, 't> Iterator for Matches<'r, 't> { + type Item = Match<'t>; + + fn next(&mut self) -> Option<Match<'t>> { + let text = self.0.text(); + self.0.next().map(|(s, e)| Match::new(text, s, e)) + } +} + +impl<'r, 't> FusedIterator for Matches<'r, 't> {} + +/// An iterator that yields all non-overlapping capture groups matching a +/// particular regular expression. +/// +/// The iterator stops when no more matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the matched byte string. +#[derive(Debug)] +pub struct CaptureMatches<'r, 't>( + re_trait::CaptureMatches<'t, ExecNoSync<'r>>, +); + +impl<'r, 't> Iterator for CaptureMatches<'r, 't> { + type Item = Captures<'t>; + + fn next(&mut self) -> Option<Captures<'t>> { + self.0.next().map(|locs| Captures { + text: self.0.text(), + locs, + named_groups: self.0.regex().capture_name_idx().clone(), + }) + } +} + +impl<'r, 't> FusedIterator for CaptureMatches<'r, 't> {} + +/// Yields all substrings delimited by a regular expression match. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the byte string being split. +#[derive(Debug)] +pub struct Split<'r, 't> { + finder: Matches<'r, 't>, + last: usize, +} + +impl<'r, 't> Iterator for Split<'r, 't> { + type Item = &'t [u8]; + + fn next(&mut self) -> Option<&'t [u8]> { + let text = self.finder.0.text(); + match self.finder.next() { + None => { + if self.last > text.len() { + None + } else { + let s = &text[self.last..]; + self.last = text.len() + 1; // Next call will return None + Some(s) + } + } + Some(m) => { + let matched = &text[self.last..m.start()]; + self.last = m.end(); + Some(matched) + } + } + } +} + +impl<'r, 't> FusedIterator for Split<'r, 't> {} + +/// Yields at most `N` substrings delimited by a regular expression match. +/// +/// The last substring will be whatever remains after splitting. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the byte string being split. +#[derive(Debug)] +pub struct SplitN<'r, 't> { + splits: Split<'r, 't>, + n: usize, +} + +impl<'r, 't> Iterator for SplitN<'r, 't> { + type Item = &'t [u8]; + + fn next(&mut self) -> Option<&'t [u8]> { + if self.n == 0 { + return None; + } + + self.n -= 1; + if self.n > 0 { + return self.splits.next(); + } + + let text = self.splits.finder.0.text(); + if self.splits.last > text.len() { + // We've already returned all substrings. + None + } else { + // self.n == 0, so future calls will return None immediately + Some(&text[self.splits.last..]) + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + (0, Some(self.n)) + } +} + +impl<'r, 't> FusedIterator for SplitN<'r, 't> {} + +/// An iterator over the names of all possible captures. +/// +/// `None` indicates an unnamed capture; the first element (capture 0, the +/// whole matched region) is always unnamed. +/// +/// `'r` is the lifetime of the compiled regular expression. +#[derive(Clone, Debug)] +pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option<String>>); + +impl<'r> Iterator for CaptureNames<'r> { + type Item = Option<&'r str>; + + fn next(&mut self) -> Option<Option<&'r str>> { + self.0 + .next() + .as_ref() + .map(|slot| slot.as_ref().map(|name| name.as_ref())) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.0.size_hint() + } + + fn count(self) -> usize { + self.0.count() + } +} + +impl<'r> ExactSizeIterator for CaptureNames<'r> {} + +impl<'r> FusedIterator for CaptureNames<'r> {} + +/// CaptureLocations is a low level representation of the raw offsets of each +/// submatch. +/// +/// You can think of this as a lower level +/// [`Captures`](struct.Captures.html), where this type does not support +/// named capturing groups directly and it does not borrow the text that these +/// offsets were matched on. +/// +/// Primarily, this type is useful when using the lower level `Regex` APIs +/// such as `read_captures`, which permits amortizing the allocation in which +/// capture match locations are stored. +/// +/// In order to build a value of this type, you'll need to call the +/// `capture_locations` method on the `Regex` being used to execute the search. +/// The value returned can then be reused in subsequent searches. +#[derive(Clone, Debug)] +pub struct CaptureLocations(re_trait::Locations); + +/// A type alias for `CaptureLocations` for backwards compatibility. +/// +/// Previously, we exported `CaptureLocations` as `Locations` in an +/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`), +/// we continue re-exporting the same undocumented API. +#[doc(hidden)] +pub type Locations = CaptureLocations; + +impl CaptureLocations { + /// Returns the start and end positions of the Nth capture group. Returns + /// `None` if `i` is not a valid capture group or if the capture group did + /// not match anything. The positions returned are *always* byte indices + /// with respect to the original string matched. + #[inline] + pub fn get(&self, i: usize) -> Option<(usize, usize)> { + self.0.pos(i) + } + + /// Returns the total number of capture groups (even if they didn't match). + /// + /// This is always at least `1` since every regex has at least `1` + /// capturing group that corresponds to the entire match. + #[inline] + pub fn len(&self) -> usize { + self.0.len() + } + + /// An alias for the `get` method for backwards compatibility. + /// + /// Previously, we exported `get` as `pos` in an undocumented API. To + /// prevent breaking that code (e.g., in `regex-capi`), we continue + /// re-exporting the same undocumented API. + #[doc(hidden)] + #[inline] + pub fn pos(&self, i: usize) -> Option<(usize, usize)> { + self.get(i) + } +} + +/// Captures represents a group of captured byte strings for a single match. +/// +/// The 0th capture always corresponds to the entire match. Each subsequent +/// index corresponds to the next capture group in the regex. If a capture +/// group is named, then the matched byte string is *also* available via the +/// `name` method. (Note that the 0th capture is always unnamed and so must be +/// accessed with the `get` method.) +/// +/// Positions returned from a capture group are always byte indices. +/// +/// `'t` is the lifetime of the matched text. +pub struct Captures<'t> { + text: &'t [u8], + locs: re_trait::Locations, + named_groups: Arc<HashMap<String, usize>>, +} + +impl<'t> Captures<'t> { + /// Returns the match associated with the capture group at index `i`. If + /// `i` does not correspond to a capture group, or if the capture group + /// did not participate in the match, then `None` is returned. + /// + /// # Examples + /// + /// Get the text of the match with a default of an empty string if this + /// group didn't participate in the match: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap(); + /// let caps = re.captures(b"abc123").unwrap(); + /// + /// let text1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes()); + /// let text2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes()); + /// assert_eq!(text1, &b"123"[..]); + /// assert_eq!(text2, &b""[..]); + /// ``` + pub fn get(&self, i: usize) -> Option<Match<'t>> { + self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e)) + } + + /// Returns the match for the capture group named `name`. If `name` isn't a + /// valid capture group or didn't match anything, then `None` is returned. + pub fn name(&self, name: &str) -> Option<Match<'t>> { + self.named_groups.get(name).and_then(|&i| self.get(i)) + } + + /// An iterator that yields all capturing matches in the order in which + /// they appear in the regex. If a particular capture group didn't + /// participate in the match, then `None` is yielded for that capture. + /// + /// The first match always corresponds to the overall match of the regex. + pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> { + SubCaptureMatches { caps: self, it: self.locs.iter() } + } + + /// Expands all instances of `$name` in `replacement` to the corresponding + /// capture group `name`, and writes them to the `dst` buffer given. + /// + /// `name` may be an integer corresponding to the index of the capture + /// group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name consisting of the characters `[_0-9A-Za-z]` + /// is used. e.g., `$1a` looks up the capture group named `1a` and not the + /// capture group at index `1`. To exert more precise control over the + /// name, or to refer to a capture group name that uses characters outside + /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When + /// using braces, any sequence of valid UTF-8 bytes is permitted. If the + /// sequence does not refer to a capture group name in the corresponding + /// regex, then it is replaced with an empty string. + /// + /// To write a literal `$` use `$$`. + pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) { + expand_bytes(self, replacement, dst) + } + + /// Returns the total number of capture groups (even if they didn't match). + /// + /// This is always at least `1`, since every regex has at least one capture + /// group that corresponds to the full match. + #[inline] + pub fn len(&self) -> usize { + self.locs.len() + } +} + +impl<'t> fmt::Debug for Captures<'t> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("Captures").field(&CapturesDebug(self)).finish() + } +} + +struct CapturesDebug<'c, 't>(&'c Captures<'t>); + +impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn escape_bytes(bytes: &[u8]) -> String { + let mut s = String::new(); + for &b in bytes { + s.push_str(&escape_byte(b)); + } + s + } + + fn escape_byte(byte: u8) -> String { + use std::ascii::escape_default; + + let escaped: Vec<u8> = escape_default(byte).collect(); + String::from_utf8_lossy(&escaped).into_owned() + } + + // We'd like to show something nice here, even if it means an + // allocation to build a reverse index. + let slot_to_name: HashMap<&usize, &String> = + self.0.named_groups.iter().map(|(a, b)| (b, a)).collect(); + let mut map = f.debug_map(); + for (slot, m) in self.0.locs.iter().enumerate() { + let m = m.map(|(s, e)| escape_bytes(&self.0.text[s..e])); + if let Some(name) = slot_to_name.get(&slot) { + map.entry(&name, &m); + } else { + map.entry(&slot, &m); + } + } + map.finish() + } +} + +/// Get a group by index. +/// +/// `'t` is the lifetime of the matched text. +/// +/// The text can't outlive the `Captures` object if this method is +/// used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it); to do that, use `get()` instead. +/// +/// # Panics +/// +/// If there is no group at the given index. +impl<'t> Index<usize> for Captures<'t> { + type Output = [u8]; + + fn index(&self, i: usize) -> &[u8] { + self.get(i) + .map(|m| m.as_bytes()) + .unwrap_or_else(|| panic!("no group at index '{}'", i)) + } +} + +/// Get a group by name. +/// +/// `'t` is the lifetime of the matched text and `'i` is the lifetime +/// of the group name (the index). +/// +/// The text can't outlive the `Captures` object if this method is +/// used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it); to do that, use `name` instead. +/// +/// # Panics +/// +/// If there is no group named by the given value. +impl<'t, 'i> Index<&'i str> for Captures<'t> { + type Output = [u8]; + + fn index<'a>(&'a self, name: &'i str) -> &'a [u8] { + self.name(name) + .map(|m| m.as_bytes()) + .unwrap_or_else(|| panic!("no group named '{}'", name)) + } +} + +/// An iterator that yields all capturing matches in the order in which they +/// appear in the regex. +/// +/// If a particular capture group didn't participate in the match, then `None` +/// is yielded for that capture. The first match always corresponds to the +/// overall match of the regex. +/// +/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and +/// the lifetime `'t` corresponds to the originally matched text. +#[derive(Clone, Debug)] +pub struct SubCaptureMatches<'c, 't> { + caps: &'c Captures<'t>, + it: SubCapturesPosIter<'c>, +} + +impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> { + type Item = Option<Match<'t>>; + + fn next(&mut self) -> Option<Option<Match<'t>>> { + self.it + .next() + .map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e))) + } +} + +impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {} + +/// Replacer describes types that can be used to replace matches in a byte +/// string. +/// +/// In general, users of this crate shouldn't need to implement this trait, +/// since implementations are already provided for `&[u8]` along with other +/// variants of bytes types and `FnMut(&Captures) -> Vec<u8>` (or any +/// `FnMut(&Captures) -> T` where `T: AsRef<[u8]>`), which covers most use cases. +pub trait Replacer { + /// Appends text to `dst` to replace the current match. + /// + /// The current match is represented by `caps`, which is guaranteed to + /// have a match at capture group `0`. + /// + /// For example, a no-op replacement would be + /// `dst.extend(&caps[0])`. + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>); + + /// Return a fixed unchanging replacement byte string. + /// + /// When doing replacements, if access to `Captures` is not needed (e.g., + /// the replacement byte string does not need `$` expansion), then it can + /// be beneficial to avoid finding sub-captures. + /// + /// In general, this is called once for every call to `replacen`. + fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> { + None + } + + /// Return a `Replacer` that borrows and wraps this `Replacer`. + /// + /// This is useful when you want to take a generic `Replacer` (which might + /// not be cloneable) and use it without consuming it, so it can be used + /// more than once. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::{Regex, Replacer}; + /// + /// fn replace_all_twice<R: Replacer>( + /// re: Regex, + /// src: &[u8], + /// mut rep: R, + /// ) -> Vec<u8> { + /// let dst = re.replace_all(src, rep.by_ref()); + /// let dst = re.replace_all(&dst, rep.by_ref()); + /// dst.into_owned() + /// } + /// ``` + fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> { + ReplacerRef(self) + } +} + +/// By-reference adaptor for a `Replacer` +/// +/// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref). +#[derive(Debug)] +pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R); + +impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + self.0.replace_append(caps, dst) + } + fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> { + self.0.no_expansion() + } +} + +impl<'a> Replacer for &'a [u8] { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(*self, dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +impl<'a> Replacer for &'a Vec<u8> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(*self, dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +impl Replacer for Vec<u8> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(self, dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +impl<'a> Replacer for Cow<'a, [u8]> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(self.as_ref(), dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +impl<'a> Replacer for &'a Cow<'a, [u8]> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(self.as_ref(), dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +fn no_expansion<T: AsRef<[u8]>>(t: &T) -> Option<Cow<'_, [u8]>> { + let s = t.as_ref(); + match find_byte(b'$', s) { + Some(_) => None, + None => Some(Cow::Borrowed(s)), + } +} + +impl<F, T> Replacer for F +where + F: FnMut(&Captures<'_>) -> T, + T: AsRef<[u8]>, +{ + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + dst.extend_from_slice((*self)(caps).as_ref()); + } +} + +/// `NoExpand` indicates literal byte string replacement. +/// +/// It can be used with `replace` and `replace_all` to do a literal byte string +/// replacement without expanding `$name` to their corresponding capture +/// groups. This can be both convenient (to avoid escaping `$`, for example) +/// and performant (since capture groups don't need to be found). +/// +/// `'t` is the lifetime of the literal text. +#[derive(Clone, Debug)] +pub struct NoExpand<'t>(pub &'t [u8]); + +impl<'t> Replacer for NoExpand<'t> { + fn replace_append(&mut self, _: &Captures<'_>, dst: &mut Vec<u8>) { + dst.extend_from_slice(self.0); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + Some(Cow::Borrowed(self.0)) + } +} diff --git a/vendor/regex/src/re_set.rs b/vendor/regex/src/re_set.rs new file mode 100644 index 0000000000..a6d886d761 --- /dev/null +++ b/vendor/regex/src/re_set.rs @@ -0,0 +1,507 @@ +macro_rules! define_set { + ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr, + $(#[$doc_regexset_example:meta])* ) => { + pub mod $name { + use std::fmt; + use std::iter; + use std::slice; + use std::vec; + + use crate::error::Error; + use crate::exec::Exec; + use crate::re_builder::$builder_mod::RegexSetBuilder; + use crate::re_trait::RegularExpression; + +/// Match multiple (possibly overlapping) regular expressions in a single scan. +/// +/// A regex set corresponds to the union of two or more regular expressions. +/// That is, a regex set will match text where at least one of its +/// constituent regular expressions matches. A regex set as its formulated here +/// provides a touch more power: it will also report *which* regular +/// expressions in the set match. Indeed, this is the key difference between +/// regex sets and a single `Regex` with many alternates, since only one +/// alternate can match at a time. +/// +/// For example, consider regular expressions to match email addresses and +/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a +/// regex set is constructed from those regexes, then searching the text +/// `foo@example.com` will report both regexes as matching. Of course, one +/// could accomplish this by compiling each regex on its own and doing two +/// searches over the text. The key advantage of using a regex set is that it +/// will report the matching regexes using a *single pass through the text*. +/// If one has hundreds or thousands of regexes to match repeatedly (like a URL +/// router for a complex web application or a user agent matcher), then a regex +/// set can realize huge performance gains. +/// +/// # Example +/// +/// This shows how the above two regexes (for matching email addresses and +/// domains) might work: +/// +$(#[$doc_regexset_example])* +/// +/// Note that it would be possible to adapt the above example to using `Regex` +/// with an expression like: +/// +/// ```text +/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net)) +/// ``` +/// +/// After a match, one could then inspect the capture groups to figure out +/// which alternates matched. The problem is that it is hard to make this +/// approach scale when there are many regexes since the overlap between each +/// alternate isn't always obvious to reason about. +/// +/// # Limitations +/// +/// Regex sets are limited to answering the following two questions: +/// +/// 1. Does any regex in the set match? +/// 2. If so, which regexes in the set match? +/// +/// As with the main [`Regex`][crate::Regex] type, it is cheaper to ask (1) +/// instead of (2) since the matching engines can stop after the first match +/// is found. +/// +/// You cannot directly extract [`Match`][crate::Match] or +/// [`Captures`][crate::Captures] objects from a regex set. If you need these +/// operations, the recommended approach is to compile each pattern in the set +/// independently and scan the exact same input a second time with those +/// independently compiled patterns: +/// +/// ```rust +/// use regex::{Regex, RegexSet}; +/// +/// let patterns = ["foo", "bar"]; +/// // Both patterns will match different ranges of this string. +/// let text = "barfoo"; +/// +/// // Compile a set matching any of our patterns. +/// let set = RegexSet::new(&patterns).unwrap(); +/// // Compile each pattern independently. +/// let regexes: Vec<_> = set.patterns().iter() +/// .map(|pat| Regex::new(pat).unwrap()) +/// .collect(); +/// +/// // Match against the whole set first and identify the individual +/// // matching patterns. +/// let matches: Vec<&str> = set.matches(text).into_iter() +/// // Dereference the match index to get the corresponding +/// // compiled pattern. +/// .map(|match_idx| ®exes[match_idx]) +/// // To get match locations or any other info, we then have to search +/// // the exact same text again, using our separately-compiled pattern. +/// .map(|pat| pat.find(text).unwrap().as_str()) +/// .collect(); +/// +/// // Matches arrive in the order the constituent patterns were declared, +/// // not the order they appear in the input. +/// assert_eq!(vec!["foo", "bar"], matches); +/// ``` +/// +/// # Performance +/// +/// A `RegexSet` has the same performance characteristics as `Regex`. Namely, +/// search takes `O(mn)` time, where `m` is proportional to the size of the +/// regex set and `n` is proportional to the length of the search text. +#[derive(Clone)] +pub struct RegexSet(Exec); + +impl RegexSet { + /// Create a new regex set with the given regular expressions. + /// + /// This takes an iterator of `S`, where `S` is something that can produce + /// a `&str`. If any of the strings in the iterator are not valid regular + /// expressions, then an error is returned. + /// + /// # Example + /// + /// Create a new regex set from an iterator of strings: + /// + /// ```rust + /// # use regex::RegexSet; + /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); + /// assert!(set.is_match("foo")); + /// ``` + pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error> + where S: AsRef<str>, I: IntoIterator<Item=S> { + RegexSetBuilder::new(exprs).build() + } + + /// Create a new empty regex set. + /// + /// # Example + /// + /// ```rust + /// # use regex::RegexSet; + /// let set = RegexSet::empty(); + /// assert!(set.is_empty()); + /// ``` + pub fn empty() -> RegexSet { + RegexSetBuilder::new(&[""; 0]).build().unwrap() + } + + /// Returns true if and only if one of the regexes in this set matches + /// the text given. + /// + /// This method should be preferred if you only need to test whether any + /// of the regexes in the set should match, but don't care about *which* + /// regexes matched. This is because the underlying matching engine will + /// quit immediately after seeing the first match instead of continuing to + /// find all matches. + /// + /// Note that as with searches using `Regex`, the expression is unanchored + /// by default. That is, if the regex does not start with `^` or `\A`, or + /// end with `$` or `\z`, then it is permitted to match anywhere in the + /// text. + /// + /// # Example + /// + /// Tests whether a set matches some text: + /// + /// ```rust + /// # use regex::RegexSet; + /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); + /// assert!(set.is_match("foo")); + /// assert!(!set.is_match("☃")); + /// ``` + pub fn is_match(&self, text: $text_ty) -> bool { + self.is_match_at(text, 0) + } + + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + #[doc(hidden)] + pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool { + self.0.searcher().is_match_at($as_bytes(text), start) + } + + /// Returns the set of regular expressions that match in the given text. + /// + /// The set returned contains the index of each regular expression that + /// matches in the given text. The index is in correspondence with the + /// order of regular expressions given to `RegexSet`'s constructor. + /// + /// The set can also be used to iterate over the matched indices. + /// + /// Note that as with searches using `Regex`, the expression is unanchored + /// by default. That is, if the regex does not start with `^` or `\A`, or + /// end with `$` or `\z`, then it is permitted to match anywhere in the + /// text. + /// + /// # Example + /// + /// Tests which regular expressions match the given text: + /// + /// ```rust + /// # use regex::RegexSet; + /// let set = RegexSet::new(&[ + /// r"\w+", + /// r"\d+", + /// r"\pL+", + /// r"foo", + /// r"bar", + /// r"barfoo", + /// r"foobar", + /// ]).unwrap(); + /// let matches: Vec<_> = set.matches("foobar").into_iter().collect(); + /// assert_eq!(matches, vec![0, 2, 3, 4, 6]); + /// + /// // You can also test whether a particular regex matched: + /// let matches = set.matches("foobar"); + /// assert!(!matches.matched(5)); + /// assert!(matches.matched(6)); + /// ``` + pub fn matches(&self, text: $text_ty) -> SetMatches { + let mut matches = vec![false; self.0.regex_strings().len()]; + let any = self.read_matches_at(&mut matches, text, 0); + SetMatches { + matched_any: any, + matches: matches, + } + } + + /// Returns the same as matches, but starts the search at the given + /// offset and stores the matches into the slice given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + /// + /// `matches` must have a length that is at least the number of regexes + /// in this set. + /// + /// This method returns true if and only if at least one member of + /// `matches` is true after executing the set against `text`. + #[doc(hidden)] + pub fn read_matches_at( + &self, + matches: &mut [bool], + text: $text_ty, + start: usize, + ) -> bool { + self.0.searcher().many_matches_at(matches, $as_bytes(text), start) + } + + /// Returns the total number of regular expressions in this set. + pub fn len(&self) -> usize { + self.0.regex_strings().len() + } + + /// Returns `true` if this set contains no regular expressions. + pub fn is_empty(&self) -> bool { + self.0.regex_strings().is_empty() + } + + /// Returns the patterns that this set will match on. + /// + /// This function can be used to determine the pattern for a match. The + /// slice returned has exactly as many patterns givens to this regex set, + /// and the order of the slice is the same as the order of the patterns + /// provided to the set. + /// + /// # Example + /// + /// ```rust + /// # use regex::RegexSet; + /// let set = RegexSet::new(&[ + /// r"\w+", + /// r"\d+", + /// r"\pL+", + /// r"foo", + /// r"bar", + /// r"barfoo", + /// r"foobar", + /// ]).unwrap(); + /// let matches: Vec<_> = set + /// .matches("foobar") + /// .into_iter() + /// .map(|match_idx| &set.patterns()[match_idx]) + /// .collect(); + /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]); + /// ``` + pub fn patterns(&self) -> &[String] { + self.0.regex_strings() + } +} + +/// A set of matches returned by a regex set. +#[derive(Clone, Debug)] +pub struct SetMatches { + matched_any: bool, + matches: Vec<bool>, +} + +impl SetMatches { + /// Whether this set contains any matches. + pub fn matched_any(&self) -> bool { + self.matched_any + } + + /// Whether the regex at the given index matched. + /// + /// The index for a regex is determined by its insertion order upon the + /// initial construction of a `RegexSet`, starting at `0`. + /// + /// # Panics + /// + /// If `regex_index` is greater than or equal to `self.len()`. + pub fn matched(&self, regex_index: usize) -> bool { + self.matches[regex_index] + } + + /// The total number of regexes in the set that created these matches. + pub fn len(&self) -> usize { + self.matches.len() + } + + /// Returns an iterator over indexes in the regex that matched. + /// + /// This will always produces matches in ascending order of index, where + /// the index corresponds to the index of the regex that matched with + /// respect to its position when initially building the set. + pub fn iter(&self) -> SetMatchesIter<'_> { + SetMatchesIter((&*self.matches).into_iter().enumerate()) + } +} + +impl IntoIterator for SetMatches { + type IntoIter = SetMatchesIntoIter; + type Item = usize; + + fn into_iter(self) -> Self::IntoIter { + SetMatchesIntoIter(self.matches.into_iter().enumerate()) + } +} + +impl<'a> IntoIterator for &'a SetMatches { + type IntoIter = SetMatchesIter<'a>; + type Item = usize; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +/// An owned iterator over the set of matches from a regex set. +/// +/// This will always produces matches in ascending order of index, where the +/// index corresponds to the index of the regex that matched with respect to +/// its position when initially building the set. +#[derive(Debug)] +pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>); + +impl Iterator for SetMatchesIntoIter { + type Item = usize; + + fn next(&mut self) -> Option<usize> { + loop { + match self.0.next() { + None => return None, + Some((_, false)) => {} + Some((i, true)) => return Some(i), + } + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.0.size_hint() + } +} + +impl DoubleEndedIterator for SetMatchesIntoIter { + fn next_back(&mut self) -> Option<usize> { + loop { + match self.0.next_back() { + None => return None, + Some((_, false)) => {} + Some((i, true)) => return Some(i), + } + } + } +} + +impl iter::FusedIterator for SetMatchesIntoIter {} + +/// A borrowed iterator over the set of matches from a regex set. +/// +/// The lifetime `'a` refers to the lifetime of a `SetMatches` value. +/// +/// This will always produces matches in ascending order of index, where the +/// index corresponds to the index of the regex that matched with respect to +/// its position when initially building the set. +#[derive(Clone, Debug)] +pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>); + +impl<'a> Iterator for SetMatchesIter<'a> { + type Item = usize; + + fn next(&mut self) -> Option<usize> { + loop { + match self.0.next() { + None => return None, + Some((_, &false)) => {} + Some((i, &true)) => return Some(i), + } + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.0.size_hint() + } +} + +impl<'a> DoubleEndedIterator for SetMatchesIter<'a> { + fn next_back(&mut self) -> Option<usize> { + loop { + match self.0.next_back() { + None => return None, + Some((_, &false)) => {} + Some((i, &true)) => return Some(i), + } + } + } +} + +impl<'a> iter::FusedIterator for SetMatchesIter<'a> {} + +#[doc(hidden)] +impl From<Exec> for RegexSet { + fn from(exec: Exec) -> Self { + RegexSet(exec) + } +} + +impl fmt::Debug for RegexSet { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "RegexSet({:?})", self.0.regex_strings()) + } +} + +#[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() } +#[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text } + } + } +} + +define_set! { + unicode, + set_unicode, + &str, + as_bytes_str, +/// ```rust +/// # use regex::RegexSet; +/// let set = RegexSet::new(&[ +/// r"[a-z]+@[a-z]+\.(com|org|net)", +/// r"[a-z]+\.(com|org|net)", +/// ]).unwrap(); +/// +/// // Ask whether any regexes in the set match. +/// assert!(set.is_match("foo@example.com")); +/// +/// // Identify which regexes in the set match. +/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect(); +/// assert_eq!(vec![0, 1], matches); +/// +/// // Try again, but with text that only matches one of the regexes. +/// let matches: Vec<_> = set.matches("example.com").into_iter().collect(); +/// assert_eq!(vec![1], matches); +/// +/// // Try again, but with text that doesn't match any regex in the set. +/// let matches: Vec<_> = set.matches("example").into_iter().collect(); +/// assert!(matches.is_empty()); +/// ``` +} + +define_set! { + bytes, + set_bytes, + &[u8], + as_bytes_bytes, +/// ```rust +/// # use regex::bytes::RegexSet; +/// let set = RegexSet::new(&[ +/// r"[a-z]+@[a-z]+\.(com|org|net)", +/// r"[a-z]+\.(com|org|net)", +/// ]).unwrap(); +/// +/// // Ask whether any regexes in the set match. +/// assert!(set.is_match(b"foo@example.com")); +/// +/// // Identify which regexes in the set match. +/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect(); +/// assert_eq!(vec![0, 1], matches); +/// +/// // Try again, but with text that only matches one of the regexes. +/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect(); +/// assert_eq!(vec![1], matches); +/// +/// // Try again, but with text that doesn't match any regex in the set. +/// let matches: Vec<_> = set.matches(b"example").into_iter().collect(); +/// assert!(matches.is_empty()); +/// ``` +} diff --git a/vendor/regex/src/re_trait.rs b/vendor/regex/src/re_trait.rs new file mode 100644 index 0000000000..d0c717df5a --- /dev/null +++ b/vendor/regex/src/re_trait.rs @@ -0,0 +1,294 @@ +use std::fmt; +use std::iter::FusedIterator; + +/// Slot is a single saved capture location. Note that there are two slots for +/// every capture in a regular expression (one slot each for the start and end +/// of the capture). +pub type Slot = Option<usize>; + +/// Locations represents the offsets of each capturing group in a regex for +/// a single match. +/// +/// Unlike `Captures`, a `Locations` value only stores offsets. +#[doc(hidden)] +#[derive(Clone, Debug)] +pub struct Locations(Vec<Slot>); + +impl Locations { + /// Returns the start and end positions of the Nth capture group. Returns + /// `None` if `i` is not a valid capture group or if the capture group did + /// not match anything. The positions returned are *always* byte indices + /// with respect to the original string matched. + pub fn pos(&self, i: usize) -> Option<(usize, usize)> { + let (s, e) = (i * 2, i * 2 + 1); + match (self.0.get(s), self.0.get(e)) { + (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), + _ => None, + } + } + + /// Creates an iterator of all the capture group positions in order of + /// appearance in the regular expression. Positions are byte indices + /// in terms of the original string matched. + pub fn iter(&self) -> SubCapturesPosIter<'_> { + SubCapturesPosIter { idx: 0, locs: self } + } + + /// Returns the total number of capturing groups. + /// + /// This is always at least `1` since every regex has at least `1` + /// capturing group that corresponds to the entire match. + pub fn len(&self) -> usize { + self.0.len() / 2 + } + + /// Return the individual slots as a slice. + pub(crate) fn as_slots(&mut self) -> &mut [Slot] { + &mut self.0 + } +} + +/// An iterator over capture group positions for a particular match of a +/// regular expression. +/// +/// Positions are byte indices in terms of the original string matched. +/// +/// `'c` is the lifetime of the captures. +#[derive(Clone, Debug)] +pub struct SubCapturesPosIter<'c> { + idx: usize, + locs: &'c Locations, +} + +impl<'c> Iterator for SubCapturesPosIter<'c> { + type Item = Option<(usize, usize)>; + + fn next(&mut self) -> Option<Option<(usize, usize)>> { + if self.idx >= self.locs.len() { + return None; + } + let x = match self.locs.pos(self.idx) { + None => Some(None), + Some((s, e)) => Some(Some((s, e))), + }; + self.idx += 1; + x + } + + fn size_hint(&self) -> (usize, Option<usize>) { + let len = self.locs.len() - self.idx; + (len, Some(len)) + } + + fn count(self) -> usize { + self.len() + } +} + +impl<'c> ExactSizeIterator for SubCapturesPosIter<'c> {} + +impl<'c> FusedIterator for SubCapturesPosIter<'c> {} + +/// `RegularExpression` describes types that can implement regex searching. +/// +/// This trait is my attempt at reducing code duplication and to standardize +/// the internal API. Specific duplication that is avoided are the `find` +/// and `capture` iterators, which are slightly tricky. +/// +/// It's not clear whether this trait is worth it, and it also isn't +/// clear whether it's useful as a public trait or not. Methods like +/// `next_after_empty` reak of bad design, but the rest of the methods seem +/// somewhat reasonable. One particular thing this trait would expose would be +/// the ability to start the search of a regex anywhere in a haystack, which +/// isn't possible in the current public API. +pub trait RegularExpression: Sized + fmt::Debug { + /// The type of the haystack. + type Text: ?Sized + fmt::Debug; + + /// The number of capture slots in the compiled regular expression. This is + /// always two times the number of capture groups (two slots per group). + fn slots_len(&self) -> usize; + + /// Allocates fresh space for all capturing groups in this regex. + fn locations(&self) -> Locations { + Locations(vec![None; self.slots_len()]) + } + + /// Returns the position of the next character after `i`. + /// + /// For example, a haystack with type `&[u8]` probably returns `i+1`, + /// whereas a haystack with type `&str` probably returns `i` plus the + /// length of the next UTF-8 sequence. + fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize; + + /// Returns the location of the shortest match. + fn shortest_match_at( + &self, + text: &Self::Text, + start: usize, + ) -> Option<usize>; + + /// Returns whether the regex matches the text given. + fn is_match_at(&self, text: &Self::Text, start: usize) -> bool; + + /// Returns the leftmost-first match location if one exists. + fn find_at( + &self, + text: &Self::Text, + start: usize, + ) -> Option<(usize, usize)>; + + /// Returns the leftmost-first match location if one exists, and also + /// fills in any matching capture slot locations. + fn captures_read_at( + &self, + locs: &mut Locations, + text: &Self::Text, + start: usize, + ) -> Option<(usize, usize)>; + + /// Returns an iterator over all non-overlapping successive leftmost-first + /// matches. + fn find_iter(self, text: &Self::Text) -> Matches<'_, Self> { + Matches { re: self, text, last_end: 0, last_match: None } + } + + /// Returns an iterator over all non-overlapping successive leftmost-first + /// matches with captures. + fn captures_iter(self, text: &Self::Text) -> CaptureMatches<'_, Self> { + CaptureMatches(self.find_iter(text)) + } +} + +/// An iterator over all non-overlapping successive leftmost-first matches. +#[derive(Debug)] +pub struct Matches<'t, R> +where + R: RegularExpression, + R::Text: 't, +{ + re: R, + text: &'t R::Text, + last_end: usize, + last_match: Option<usize>, +} + +impl<'t, R> Matches<'t, R> +where + R: RegularExpression, + R::Text: 't, +{ + /// Return the text being searched. + pub fn text(&self) -> &'t R::Text { + self.text + } + + /// Return the underlying regex. + pub fn regex(&self) -> &R { + &self.re + } +} + +impl<'t, R> Iterator for Matches<'t, R> +where + R: RegularExpression, + R::Text: 't + AsRef<[u8]>, +{ + type Item = (usize, usize); + + fn next(&mut self) -> Option<(usize, usize)> { + if self.last_end > self.text.as_ref().len() { + return None; + } + let (s, e) = match self.re.find_at(self.text, self.last_end) { + None => return None, + Some((s, e)) => (s, e), + }; + if s == e { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = self.re.next_after_empty(self.text, e); + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(e) == self.last_match { + return self.next(); + } + } else { + self.last_end = e; + } + self.last_match = Some(e); + Some((s, e)) + } +} + +impl<'t, R> FusedIterator for Matches<'t, R> +where + R: RegularExpression, + R::Text: 't + AsRef<[u8]>, +{ +} + +/// An iterator over all non-overlapping successive leftmost-first matches with +/// captures. +#[derive(Debug)] +pub struct CaptureMatches<'t, R>(Matches<'t, R>) +where + R: RegularExpression, + R::Text: 't; + +impl<'t, R> CaptureMatches<'t, R> +where + R: RegularExpression, + R::Text: 't, +{ + /// Return the text being searched. + pub fn text(&self) -> &'t R::Text { + self.0.text() + } + + /// Return the underlying regex. + pub fn regex(&self) -> &R { + self.0.regex() + } +} + +impl<'t, R> Iterator for CaptureMatches<'t, R> +where + R: RegularExpression, + R::Text: 't + AsRef<[u8]>, +{ + type Item = Locations; + + fn next(&mut self) -> Option<Locations> { + if self.0.last_end > self.0.text.as_ref().len() { + return None; + } + let mut locs = self.0.re.locations(); + let (s, e) = match self.0.re.captures_read_at( + &mut locs, + self.0.text, + self.0.last_end, + ) { + None => return None, + Some((s, e)) => (s, e), + }; + if s == e { + self.0.last_end = self.0.re.next_after_empty(self.0.text, e); + if Some(e) == self.0.last_match { + return self.next(); + } + } else { + self.0.last_end = e; + } + self.0.last_match = Some(e); + Some(locs) + } +} + +impl<'t, R> FusedIterator for CaptureMatches<'t, R> +where + R: RegularExpression, + R::Text: 't + AsRef<[u8]>, +{ +} diff --git a/vendor/regex/src/re_unicode.rs b/vendor/regex/src/re_unicode.rs new file mode 100644 index 0000000000..197510ea0d --- /dev/null +++ b/vendor/regex/src/re_unicode.rs @@ -0,0 +1,1311 @@ +use std::borrow::Cow; +use std::collections::HashMap; +use std::fmt; +use std::iter::FusedIterator; +use std::ops::{Index, Range}; +use std::str::FromStr; +use std::sync::Arc; + +use crate::find_byte::find_byte; + +use crate::error::Error; +use crate::exec::{Exec, ExecNoSyncStr}; +use crate::expand::expand_str; +use crate::re_builder::unicode::RegexBuilder; +use crate::re_trait::{self, RegularExpression, SubCapturesPosIter}; + +/// Escapes all regular expression meta characters in `text`. +/// +/// The string returned may be safely used as a literal in a regular +/// expression. +pub fn escape(text: &str) -> String { + regex_syntax::escape(text) +} + +/// Match represents a single match of a regex in a haystack. +/// +/// The lifetime parameter `'t` refers to the lifetime of the matched text. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct Match<'t> { + text: &'t str, + start: usize, + end: usize, +} + +impl<'t> Match<'t> { + /// Returns the starting byte offset of the match in the haystack. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// Returns the ending byte offset of the match in the haystack. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns the range over the starting and ending byte offsets of the + /// match in the haystack. + #[inline] + pub fn range(&self) -> Range<usize> { + self.start..self.end + } + + /// Returns the matched text. + #[inline] + pub fn as_str(&self) -> &'t str { + &self.text[self.range()] + } + + /// Creates a new match from the given haystack and byte offsets. + #[inline] + fn new(haystack: &'t str, start: usize, end: usize) -> Match<'t> { + Match { text: haystack, start, end } + } +} + +impl<'t> From<Match<'t>> for &'t str { + fn from(m: Match<'t>) -> &'t str { + m.as_str() + } +} + +impl<'t> From<Match<'t>> for Range<usize> { + fn from(m: Match<'t>) -> Range<usize> { + m.range() + } +} + +/// A compiled regular expression for matching Unicode strings. +/// +/// It is represented as either a sequence of bytecode instructions (dynamic) +/// or as a specialized Rust function (native). It can be used to search, split +/// or replace text. All searching is done with an implicit `.*?` at the +/// beginning and end of an expression. To force an expression to match the +/// whole string (or a prefix or a suffix), you must use an anchor like `^` or +/// `$` (or `\A` and `\z`). +/// +/// While this crate will handle Unicode strings (whether in the regular +/// expression or in the search text), all positions returned are **byte +/// indices**. Every byte index is guaranteed to be at a Unicode code point +/// boundary. +/// +/// The lifetimes `'r` and `'t` in this crate correspond to the lifetime of a +/// compiled regular expression and text to search, respectively. +/// +/// The only methods that allocate new strings are the string replacement +/// methods. All other methods (searching and splitting) return borrowed +/// pointers into the string given. +/// +/// # Examples +/// +/// Find the location of a US phone number: +/// +/// ```rust +/// # use regex::Regex; +/// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap(); +/// let mat = re.find("phone: 111-222-3333").unwrap(); +/// assert_eq!((mat.start(), mat.end()), (7, 19)); +/// ``` +/// +/// # Using the `std::str::pattern` methods with `Regex` +/// +/// > **Note**: This section requires that this crate is compiled with the +/// > `pattern` Cargo feature enabled, which **requires nightly Rust**. +/// +/// Since `Regex` implements `Pattern`, you can use regexes with methods +/// defined on `&str`. For example, `is_match`, `find`, `find_iter` +/// and `split` can be replaced with `str::contains`, `str::find`, +/// `str::match_indices` and `str::split`. +/// +/// Here are some examples: +/// +/// ```rust,ignore +/// # use regex::Regex; +/// let re = Regex::new(r"\d+").unwrap(); +/// let haystack = "a111b222c"; +/// +/// assert!(haystack.contains(&re)); +/// assert_eq!(haystack.find(&re), Some(1)); +/// assert_eq!(haystack.match_indices(&re).collect::<Vec<_>>(), +/// vec![(1, "111"), (5, "222")]); +/// assert_eq!(haystack.split(&re).collect::<Vec<_>>(), vec!["a", "b", "c"]); +/// ``` +#[derive(Clone)] +pub struct Regex(Exec); + +impl fmt::Display for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +impl fmt::Debug for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(self, f) + } +} + +#[doc(hidden)] +impl From<Exec> for Regex { + fn from(exec: Exec) -> Regex { + Regex(exec) + } +} + +impl FromStr for Regex { + type Err = Error; + + /// Attempts to parse a string into a regular expression + fn from_str(s: &str) -> Result<Regex, Error> { + Regex::new(s) + } +} + +/// Core regular expression methods. +impl Regex { + /// Compiles a regular expression. Once compiled, it can be used repeatedly + /// to search, split or replace text in a string. + /// + /// If an invalid expression is given, then an error is returned. + pub fn new(re: &str) -> Result<Regex, Error> { + RegexBuilder::new(re).build() + } + + /// Returns true if and only if there is a match for the regex in the + /// string given. + /// + /// It is recommended to use this method if all you need to do is test + /// a match, since the underlying matching engine may be able to do less + /// work. + /// + /// # Example + /// + /// Test if some text contains at least one word with exactly 13 + /// Unicode word characters: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let text = "I categorically deny having triskaidekaphobia."; + /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text)); + /// # } + /// ``` + pub fn is_match(&self, text: &str) -> bool { + self.is_match_at(text, 0) + } + + /// Returns the start and end byte range of the leftmost-first match in + /// `text`. If no match exists, then `None` is returned. + /// + /// Note that this should only be used if you want to discover the position + /// of the match. Testing the existence of a match is faster if you use + /// `is_match`. + /// + /// # Example + /// + /// Find the start and end location of the first word with exactly 13 + /// Unicode word characters: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let text = "I categorically deny having triskaidekaphobia."; + /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap(); + /// assert_eq!(mat.start(), 2); + /// assert_eq!(mat.end(), 15); + /// # } + /// ``` + pub fn find<'t>(&self, text: &'t str) -> Option<Match<'t>> { + self.find_at(text, 0) + } + + /// Returns an iterator for each successive non-overlapping match in + /// `text`, returning the start and end byte indices with respect to + /// `text`. + /// + /// # Example + /// + /// Find the start and end location of every word with exactly 13 Unicode + /// word characters: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let text = "Retroactively relinquishing remunerations is reprehensible."; + /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { + /// println!("{:?}", mat); + /// } + /// # } + /// ``` + pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> { + Matches(self.0.searcher_str().find_iter(text)) + } + + /// Returns the capture groups corresponding to the leftmost-first + /// match in `text`. Capture group `0` always corresponds to the entire + /// match. If no match is found, then `None` is returned. + /// + /// You should only use `captures` if you need access to the location of + /// capturing group matches. Otherwise, `find` is faster for discovering + /// the location of the overall match. + /// + /// # Examples + /// + /// Say you have some text with movie names and their release years, + /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text + /// looking like that, while also extracting the movie name and its release + /// year separately. + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); + /// let text = "Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(text).unwrap(); + /// assert_eq!(caps.get(1).unwrap().as_str(), "Citizen Kane"); + /// assert_eq!(caps.get(2).unwrap().as_str(), "1941"); + /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)"); + /// // You can also access the groups by index using the Index notation. + /// // Note that this will panic on an invalid index. + /// assert_eq!(&caps[1], "Citizen Kane"); + /// assert_eq!(&caps[2], "1941"); + /// assert_eq!(&caps[0], "'Citizen Kane' (1941)"); + /// # } + /// ``` + /// + /// Note that the full match is at capture group `0`. Each subsequent + /// capture group is indexed by the order of its opening `(`. + /// + /// We can make this example a bit clearer by using *named* capture groups: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)") + /// .unwrap(); + /// let text = "Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(text).unwrap(); + /// assert_eq!(caps.name("title").unwrap().as_str(), "Citizen Kane"); + /// assert_eq!(caps.name("year").unwrap().as_str(), "1941"); + /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)"); + /// // You can also access the groups by name using the Index notation. + /// // Note that this will panic on an invalid group name. + /// assert_eq!(&caps["title"], "Citizen Kane"); + /// assert_eq!(&caps["year"], "1941"); + /// assert_eq!(&caps[0], "'Citizen Kane' (1941)"); + /// + /// # } + /// ``` + /// + /// Here we name the capture groups, which we can access with the `name` + /// method or the `Index` notation with a `&str`. Note that the named + /// capture groups are still accessible with `get` or the `Index` notation + /// with a `usize`. + /// + /// The `0`th capture group is always unnamed, so it must always be + /// accessed with `get(0)` or `[0]`. + pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> { + let mut locs = self.capture_locations(); + self.captures_read_at(&mut locs, text, 0).map(move |_| Captures { + text, + locs: locs.0, + named_groups: self.0.capture_name_idx().clone(), + }) + } + + /// Returns an iterator over all the non-overlapping capture groups matched + /// in `text`. This is operationally the same as `find_iter`, except it + /// yields information about capturing group matches. + /// + /// # Example + /// + /// We can use this to find all movie titles and their release years in + /// some text, where the movie is formatted like "'Title' (xxxx)": + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)") + /// .unwrap(); + /// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; + /// for caps in re.captures_iter(text) { + /// println!("Movie: {:?}, Released: {:?}", + /// &caps["title"], &caps["year"]); + /// } + /// // Output: + /// // Movie: Citizen Kane, Released: 1941 + /// // Movie: The Wizard of Oz, Released: 1939 + /// // Movie: M, Released: 1931 + /// # } + /// ``` + pub fn captures_iter<'r, 't>( + &'r self, + text: &'t str, + ) -> CaptureMatches<'r, 't> { + CaptureMatches(self.0.searcher_str().captures_iter(text)) + } + + /// Returns an iterator of substrings of `text` delimited by a match of the + /// regular expression. Namely, each element of the iterator corresponds to + /// text that *isn't* matched by the regular expression. + /// + /// This method will *not* copy the text given. + /// + /// # Example + /// + /// To split a string delimited by arbitrary amounts of spaces or tabs: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"[ \t]+").unwrap(); + /// let fields: Vec<&str> = re.split("a b \t c\td e").collect(); + /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]); + /// # } + /// ``` + pub fn split<'r, 't>(&'r self, text: &'t str) -> Split<'r, 't> { + Split { finder: self.find_iter(text), last: 0 } + } + + /// Returns an iterator of at most `limit` substrings of `text` delimited + /// by a match of the regular expression. (A `limit` of `0` will return no + /// substrings.) Namely, each element of the iterator corresponds to text + /// that *isn't* matched by the regular expression. The remainder of the + /// string that is not split will be the last element in the iterator. + /// + /// This method will *not* copy the text given. + /// + /// # Example + /// + /// Get the first two words in some text: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"\W+").unwrap(); + /// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect(); + /// assert_eq!(fields, vec!("Hey", "How", "are you?")); + /// # } + /// ``` + pub fn splitn<'r, 't>( + &'r self, + text: &'t str, + limit: usize, + ) -> SplitN<'r, 't> { + SplitN { splits: self.split(text), n: limit } + } + + /// Replaces the leftmost-first match with the replacement provided. + /// The replacement can be a regular string (where `$N` and `$name` are + /// expanded to match capture groups) or a function that takes the matches' + /// `Captures` and returns the replaced string. + /// + /// If no match is found, then a copy of the string is returned unchanged. + /// + /// # Replacement string syntax + /// + /// All instances of `$name` in the replacement text is replaced with the + /// corresponding capture group `name`. + /// + /// `name` may be an integer corresponding to the index of the + /// capture group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert more + /// precise control over the name, use braces, e.g., `${1}a`. + /// + /// To write a literal `$` use `$$`. + /// + /// # Examples + /// + /// Note that this function is polymorphic with respect to the replacement. + /// In typical usage, this can just be a normal string: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new("[^01]+").unwrap(); + /// assert_eq!(re.replace("1078910", ""), "1010"); + /// # } + /// ``` + /// + /// But anything satisfying the `Replacer` trait will work. For example, + /// a closure of type `|&Captures| -> String` provides direct access to the + /// captures corresponding to a match. This allows one to access + /// capturing group matches easily: + /// + /// ```rust + /// # use regex::Regex; + /// # use regex::Captures; fn main() { + /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap(); + /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| { + /// format!("{} {}", &caps[2], &caps[1]) + /// }); + /// assert_eq!(result, "Bruce Springsteen"); + /// # } + /// ``` + /// + /// But this is a bit cumbersome to use all the time. Instead, a simple + /// syntax is supported that expands `$name` into the corresponding capture + /// group. Here's the last example, but using this expansion technique + /// with named capture groups: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap(); + /// let result = re.replace("Springsteen, Bruce", "$first $last"); + /// assert_eq!(result, "Bruce Springsteen"); + /// # } + /// ``` + /// + /// Note that using `$2` instead of `$first` or `$1` instead of `$last` + /// would produce the same result. To write a literal `$` use `$$`. + /// + /// Sometimes the replacement string requires use of curly braces to + /// delineate a capture group replacement and surrounding literal text. + /// For example, if we wanted to join two words together with an + /// underscore: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap(); + /// let result = re.replace("deep fried", "${first}_$second"); + /// assert_eq!(result, "deep_fried"); + /// # } + /// ``` + /// + /// Without the curly braces, the capture group name `first_` would be + /// used, and since it doesn't exist, it would be replaced with the empty + /// string. + /// + /// Finally, sometimes you just want to replace a literal string with no + /// regard for capturing group expansion. This can be done by wrapping a + /// byte string with `NoExpand`: + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// use regex::NoExpand; + /// + /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap(); + /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last")); + /// assert_eq!(result, "$2 $last"); + /// # } + /// ``` + pub fn replace<'t, R: Replacer>( + &self, + text: &'t str, + rep: R, + ) -> Cow<'t, str> { + self.replacen(text, 1, rep) + } + + /// Replaces all non-overlapping matches in `text` with the replacement + /// provided. This is the same as calling `replacen` with `limit` set to + /// `0`. + /// + /// See the documentation for `replace` for details on how to access + /// capturing group matches in the replacement string. + pub fn replace_all<'t, R: Replacer>( + &self, + text: &'t str, + rep: R, + ) -> Cow<'t, str> { + self.replacen(text, 0, rep) + } + + /// Replaces at most `limit` non-overlapping matches in `text` with the + /// replacement provided. If `limit` is 0, then all non-overlapping matches + /// are replaced. + /// + /// See the documentation for `replace` for details on how to access + /// capturing group matches in the replacement string. + pub fn replacen<'t, R: Replacer>( + &self, + text: &'t str, + limit: usize, + mut rep: R, + ) -> Cow<'t, str> { + // If we know that the replacement doesn't have any capture expansions, + // then we can use the fast path. The fast path can make a tremendous + // difference: + // + // 1) We use `find_iter` instead of `captures_iter`. Not asking for + // captures generally makes the regex engines faster. + // 2) We don't need to look up all of the capture groups and do + // replacements inside the replacement string. We just push it + // at each match and be done with it. + if let Some(rep) = rep.no_expansion() { + let mut it = self.find_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } + let mut new = String::with_capacity(text.len()); + let mut last_match = 0; + for (i, m) in it { + new.push_str(&text[last_match..m.start()]); + new.push_str(&rep); + last_match = m.end(); + if limit > 0 && i >= limit - 1 { + break; + } + } + new.push_str(&text[last_match..]); + return Cow::Owned(new); + } + + // The slower path, which we use if the replacement needs access to + // capture groups. + let mut it = self.captures_iter(text).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(text); + } + let mut new = String::with_capacity(text.len()); + let mut last_match = 0; + for (i, cap) in it { + // unwrap on 0 is OK because captures only reports matches + let m = cap.get(0).unwrap(); + new.push_str(&text[last_match..m.start()]); + rep.replace_append(&cap, &mut new); + last_match = m.end(); + if limit > 0 && i >= limit - 1 { + break; + } + } + new.push_str(&text[last_match..]); + Cow::Owned(new) + } +} + +/// Advanced or "lower level" search methods. +impl Regex { + /// Returns the end location of a match in the text given. + /// + /// This method may have the same performance characteristics as + /// `is_match`, except it provides an end location for a match. In + /// particular, the location returned *may be shorter* than the proper end + /// of the leftmost-first match. + /// + /// # Example + /// + /// Typically, `a+` would match the entire first sequence of `a` in some + /// text, but `shortest_match` can give up as soon as it sees the first + /// `a`. + /// + /// ```rust + /// # use regex::Regex; + /// # fn main() { + /// let text = "aaaaa"; + /// let pos = Regex::new(r"a+").unwrap().shortest_match(text); + /// assert_eq!(pos, Some(1)); + /// # } + /// ``` + pub fn shortest_match(&self, text: &str) -> Option<usize> { + self.shortest_match_at(text, 0) + } + + /// Returns the same as shortest_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn shortest_match_at( + &self, + text: &str, + start: usize, + ) -> Option<usize> { + self.0.searcher_str().shortest_match_at(text, start) + } + + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn is_match_at(&self, text: &str, start: usize) -> bool { + self.0.searcher_str().is_match_at(text, start) + } + + /// Returns the same as find, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn find_at<'t>( + &self, + text: &'t str, + start: usize, + ) -> Option<Match<'t>> { + self.0 + .searcher_str() + .find_at(text, start) + .map(|(s, e)| Match::new(text, s, e)) + } + + /// This is like `captures`, but uses + /// [`CaptureLocations`](struct.CaptureLocations.html) + /// instead of + /// [`Captures`](struct.Captures.html) in order to amortize allocations. + /// + /// To create a `CaptureLocations` value, use the + /// `Regex::capture_locations` method. + /// + /// This returns the overall match if this was successful, which is always + /// equivalence to the `0`th capture group. + pub fn captures_read<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t str, + ) -> Option<Match<'t>> { + self.captures_read_at(locs, text, 0) + } + + /// Returns the same as captures, but starts the search at the given + /// offset and populates the capture locations given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn captures_read_at<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t str, + start: usize, + ) -> Option<Match<'t>> { + self.0 + .searcher_str() + .captures_read_at(&mut locs.0, text, start) + .map(|(s, e)| Match::new(text, s, e)) + } + + /// An undocumented alias for `captures_read_at`. + /// + /// The `regex-capi` crate previously used this routine, so to avoid + /// breaking that crate, we continue to provide the name as an undocumented + /// alias. + #[doc(hidden)] + pub fn read_captures_at<'t>( + &self, + locs: &mut CaptureLocations, + text: &'t str, + start: usize, + ) -> Option<Match<'t>> { + self.captures_read_at(locs, text, start) + } +} + +/// Auxiliary methods. +impl Regex { + /// Returns the original string of this regex. + pub fn as_str(&self) -> &str { + &self.0.regex_strings()[0] + } + + /// Returns an iterator over the capture names. + pub fn capture_names(&self) -> CaptureNames<'_> { + CaptureNames(self.0.capture_names().iter()) + } + + /// Returns the number of captures. + pub fn captures_len(&self) -> usize { + self.0.capture_names().len() + } + + /// Returns an empty set of capture locations that can be reused in + /// multiple calls to `captures_read` or `captures_read_at`. + pub fn capture_locations(&self) -> CaptureLocations { + CaptureLocations(self.0.searcher_str().locations()) + } + + /// An alias for `capture_locations` to preserve backward compatibility. + /// + /// The `regex-capi` crate uses this method, so to avoid breaking that + /// crate, we continue to export it as an undocumented API. + #[doc(hidden)] + pub fn locations(&self) -> CaptureLocations { + CaptureLocations(self.0.searcher_str().locations()) + } +} + +/// An iterator over the names of all possible captures. +/// +/// `None` indicates an unnamed capture; the first element (capture 0, the +/// whole matched region) is always unnamed. +/// +/// `'r` is the lifetime of the compiled regular expression. +#[derive(Clone, Debug)] +pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option<String>>); + +impl<'r> Iterator for CaptureNames<'r> { + type Item = Option<&'r str>; + + fn next(&mut self) -> Option<Option<&'r str>> { + self.0 + .next() + .as_ref() + .map(|slot| slot.as_ref().map(|name| name.as_ref())) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.0.size_hint() + } + + fn count(self) -> usize { + self.0.count() + } +} + +impl<'r> ExactSizeIterator for CaptureNames<'r> {} + +impl<'r> FusedIterator for CaptureNames<'r> {} + +/// Yields all substrings delimited by a regular expression match. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the string being split. +#[derive(Debug)] +pub struct Split<'r, 't> { + finder: Matches<'r, 't>, + last: usize, +} + +impl<'r, 't> Iterator for Split<'r, 't> { + type Item = &'t str; + + fn next(&mut self) -> Option<&'t str> { + let text = self.finder.0.text(); + match self.finder.next() { + None => { + if self.last > text.len() { + None + } else { + let s = &text[self.last..]; + self.last = text.len() + 1; // Next call will return None + Some(s) + } + } + Some(m) => { + let matched = &text[self.last..m.start()]; + self.last = m.end(); + Some(matched) + } + } + } +} + +impl<'r, 't> FusedIterator for Split<'r, 't> {} + +/// Yields at most `N` substrings delimited by a regular expression match. +/// +/// The last substring will be whatever remains after splitting. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the string being split. +#[derive(Debug)] +pub struct SplitN<'r, 't> { + splits: Split<'r, 't>, + n: usize, +} + +impl<'r, 't> Iterator for SplitN<'r, 't> { + type Item = &'t str; + + fn next(&mut self) -> Option<&'t str> { + if self.n == 0 { + return None; + } + + self.n -= 1; + if self.n > 0 { + return self.splits.next(); + } + + let text = self.splits.finder.0.text(); + if self.splits.last > text.len() { + // We've already returned all substrings. + None + } else { + // self.n == 0, so future calls will return None immediately + Some(&text[self.splits.last..]) + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + (0, Some(self.n)) + } +} + +impl<'r, 't> FusedIterator for SplitN<'r, 't> {} + +/// CaptureLocations is a low level representation of the raw offsets of each +/// submatch. +/// +/// You can think of this as a lower level +/// [`Captures`](struct.Captures.html), where this type does not support +/// named capturing groups directly and it does not borrow the text that these +/// offsets were matched on. +/// +/// Primarily, this type is useful when using the lower level `Regex` APIs +/// such as `read_captures`, which permits amortizing the allocation in which +/// capture match locations are stored. +/// +/// In order to build a value of this type, you'll need to call the +/// `capture_locations` method on the `Regex` being used to execute the search. +/// The value returned can then be reused in subsequent searches. +#[derive(Clone, Debug)] +pub struct CaptureLocations(re_trait::Locations); + +/// A type alias for `CaptureLocations` for backwards compatibility. +/// +/// Previously, we exported `CaptureLocations` as `Locations` in an +/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`), +/// we continue re-exporting the same undocumented API. +#[doc(hidden)] +pub type Locations = CaptureLocations; + +impl CaptureLocations { + /// Returns the start and end positions of the Nth capture group. Returns + /// `None` if `i` is not a valid capture group or if the capture group did + /// not match anything. The positions returned are *always* byte indices + /// with respect to the original string matched. + #[inline] + pub fn get(&self, i: usize) -> Option<(usize, usize)> { + self.0.pos(i) + } + + /// Returns the total number of capture groups (even if they didn't match). + /// + /// This is always at least `1` since every regex has at least `1` + /// capturing group that corresponds to the entire match. + #[inline] + pub fn len(&self) -> usize { + self.0.len() + } + + /// An alias for the `get` method for backwards compatibility. + /// + /// Previously, we exported `get` as `pos` in an undocumented API. To + /// prevent breaking that code (e.g., in `regex-capi`), we continue + /// re-exporting the same undocumented API. + #[doc(hidden)] + #[inline] + pub fn pos(&self, i: usize) -> Option<(usize, usize)> { + self.get(i) + } +} + +/// Captures represents a group of captured strings for a single match. +/// +/// The 0th capture always corresponds to the entire match. Each subsequent +/// index corresponds to the next capture group in the regex. If a capture +/// group is named, then the matched string is *also* available via the `name` +/// method. (Note that the 0th capture is always unnamed and so must be +/// accessed with the `get` method.) +/// +/// Positions returned from a capture group are always byte indices. +/// +/// `'t` is the lifetime of the matched text. +pub struct Captures<'t> { + text: &'t str, + locs: re_trait::Locations, + named_groups: Arc<HashMap<String, usize>>, +} + +impl<'t> Captures<'t> { + /// Returns the match associated with the capture group at index `i`. If + /// `i` does not correspond to a capture group, or if the capture group + /// did not participate in the match, then `None` is returned. + /// + /// # Examples + /// + /// Get the text of the match with a default of an empty string if this + /// group didn't participate in the match: + /// + /// ```rust + /// # use regex::Regex; + /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap(); + /// let caps = re.captures("abc123").unwrap(); + /// + /// let text1 = caps.get(1).map_or("", |m| m.as_str()); + /// let text2 = caps.get(2).map_or("", |m| m.as_str()); + /// assert_eq!(text1, "123"); + /// assert_eq!(text2, ""); + /// ``` + pub fn get(&self, i: usize) -> Option<Match<'t>> { + self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e)) + } + + /// Returns the match for the capture group named `name`. If `name` isn't a + /// valid capture group or didn't match anything, then `None` is returned. + pub fn name(&self, name: &str) -> Option<Match<'t>> { + self.named_groups.get(name).and_then(|&i| self.get(i)) + } + + /// An iterator that yields all capturing matches in the order in which + /// they appear in the regex. If a particular capture group didn't + /// participate in the match, then `None` is yielded for that capture. + /// + /// The first match always corresponds to the overall match of the regex. + pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> { + SubCaptureMatches { caps: self, it: self.locs.iter() } + } + + /// Expands all instances of `$name` in `replacement` to the corresponding + /// capture group `name`, and writes them to the `dst` buffer given. + /// + /// `name` may be an integer corresponding to the index of the capture + /// group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// If `name` isn't a valid capture group (whether the name doesn't exist + /// or isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name consisting of the characters `[_0-9A-Za-z]` + /// is used. e.g., `$1a` looks up the capture group named `1a` and not the + /// capture group at index `1`. To exert more precise control over the + /// name, or to refer to a capture group name that uses characters outside + /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When + /// using braces, any sequence of characters is permitted. If the sequence + /// does not refer to a capture group name in the corresponding regex, then + /// it is replaced with an empty string. + /// + /// To write a literal `$` use `$$`. + pub fn expand(&self, replacement: &str, dst: &mut String) { + expand_str(self, replacement, dst) + } + + /// Returns the total number of capture groups (even if they didn't match). + /// + /// This is always at least `1`, since every regex has at least one capture + /// group that corresponds to the full match. + #[inline] + pub fn len(&self) -> usize { + self.locs.len() + } +} + +impl<'t> fmt::Debug for Captures<'t> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("Captures").field(&CapturesDebug(self)).finish() + } +} + +struct CapturesDebug<'c, 't>(&'c Captures<'t>); + +impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // We'd like to show something nice here, even if it means an + // allocation to build a reverse index. + let slot_to_name: HashMap<&usize, &String> = + self.0.named_groups.iter().map(|(a, b)| (b, a)).collect(); + let mut map = f.debug_map(); + for (slot, m) in self.0.locs.iter().enumerate() { + let m = m.map(|(s, e)| &self.0.text[s..e]); + if let Some(name) = slot_to_name.get(&slot) { + map.entry(&name, &m); + } else { + map.entry(&slot, &m); + } + } + map.finish() + } +} + +/// Get a group by index. +/// +/// `'t` is the lifetime of the matched text. +/// +/// The text can't outlive the `Captures` object if this method is +/// used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it); to do that, use `get()` instead. +/// +/// # Panics +/// +/// If there is no group at the given index. +impl<'t> Index<usize> for Captures<'t> { + type Output = str; + + fn index(&self, i: usize) -> &str { + self.get(i) + .map(|m| m.as_str()) + .unwrap_or_else(|| panic!("no group at index '{}'", i)) + } +} + +/// Get a group by name. +/// +/// `'t` is the lifetime of the matched text and `'i` is the lifetime +/// of the group name (the index). +/// +/// The text can't outlive the `Captures` object if this method is +/// used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it); to do that, use `name` instead. +/// +/// # Panics +/// +/// If there is no group named by the given value. +impl<'t, 'i> Index<&'i str> for Captures<'t> { + type Output = str; + + fn index<'a>(&'a self, name: &'i str) -> &'a str { + self.name(name) + .map(|m| m.as_str()) + .unwrap_or_else(|| panic!("no group named '{}'", name)) + } +} + +/// An iterator that yields all capturing matches in the order in which they +/// appear in the regex. +/// +/// If a particular capture group didn't participate in the match, then `None` +/// is yielded for that capture. The first match always corresponds to the +/// overall match of the regex. +/// +/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and +/// the lifetime `'t` corresponds to the originally matched text. +#[derive(Clone, Debug)] +pub struct SubCaptureMatches<'c, 't> { + caps: &'c Captures<'t>, + it: SubCapturesPosIter<'c>, +} + +impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> { + type Item = Option<Match<'t>>; + + fn next(&mut self) -> Option<Option<Match<'t>>> { + self.it + .next() + .map(|cap| cap.map(|(s, e)| Match::new(self.caps.text, s, e))) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.it.size_hint() + } + + fn count(self) -> usize { + self.it.count() + } +} + +impl<'c, 't> ExactSizeIterator for SubCaptureMatches<'c, 't> {} + +impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {} + +/// An iterator that yields all non-overlapping capture groups matching a +/// particular regular expression. +/// +/// The iterator stops when no more matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the matched string. +#[derive(Debug)] +pub struct CaptureMatches<'r, 't>( + re_trait::CaptureMatches<'t, ExecNoSyncStr<'r>>, +); + +impl<'r, 't> Iterator for CaptureMatches<'r, 't> { + type Item = Captures<'t>; + + fn next(&mut self) -> Option<Captures<'t>> { + self.0.next().map(|locs| Captures { + text: self.0.text(), + locs, + named_groups: self.0.regex().capture_name_idx().clone(), + }) + } +} + +impl<'r, 't> FusedIterator for CaptureMatches<'r, 't> {} + +/// An iterator over all non-overlapping matches for a particular string. +/// +/// The iterator yields a `Match` value. The iterator stops when no more +/// matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'t` is the +/// lifetime of the matched string. +#[derive(Debug)] +pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSyncStr<'r>>); + +impl<'r, 't> Iterator for Matches<'r, 't> { + type Item = Match<'t>; + + fn next(&mut self) -> Option<Match<'t>> { + let text = self.0.text(); + self.0.next().map(|(s, e)| Match::new(text, s, e)) + } +} + +impl<'r, 't> FusedIterator for Matches<'r, 't> {} + +/// Replacer describes types that can be used to replace matches in a string. +/// +/// In general, users of this crate shouldn't need to implement this trait, +/// since implementations are already provided for `&str` along with other +/// variants of string types and `FnMut(&Captures) -> String` (or any +/// `FnMut(&Captures) -> T` where `T: AsRef<str>`), which covers most use cases. +pub trait Replacer { + /// Appends text to `dst` to replace the current match. + /// + /// The current match is represented by `caps`, which is guaranteed to + /// have a match at capture group `0`. + /// + /// For example, a no-op replacement would be + /// `dst.push_str(caps.get(0).unwrap().as_str())`. + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String); + + /// Return a fixed unchanging replacement string. + /// + /// When doing replacements, if access to `Captures` is not needed (e.g., + /// the replacement byte string does not need `$` expansion), then it can + /// be beneficial to avoid finding sub-captures. + /// + /// In general, this is called once for every call to `replacen`. + fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, str>> { + None + } + + /// Return a `Replacer` that borrows and wraps this `Replacer`. + /// + /// This is useful when you want to take a generic `Replacer` (which might + /// not be cloneable) and use it without consuming it, so it can be used + /// more than once. + /// + /// # Example + /// + /// ``` + /// use regex::{Regex, Replacer}; + /// + /// fn replace_all_twice<R: Replacer>( + /// re: Regex, + /// src: &str, + /// mut rep: R, + /// ) -> String { + /// let dst = re.replace_all(src, rep.by_ref()); + /// let dst = re.replace_all(&dst, rep.by_ref()); + /// dst.into_owned() + /// } + /// ``` + fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> { + ReplacerRef(self) + } +} + +/// By-reference adaptor for a `Replacer` +/// +/// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref). +#[derive(Debug)] +pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R); + +impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.0.replace_append(caps, dst) + } + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + self.0.no_expansion() + } +} + +impl<'a> Replacer for &'a str { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + caps.expand(*self, dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + no_expansion(self) + } +} + +impl<'a> Replacer for &'a String { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.as_str().replace_append(caps, dst) + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + no_expansion(self) + } +} + +impl Replacer for String { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.as_str().replace_append(caps, dst) + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + no_expansion(self) + } +} + +impl<'a> Replacer for Cow<'a, str> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.as_ref().replace_append(caps, dst) + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + no_expansion(self) + } +} + +impl<'a> Replacer for &'a Cow<'a, str> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.as_ref().replace_append(caps, dst) + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + no_expansion(self) + } +} + +fn no_expansion<T: AsRef<str>>(t: &T) -> Option<Cow<'_, str>> { + let s = t.as_ref(); + match find_byte(b'$', s.as_bytes()) { + Some(_) => None, + None => Some(Cow::Borrowed(s)), + } +} + +impl<F, T> Replacer for F +where + F: FnMut(&Captures<'_>) -> T, + T: AsRef<str>, +{ + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + dst.push_str((*self)(caps).as_ref()); + } +} + +/// `NoExpand` indicates literal string replacement. +/// +/// It can be used with `replace` and `replace_all` to do a literal string +/// replacement without expanding `$name` to their corresponding capture +/// groups. This can be both convenient (to avoid escaping `$`, for example) +/// and performant (since capture groups don't need to be found). +/// +/// `'t` is the lifetime of the literal text. +#[derive(Clone, Debug)] +pub struct NoExpand<'t>(pub &'t str); + +impl<'t> Replacer for NoExpand<'t> { + fn replace_append(&mut self, _: &Captures<'_>, dst: &mut String) { + dst.push_str(self.0); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + Some(Cow::Borrowed(self.0)) + } +} diff --git a/vendor/regex/src/sparse.rs b/vendor/regex/src/sparse.rs new file mode 100644 index 0000000000..98b726613d --- /dev/null +++ b/vendor/regex/src/sparse.rs @@ -0,0 +1,84 @@ +use std::fmt; +use std::ops::Deref; +use std::slice; + +/// A sparse set used for representing ordered NFA states. +/// +/// This supports constant time addition and membership testing. Clearing an +/// entire set can also be done in constant time. Iteration yields elements +/// in the order in which they were inserted. +/// +/// The data structure is based on: https://research.swtch.com/sparse +/// Note though that we don't actually use uninitialized memory. We generally +/// reuse allocations, so the initial allocation cost is bareable. However, +/// its other properties listed above are extremely useful. +#[derive(Clone)] +pub struct SparseSet { + /// Dense contains the instruction pointers in the order in which they + /// were inserted. + dense: Vec<usize>, + /// Sparse maps instruction pointers to their location in dense. + /// + /// An instruction pointer is in the set if and only if + /// sparse[ip] < dense.len() && ip == dense[sparse[ip]]. + sparse: Box<[usize]>, +} + +impl SparseSet { + pub fn new(size: usize) -> SparseSet { + SparseSet { + dense: Vec::with_capacity(size), + sparse: vec![0; size].into_boxed_slice(), + } + } + + pub fn len(&self) -> usize { + self.dense.len() + } + + pub fn is_empty(&self) -> bool { + self.dense.is_empty() + } + + pub fn capacity(&self) -> usize { + self.dense.capacity() + } + + pub fn insert(&mut self, value: usize) { + let i = self.len(); + assert!(i < self.capacity()); + self.dense.push(value); + self.sparse[value] = i; + } + + pub fn contains(&self, value: usize) -> bool { + let i = self.sparse[value]; + self.dense.get(i) == Some(&value) + } + + pub fn clear(&mut self) { + self.dense.clear(); + } +} + +impl fmt::Debug for SparseSet { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "SparseSet({:?})", self.dense) + } +} + +impl Deref for SparseSet { + type Target = [usize]; + + fn deref(&self) -> &Self::Target { + &self.dense + } +} + +impl<'a> IntoIterator for &'a SparseSet { + type Item = &'a usize; + type IntoIter = slice::Iter<'a, usize>; + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} diff --git a/vendor/regex/src/testdata/LICENSE b/vendor/regex/src/testdata/LICENSE new file mode 100644 index 0000000000..f47dbf4c44 --- /dev/null +++ b/vendor/regex/src/testdata/LICENSE @@ -0,0 +1,19 @@ +The following license covers testregex.c and all associated test data. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software +without restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, and/or sell copies of the +Software, and to permit persons to whom the Software is furnished to do +so, subject to the following disclaimer: + +THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/regex/src/testdata/README b/vendor/regex/src/testdata/README new file mode 100644 index 0000000000..6efc2dad33 --- /dev/null +++ b/vendor/regex/src/testdata/README @@ -0,0 +1,17 @@ +Test data was taken from the Go distribution, which was in turn taken from the +testregex test suite: + + http://www2.research.att.com/~astopen/testregex/testregex.html + +The LICENSE in this directory corresponds to the LICENSE that the data was +released under. + +The tests themselves were modified for RE2/Go. A couple were modified further +by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them. +(Yes, it seems like RE2/Go includes failing test cases.) This may or may not +have been a bad idea, but I think being consistent with an established Regex +library is worth something. + +Note that these files are read by 'scripts/regex-match-tests.py' and turned +into Rust tests found in 'regex_macros/tests/matches.rs'. + diff --git a/vendor/regex/src/testdata/basic.dat b/vendor/regex/src/testdata/basic.dat new file mode 100644 index 0000000000..632e1bb416 --- /dev/null +++ b/vendor/regex/src/testdata/basic.dat @@ -0,0 +1,221 @@ +NOTE all standard compliant implementations should pass these : 2002-05-31 + +BE abracadabra$ abracadabracadabra (7,18) +BE a...b abababbb (2,7) +BE XXXXXX ..XXXXXX (2,8) +E \) () (1,2) +BE a] a]a (0,2) +B } } (0,1) +E \} } (0,1) +BE \] ] (0,1) +B ] ] (0,1) +E ] ] (0,1) +B { { (0,1) +B } } (0,1) +BE ^a ax (0,1) +BE \^a a^a (1,3) +BE a\^ a^ (0,2) +BE a$ aa (1,2) +BE a\$ a$ (0,2) +BE ^$ NULL (0,0) +E $^ NULL (0,0) +E a($) aa (1,2)(2,2) +E a*(^a) aa (0,1)(0,1) +E (..)*(...)* a (0,0) +E (..)*(...)* abcd (0,4)(2,4) +E (ab|a)(bc|c) abc (0,3)(0,2)(2,3) +E (ab)c|abc abc (0,3)(0,2) +E a{0}b ab (1,2) +E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) +E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) +E a{9876543210} NULL BADBR +E ((a|a)|a) a (0,1)(0,1)(0,1) +E (a*)(a|aa) aaaa (0,4)(0,3)(3,4) +E a*(a.|aa) aaaa (0,4)(2,4) +E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2) +E (a|b)?.* b (0,1)(0,1) +E (a|b)c|a(b|c) ac (0,2)(0,1) +E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2) +E (a|b)*c|(a|ab)*c abc (0,3)(1,2) +E (a|b)*c|(a|ab)*c xc (1,2) +E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2) +E a?(ab|ba)ab abab (0,4)(0,2) +E a?(ac{0}b|ba)ab abab (0,4)(0,2) +E ab|abab abbabab (0,2) +E aba|bab|bba baaabbbaba (5,8) +E aba|bab baaabbbaba (6,9) +E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2) +E (a.|.a.)*|(a|.a...) aa (0,2)(0,2) +E ab|a xabc (1,3) +E ab|a xxabc (2,4) +Ei (?-u)(Ab|cD)* aBcD (0,4)(2,4) +BE [^-] --a (2,3) +BE [a-]* --a (0,3) +BE [a-m-]* --amoma-- (0,4) +E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17) +E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17) +{E [[:upper:]] A (0,1) [[<element>]] not supported +E [[:lower:]]+ `az{ (1,3) +E [[:upper:]]+ @AZ[ (1,3) +# No collation in Go +#BE [[-]] [[-]] (2,4) +#BE [[.NIL.]] NULL ECOLLATE +#BE [[=aleph=]] NULL ECOLLATE +} +BE$ \n \n (0,1) +BEn$ \n \n (0,1) +BE$ [^a] \n (0,1) +BE$ \na \na (0,2) +E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3) +BE xxx xxx (0,3) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3) +E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11) +E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1) +E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2) +E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81) +E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25) +E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22) +E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11) +BE$ .* \x01\x7f (0,2) +E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57) +L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH +E a*a*a*a*a*b aaaaaaaaab (0,10) +BE ^ NULL (0,0) +BE $ NULL (0,0) +BE ^$ NULL (0,0) +BE ^a$ a (0,1) +BE abc abc (0,3) +BE abc xabcy (1,4) +BE abc ababc (2,5) +BE ab*c abc (0,3) +BE ab*bc abc (0,3) +BE ab*bc abbc (0,4) +BE ab*bc abbbbc (0,6) +E ab+bc abbc (0,4) +E ab+bc abbbbc (0,6) +E ab?bc abbc (0,4) +E ab?bc abc (0,3) +E ab?c abc (0,3) +BE ^abc$ abc (0,3) +BE ^abc abcc (0,3) +BE abc$ aabc (1,4) +BE ^ abc (0,0) +BE $ abc (3,3) +BE a.c abc (0,3) +BE a.c axc (0,3) +BE a.*c axyzc (0,5) +BE a[bc]d abd (0,3) +BE a[b-d]e ace (0,3) +BE a[b-d] aac (1,3) +BE a[-b] a- (0,2) +BE a[b-] a- (0,2) +BE a] a] (0,2) +BE a[]]b a]b (0,3) +BE a[^bc]d aed (0,3) +BE a[^-b]c adc (0,3) +BE a[^]b]c adc (0,3) +E ab|cd abc (0,2) +E ab|cd abcd (0,2) +E a\(b a(b (0,3) +E a\(*b ab (0,2) +E a\(*b a((b (0,4) +E ((a)) abc (0,1)(0,1)(0,1) +E (a)b(c) abc (0,3)(0,1)(2,3) +E a+b+c aabbabc (4,7) +E a* aaa (0,3) +#E (a*)* - (0,0)(0,0) +E (a*)* - (0,0)(?,?) RE2/Go +E (a*)+ - (0,0)(0,0) +#E (a*|b)* - (0,0)(0,0) +E (a*|b)* - (0,0)(?,?) RE2/Go +E (a+|b)* ab (0,2)(1,2) +E (a+|b)+ ab (0,2)(1,2) +E (a+|b)? ab (0,1)(0,1) +BE [^ab]* cde (0,3) +#E (^)* - (0,0)(0,0) +E (^)* - (0,0)(?,?) RE2/Go +BE a* NULL (0,0) +E ([abc])*d abbbcd (0,6)(4,5) +E ([abc])*bcd abcd (0,4)(0,1) +E a|b|c|d|e e (0,1) +E (a|b|c|d|e)f ef (0,2)(0,1) +#E ((a*|b))* - (0,0)(0,0)(0,0) +E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go +BE abcd*efg abcdefg (0,7) +BE ab* xabyabbbz (1,3) +BE ab* xayabbbz (1,2) +E (ab|cd)e abcde (2,5)(2,4) +BE [abhgefdc]ij hij (0,3) +E (a|b)c*d abcd (1,4)(1,2) +E (ab|ab*)bc abc (0,3)(0,1) +E a([bc]*)c* abc (0,3)(1,3) +E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4) +E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4) +E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4) +E a[bcd]*dcdcde adcdcde (0,7) +E (ab|a)b*c abc (0,3)(0,2) +E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4) +BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5) +E ^a(bc+|b[eh])g|.h$ abh (1,3) +E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5) +E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2) +E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6) +E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1) +BE multiple words multiple words yeah (0,14) +E (.*)c(.*) abcde (0,5)(0,2)(3,5) +BE abcd abcd (0,4) +E a(bc)d abcd (0,4)(1,3) +E a[-]?c ac (0,3) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12) +E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12) +E a+(b|c)*d+ aabcdd (0,6)(3,4) +E ^.+$ vivi (0,4) +E ^(.+)$ vivi (0,4)(0,4) +E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19) +E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3) +E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7) +E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7) +E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11) +E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3) +E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7) +E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3) +E ((foo)|bar)!bas bar!bas (0,7)(0,3) +E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7) +E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3) +E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3) +E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7) +E (foo|(bar))!bas foo!bas (0,7)(0,3) +E (foo|bar)!bas bar!bas (0,7)(0,3) +E (foo|bar)!bas foo!bar!bas (4,11)(4,7) +E (foo|bar)!bas foo!bas (0,7)(0,3) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11) +E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) +E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7) +E .*(/XXX).* /XXX (0,4)(0,4) +E .*(\\XXX).* \XXX (0,4)(0,4) +E \\XXX \XXX (0,4) +E .*(/000).* /000 (0,4)(0,4) +E .*(\\000).* \000 (0,4)(0,4) +E \\000 \000 (0,4) diff --git a/vendor/regex/src/testdata/nullsubexpr.dat b/vendor/regex/src/testdata/nullsubexpr.dat new file mode 100644 index 0000000000..2e18fbb917 --- /dev/null +++ b/vendor/regex/src/testdata/nullsubexpr.dat @@ -0,0 +1,79 @@ +NOTE null subexpression matches : 2002-06-06 + +E (a*)* a (0,1)(0,1) +#E SAME x (0,0)(0,0) +E SAME x (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a*)+ a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a+)* a (0,1)(0,1) +E SAME x (0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E (a+)+ a (0,1)(0,1) +E SAME x NOMATCH +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) + +E ([a]*)* a (0,1)(0,1) +#E SAME x (0,0)(0,0) +E SAME x (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E ([a]*)+ a (0,1)(0,1) +E SAME x (0,0)(0,0) +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaax (0,6)(0,6) +E ([^b]*)* a (0,1)(0,1) +#E SAME b (0,0)(0,0) +E SAME b (0,0)(?,?) RE2/Go +E SAME aaaaaa (0,6)(0,6) +E SAME aaaaaab (0,6)(0,6) +E ([ab]*)* a (0,1)(0,1) +E SAME aaaaaa (0,6)(0,6) +E SAME ababab (0,6)(0,6) +E SAME bababa (0,6)(0,6) +E SAME b (0,1)(0,1) +E SAME bbbbbb (0,6)(0,6) +E SAME aaaabcde (0,5)(0,5) +E ([^a]*)* b (0,1)(0,1) +E SAME bbbbbb (0,6)(0,6) +#E SAME aaaaaa (0,0)(0,0) +E SAME aaaaaa (0,0)(?,?) RE2/Go +E ([^ab]*)* ccccxx (0,6)(0,6) +#E SAME ababab (0,0)(0,0) +E SAME ababab (0,0)(?,?) RE2/Go + +E ((z)+|a)* zabcde (0,2)(1,2) + +#{E a+? aaaaaa (0,1) no *? +? mimimal match ops +#E (a) aaa (0,1)(0,1) +#E (a*?) aaa (0,0)(0,0) +#E (a)*? aaa (0,0) +#E (a*?)*? aaa (0,0) +#} + +B \(a*\)*\(x\) x (0,1)(0,0)(0,1) +B \(a*\)*\(x\) ax (0,2)(0,1)(1,2) +B \(a*\)*\(x\) axa (0,2)(0,1)(1,2) +B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1) +B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2) +B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3) +B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4) +B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3) + +#E (a*)*(x) x (0,1)(0,0)(0,1) +E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go +E (a*)*(x) ax (0,2)(0,1)(1,2) +E (a*)*(x) axa (0,2)(0,1)(1,2) + +E (a*)+(x) x (0,1)(0,0)(0,1) +E (a*)+(x) ax (0,2)(0,1)(1,2) +E (a*)+(x) axa (0,2)(0,1)(1,2) + +E (a*){2}(x) x (0,1)(0,0)(0,1) +E (a*){2}(x) ax (0,2)(1,1)(1,2) +E (a*){2}(x) axa (0,2)(1,1)(1,2) diff --git a/vendor/regex/src/testdata/repetition.dat b/vendor/regex/src/testdata/repetition.dat new file mode 100644 index 0000000000..3bb2121180 --- /dev/null +++ b/vendor/regex/src/testdata/repetition.dat @@ -0,0 +1,163 @@ +NOTE implicit vs. explicit repetitions : 2009-02-02 + +# Glenn Fowler <gsf@research.att.com> +# conforming matches (column 4) must match one of the following BREs +# NOMATCH +# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* +# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* +# i.e., each 3-tuple has two identical elements and one (?,?) + +E ((..)|(.)) NULL NOMATCH +E ((..)|(.))((..)|(.)) NULL NOMATCH +E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH + +E ((..)|(.)){1} NULL NOMATCH +E ((..)|(.)){2} NULL NOMATCH +E ((..)|(.)){3} NULL NOMATCH + +E ((..)|(.))* NULL (0,0) + +E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1) +E ((..)|(.))((..)|(.)) a NOMATCH +E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH + +E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1) +E ((..)|(.)){2} a NOMATCH +E ((..)|(.)){3} a NOMATCH + +E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1) + +E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2) +E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH + +E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2) +E ((..)|(.)){3} aa NOMATCH + +E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?) + +E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3) +E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3) + +E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?) +#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3) +E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go +E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3) + +#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3) +E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go + +E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4) + +E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?) +#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4) +E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go + +E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?) + +E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5) + +E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?) +#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5) +E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go + +#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5) +E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go + +E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) +E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?) + +E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?) +E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?) +E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?) + +E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?) + +NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02 + +# These test a bug in OS X / FreeBSD / NetBSD, and libtree. +# Linux/GLIBC gets the {8,} and {8,8} wrong. + +:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8) +:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8) +:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8) +:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8) +:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8) +:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8) +:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8) +:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8) +:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8) +#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8) +:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8) +:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8) +:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8) +:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8) +:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8) +:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8) +:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go +#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8) +:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go +:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8) + +# These test a fixed bug in my regex-tdfa that did not keep the expanded +# form properly grouped, so right association did the wrong thing with +# these ambiguous patterns (crafted just to test my code when I became +# suspicious of my implementation). The first subexpression should use +# "ab" then "a" then "bcd". + +# OS X / FreeBSD / NetBSD badly fail many of these, with impossible +# results like (0,6)(4,5)(6,6). + +:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1) +:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1) +:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH +:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1) +:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1) +:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH +:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1) +:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1) + +# The above worked on Linux/GLIBC but the following often fail. +# They also trip up OS X / FreeBSD / NetBSD: + +#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) +:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH +#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) +:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH +#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) +:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go +#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) +:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go diff --git a/vendor/regex/src/utf8.rs b/vendor/regex/src/utf8.rs new file mode 100644 index 0000000000..2dfd2c0d1d --- /dev/null +++ b/vendor/regex/src/utf8.rs @@ -0,0 +1,264 @@ +/// A few elementary UTF-8 encoding and decoding functions used by the matching +/// engines. +/// +/// In an ideal world, the matching engines operate on `&str` and we can just +/// lean on the standard library for all our UTF-8 needs. However, to support +/// byte based regexes (that can match on arbitrary bytes which may contain +/// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`. +/// The standard library doesn't really recognize this use case, so we have +/// to build it out ourselves. +/// +/// Should this be factored out into a separate crate? It seems independently +/// useful. There are other crates that already exist (e.g., `utf-8`) that have +/// overlapping use cases. Not sure what to do. +use std::char; + +const TAG_CONT: u8 = 0b1000_0000; +const TAG_TWO: u8 = 0b1100_0000; +const TAG_THREE: u8 = 0b1110_0000; +const TAG_FOUR: u8 = 0b1111_0000; + +/// Returns the smallest possible index of the next valid UTF-8 sequence +/// starting after `i`. +pub fn next_utf8(text: &[u8], i: usize) -> usize { + let b = match text.get(i) { + None => return i + 1, + Some(&b) => b, + }; + let inc = if b <= 0x7F { + 1 + } else if b <= 0b110_11111 { + 2 + } else if b <= 0b1110_1111 { + 3 + } else { + 4 + }; + i + inc +} + +/// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`. +/// +/// If no valid UTF-8 sequence could be found, then `None` is returned. +/// Otherwise, the decoded codepoint and the number of bytes read is returned. +/// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be +/// 1, 2, 3 or 4. +/// +/// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a +/// codepoint that is out of range (surrogate codepoints are out of range) or +/// is not the shortest possible UTF-8 sequence for that codepoint. +#[inline] +pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> { + let b0 = match src.get(0) { + None => return None, + Some(&b) if b <= 0x7F => return Some((b as char, 1)), + Some(&b) => b, + }; + match b0 { + 0b110_00000..=0b110_11111 => { + if src.len() < 2 { + return None; + } + let b1 = src[1]; + if 0b11_000000 & b1 != TAG_CONT { + return None; + } + let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32); + match cp { + 0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)), + _ => None, + } + } + 0b1110_0000..=0b1110_1111 => { + if src.len() < 3 { + return None; + } + let (b1, b2) = (src[1], src[2]); + if 0b11_000000 & b1 != TAG_CONT { + return None; + } + if 0b11_000000 & b2 != TAG_CONT { + return None; + } + let cp = ((b0 & !TAG_THREE) as u32) << 12 + | ((b1 & !TAG_CONT) as u32) << 6 + | ((b2 & !TAG_CONT) as u32); + match cp { + // char::from_u32 will disallow surrogate codepoints. + 0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)), + _ => None, + } + } + 0b11110_000..=0b11110_111 => { + if src.len() < 4 { + return None; + } + let (b1, b2, b3) = (src[1], src[2], src[3]); + if 0b11_000000 & b1 != TAG_CONT { + return None; + } + if 0b11_000000 & b2 != TAG_CONT { + return None; + } + if 0b11_000000 & b3 != TAG_CONT { + return None; + } + let cp = ((b0 & !TAG_FOUR) as u32) << 18 + | ((b1 & !TAG_CONT) as u32) << 12 + | ((b2 & !TAG_CONT) as u32) << 6 + | ((b3 & !TAG_CONT) as u32); + match cp { + 0x10000..=0x0010_FFFF => char::from_u32(cp).map(|cp| (cp, 4)), + _ => None, + } + } + _ => None, + } +} + +/// Like `decode_utf8`, but decodes the last UTF-8 sequence in `src` instead +/// of the first. +pub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> { + if src.is_empty() { + return None; + } + let mut start = src.len() - 1; + if src[start] <= 0x7F { + return Some((src[start] as char, 1)); + } + while start > src.len().saturating_sub(4) { + start -= 1; + if is_start_byte(src[start]) { + break; + } + } + match decode_utf8(&src[start..]) { + None => None, + Some((_, n)) if n < src.len() - start => None, + Some((cp, n)) => Some((cp, n)), + } +} + +fn is_start_byte(b: u8) -> bool { + b & 0b11_000000 != 0b1_0000000 +} + +#[cfg(test)] +mod tests { + use std::str; + + use quickcheck::quickcheck; + + use super::{ + decode_last_utf8, decode_utf8, TAG_CONT, TAG_FOUR, TAG_THREE, TAG_TWO, + }; + + #[test] + fn prop_roundtrip() { + fn p(given_cp: char) -> bool { + let mut tmp = [0; 4]; + let encoded_len = given_cp.encode_utf8(&mut tmp).len(); + let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap(); + encoded_len == got_len && given_cp == got_cp + } + quickcheck(p as fn(char) -> bool) + } + + #[test] + fn prop_roundtrip_last() { + fn p(given_cp: char) -> bool { + let mut tmp = [0; 4]; + let encoded_len = given_cp.encode_utf8(&mut tmp).len(); + let (got_cp, got_len) = + decode_last_utf8(&tmp[..encoded_len]).unwrap(); + encoded_len == got_len && given_cp == got_cp + } + quickcheck(p as fn(char) -> bool) + } + + #[test] + fn prop_encode_matches_std() { + fn p(cp: char) -> bool { + let mut got = [0; 4]; + let n = cp.encode_utf8(&mut got).len(); + let expected = cp.to_string(); + &got[..n] == expected.as_bytes() + } + quickcheck(p as fn(char) -> bool) + } + + #[test] + fn prop_decode_matches_std() { + fn p(given_cp: char) -> bool { + let mut tmp = [0; 4]; + let n = given_cp.encode_utf8(&mut tmp).len(); + let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap(); + let expected_cp = + str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap(); + got_cp == expected_cp + } + quickcheck(p as fn(char) -> bool) + } + + #[test] + fn prop_decode_last_matches_std() { + fn p(given_cp: char) -> bool { + let mut tmp = [0; 4]; + let n = given_cp.encode_utf8(&mut tmp).len(); + let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap(); + let expected_cp = str::from_utf8(&tmp[..n]) + .unwrap() + .chars() + .rev() + .next() + .unwrap(); + got_cp == expected_cp + } + quickcheck(p as fn(char) -> bool) + } + + #[test] + fn reject_invalid() { + // Invalid start byte + assert_eq!(decode_utf8(&[0xFF]), None); + // Surrogate pair + assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None); + // Invalid continuation byte. + assert_eq!(decode_utf8(&[0xD4, 0xC2]), None); + // Bad lengths + assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes + assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes + assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes + // Not a minimal UTF-8 sequence + assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a']), None); + assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a']), None); + assert_eq!( + decode_utf8(&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]), + None + ); + } + + #[test] + fn reject_invalid_last() { + // Invalid start byte + assert_eq!(decode_last_utf8(&[0xFF]), None); + // Surrogate pair + assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None); + // Bad lengths + assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes + assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes + assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes + // Not a minimal UTF-8 sequence + assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a']), None); + assert_eq!( + decode_last_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a',]), + None + ); + assert_eq!( + decode_last_utf8( + &[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',] + ), + None + ); + } +} diff --git a/vendor/regex/test b/vendor/regex/test new file mode 100755 index 0000000000..b10564f128 --- /dev/null +++ b/vendor/regex/test @@ -0,0 +1,30 @@ +#!/bin/bash + +set -e + +# This is a convenience script for running a broad swath of tests across +# features. We don't test the complete space, since the complete space is quite +# large. Hopefully once we migrate the test suite to better infrastructure +# (like regex-automata), we'll be able to test more of the space. +echo "===== DEFAULT FEATURES ===" +cargo test + +echo "===== DOC TESTS ===" +cargo test --doc + +features=( + "std" + "std unicode" + "std unicode-perl" + "std perf" + "std perf-cache" + "std perf-dfa" + "std perf-inline" + "std perf-literal" +) +for f in "${features[@]}"; do + echo "===== FEATURE: $f (default) ===" + cargo test --test default --no-default-features --features "$f" + echo "===== FEATURE: $f (default-bytes) ===" + cargo test --test default-bytes --no-default-features --features "$f" +done diff --git a/vendor/regex/tests/api.rs b/vendor/regex/tests/api.rs new file mode 100644 index 0000000000..c7250a8a3a --- /dev/null +++ b/vendor/regex/tests/api.rs @@ -0,0 +1,234 @@ +#[test] +fn empty_regex_empty_match() { + let re = regex!(""); + assert_eq!(vec![(0, 0)], findall!(re, "")); +} + +#[test] +fn empty_regex_nonempty_match() { + let re = regex!(""); + assert_eq!(vec![(0, 0), (1, 1), (2, 2), (3, 3)], findall!(re, "abc")); +} + +#[test] +fn one_zero_length_match() { + let re = regex!(r"[0-9]*"); + assert_eq!(vec![(0, 0), (1, 2), (3, 4)], findall!(re, "a1b2")); +} + +#[test] +fn many_zero_length_match() { + let re = regex!(r"[0-9]*"); + assert_eq!( + vec![(0, 0), (1, 2), (3, 3), (4, 4), (5, 6)], + findall!(re, "a1bbb2") + ); +} + +#[test] +fn many_sequential_zero_length_match() { + let re = regex!(r"[0-9]?"); + assert_eq!( + vec![(0, 0), (1, 2), (2, 3), (4, 5), (6, 6)], + findall!(re, "a12b3c") + ); +} + +#[test] +fn quoted_bracket_set() { + let re = regex!(r"([\x{5b}\x{5d}])"); + assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]")); + let re = regex!(r"([\[\]])"); + assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]")); +} + +#[test] +fn first_range_starts_with_left_bracket() { + let re = regex!(r"([\[-z])"); + assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]")); +} + +#[test] +fn range_ends_with_escape() { + let re = regex!(r"([\[-\x{5d}])"); + assert_eq!(vec![(0, 1), (1, 2)], findall!(re, "[]")); +} + +#[test] +fn empty_match_find_iter() { + let re = regex!(r".*?"); + assert_eq!(vec![(0, 0), (1, 1), (2, 2), (3, 3)], findall!(re, "abc")); +} + +#[test] +fn empty_match_captures_iter() { + let re = regex!(r".*?"); + let ms: Vec<_> = re + .captures_iter(text!("abc")) + .map(|c| c.get(0).unwrap()) + .map(|m| (m.start(), m.end())) + .collect(); + assert_eq!(ms, vec![(0, 0), (1, 1), (2, 2), (3, 3)]); +} + +#[test] +fn capture_names() { + let re = regex!(r"(.)(?P<a>.)"); + assert_eq!(3, re.captures_len()); + assert_eq!((3, Some(3)), re.capture_names().size_hint()); + assert_eq!( + vec![None, None, Some("a")], + re.capture_names().collect::<Vec<_>>() + ); +} + +#[test] +fn regex_string() { + assert_eq!(r"[a-zA-Z0-9]+", regex!(r"[a-zA-Z0-9]+").as_str()); + assert_eq!(r"[a-zA-Z0-9]+", &format!("{}", regex!(r"[a-zA-Z0-9]+"))); + assert_eq!(r"[a-zA-Z0-9]+", &format!("{:?}", regex!(r"[a-zA-Z0-9]+"))); +} + +#[test] +fn capture_index() { + let re = regex!(r"^(?P<name>.+)$"); + let cap = re.captures(t!("abc")).unwrap(); + assert_eq!(&cap[0], t!("abc")); + assert_eq!(&cap[1], t!("abc")); + assert_eq!(&cap["name"], t!("abc")); +} + +#[test] +#[should_panic] +#[cfg_attr(all(target_env = "msvc", target_pointer_width = "32"), ignore)] +fn capture_index_panic_usize() { + let re = regex!(r"^(?P<name>.+)$"); + let cap = re.captures(t!("abc")).unwrap(); + let _ = cap[2]; +} + +#[test] +#[should_panic] +#[cfg_attr(all(target_env = "msvc", target_pointer_width = "32"), ignore)] +fn capture_index_panic_name() { + let re = regex!(r"^(?P<name>.+)$"); + let cap = re.captures(t!("abc")).unwrap(); + let _ = cap["bad name"]; +} + +#[test] +fn capture_index_lifetime() { + // This is a test of whether the types on `caps["..."]` are general + // enough. If not, this will fail to typecheck. + fn inner(s: &str) -> usize { + let re = regex!(r"(?P<number>[0-9]+)"); + let caps = re.captures(t!(s)).unwrap(); + caps["number"].len() + } + assert_eq!(3, inner("123")); +} + +#[test] +fn capture_misc() { + let re = regex!(r"(.)(?P<a>a)?(.)(?P<b>.)"); + let cap = re.captures(t!("abc")).unwrap(); + + assert_eq!(5, cap.len()); + + assert_eq!((0, 3), { + let m = cap.get(0).unwrap(); + (m.start(), m.end()) + }); + assert_eq!(None, cap.get(2)); + assert_eq!((2, 3), { + let m = cap.get(4).unwrap(); + (m.start(), m.end()) + }); + + assert_eq!(t!("abc"), match_text!(cap.get(0).unwrap())); + assert_eq!(None, cap.get(2)); + assert_eq!(t!("c"), match_text!(cap.get(4).unwrap())); + + assert_eq!(None, cap.name("a")); + assert_eq!(t!("c"), match_text!(cap.name("b").unwrap())); +} + +#[test] +fn sub_capture_matches() { + let re = regex!(r"([a-z])(([a-z])|([0-9]))"); + let cap = re.captures(t!("a5")).unwrap(); + let subs: Vec<_> = cap.iter().collect(); + + assert_eq!(5, subs.len()); + assert!(subs[0].is_some()); + assert!(subs[1].is_some()); + assert!(subs[2].is_some()); + assert!(subs[3].is_none()); + assert!(subs[4].is_some()); + + assert_eq!(t!("a5"), match_text!(subs[0].unwrap())); + assert_eq!(t!("a"), match_text!(subs[1].unwrap())); + assert_eq!(t!("5"), match_text!(subs[2].unwrap())); + assert_eq!(t!("5"), match_text!(subs[4].unwrap())); +} + +expand!(expand1, r"(?-u)(?P<foo>\w+)", "abc", "$foo", "abc"); +expand!(expand2, r"(?-u)(?P<foo>\w+)", "abc", "$0", "abc"); +expand!(expand3, r"(?-u)(?P<foo>\w+)", "abc", "$1", "abc"); +expand!(expand4, r"(?-u)(?P<foo>\w+)", "abc", "$$1", "$1"); +expand!(expand5, r"(?-u)(?P<foo>\w+)", "abc", "$$foo", "$foo"); +expand!(expand6, r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)", "abc 123", "$b$a", "123abc"); +expand!(expand7, r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)", "abc 123", "z$bz$az", "z"); +expand!( + expand8, + r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)", + "abc 123", + ".$b.$a.", + ".123.abc." +); +expand!( + expand9, + r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)", + "abc 123", + " $b $a ", + " 123 abc " +); +expand!(expand10, r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)", "abc 123", "$bz$az", ""); + +expand!(expand_name1, r"%(?P<Z>[a-z]+)", "%abc", "$Z%", "abc%"); +expand!(expand_name2, r"\[(?P<Z>[a-z]+)", "[abc", "$Z[", "abc["); +expand!(expand_name3, r"\{(?P<Z>[a-z]+)", "{abc", "$Z{", "abc{"); +expand!(expand_name4, r"\}(?P<Z>[a-z]+)", "}abc", "$Z}", "abc}"); +expand!(expand_name5, r"%([a-z]+)", "%abc", "$1a%", "%"); +expand!(expand_name6, r"%([a-z]+)", "%abc", "${1}a%", "abca%"); +expand!(expand_name7, r"\[(?P<Z[>[a-z]+)", "[abc", "${Z[}[", "abc["); +expand!(expand_name8, r"\[(?P<Z[>[a-z]+)", "[abc", "${foo}[", "["); +expand!(expand_name9, r"\[(?P<Z[>[a-z]+)", "[abc", "${1a}[", "["); +expand!(expand_name10, r"\[(?P<Z[>[a-z]+)", "[abc", "${#}[", "["); +expand!(expand_name11, r"\[(?P<Z[>[a-z]+)", "[abc", "${$$}[", "["); + +split!( + split1, + r"(?-u)\s+", + "a b\nc\td\n\t e", + &[t!("a"), t!("b"), t!("c"), t!("d"), t!("e")] +); +split!( + split2, + r"(?-u)\b", + "a b c", + &[t!(""), t!("a"), t!(" "), t!("b"), t!(" "), t!("c"), t!("")] +); +split!(split3, r"a$", "a", &[t!(""), t!("")]); +split!(split_none, r"-", r"a", &[t!("a")]); +split!(split_trailing_blank, r"-", r"a-", &[t!("a"), t!("")]); +split!(split_trailing_blanks, r"-", r"a--", &[t!("a"), t!(""), t!("")]); +split!(split_empty, r"-", r"", &[t!("")]); + +splitn!(splitn_below_limit, r"-", r"a", 2, &[t!("a")]); +splitn!(splitn_at_limit, r"-", r"a-b", 2, &[t!("a"), t!("b")]); +splitn!(splitn_above_limit, r"-", r"a-b-c", 2, &[t!("a"), t!("b-c")]); +splitn!(splitn_zero_limit, r"-", r"a-b", 0, empty_vec!()); +splitn!(splitn_trailing_blank, r"-", r"a-", 2, &[t!("a"), t!("")]); +splitn!(splitn_trailing_separator, r"-", r"a--", 2, &[t!("a"), t!("-")]); +splitn!(splitn_empty, r"-", r"", 1, &[t!("")]); diff --git a/vendor/regex/tests/api_str.rs b/vendor/regex/tests/api_str.rs new file mode 100644 index 0000000000..480116da73 --- /dev/null +++ b/vendor/regex/tests/api_str.rs @@ -0,0 +1,34 @@ +// These tests don't really make sense with the bytes API, so we only test them +// on the Unicode API. + +#[test] +fn empty_match_unicode_find_iter() { + // Tests that we still yield byte ranges at valid UTF-8 sequence boundaries + // even when we're susceptible to empty width matches. + let re = regex!(r".*?"); + assert_eq!( + vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)], + findall!(re, "Ⅰ1Ⅱ2") + ); +} + +#[test] +fn empty_match_unicode_captures_iter() { + // Same as empty_match_unicode_find_iter, but tests capture iteration. + let re = regex!(r".*?"); + let ms: Vec<_> = re + .captures_iter(text!("Ⅰ1Ⅱ2")) + .map(|c| c.get(0).unwrap()) + .map(|m| (m.start(), m.end())) + .collect(); + assert_eq!(vec![(0, 0), (3, 3), (4, 4), (7, 7), (8, 8)], ms); +} + +#[test] +fn match_as_str() { + let re = regex!(r"fo+"); + let caps = re.captures("barfoobar").unwrap(); + assert_eq!(caps.get(0).map(|m| m.as_str()), Some("foo")); + assert_eq!(caps.get(0).map(From::from), Some("foo")); + assert_eq!(caps.get(0).map(Into::into), Some("foo")); +} diff --git a/vendor/regex/tests/bytes.rs b/vendor/regex/tests/bytes.rs new file mode 100644 index 0000000000..d05f138edf --- /dev/null +++ b/vendor/regex/tests/bytes.rs @@ -0,0 +1,107 @@ +// These are tests specifically crafted for regexes that can match arbitrary +// bytes. + +// A silly wrapper to make it possible to write and match raw bytes. +struct R<'a>(&'a [u8]); +impl<'a> R<'a> { + fn as_bytes(&self) -> &'a [u8] { + self.0 + } +} + +mat!(word_boundary, r"(?-u) \b", " δ", None); +#[cfg(feature = "unicode-perl")] +mat!(word_boundary_unicode, r" \b", " δ", Some((0, 1))); +mat!(word_not_boundary, r"(?-u) \B", " δ", Some((0, 1))); +#[cfg(feature = "unicode-perl")] +mat!(word_not_boundary_unicode, r" \B", " δ", None); + +mat!(perl_w_ascii, r"(?-u)\w+", "aδ", Some((0, 1))); +#[cfg(feature = "unicode-perl")] +mat!(perl_w_unicode, r"\w+", "aδ", Some((0, 3))); +mat!(perl_d_ascii, r"(?-u)\d+", "1२३9", Some((0, 1))); +#[cfg(feature = "unicode-perl")] +mat!(perl_d_unicode, r"\d+", "1२३9", Some((0, 8))); +mat!(perl_s_ascii, r"(?-u)\s+", " \u{1680}", Some((0, 1))); +#[cfg(feature = "unicode-perl")] +mat!(perl_s_unicode, r"\s+", " \u{1680}", Some((0, 4))); + +// The first `(.+)` matches two Unicode codepoints, but can't match the 5th +// byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and +// matches. +mat!( + mixed1, + r"(.+)(?-u)(.+)", + R(b"\xCE\x93\xCE\x94\xFF"), + Some((0, 5)), + Some((0, 4)), + Some((4, 5)) +); + +mat!(case_ascii_one, r"(?i-u)a", "A", Some((0, 1))); +mat!(case_ascii_class, r"(?i-u)[a-z]+", "AaAaA", Some((0, 5))); +#[cfg(feature = "unicode-case")] +mat!(case_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 7))); +mat!(case_not_unicode, r"(?i-u)[a-z]+", "aA\u{212A}aA", Some((0, 2))); + +mat!(negate_unicode, r"[^a]", "δ", Some((0, 2))); +mat!(negate_not_unicode, r"(?-u)[^a]", "δ", Some((0, 1))); + +// This doesn't match in a normal Unicode regex because the implicit preceding +// `.*?` is Unicode aware. +mat!(dotstar_prefix_not_unicode1, r"(?-u)a", R(b"\xFFa"), Some((1, 2))); +mat!(dotstar_prefix_not_unicode2, r"a", R(b"\xFFa"), Some((1, 2))); + +// Have fun with null bytes. +mat!( + null_bytes, + r"(?-u)(?P<cstr>[^\x00]+)\x00", + R(b"foo\x00"), + Some((0, 4)), + Some((0, 3)) +); + +// Test that lookahead operators work properly in the face of invalid UTF-8. +// See: https://github.com/rust-lang/regex/issues/277 +matiter!( + invalidutf8_anchor1, + r"(?-u)\xcc?^", + R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"), + (0, 0) +); +matiter!( + invalidutf8_anchor2, + r"(?-u)^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$", + R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"), + (22, 22) +); +matiter!( + invalidutf8_anchor3, + r"(?-u)^|ddp\xff\xffdddddlQd@\x80", + R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"), + (0, 0) +); + +// See https://github.com/rust-lang/regex/issues/303 +#[test] +fn negated_full_byte_range() { + assert!(::regex::bytes::Regex::new(r#"(?-u)[^\x00-\xff]"#).is_err()); +} + +matiter!(word_boundary_ascii1, r"(?-u:\B)x(?-u:\B)", "áxβ"); +matiter!( + word_boundary_ascii2, + r"(?-u:\B)", + "0\u{7EF5E}", + (2, 2), + (3, 3), + (4, 4), + (5, 5) +); + +// See: https://github.com/rust-lang/regex/issues/264 +mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0))); +mat!(ascii_boundary_capture, r"(?-u)(\B)", "\u{28f3e}", Some((0, 0))); + +// See: https://github.com/rust-lang/regex/issues/271 +mat!(end_not_wb, r"$(?-u:\B)", "\u{5c124}\u{b576c}", Some((8, 8))); diff --git a/vendor/regex/tests/consistent.rs b/vendor/regex/tests/consistent.rs new file mode 100644 index 0000000000..722f2a51a0 --- /dev/null +++ b/vendor/regex/tests/consistent.rs @@ -0,0 +1,238 @@ +use regex::internal::ExecBuilder; + +/// Given a regex, check if all of the backends produce the same +/// results on a number of different inputs. +/// +/// For now this just throws quickcheck at the problem, which +/// is not very good because it only really tests half of the +/// problem space. It is pretty unlikely that a random string +/// will match any given regex, so this will probably just +/// be checking that the different backends fail in the same +/// way. This is still worthwhile to test, but is definitely not +/// the whole story. +/// +/// TODO(ethan): In order to cover the other half of the problem +/// space, we should generate a random matching string by inspecting +/// the AST of the input regex. The right way to do this probably +/// involves adding a custom Arbitrary instance around a couple +/// of newtypes. That way we can respect the quickcheck size hinting +/// and shrinking and whatnot. +pub fn backends_are_consistent(re: &str) -> Result<u64, String> { + let standard_backends = vec![ + ( + "bounded_backtracking_re", + ExecBuilder::new(re) + .bounded_backtracking() + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err))?, + ), + ( + "pikevm_re", + ExecBuilder::new(re) + .nfa() + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err))?, + ), + ( + "default_re", + ExecBuilder::new(re) + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err))?, + ), + ]; + + let utf8bytes_backends = vec![ + ( + "bounded_backtracking_utf8bytes_re", + ExecBuilder::new(re) + .bounded_backtracking() + .bytes(true) + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err))?, + ), + ( + "pikevm_utf8bytes_re", + ExecBuilder::new(re) + .nfa() + .bytes(true) + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err))?, + ), + ( + "default_utf8bytes_re", + ExecBuilder::new(re) + .bytes(true) + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err))?, + ), + ]; + + let bytes_backends = vec![ + ( + "bounded_backtracking_bytes_re", + ExecBuilder::new(re) + .bounded_backtracking() + .only_utf8(false) + .build() + .map(|exec| exec.into_byte_regex()) + .map_err(|err| format!("{}", err))?, + ), + ( + "pikevm_bytes_re", + ExecBuilder::new(re) + .nfa() + .only_utf8(false) + .build() + .map(|exec| exec.into_byte_regex()) + .map_err(|err| format!("{}", err))?, + ), + ( + "default_bytes_re", + ExecBuilder::new(re) + .only_utf8(false) + .build() + .map(|exec| exec.into_byte_regex()) + .map_err(|err| format!("{}", err))?, + ), + ]; + + Ok(string_checker::check_backends(&standard_backends)? + + string_checker::check_backends(&utf8bytes_backends)? + + bytes_checker::check_backends(&bytes_backends)?) +} + +// +// A consistency checker parameterized by the input type (&str or &[u8]). +// + +macro_rules! checker { + ($module_name:ident, $regex_type:path, $mk_input:expr) => { + mod $module_name { + use quickcheck; + use quickcheck::{Arbitrary, TestResult}; + + pub fn check_backends( + backends: &[(&str, $regex_type)], + ) -> Result<u64, String> { + let mut total_passed = 0; + for regex in backends[1..].iter() { + total_passed += quickcheck_regex_eq(&backends[0], regex)?; + } + + Ok(total_passed) + } + + fn quickcheck_regex_eq( + &(name1, ref re1): &(&str, $regex_type), + &(name2, ref re2): &(&str, $regex_type), + ) -> Result<u64, String> { + quickcheck::QuickCheck::new() + .quicktest(RegexEqualityTest::new( + re1.clone(), + re2.clone(), + )) + .map_err(|err| { + format!( + "{}(/{}/) and {}(/{}/) are inconsistent.\ + QuickCheck Err: {:?}", + name1, re1, name2, re2, err + ) + }) + } + + struct RegexEqualityTest { + re1: $regex_type, + re2: $regex_type, + } + impl RegexEqualityTest { + fn new(re1: $regex_type, re2: $regex_type) -> Self { + RegexEqualityTest { re1: re1, re2: re2 } + } + } + + impl quickcheck::Testable for RegexEqualityTest { + fn result(&self, gen: &mut quickcheck::Gen) -> TestResult { + let input = $mk_input(gen); + let input = &input; + + if self.re1.find(&input) != self.re2.find(input) { + return TestResult::error(format!( + "find mismatch input={:?}", + input + )); + } + + let cap1 = self.re1.captures(input); + let cap2 = self.re2.captures(input); + match (cap1, cap2) { + (None, None) => {} + (Some(cap1), Some(cap2)) => { + for (c1, c2) in cap1.iter().zip(cap2.iter()) { + if c1 != c2 { + return TestResult::error(format!( + "captures mismatch input={:?}", + input + )); + } + } + } + _ => { + return TestResult::error(format!( + "captures mismatch input={:?}", + input + )) + } + } + + let fi1 = self.re1.find_iter(input); + let fi2 = self.re2.find_iter(input); + for (m1, m2) in fi1.zip(fi2) { + if m1 != m2 { + return TestResult::error(format!( + "find_iter mismatch input={:?}", + input + )); + } + } + + let ci1 = self.re1.captures_iter(input); + let ci2 = self.re2.captures_iter(input); + for (cap1, cap2) in ci1.zip(ci2) { + for (c1, c2) in cap1.iter().zip(cap2.iter()) { + if c1 != c2 { + return TestResult::error(format!( + "captures_iter mismatch input={:?}", + input + )); + } + } + } + + let s1 = self.re1.split(input); + let s2 = self.re2.split(input); + for (chunk1, chunk2) in s1.zip(s2) { + if chunk1 != chunk2 { + return TestResult::error(format!( + "split mismatch input={:?}", + input + )); + } + } + + TestResult::from_bool(true) + } + } + } // mod + }; // rule case +} // macro_rules! + +checker!(string_checker, ::regex::Regex, |gen| String::arbitrary(gen)); +checker!(bytes_checker, ::regex::bytes::Regex, |gen| Vec::<u8>::arbitrary( + gen +)); diff --git a/vendor/regex/tests/crates_regex.rs b/vendor/regex/tests/crates_regex.rs new file mode 100644 index 0000000000..200ec27b2d --- /dev/null +++ b/vendor/regex/tests/crates_regex.rs @@ -0,0 +1,3287 @@ +// DO NOT EDIT. Automatically generated by 'scripts/scrape_crates_io.py' +// on 2018-06-20 09:56:32.820354. + +// autoshutdown-0.1.0: r"\s*(\d+)(\w)\s*" +consistent!(autoshutdown_0, r"\s*(\d+)(\w)\s*"); + +// epub-1.1.1: r"/" +consistent!(epub_0, r"/"); + +// rpi-info-0.2.0: "^Revision\t+: ([0-9a-fA-F]+)" +consistent!(rpi_info_0, "^Revision\t+: ([0-9a-fA-F]+)"); + +// rpi-info-0.2.0: "Serial\t+: ([0-9a-fA-F]+)" +consistent!(rpi_info_1, "Serial\t+: ([0-9a-fA-F]+)"); + +// pnet_macros-0.21.0: r"^u([0-9]+)(be|le|he)?$" +consistent!(pnet_macros_0, r"^u([0-9]+)(be|le|he)?$"); + +// iban_validate-1.0.3: r"^[A-Z]{2}\d{2}[A-Z\d]{1,30}$" +consistent!(iban_validate_0, r"^[A-Z]{2}\d{2}[A-Z\d]{1,30}$"); + +// markifier-0.1.0: r".*\[(?P<percent>.+)%.*\].*" +consistent!(markifier_0, r".*\[(?P<percent>.+)%.*\].*"); + +// mallumo-0.3.0: r"(#include) (\S*)(.*)" +consistent!(mallumo_0, r"(#include) (\S*)(.*)"); + +// mallumo-0.3.0: r"(ERROR: \d+:)(\d+)(: )(.+)" +consistent!(mallumo_1, r"(ERROR: \d+:)(\d+)(: )(.+)"); + +// mallumo-0.3.0: r"(\d+\()(\d+)(?:\) : )(.+)" +consistent!(mallumo_2, r"(\d+\()(\d+)(?:\) : )(.+)"); + +// magnet_more-0.0.1: r"(.+?)(\[.*?\])?" +consistent!(magnet_more_0, r"(.+?)(\[.*?\])?"); + +// magnet_app-0.0.1: r":(?P<k>[a-zA-Z_]+)" +consistent!(magnet_app_0, r":(?P<k>[a-zA-Z_]+)"); + +// yubibomb-0.2.0: r"^\d{6}(?:\s*,\s*\d{6})*$" +consistent!(yubibomb_0, r"^\d{6}(?:\s*,\s*\d{6})*$"); + +// multirust-rs-0.0.4: r"[\\/]([^\\/?]+)(\?.*)?$" +consistent!(multirust_rs_0, r"[\\/]([^\\/?]+)(\?.*)?$"); + +// hueclient-0.3.2: "\"[a-z]*\":null" +consistent!(hueclient_0, "\"[a-z]*\":null"); + +// hueclient-0.3.2: ",+" +consistent!(hueclient_1, ",+"); + +// hueclient-0.3.2: ",\\}" +consistent!(hueclient_2, ",\\}"); + +// hueclient-0.3.2: "\\{," +consistent!(hueclient_3, "\\{,"); + +// aerial-0.1.0: r"[a-zA-Z_\$][a-zA-Z_0-9]*" +consistent!(aerial_0, r"[a-zA-Z_\$][a-zA-Z_0-9]*"); + +// aerial-0.1.0: r"thi[sng]+" +consistent!(aerial_1, r"thi[sng]+"); + +// rvue-0.1.0: r"(.+)\s+\((.+?)\)" +consistent!(rvue_0, r"(.+)\s+\((.+?)\)"); + +// rvue-0.1.0: r"([\d\.]+)\s*out\s*of\s*([\d\.]+)" +consistent!(rvue_1, r"([\d\.]+)\s*out\s*of\s*([\d\.]+)"); + +// rvue-0.1.0: r"^([\d\.]+)\s*(?:\(\))?$" +consistent!(rvue_2, r"^([\d\.]+)\s*(?:\(\))?$"); + +// rvue-0.1.0: r"([\d\.]+)\s*Points\s*Possible" +consistent!(rvue_3, r"([\d\.]+)\s*Points\s*Possible"); + +// rvue-0.1.0: r"([\d\.]+)\s*/\s*([\d\.]+)" +consistent!(rvue_4, r"([\d\.]+)\s*/\s*([\d\.]+)"); + +// rvsim-0.1.0: r"_?([_a-z0-9]+)\s*:\s*([_a-z0-9]+)\s*[,)]" +consistent!(rvsim_0, r"_?([_a-z0-9]+)\s*:\s*([_a-z0-9]+)\s*[,)]"); + +// nereon-0.1.4: "(.*[^\\\\])\\{\\}(.*)" +consistent!(nereon_0, "(.*[^\\\\])\\{\\}(.*)"); + +// next_episode-0.3.0: r"((?i)^(.+).s(\d+)e(\d+).*)$" +consistent!(next_episode_0, r"((?i)^(.+).s(\d+)e(\d+).*)$"); + +// migrant_lib-0.19.2: r"[^a-z0-9-]+" +consistent!(migrant_lib_0, r"[^a-z0-9-]+"); + +// migrant_lib-0.19.2: r"[0-9]{14}_[a-z0-9-]+" +consistent!(migrant_lib_1, r"[0-9]{14}_[a-z0-9-]+"); + +// migrant_lib-0.19.2: r"([0-9]{14}_)?[a-z0-9-]+" +consistent!(migrant_lib_2, r"([0-9]{14}_)?[a-z0-9-]+"); + +// minipre-0.2.0: "$_" +consistent!(minipre_0, "$_"); + +// minifier-0.0.13: r">\s+<" +consistent!(minifier_0, r">\s+<"); + +// minifier-0.0.13: r"\s{2,}|[\r\n]" +consistent!(minifier_1, r"\s{2,}|[\r\n]"); + +// minifier-0.0.13: r"<(style|script)[\w|\s].*?>" +consistent!(minifier_2, r"<(style|script)[\w|\s].*?>"); + +// minifier-0.0.13: "<!--(.|\n)*?-->" +consistent!(minifier_3, "<!--(.|\n)*?-->"); + +// minifier-0.0.13: r"<\w.*?>" +consistent!(minifier_4, r"<\w.*?>"); + +// minifier-0.0.13: r" \s+|\s +" +consistent!(minifier_5, r" \s+|\s +"); + +// minifier-0.0.13: r"\w\s+\w" +consistent!(minifier_6, r"\w\s+\w"); + +// minifier-0.0.13: r"'\s+>" +consistent!(minifier_7, r"'\s+>"); + +// minifier-0.0.13: r"\d\s+>" +consistent!(minifier_8, r"\d\s+>"); + +// ggp-rs-0.1.2: r"(?P<relation>\([^)]+\))|(?P<prop>[a-zA-Z0-9_]+)" +consistent!(ggp_rs_0, r"(?P<relation>\([^)]+\))|(?P<prop>[a-zA-Z0-9_]+)"); + +// ggp-rs-0.1.2: r"\((.*)\)." +consistent!(ggp_rs_1, r"\((.*)\)."); + +// poe-superfilter-0.2.0: "[A-Za-z0-9_]" +consistent!(poe_superfilter_0, "[A-Za-z0-9_]"); + +// poke-a-mango-0.5.0: r"(\d+)x(\d+)" +consistent!(poke_a_mango_0, r"(\d+)x(\d+)"); + +// pop3-rs-0.1.0: r"(?P<nmsg>\d+) (?P<size>\d+)" +consistent!(pop3_rs_0, r"(?P<nmsg>\d+) (?P<size>\d+)"); + +// pop3-rs-0.1.0: r"(?P<msgid>\d+) (?P<uidl>[\x21-\x7E]{1,70})" +consistent!(pop3_rs_1, r"(?P<msgid>\d+) (?P<uidl>[\x21-\x7E]{1,70})"); + +// pop3-rs-0.1.0: r"(<.*>)\r\n$" +consistent!(pop3_rs_2, r"(<.*>)\r\n$"); + +// pop3-rs-0.1.0: r"^(?P<status>\+OK|-ERR) (?P<statustext>.*)" +consistent!(pop3_rs_3, r"^(?P<status>\+OK|-ERR) (?P<statustext>.*)"); + +// pop3-1.0.6: r"^\.\r\n$" +consistent!(pop3_0, r"^\.\r\n$"); + +// pop3-1.0.6: r"\+OK(.*)" +consistent!(pop3_1, r"\+OK(.*)"); + +// pop3-1.0.6: r"-ERR(.*)" +consistent!(pop3_2, r"-ERR(.*)"); + +// pop3-1.0.6: r"\+OK (\d+) (\d+)\r\n" +consistent!(pop3_3, r"\+OK (\d+) (\d+)\r\n"); + +// pop3-1.0.6: r"(\d+) ([\x21-\x7e]+)\r\n" +consistent!(pop3_4, r"(\d+) ([\x21-\x7e]+)\r\n"); + +// pop3-1.0.6: r"\+OK (\d+) ([\x21-\x7e]+)\r\n" +consistent!(pop3_5, r"\+OK (\d+) ([\x21-\x7e]+)\r\n"); + +// pop3-1.0.6: r"(\d+) (\d+)\r\n" +consistent!(pop3_6, r"(\d+) (\d+)\r\n"); + +// pop3-1.0.6: r"\+OK (\d+) (\d+)\r\n" +consistent!(pop3_7, r"\+OK (\d+) (\d+)\r\n"); + +// polk-1.1.3: "github:(\\w+)/?(\\w+)?" +consistent!(polk_0, "github:(\\w+)/?(\\w+)?"); + +// geochunk-0.1.5: "^[0-9]{5}" +consistent!(geochunk_0, "^[0-9]{5}"); + +// generic-dns-update-1.1.4: r"((?:(?:0|1[\d]{0,2}|2(?:[0-4]\d?|5[0-5]?|[6-9])?|[3-9]\d?)\.){3}(?:0|1[\d]{0,2}|2(?:[0-4]\d?|5[0-5]?|[6-9])?|[3-9]\d?))" +consistent!(generic_dns_update_0, r"((?:(?:0|1[\d]{0,2}|2(?:[0-4]\d?|5[0-5]?|[6-9])?|[3-9]\d?)\.){3}(?:0|1[\d]{0,2}|2(?:[0-4]\d?|5[0-5]?|[6-9])?|[3-9]\d?))"); + +// generic-dns-update-1.1.4: r"((([0-9A-Fa-f]{1,4}:){7}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){6}:[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){5}:([0-9A-Fa-f]{1,4}:)?[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){4}:([0-9A-Fa-f]{1,4}:){0,2}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){3}:([0-9A-Fa-f]{1,4}:){0,3}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){2}:([0-9A-Fa-f]{1,4}:){0,4}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){6}((\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d)\.){3}(\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d))|(([0-9A-Fa-f]{1,4}:){0,5}:((\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d)\.){3}(\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d))|(::([0-9A-Fa-f]{1,4}:){0,5}((\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d)\.){3}(\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d))|([0-9A-Fa-f]{1,4}::([0-9A-Fa-f]{1,4}:){0,5}[0-9A-Fa-f]{1,4})|(::([0-9A-Fa-f]{1,4}:){0,6}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){1,7}:))" +consistent!(generic_dns_update_1, r"((([0-9A-Fa-f]{1,4}:){7}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){6}:[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){5}:([0-9A-Fa-f]{1,4}:)?[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){4}:([0-9A-Fa-f]{1,4}:){0,2}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){3}:([0-9A-Fa-f]{1,4}:){0,3}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){2}:([0-9A-Fa-f]{1,4}:){0,4}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){6}((\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d)\.){3}(\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d))|(([0-9A-Fa-f]{1,4}:){0,5}:((\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d)\.){3}(\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d))|(::([0-9A-Fa-f]{1,4}:){0,5}((\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d)\.){3}(\d((25[0-5])|(1\d{2})|(2[0-4]\d)|(\d{1,2}))\d))|([0-9A-Fa-f]{1,4}::([0-9A-Fa-f]{1,4}:){0,5}[0-9A-Fa-f]{1,4})|(::([0-9A-Fa-f]{1,4}:){0,6}[0-9A-Fa-f]{1,4})|(([0-9A-Fa-f]{1,4}:){1,7}:))"); + +// generic-dns-update-1.1.4: r"<value><string>([0-9.]*)</string></value>" +consistent!( + generic_dns_update_2, + r"<value><string>([0-9.]*)</string></value>" +); + +// generic-dns-update-1.1.4: r"<int>([0-9]+)</int>" +consistent!(generic_dns_update_3, r"<int>([0-9]+)</int>"); + +// generic-dns-update-1.1.4: r"<int>([0-9]+)</int>" +consistent!(generic_dns_update_4, r"<int>([0-9]+)</int>"); + +// generic-dns-update-1.1.4: r"<boolean>([0-1]*)</boolean>" +consistent!(generic_dns_update_5, r"<boolean>([0-1]*)</boolean>"); + +// generate-nix-pkg-0.3.0: r"(\d*)\.(\d*)\.(\d*)(-(\S*))?" +consistent!(generate_nix_pkg_0, r"(\d*)\.(\d*)\.(\d*)(-(\S*))?"); + +// generate-nix-pkg-0.3.0: r"^(\S*) (\d*)\.(\d*)\.(\d*)(-(\S*))?" +consistent!(generate_nix_pkg_1, r"^(\S*) (\d*)\.(\d*)\.(\d*)(-(\S*))?"); + +// genact-0.6.0: r"arch/([a-z0-9_])+/" +consistent!(genact_0, r"arch/([a-z0-9_])+/"); + +// genact-0.6.0: r"arch/([a-z0-9_])+/" +consistent!(genact_1, r"arch/([a-z0-9_])+/"); + +// cron_rs-0.1.6: r"^\s*((\*(/\d+)?)|[0-9-,/]+)(\s+((\*(/\d+)?)|[0-9-,/]+)){4,5}\s*$" +consistent!( + cron_rs_0, + r"^\s*((\*(/\d+)?)|[0-9-,/]+)(\s+((\*(/\d+)?)|[0-9-,/]+)){4,5}\s*$" +); + +// systemfd-0.3.0: r"^([a-zA-Z]+)::(.+)$" +consistent!(systemfd_0, r"^([a-zA-Z]+)::(.+)$"); + +// symbolic-debuginfo-5.0.2: "__?hidden#\\d+_" +consistent!(symbolic_debuginfo_0, "__?hidden#\\d+_"); + +// symbolic-minidump-5.0.2: r"^Linux ([^ ]+) (.*) \w+(?: GNU/Linux)?$" +consistent!(symbolic_minidump_0, r"^Linux ([^ ]+) (.*) \w+(?: GNU/Linux)?$"); + +// graphql-idl-parser-0.1.1: "^(?u:\\#)(?u:[\t-\r - \u{85}-\u{85}\u{a0}-\u{a0}\u{1680}-\u{1680}\u{2000}-\u{200a}\u{2028}-\u{2029}\u{202f}-\u{202f}\u{205f}-\u{205f}\u{3000}-\u{3000}])*(?u:.)+" +consistent!(graphql_idl_parser_0, "^(?u:\\#)(?u:[\t-\r - \u{85}-\u{85}\u{a0}-\u{a0}\u{1680}-\u{1680}\u{2000}-\u{200a}\u{2028}-\u{2029}\u{202f}-\u{202f}\u{205f}-\u{205f}\u{3000}-\u{3000}])*(?u:.)+"); + +// graphql-idl-parser-0.1.1: "^(?u:=)(?u:[\t-\r - \u{85}-\u{85}\u{a0}-\u{a0}\u{1680}-\u{1680}\u{2000}-\u{200a}\u{2028}-\u{2029}\u{202f}-\u{202f}\u{205f}-\u{205f}\u{3000}-\u{3000}])*(?u:.)+" +consistent!(graphql_idl_parser_1, "^(?u:=)(?u:[\t-\r - \u{85}-\u{85}\u{a0}-\u{a0}\u{1680}-\u{1680}\u{2000}-\u{200a}\u{2028}-\u{2029}\u{202f}-\u{202f}\u{205f}-\u{205f}\u{3000}-\u{3000}])*(?u:.)+"); + +// graphql-idl-parser-0.1.1: "^(?u:[A-Z_-_a-z])(?u:[0-9A-Z_-_a-z])*" +consistent!(graphql_idl_parser_2, "^(?u:[A-Z_-_a-z])(?u:[0-9A-Z_-_a-z])*"); + +// graphql-idl-parser-0.1.1: "^(?u:!)" +consistent!(graphql_idl_parser_3, "^(?u:!)"); + +// graphql-idl-parser-0.1.1: "^(?u:\\()" +consistent!(graphql_idl_parser_4, "^(?u:\\()"); + +// graphql-idl-parser-0.1.1: "^(?u:\\))" +consistent!(graphql_idl_parser_5, "^(?u:\\))"); + +// graphql-idl-parser-0.1.1: "^(?u:,)" +consistent!(graphql_idl_parser_6, "^(?u:,)"); + +// graphql-idl-parser-0.1.1: "^(?u::)" +consistent!(graphql_idl_parser_7, "^(?u::)"); + +// graphql-idl-parser-0.1.1: "^(?u:@)" +consistent!(graphql_idl_parser_8, "^(?u:@)"); + +// graphql-idl-parser-0.1.1: "^(?u:\\[)" +consistent!(graphql_idl_parser_9, "^(?u:\\[)"); + +// graphql-idl-parser-0.1.1: "^(?u:\\])" +consistent!(graphql_idl_parser_10, "^(?u:\\])"); + +// graphql-idl-parser-0.1.1: "^(?u:enum)" +consistent!(graphql_idl_parser_11, "^(?u:enum)"); + +// graphql-idl-parser-0.1.1: "^(?u:implements)" +consistent!(graphql_idl_parser_12, "^(?u:implements)"); + +// graphql-idl-parser-0.1.1: "^(?u:input)" +consistent!(graphql_idl_parser_13, "^(?u:input)"); + +// graphql-idl-parser-0.1.1: "^(?u:interface)" +consistent!(graphql_idl_parser_14, "^(?u:interface)"); + +// graphql-idl-parser-0.1.1: "^(?u:scalar)" +consistent!(graphql_idl_parser_15, "^(?u:scalar)"); + +// graphql-idl-parser-0.1.1: "^(?u:type)" +consistent!(graphql_idl_parser_16, "^(?u:type)"); + +// graphql-idl-parser-0.1.1: "^(?u:union)" +consistent!(graphql_idl_parser_17, "^(?u:union)"); + +// graphql-idl-parser-0.1.1: "^(?u:\\{)" +consistent!(graphql_idl_parser_18, "^(?u:\\{)"); + +// graphql-idl-parser-0.1.1: "^(?u:\\})" +consistent!(graphql_idl_parser_19, "^(?u:\\})"); + +// grimoire-0.1.0: r"(?s)/\*(?P<config>.*?)\*/" +consistent!(grimoire_0, r"(?s)/\*(?P<config>.*?)\*/"); + +// phonenumber-0.2.0+8.9.0: r"[\d]+(?:[~\x{2053}\x{223C}\x{FF5E}][\d]+)?" +consistent!(phonenumber_0, r"[\d]+(?:[~\x{2053}\x{223C}\x{FF5E}][\d]+)?"); + +// phonenumber-0.2.0+8.9.0: r"[, \[\]]" +consistent!(phonenumber_1, r"[, \[\]]"); + +// phonenumber-0.2.0+8.9.0: r"[\\/] *x" +consistent!(phonenumber_2, r"[\\/] *x"); + +// phonenumber-0.2.0+8.9.0: r"[[\P{N}&&\P{L}]&&[^#]]+$" +consistent!(phonenumber_3, r"[[\P{N}&&\P{L}]&&[^#]]+$"); + +// phonenumber-0.2.0+8.9.0: r"(?:.*?[A-Za-z]){3}.*" +consistent!(phonenumber_4, r"(?:.*?[A-Za-z]){3}.*"); + +// phonenumber-0.2.0+8.9.0: r"(\D+)" +consistent!(phonenumber_5, r"(\D+)"); + +// phonenumber-0.2.0+8.9.0: r"(\$\d)" +consistent!(phonenumber_6, r"(\$\d)"); + +// phonenumber-0.2.0+8.9.0: r"\(?\$1\)?" +consistent!(phonenumber_7, r"\(?\$1\)?"); + +// phone_number-0.1.0: r"\D" +consistent!(phone_number_0, r"\D"); + +// phone_number-0.1.0: r"^0+" +consistent!(phone_number_1, r"^0+"); + +// phone_number-0.1.0: r"^89" +consistent!(phone_number_2, r"^89"); + +// phone_number-0.1.0: r"^8+" +consistent!(phone_number_3, r"^8+"); + +// phile-0.1.4: r"^ *(\^_*\^) *$" +consistent!(phile_0, r"^ *(\^_*\^) *$"); + +// phile-0.1.4: r"^[_\p{XID_Start}]$" +consistent!(phile_1, r"^[_\p{XID_Start}]$"); + +// phile-0.1.4: r"^\p{XID_Continue}$" +consistent!(phile_2, r"^\p{XID_Continue}$"); + +// uritemplate-0.1.2: "%25(?P<hex>[0-9a-fA-F][0-9a-fA-F])" +consistent!(uritemplate_0, "%25(?P<hex>[0-9a-fA-F][0-9a-fA-F])"); + +// urdf-rs-0.4.2: "^package://(\\w+)/" +consistent!(urdf_rs_0, "^package://(\\w+)/"); + +// url-match-0.1.7: r"(?P<key>[?&.])" +consistent!(url_match_0, r"(?P<key>[?&.])"); + +// url-match-0.1.7: r":(?P<key>[a-zA-Z0-9_-]+)" +consistent!(url_match_1, r":(?P<key>[a-zA-Z0-9_-]+)"); + +// tsm-sys-0.1.0: r"hello world" +consistent!(tsm_sys_0, r"hello world"); + +// deb-version-0.1.0: "^(?:(?:(?:\\d+:).+)|(?:[^:]+))$" +consistent!(deb_version_0, "^(?:(?:(?:\\d+:).+)|(?:[^:]+))$"); + +// debcargo-2.1.0: r"^(?i)(a|an|the)\s+" +consistent!(debcargo_0, r"^(?i)(a|an|the)\s+"); + +// debcargo-2.1.0: r"^(?i)(rust\s+)?(implementation|library|tool|crate)\s+(of|to|for)\s+" +consistent!( + debcargo_1, + r"^(?i)(rust\s+)?(implementation|library|tool|crate)\s+(of|to|for)\s+" +); + +// feaders-0.2.0: r"^.*\.h$" +consistent!(feaders_0, r"^.*\.h$"); + +// feaders-0.2.0: r"^.*\.c$" +consistent!(feaders_1, r"^.*\.c$"); + +// feaders-0.2.0: r"^.*\.hpp$" +consistent!(feaders_2, r"^.*\.hpp$"); + +// feaders-0.2.0: r"^.*\.cc$" +consistent!(feaders_3, r"^.*\.cc$"); + +// feaders-0.2.0: r"^.*\.cpp$" +consistent!(feaders_4, r"^.*\.cpp$"); + +// hyperscan-0.1.6: r"CPtr\(\w+\)" +consistent!(hyperscan_0, r"CPtr\(\w+\)"); + +// hyperscan-0.1.6: r"^Version:\s(\d\.\d\.\d)\sFeatures:\s+(\w+)?\sMode:\s(\w+)$" +consistent!( + hyperscan_1, + r"^Version:\s(\d\.\d\.\d)\sFeatures:\s+(\w+)?\sMode:\s(\w+)$" +); + +// hyperscan-0.1.6: r"RawDatabase<Block>\{db: \w+\}" +consistent!(hyperscan_2, r"RawDatabase<Block>\{db: \w+\}"); + +// hyperscan-0.1.6: r"RawSerializedDatabase\{p: \w+, len: \d+\}" +consistent!(hyperscan_3, r"RawSerializedDatabase\{p: \w+, len: \d+\}"); + +// ucd-parse-0.1.1: r"[0-9A-F]+" +consistent!(ucd_parse_0, r"[0-9A-F]+"); + +// afsort-0.2.0: r".*" +consistent!(afsort_0, r".*"); + +// afsort-0.2.0: r".*" +consistent!(afsort_1, r".*"); + +// afsort-0.2.0: r".*" +consistent!(afsort_2, r".*"); + +// afsort-0.2.0: r".*" +consistent!(afsort_3, r".*"); + +// afsort-0.2.0: r".*" +consistent!(afsort_4, r".*"); + +// afsort-0.2.0: r".*" +consistent!(afsort_5, r".*"); + +// afsort-0.2.0: r"^[a-z]+$" +consistent!(afsort_6, r"^[a-z]+$"); + +// afsort-0.2.0: r"^[a-z]+$" +consistent!(afsort_7, r"^[a-z]+$"); + +// tin-summer-1.21.4: r"(\.git|\.pijul|_darcs|\.hg)$" +consistent!(tin_summer_0, r"(\.git|\.pijul|_darcs|\.hg)$"); + +// tin-drummer-1.0.1: r".*?\.(a|la|lo|o|ll|keter|bc|dyn_o|d|rlib|crate|min\.js|hi|dyn_hi|S|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$" +consistent!(tin_drummer_0, r".*?\.(a|la|lo|o|ll|keter|bc|dyn_o|d|rlib|crate|min\.js|hi|dyn_hi|S|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$"); + +// tin-drummer-1.0.1: r".*?\.(stats|conf|h|out|cache.*|dat|pc|info|\.js)$" +consistent!( + tin_drummer_1, + r".*?\.(stats|conf|h|out|cache.*|dat|pc|info|\.js)$" +); + +// tin-drummer-1.0.1: r".*?\.(exe|a|la|o|ll|keter|bc|dyn_o|d|rlib|crate|min\.js|hi|dyn_hi|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$" +consistent!(tin_drummer_2, r".*?\.(exe|a|la|o|ll|keter|bc|dyn_o|d|rlib|crate|min\.js|hi|dyn_hi|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$"); + +// tin-drummer-1.0.1: r".*?\.(stats|conf|h|out|cache.*|\.js)$" +consistent!(tin_drummer_3, r".*?\.(stats|conf|h|out|cache.*|\.js)$"); + +// tin-drummer-1.0.1: r"(\.git|\.pijul|_darcs|\.hg)$" +consistent!(tin_drummer_4, r"(\.git|\.pijul|_darcs|\.hg)$"); + +// tin-drummer-1.0.1: r".*?\.(dyn_o|out|d|hi|dyn_hi|dump-.*|p_hi|p_o|prof|tix)$" +consistent!( + tin_drummer_5, + r".*?\.(dyn_o|out|d|hi|dyn_hi|dump-.*|p_hi|p_o|prof|tix)$" +); + +// tin-drummer-1.0.1: r".*?\.(ibc)$" +consistent!(tin_drummer_6, r".*?\.(ibc)$"); + +// tin-drummer-1.0.1: r"\.stack-work|dist-newstyle" +consistent!(tin_drummer_7, r"\.stack-work|dist-newstyle"); + +// timmy-0.3.0: r"_NET_WM_PID\(CARDINAL\) = (\d+)" +consistent!(timmy_0, r"_NET_WM_PID\(CARDINAL\) = (\d+)"); + +// timmy-0.3.0: r"today|yesterday|now" +consistent!(timmy_1, r"today|yesterday|now"); + +// timmy-0.3.0: r"(?P<day>\d{1,2})/(?P<month>\d{1,2})(/(?P<year>\d{4}|\d{2}))?" +consistent!( + timmy_2, + r"(?P<day>\d{1,2})/(?P<month>\d{1,2})(/(?P<year>\d{4}|\d{2}))?" +); + +// timmy-0.3.0: r"(?P<n>\d+) (days?|ds?)(?P<ago>( ago)?)" +consistent!(timmy_3, r"(?P<n>\d+) (days?|ds?)(?P<ago>( ago)?)"); + +// timmy-0.3.0: r"(?P<hr>\d{2}):(?P<mins>\d{2})" +consistent!(timmy_4, r"(?P<hr>\d{2}):(?P<mins>\d{2})"); + +// tinfo-0.5.0: r"^(\d+): \d+ windows \(.*\) \[\d+x\d+\]( \(attached\))?" +consistent!( + tinfo_0, + r"^(\d+): \d+ windows \(.*\) \[\d+x\d+\]( \(attached\))?" +); + +// tinfo-0.5.0: r"^(\d+):(\d+): (.*) \((\d+) panes\) \[(\d+)x(\d+)\]" +consistent!(tinfo_1, r"^(\d+):(\d+): (.*) \((\d+) panes\) \[(\d+)x(\d+)\]"); + +// timespan-0.0.4: r"(?:\\\{start\\\}|\\\{end\\\})" +consistent!(timespan_0, r"(?:\\\{start\\\}|\\\{end\\\})"); + +// timespan-0.0.4: r"(.*)\s+-\s+(.*)" +consistent!(timespan_1, r"(.*)\s+-\s+(.*)"); + +// timespan-0.0.4: r"(.*)\s+(\w+)$" +consistent!(timespan_2, r"(.*)\s+(\w+)$"); + +// timespan-0.0.4: r"(.*)\s+(\w+)$" +consistent!(timespan_3, r"(.*)\s+(\w+)$"); + +// timespan-0.0.4: r"(.*)\s+-\s+(.*)" +consistent!(timespan_4, r"(.*)\s+-\s+(.*)"); + +// titlecase-0.10.0: r"[[:lower:]]" +consistent!(titlecase_0, r"[[:lower:]]"); + +// tight-0.1.3: r"^\d+ (day|week|month|year)s?$" +consistent!(tight_0, r"^\d+ (day|week|month|year)s?$"); + +// tight-0.1.3: r"^\d+ (day|week|month|year)s?$" +consistent!(tight_1, r"^\d+ (day|week|month|year)s?$"); + +// yaml-0.2.1: r"^[-+]?(0|[1-9][0-9_]*)$" +consistent!(yaml_0, r"^[-+]?(0|[1-9][0-9_]*)$"); + +// yaml-0.2.1: r"^([-+]?)0o?([0-7_]+)$" +consistent!(yaml_1, r"^([-+]?)0o?([0-7_]+)$"); + +// yaml-0.2.1: r"^([-+]?)0x([0-9a-fA-F_]+)$" +consistent!(yaml_2, r"^([-+]?)0x([0-9a-fA-F_]+)$"); + +// yaml-0.2.1: r"^([-+]?)0b([0-1_]+)$" +consistent!(yaml_3, r"^([-+]?)0b([0-1_]+)$"); + +// yaml-0.2.1: r"^([-+]?)(\.[0-9]+|[0-9]+(\.[0-9]*)?([eE][-+]?[0-9]+)?)$" +consistent!( + yaml_4, + r"^([-+]?)(\.[0-9]+|[0-9]+(\.[0-9]*)?([eE][-+]?[0-9]+)?)$" +); + +// yaml-0.2.1: r"^[+]?(\.inf|\.Inf|\.INF)$" +consistent!(yaml_5, r"^[+]?(\.inf|\.Inf|\.INF)$"); + +// yaml-0.2.1: r"^-(\.inf|\.Inf|\.INF)$" +consistent!(yaml_6, r"^-(\.inf|\.Inf|\.INF)$"); + +// yaml-0.2.1: r"^(\.nan|\.NaN|\.NAN)$" +consistent!(yaml_7, r"^(\.nan|\.NaN|\.NAN)$"); + +// yaml-0.2.1: r"^(null|Null|NULL|~)$" +consistent!(yaml_8, r"^(null|Null|NULL|~)$"); + +// yaml-0.2.1: r"^(true|True|TRUE|yes|Yes|YES)$" +consistent!(yaml_9, r"^(true|True|TRUE|yes|Yes|YES)$"); + +// yaml-0.2.1: r"^(false|False|FALSE|no|No|NO)$" +consistent!(yaml_10, r"^(false|False|FALSE|no|No|NO)$"); + +// kefia-0.1.0: r"(?m)^(\S+)/(\S+) (\S+)(?: \((.*)\))?$" +consistent!(kefia_0, r"(?m)^(\S+)/(\S+) (\S+)(?: \((.*)\))?$"); + +// risp-0.7.0: "^(\\s+|;.*?(\n|$))+" +consistent!(risp_0, "^(\\s+|;.*?(\n|$))+"); + +// risp-0.7.0: "^\".*?\"" +consistent!(risp_1, "^\".*?\""); + +// risp-0.7.0: r"^[^\s\{\}()\[\]]+" +consistent!(risp_2, r"^[^\s\{\}()\[\]]+"); + +// risp-0.7.0: r"^-?\d+" +consistent!(risp_3, r"^-?\d+"); + +// ripgrep-0.8.1: "^([0-9]+)([KMG])?$" +consistent!(ripgrep_0, "^([0-9]+)([KMG])?$"); + +// riquid-0.0.1: r"^\w+" +consistent!(riquid_0, r"^\w+"); + +// riquid-0.0.1: r"^\d+" +consistent!(riquid_1, r"^\d+"); + +// recursive_disassembler-2.1.2: r"\A(0x)?([a-fA-F0-9]+)\z" +consistent!(recursive_disassembler_0, r"\A(0x)?([a-fA-F0-9]+)\z"); + +// remake-0.1.0: r"^[a-zA-Z_][a-zA-Z0-9_]*" +consistent!(remake_0, r"^[a-zA-Z_][a-zA-Z0-9_]*"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)" +consistent!(regex_decode_0, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)" +consistent!(regex_decode_1, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)" +consistent!(regex_decode_2, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)" +consistent!(regex_decode_3, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)" +consistent!(regex_decode_4, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)" +consistent!(regex_decode_5, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{2})\)" +consistent!(regex_decode_6, r"'(?P<title>[^']+)'\s+\((?P<year>\d{2})\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)" +consistent!(regex_decode_7, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)" +consistent!(regex_decode_8, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)" +consistent!(regex_decode_9, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)" +consistent!(regex_decode_10, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)" +consistent!(regex_decode_11, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)" +consistent!(regex_decode_12, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)"); + +// regex-decode-0.1.0: r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)" +consistent!(regex_decode_13, r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})?\)"); + +// regex-cache-0.2.0: "[0-9]{3}-[0-9]{3}-[0-9]{4}" +consistent!(regex_cache_0, "[0-9]{3}-[0-9]{3}-[0-9]{4}"); + +// regex-cache-0.2.0: r"^\d+$" +consistent!(regex_cache_1, r"^\d+$"); + +// regex-cache-0.2.0: r"^[a-z]+$" +consistent!(regex_cache_2, r"^[a-z]+$"); + +// regex-cache-0.2.0: r"^\d+$" +consistent!(regex_cache_3, r"^\d+$"); + +// regex-cache-0.2.0: r"^\d+$" +consistent!(regex_cache_4, r"^\d+$"); + +// regex_dfa-0.5.0: r"\d{4}-\d{2}-\d{2}" +consistent!(regex_dfa_0, r"\d{4}-\d{2}-\d{2}"); + +// reaper-2.0.0: r"^[0-9\p{L} _\\.]{3,16}$" +consistent!(reaper_0, r"^[0-9\p{L} _\\.]{3,16}$"); + +// retdec-0.1.0: r"^attachment; filename=(.+)$" +consistent!(retdec_0, r"^attachment; filename=(.+)$"); + +// renvsubst-0.1.2: r"(\\)(?P<head>\$[0-9A-Za-z_{])" +consistent!(renvsubst_0, r"(\\)(?P<head>\$[0-9A-Za-z_{])"); + +// renvsubst-0.1.2: r"\$([[:word:]]+)" +consistent!(renvsubst_1, r"\$([[:word:]]+)"); + +// renvsubst-0.1.2: r"\$\{([[:word:]]+)\}" +consistent!(renvsubst_2, r"\$\{([[:word:]]+)\}"); + +// rexpect-0.3.0: r"'[a-z]+'" +consistent!(rexpect_0, r"'[a-z]+'"); + +// rexpect-0.3.0: r"^\d{4}-\d{2}-\d{2}$" +consistent!(rexpect_1, r"^\d{4}-\d{2}-\d{2}$"); + +// rexpect-0.3.0: r"-\d{2}-" +consistent!(rexpect_2, r"-\d{2}-"); + +// luther-0.1.0: "^a(b|c)c*$" +consistent!(luther_0, "^a(b|c)c*$"); + +// little_boxes-1.6.0: r"(\x9B|\x1B\[)[0-?]*[ -/]*[@-~]" +consistent!(little_boxes_0, r"(\x9B|\x1B\[)[0-?]*[ -/]*[@-~]"); + +// libimagentrytag-0.8.0: "^[a-zA-Z]([a-zA-Z0-9_-]*)$" +consistent!(libimagentrytag_0, "^[a-zA-Z]([a-zA-Z0-9_-]*)$"); + +// libimaginteraction-0.8.0: r"^[Yy](\n?)$" +consistent!(libimaginteraction_0, r"^[Yy](\n?)$"); + +// libimaginteraction-0.8.0: r"^[Nn](\n?)$" +consistent!(libimaginteraction_1, r"^[Nn](\n?)$"); + +// libimagutil-0.8.0: "^(?P<KEY>([^=]*))=(.*)$" +consistent!(libimagutil_0, "^(?P<KEY>([^=]*))=(.*)$"); + +// libimagutil-0.8.0: "(.*)=(\"(?P<QVALUE>([^\"]*))\"|(?P<VALUE>(.*)))$" +consistent!(libimagutil_1, "(.*)=(\"(?P<QVALUE>([^\"]*))\"|(?P<VALUE>(.*)))$"); + +// linux_ip-0.1.0: r"\s+" +consistent!(linux_ip_0, r"\s+"); + +// linux_ip-0.1.0: r"\s*[\n\r]+\s*" +consistent!(linux_ip_1, r"\s*[\n\r]+\s*"); + +// linux_ip-0.1.0: r"^([0-9a-fA-F\.:/]+)\s+dev\s+([a-z0-9\.]+)\s*(.*)$" +consistent!(linux_ip_2, r"^([0-9a-fA-F\.:/]+)\s+dev\s+([a-z0-9\.]+)\s*(.*)$"); + +// linux_ip-0.1.0: r"^([0-9a-fA-F\.:/]+|default)\s+via\s+([a-z0-9\.:]+)\s+dev\s+([a-z0-9\.]+)\s*(.*)$" +consistent!(linux_ip_3, r"^([0-9a-fA-F\.:/]+|default)\s+via\s+([a-z0-9\.:]+)\s+dev\s+([a-z0-9\.]+)\s*(.*)$"); + +// linux_ip-0.1.0: r"^(blackhole)\s+([0-9a-fA-F\.:/]+)$" +consistent!(linux_ip_4, r"^(blackhole)\s+([0-9a-fA-F\.:/]+)$"); + +// linux_ip-0.1.0: r"^(unreachable)\s+([0-9a-fA-F\.:/]+)\s+dev\s+([a-z0-9\.]+)\s+(.*)$" +consistent!( + linux_ip_5, + r"^(unreachable)\s+([0-9a-fA-F\.:/]+)\s+dev\s+([a-z0-9\.]+)\s+(.*)$" +); + +// linux_ip-0.1.0: r"\s*[\n\r]+\s*" +consistent!(linux_ip_6, r"\s*[\n\r]+\s*"); + +// linux_ip-0.1.0: r"^\d+:\s+([a-zA-Z0-9\.-]+)(@\S+)*:\s+(.*)$" +consistent!(linux_ip_7, r"^\d+:\s+([a-zA-Z0-9\.-]+)(@\S+)*:\s+(.*)$"); + +// linux_ip-0.1.0: r"\s*link/ether\s+([a-f0-9:]+)\s+.*" +consistent!(linux_ip_8, r"\s*link/ether\s+([a-f0-9:]+)\s+.*"); + +// linux_ip-0.1.0: r"\s*inet[6]*\s+([0-9a-f:\./]+)\s+.*" +consistent!(linux_ip_9, r"\s*inet[6]*\s+([0-9a-f:\./]+)\s+.*"); + +// linky-0.1.4: r"[^\w -]" +consistent!(linky_0, r"[^\w -]"); + +// linky-0.1.4: r"^(.*):(\d+): [^ ]* ([^ ]*)$" +consistent!(linky_1, r"^(.*):(\d+): [^ ]* ([^ ]*)$"); + +// limonite-0.2.1: r"^(\d{4}-\d{2}-\d{2})-(\d{3})-(.+)$" +consistent!(limonite_0, r"^(\d{4}-\d{2}-\d{2})-(\d{3})-(.+)$"); + +// process-queue-0.1.1: r"^[a-zA-Z]+$" +consistent!(process_queue_0, r"^[a-zA-Z]+$"); + +// pronghorn-0.1.2: r"^\{([a-zA-Z_]+)\}$" +consistent!(pronghorn_0, r"^\{([a-zA-Z_]+)\}$"); + +// protocol-ftp-client-0.1.1: "(?m:^(\\d{3}) (.+)\r$)" +consistent!(protocol_ftp_client_0, "(?m:^(\\d{3}) (.+)\r$)"); + +// protocol-ftp-client-0.1.1: "\"(.+)\"" +consistent!(protocol_ftp_client_1, "\"(.+)\""); + +// protocol-ftp-client-0.1.1: "(\\w+) [Tt]ype: (\\w+)" +consistent!(protocol_ftp_client_2, "(\\w+) [Tt]ype: (\\w+)"); + +// protocol-ftp-client-0.1.1: "(?m:^(\\d{3})-.+\r$)" +consistent!(protocol_ftp_client_3, "(?m:^(\\d{3})-.+\r$)"); + +// protocol-ftp-client-0.1.1: "Entering Passive Mode \\((\\d+),(\\d+),(\\d+),(\\d+),(\\d+),(\\d+)\\)" +consistent!( + protocol_ftp_client_4, + "Entering Passive Mode \\((\\d+),(\\d+),(\\d+),(\\d+),(\\d+),(\\d+)\\)" +); + +// protocol-ftp-client-0.1.1: "(?m:^(.+)\r$)" +consistent!(protocol_ftp_client_5, "(?m:^(.+)\r$)"); + +// protocol-ftp-client-0.1.1: "^([d-])(?:[rwx-]{3}){3} +\\d+ +\\w+ +\\w+ +(\\d+) +(.+) +(.+)$" +consistent!( + protocol_ftp_client_6, + "^([d-])(?:[rwx-]{3}){3} +\\d+ +\\w+ +\\w+ +(\\d+) +(.+) +(.+)$" +); + +// article-date-extractor-0.1.1: r"([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})" +consistent!(article_date_extractor_0, r"([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})"); + +// article-date-extractor-0.1.1: r"(?i)publishdate|pubdate|timestamp|article_date|articledate|date" +consistent!( + article_date_extractor_1, + r"(?i)publishdate|pubdate|timestamp|article_date|articledate|date" +); + +// arthas_plugin-0.1.1: r"type\((.*)\)" +consistent!(arthas_plugin_0, r"type\((.*)\)"); + +// arthas_plugin-0.1.1: r"Vec<(.*)>" +consistent!(arthas_plugin_1, r"Vec<(.*)>"); + +// arthas_plugin-0.1.1: r"Option<(.*)>" +consistent!(arthas_plugin_2, r"Option<(.*)>"); + +// arthas_plugin-0.1.1: r"HashMap<[a-z0-9A-Z]+, *(.*)>" +consistent!(arthas_plugin_3, r"HashMap<[a-z0-9A-Z]+, *(.*)>"); + +// arthas_derive-0.1.0: "Vec *< *(.*) *>" +consistent!(arthas_derive_0, "Vec *< *(.*) *>"); + +// arthas_derive-0.1.0: r"Option *< *(.*) *>" +consistent!(arthas_derive_1, r"Option *< *(.*) *>"); + +// arthas_derive-0.1.0: r"HashMap *< *[a-z0-9A-Z]+ *, *(.*) *>" +consistent!(arthas_derive_2, r"HashMap *< *[a-z0-9A-Z]+ *, *(.*) *>"); + +// arpabet-0.2.0: r"^([\w\-\(\)\.']+)\s+([^\s].*)\s*$" +consistent!(arpabet_0, r"^([\w\-\(\)\.']+)\s+([^\s].*)\s*$"); + +// arpabet-0.2.0: r"^;;;\s+" +consistent!(arpabet_1, r"^;;;\s+"); + +// glossy_codegen-0.2.0: r"/\*.*?\*/|//.*" +consistent!(glossy_codegen_0, r"/\*.*?\*/|//.*"); + +// glossy_codegen-0.2.0: "^\\s*#\\s*include\\s+<([:print:]+)>\\s*$" +consistent!(glossy_codegen_1, "^\\s*#\\s*include\\s+<([:print:]+)>\\s*$"); + +// glossy_codegen-0.2.0: "^\\s*#\\s*include\\s+\"([:print:]+)\"\\s*$" +consistent!(glossy_codegen_2, "^\\s*#\\s*include\\s+\"([:print:]+)\"\\s*$"); + +// glossy_codegen-0.2.0: r"^\s*#\s*version\s+(\d+)" +consistent!(glossy_codegen_3, r"^\s*#\s*version\s+(\d+)"); + +// glossy_codegen-0.2.0: r"^\s*$" +consistent!(glossy_codegen_4, r"^\s*$"); + +// gluster-1.0.1: r"(?P<addr>via \S+)" +consistent!(gluster_0, r"(?P<addr>via \S+)"); + +// gluster-1.0.1: r"(?P<src>src \S+)" +consistent!(gluster_1, r"(?P<src>src \S+)"); + +// gl_helpers-0.1.7: r"(.*)\[\d+\]" +consistent!(gl_helpers_0, r"(.*)\[\d+\]"); + +// gl_helpers-0.1.7: r"(\d+).(\d+)" +consistent!(gl_helpers_1, r"(\d+).(\d+)"); + +// glr-parser-0.0.1: r"(?P<c>[\\\.\+\*\?\(\)\|\[\]\{\}\^\$])" +consistent!(glr_parser_0, r"(?P<c>[\\\.\+\*\?\(\)\|\[\]\{\}\^\$])"); + +// glr-parser-0.0.1: r"^\w+$" +consistent!(glr_parser_1, r"^\w+$"); + +// glr-parser-0.0.1: "'[^']+'" +consistent!(glr_parser_2, "'[^']+'"); + +// hoodlum-0.5.0: r"(?m)//.*" +consistent!(hoodlum_0, r"(?m)//.*"); + +// form-checker-0.2.2: r"^1\d{10}$" +consistent!(form_checker_0, r"^1\d{10}$"); + +// form-checker-0.2.2: r"(?i)^[\w.%+-]+@(?:[A-Z0-9-]+\.)+[A-Z]{2,4}$" +consistent!(form_checker_1, r"(?i)^[\w.%+-]+@(?:[A-Z0-9-]+\.)+[A-Z]{2,4}$"); + +// wikibase-0.2.0: r"(?P<user_agent>[a-zA-Z0-9-_]+/[0-9\.]+)" +consistent!(wikibase_0, r"(?P<user_agent>[a-zA-Z0-9-_]+/[0-9\.]+)"); + +// wifiscanner-0.3.6: r"Cell [0-9]{2,} - Address:" +consistent!(wifiscanner_0, r"Cell [0-9]{2,} - Address:"); + +// wifiscanner-0.3.6: r"([0-9a-zA-Z]{1}[0-9a-zA-Z]{1}[:]{1}){5}[0-9a-zA-Z]{1}[0-9a-zA-Z]{1}" +consistent!( + wifiscanner_1, + r"([0-9a-zA-Z]{1}[0-9a-zA-Z]{1}[:]{1}){5}[0-9a-zA-Z]{1}[0-9a-zA-Z]{1}" +); + +// wifiscanner-0.3.6: r"Signal level=(\d+)/100" +consistent!(wifiscanner_2, r"Signal level=(\d+)/100"); + +// bbcode-1.0.2: r"(?s)\[b\](.*?)\[/b\]" +consistent!(bbcode_0, r"(?s)\[b\](.*?)\[/b\]"); + +// bbcode-1.0.2: r"(?s)\[i\](.*?)\[/i\]" +consistent!(bbcode_1, r"(?s)\[i\](.*?)\[/i\]"); + +// bbcode-1.0.2: r"(?s)\[u\](.*?)\[/u\]" +consistent!(bbcode_2, r"(?s)\[u\](.*?)\[/u\]"); + +// bbcode-1.0.2: r"(?s)\[s\](.*?)\[/s\]" +consistent!(bbcode_3, r"(?s)\[s\](.*?)\[/s\]"); + +// bbcode-1.0.2: r"(?s)\[size=(\d+)](.*?)\[/size\]" +consistent!(bbcode_4, r"(?s)\[size=(\d+)](.*?)\[/size\]"); + +// bbcode-1.0.2: r"(?s)\[color=(.+)](.*?)\[/color\]" +consistent!(bbcode_5, r"(?s)\[color=(.+)](.*?)\[/color\]"); + +// bbcode-1.0.2: r"(?s)\[center\](.*?)\[/center\]" +consistent!(bbcode_6, r"(?s)\[center\](.*?)\[/center\]"); + +// bbcode-1.0.2: r"(?s)\[left\](.*?)\[/left\]" +consistent!(bbcode_7, r"(?s)\[left\](.*?)\[/left\]"); + +// bbcode-1.0.2: r"(?s)\[right\](.*?)\[/right\]" +consistent!(bbcode_8, r"(?s)\[right\](.*?)\[/right\]"); + +// bbcode-1.0.2: r"(?s)\[table\](.*?)\[/table\]" +consistent!(bbcode_9, r"(?s)\[table\](.*?)\[/table\]"); + +// bbcode-1.0.2: r"(?s)\[td\](.*?)\[/td\]" +consistent!(bbcode_10, r"(?s)\[td\](.*?)\[/td\]"); + +// bbcode-1.0.2: r"(?s)\[tr\](.*?)\[/tr\]" +consistent!(bbcode_11, r"(?s)\[tr\](.*?)\[/tr\]"); + +// bbcode-1.0.2: r"(?s)\[th\](.*?)\[/th\]" +consistent!(bbcode_12, r"(?s)\[th\](.*?)\[/th\]"); + +// bbcode-1.0.2: r"(?s)\[url\](.*?)\[/url\]" +consistent!(bbcode_13, r"(?s)\[url\](.*?)\[/url\]"); + +// bbcode-1.0.2: r"(?s)\[url=(.+)\](.*?)\[/url\]" +consistent!(bbcode_14, r"(?s)\[url=(.+)\](.*?)\[/url\]"); + +// bbcode-1.0.2: r"(?s)\[quote\](.*?)\[/quote\]" +consistent!(bbcode_15, r"(?s)\[quote\](.*?)\[/quote\]"); + +// bbcode-1.0.2: r"(?s)\[quote=(.+)\](.*?)\[/quote\]" +consistent!(bbcode_16, r"(?s)\[quote=(.+)\](.*?)\[/quote\]"); + +// bbcode-1.0.2: r"(?s)\[img=(\d+)x(\d+)(\b.*)?\](.*?)\[/img\]" +consistent!(bbcode_17, r"(?s)\[img=(\d+)x(\d+)(\b.*)?\](.*?)\[/img\]"); + +// bbcode-1.0.2: r"(?s)\[img=(.+)(\b.*)?\](.*?)\[/img\]" +consistent!(bbcode_18, r"(?s)\[img=(.+)(\b.*)?\](.*?)\[/img\]"); + +// bbcode-1.0.2: r"(?s)\[img(\b.*)?\](.*?)\[/img\]" +consistent!(bbcode_19, r"(?s)\[img(\b.*)?\](.*?)\[/img\]"); + +// bbcode-1.0.2: r"(?s)\[ol\](.*?)\[/ol\]" +consistent!(bbcode_20, r"(?s)\[ol\](.*?)\[/ol\]"); + +// bbcode-1.0.2: r"(?s)\[ul\](.*?)\[/ul\]" +consistent!(bbcode_21, r"(?s)\[ul\](.*?)\[/ul\]"); + +// bbcode-1.0.2: r"(?s)\[list\](.*?)\[/list\]" +consistent!(bbcode_22, r"(?s)\[list\](.*?)\[/list\]"); + +// bbcode-1.0.2: r"(?s)\[youtube\](.*?)\[/youtube\]" +consistent!(bbcode_23, r"(?s)\[youtube\](.*?)\[/youtube\]"); + +// bbcode-1.0.2: r"(?s)\[youtube=(\d+)x(\d+)\](.*?)\[/youtube\]" +consistent!(bbcode_24, r"(?s)\[youtube=(\d+)x(\d+)\](.*?)\[/youtube\]"); + +// bbcode-1.0.2: r"(?s)\[li\](.*?)\[/li\]" +consistent!(bbcode_25, r"(?s)\[li\](.*?)\[/li\]"); + +// block-utils-0.5.0: r"loop\d+" +consistent!(block_utils_0, r"loop\d+"); + +// block-utils-0.5.0: r"ram\d+" +consistent!(block_utils_1, r"ram\d+"); + +// block-utils-0.5.0: r"md\d+" +consistent!(block_utils_2, r"md\d+"); + +// kvvliveapi-0.1.0: r"^([1-9]) min$" +consistent!(kvvliveapi_0, r"^([1-9]) min$"); + +// rfc822_sanitizer-0.3.3: r"(\d{2}):(\d{2}):(\d{2})" +consistent!(rfc822_sanitizer_0, r"(\d{2}):(\d{2}):(\d{2})"); + +// rfc822_sanitizer-0.3.3: r"(\d{1,2}):(\d{1,2}):(\d{1,2})" +consistent!(rfc822_sanitizer_1, r"(\d{1,2}):(\d{1,2}):(\d{1,2})"); + +// faker-0.0.4: r"[2-9]" +consistent!(faker_0, r"[2-9]"); + +// faker-0.0.4: r"[1-9]" +consistent!(faker_1, r"[1-9]"); + +// faker-0.0.4: r"[0-9]" +consistent!(faker_2, r"[0-9]"); + +// faker-0.0.4: r"\d{10}" +consistent!(faker_3, r"\d{10}"); + +// faker-0.0.4: r"\d{1}" +consistent!(faker_4, r"\d{1}"); + +// faker-0.0.4: r"^\w+" +consistent!(faker_5, r"^\w+"); + +// faker-0.0.4: r"^\w+" +consistent!(faker_6, r"^\w+"); + +// faker-0.0.4: r"^(\w+\.? ?){2,3}$" +consistent!(faker_7, r"^(\w+\.? ?){2,3}$"); + +// faker-0.0.4: r"^[A-Z][a-z]+\.?$" +consistent!(faker_8, r"^[A-Z][a-z]+\.?$"); + +// faker-0.0.4: r"^[A-Z][A-Za-z]*\.?$" +consistent!(faker_9, r"^[A-Z][A-Za-z]*\.?$"); + +// faker-0.0.4: r"http://lorempixel.com/100/100/\w+" +consistent!(faker_10, r"http://lorempixel.com/100/100/\w+"); + +// faker-0.0.4: r"http://lorempixel.com/100/100/cats" +consistent!(faker_11, r"http://lorempixel.com/100/100/cats"); + +// fancy-regex-0.1.0: "(?i:ß)" +consistent!(fancy_regex_0, "(?i:ß)"); + +// fancy-regex-0.1.0: "(?i:\\x{0587})" +consistent!(fancy_regex_1, "(?i:\\x{0587})"); + +// fancy-regex-0.1.0: "^\\\\([!-/:-@\\[-`\\{-~aftnrv]|[0-7]{1,3}|x[0-9a-fA-F]{2}|x\\{[0-9a-fA-F]{1,6}\\})" +consistent!(fancy_regex_2, "^\\\\([!-/:-@\\[-`\\{-~aftnrv]|[0-7]{1,3}|x[0-9a-fA-F]{2}|x\\{[0-9a-fA-F]{1,6}\\})"); + +// fancy-prompt-0.1.5: r"/([^/])[^/]+/" +consistent!(fancy_prompt_0, r"/([^/])[^/]+/"); + +// fancy-prompt-0.1.5: r"^([^:]+):.*?(?::([^:]+))?$" +consistent!(fancy_prompt_1, r"^([^:]+):.*?(?::([^:]+))?$"); + +// fanta-0.2.0: r"^(/?__\w+__)/(.*)" +consistent!(fanta_0, r"^(/?__\w+__)/(.*)"); + +// fanta-cli-0.1.1: r"(.)([A-Z])" +consistent!(fanta_cli_0, r"(.)([A-Z])"); + +// fanta-cli-0.1.1: "\\{:[^\\s]+\\}" +consistent!(fanta_cli_1, "\\{:[^\\s]+\\}"); + +// amethyst_tools-0.7.1: "(?P<last>[^\r])\n" +consistent!(amethyst_tools_0, "(?P<last>[^\r])\n"); + +// amigo-0.3.1: r"^-?\d+(\.\d)?" +consistent!(amigo_0, r"^-?\d+(\.\d)?"); + +// amigo-0.3.1: r"^[a-zA-Z_]+[\w-]*[!?_]?" +consistent!(amigo_1, r"^[a-zA-Z_]+[\w-]*[!?_]?"); + +// amigo-0.3.1: r"^\(" +consistent!(amigo_2, r"^\("); + +// amigo-0.3.1: r"^\)" +consistent!(amigo_3, r"^\)"); + +// amigo-0.3.1: r"^\s+" +consistent!(amigo_4, r"^\s+"); + +// ethcore-logger-1.12.0: "\x1b\\[[^m]+m" +consistent!(ethcore_logger_0, "\x1b\\[[^m]+m"); + +// dash2html-1.0.1: r"__.*?__" +consistent!(dash2html_0, r"__.*?__"); + +// dash2html-1.0.1: r"(?i)@(?:time|clipboard|cursor|date)" +consistent!(dash2html_1, r"(?i)@(?:time|clipboard|cursor|date)"); + +// os_type-2.0.0: r"^Microsoft Windows \[Version\s(\d+\.\d+\.\d+)\]$" +consistent!(os_type_0, r"^Microsoft Windows \[Version\s(\d+\.\d+\.\d+)\]$"); + +// os_type-2.0.0: r"ProductName:\s([\w\s]+)\n" +consistent!(os_type_1, r"ProductName:\s([\w\s]+)\n"); + +// os_type-2.0.0: r"ProductVersion:\s(\w+\.\w+\.\w+)" +consistent!(os_type_2, r"ProductVersion:\s(\w+\.\w+\.\w+)"); + +// os_type-2.0.0: r"BuildVersion:\s(\w+)" +consistent!(os_type_3, r"BuildVersion:\s(\w+)"); + +// os_type-2.0.0: r"(\w+) Linux release" +consistent!(os_type_4, r"(\w+) Linux release"); + +// os_type-2.0.0: r"release\s([\w\.]+)" +consistent!(os_type_5, r"release\s([\w\.]+)"); + +// os_type-2.0.0: r"Distributor ID:\s(\w+)" +consistent!(os_type_6, r"Distributor ID:\s(\w+)"); + +// os_type-2.0.0: r"Release:\s([\w\.]+)" +consistent!(os_type_7, r"Release:\s([\w\.]+)"); + +// bindgen-0.37.0: r"typename type\-parameter\-\d+\-\d+::.+" +consistent!(bindgen_0, r"typename type\-parameter\-\d+\-\d+::.+"); + +// imap-0.8.1: "^+(.*)\r\n" +consistent!(imap_0, "^+(.*)\r\n"); + +// image-base64-0.1.0: r"^ffd8ffe0" +consistent!(image_base64_0, r"^ffd8ffe0"); + +// image-base64-0.1.0: r"^89504e47" +consistent!(image_base64_1, r"^89504e47"); + +// image-base64-0.1.0: r"^47494638" +consistent!(image_base64_2, r"^47494638"); + +// json-pointer-0.3.2: "^(/([^/~]|~[01])*)*$" +consistent!(json_pointer_0, "^(/([^/~]|~[01])*)*$"); + +// json-pointer-0.3.2: "^#(/([^/~%]|~[01]|%[0-9a-fA-F]{2})*)*$" +consistent!(json_pointer_1, "^#(/([^/~%]|~[01]|%[0-9a-fA-F]{2})*)*$"); + +// mysql_common-0.7.0: r"^5.5.5-(\d{1,2})\.(\d{1,2})\.(\d{1,3})-MariaDB" +consistent!(mysql_common_0, r"^5.5.5-(\d{1,2})\.(\d{1,2})\.(\d{1,3})-MariaDB"); + +// mysql_common-0.7.0: r"^(\d{1,2})\.(\d{1,2})\.(\d{1,3})(.*)" +consistent!(mysql_common_1, r"^(\d{1,2})\.(\d{1,2})\.(\d{1,3})(.*)"); + +// government_id-0.1.0: r"^[0-9]{4}[0-9A-Z]{2}[0-9]{3}$" +consistent!(government_id_0, r"^[0-9]{4}[0-9A-Z]{2}[0-9]{3}$"); + +// ohmers-0.1.1: r"UniqueIndexViolation: (\w+)" +consistent!(ohmers_0, r"UniqueIndexViolation: (\w+)"); + +// eliza-1.0.0: r"(.*) you are (.*)" +consistent!(eliza_0, r"(.*) you are (.*)"); + +// eliza-1.0.0: r"(.*) you are (.*)" +consistent!(eliza_1, r"(.*) you are (.*)"); + +// eliza-1.0.0: r"(.*) you are (.*)" +consistent!(eliza_2, r"(.*) you are (.*)"); + +// chema-0.0.5: "^\\s*\\*" +consistent!(chema_0, "^\\s*\\*"); + +// chema-0.0.5: "^\\s*@(\\w+)\\s+(.*)" +consistent!(chema_1, "^\\s*@(\\w+)\\s+(.*)"); + +// chord3-0.3.0: r"^\s*#" +consistent!(chord3_0, r"^\s*#"); + +// chord3-0.3.0: r"\{(?P<cmd>\w+)(?::?\s*(?P<arg>.*))?\}" +consistent!(chord3_1, r"\{(?P<cmd>\w+)(?::?\s*(?P<arg>.*))?\}"); + +// chord3-0.3.0: r"\{(eot|end_of_tab):?\s*" +consistent!(chord3_2, r"\{(eot|end_of_tab):?\s*"); + +// chord3-0.3.0: r"([^\[]*)(?:\[([^\]]*)\])?" +consistent!(chord3_3, r"([^\[]*)(?:\[([^\]]*)\])?"); + +// checkmail-0.1.1: "^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$" +consistent!(checkmail_0, "^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$"); + +// cntk-0.2.1: r"\b\w\w+\b" +consistent!(cntk_0, r"\b\w\w+\b"); + +// cntk-0.2.1: r"\b\w\w+\b" +consistent!(cntk_1, r"\b\w\w+\b"); + +// cniguru-0.1.0: r"\(id: (\d+)\)" +consistent!(cniguru_0, r"\(id: (\d+)\)"); + +// upm_lib-0.3.0: r"^(\d+)\.(\d+)\.(\d+)(?:-([\dA-Za-z-]+(?:\.[\dA-Za-z-]+)*))?(?:\+([\dA-Za-z-]+(?:\.[\dA-Za-z-]+)*))?$" +consistent!(upm_lib_0, r"^(\d+)\.(\d+)\.(\d+)(?:-([\dA-Za-z-]+(?:\.[\dA-Za-z-]+)*))?(?:\+([\dA-Za-z-]+(?:\.[\dA-Za-z-]+)*))?$"); + +// avro-0.2.1: r"^\s*(\*+(\s+))?" +consistent!(avro_0, r"^\s*(\*+(\s+))?"); + +// avro-0.2.1: r"^\s*(\*+)?" +consistent!(avro_1, r"^\s*(\*+)?"); + +// nomi-0.0.2: "[0-9]+" +consistent!(nomi_0, "[0-9]+"); + +// nodes-0.1.0: "([0-9]+)@(?:nodes|n)?:([^@]+)?" +consistent!(nodes_0, "([0-9]+)@(?:nodes|n)?:([^@]+)?"); + +// not-stakkr-1.0.0: r"(?i)in (\d+) (second|minute|hour|day|week)s?" +consistent!(not_stakkr_0, r"(?i)in (\d+) (second|minute|hour|day|week)s?"); + +// notetxt-0.0.1: "^([A-Za-z0-9 -_:]+)\n-+\n" +consistent!(notetxt_0, "^([A-Za-z0-9 -_:]+)\n-+\n"); + +// nail-0.1.0-pre.0: r"^-?[0-9]+(\.[0-9]+)?([eE]-?[0-9]+)?$" +consistent!(nail_0, r"^-?[0-9]+(\.[0-9]+)?([eE]-?[0-9]+)?$"); + +// nail-0.1.0-pre.0: r"^-?[0-9]+$" +consistent!(nail_1, r"^-?[0-9]+$"); + +// askalono-0.2.0: r"[^\w\s\pP]+" +consistent!(askalono_0, r"[^\w\s\pP]+"); + +// askalono-0.2.0: r"(?x)[ \t\p{Zs} \\ / \| \x2044 ]+" +consistent!(askalono_1, r"(?x)[ \t\p{Zs} \\ / \| \x2044 ]+"); + +// askalono-0.2.0: r"\p{Pd}+" +consistent!(askalono_2, r"\p{Pd}+"); + +// askalono-0.2.0: r"\p{Ps}+" +consistent!(askalono_3, r"\p{Ps}+"); + +// askalono-0.2.0: r"\p{Pe}+" +consistent!(askalono_4, r"\p{Pe}+"); + +// askalono-0.2.0: r"\p{Pc}+" +consistent!(askalono_5, r"\p{Pc}+"); + +// askalono-0.2.0: r"[©Ⓒⓒ]" +consistent!(askalono_6, r"[©Ⓒⓒ]"); + +// askalono-0.2.0: r"[\r\n\v\f]" +consistent!(askalono_7, r"[\r\n\v\f]"); + +// askalono-0.2.0: r"\n{3,}" +consistent!(askalono_8, r"\n{3,}"); + +// askalono-0.2.0: r"[^\w\s]+" +consistent!(askalono_9, r"[^\w\s]+"); + +// askalono-0.2.0: r"\s+" +consistent!(askalono_10, r"\s+"); + +// assembunny_plus-0.0.3: r"[^0-9a-zA-Z_]" +consistent!(assembunny_plus_0, r"[^0-9a-zA-Z_]"); + +// assembunny_plus-0.0.3: r"[0-9]" +consistent!(assembunny_plus_1, r"[0-9]"); + +// salt-compressor-0.4.0: r"(?m)^Minion (\S*) did not respond\. No job will be sent\.$" +consistent!( + salt_compressor_0, + r"(?m)^Minion (\S*) did not respond\. No job will be sent\.$" +); + +// sabisabi-0.4.1: r"</?[^>]+?>" +consistent!(sabisabi_0, r"</?[^>]+?>"); + +// sabisabi-0.4.1: r"\([^)]*\)" +consistent!(sabisabi_1, r"\([^)]*\)"); + +// sassers-0.13.5-h28: "@import \"([^\"]*)\";" +consistent!(sassers_0, "@import \"([^\"]*)\";"); + +// shadowsocks-0.6.2: r"[A-Za-z\d-]{1,63}$" +consistent!(shadowsocks_0, r"[A-Za-z\d-]{1,63}$"); + +// shkeleton-0.1.5: "[abc]+" +consistent!(shkeleton_0, "[abc]+"); + +// shellwords-0.1.0: r"([^A-Za-z0-9_\-.,:/@\n])" +consistent!(shellwords_0, r"([^A-Za-z0-9_\-.,:/@\n])"); + +// shellwords-0.1.0: r"\n" +consistent!(shellwords_1, r"\n"); + +// shush-0.1.5: "(?P<num>[0-9]+)(?P<units>[dhms])" +consistent!(shush_0, "(?P<num>[0-9]+)(?P<units>[dhms])"); + +// woothee-0.8.0: r"(?:Chrome|CrMo|CriOS)/([.0-9]+)" +consistent!(woothee_0, r"(?:Chrome|CrMo|CriOS)/([.0-9]+)"); + +// woothee-0.8.0: r"Vivaldi/([.0-9]+)" +consistent!(woothee_1, r"Vivaldi/([.0-9]+)"); + +// woothee-0.8.0: r"Firefox/([.0-9]+)" +consistent!(woothee_2, r"Firefox/([.0-9]+)"); + +// woothee-0.8.0: r"^Mozilla/[.0-9]+ \((?:Mobile|Tablet);(?:.*;)? rv:([.0-9]+)\) Gecko/[.0-9]+ Firefox/[.0-9]+$" +consistent!(woothee_3, r"^Mozilla/[.0-9]+ \((?:Mobile|Tablet);(?:.*;)? rv:([.0-9]+)\) Gecko/[.0-9]+ Firefox/[.0-9]+$"); + +// woothee-0.8.0: r"FxiOS/([.0-9]+)" +consistent!(woothee_4, r"FxiOS/([.0-9]+)"); + +// woothee-0.8.0: r"\(([^;)]+);FOMA;" +consistent!(woothee_5, r"\(([^;)]+);FOMA;"); + +// woothee-0.8.0: r"jig browser[^;]+; ([^);]+)" +consistent!(woothee_6, r"jig browser[^;]+; ([^);]+)"); + +// woothee-0.8.0: r"(?i)rss(?:reader|bar|[-_ /;()]|[ +]*/)" +consistent!(woothee_7, r"(?i)rss(?:reader|bar|[-_ /;()]|[ +]*/)"); + +// woothee-0.8.0: r"(?i)(?:bot|crawler|spider)(?:[-_ ./;@()]|$)" +consistent!(woothee_8, r"(?i)(?:bot|crawler|spider)(?:[-_ ./;@()]|$)"); + +// woothee-0.8.0: r"(?i)(?:feed|web) ?parser" +consistent!(woothee_9, r"(?i)(?:feed|web) ?parser"); + +// woothee-0.8.0: r"(?i)watch ?dog" +consistent!(woothee_10, r"(?i)watch ?dog"); + +// woothee-0.8.0: r"Edge/([.0-9]+)" +consistent!(woothee_11, r"Edge/([.0-9]+)"); + +// woothee-0.8.0: r"MSIE ([.0-9]+);" +consistent!(woothee_12, r"MSIE ([.0-9]+);"); + +// woothee-0.8.0: r"Version/([.0-9]+)" +consistent!(woothee_13, r"Version/([.0-9]+)"); + +// woothee-0.8.0: r"Opera[/ ]([.0-9]+)" +consistent!(woothee_14, r"Opera[/ ]([.0-9]+)"); + +// woothee-0.8.0: r"OPR/([.0-9]+)" +consistent!(woothee_15, r"OPR/([.0-9]+)"); + +// woothee-0.8.0: r"Version/([.0-9]+)" +consistent!(woothee_16, r"Version/([.0-9]+)"); + +// woothee-0.8.0: r"(?:SoftBank|Vodafone|J-PHONE)/[.0-9]+/([^ /;()]+)" +consistent!(woothee_17, r"(?:SoftBank|Vodafone|J-PHONE)/[.0-9]+/([^ /;()]+)"); + +// woothee-0.8.0: r"Trident/([.0-9]+);" +consistent!(woothee_18, r"Trident/([.0-9]+);"); + +// woothee-0.8.0: r" rv:([.0-9]+)" +consistent!(woothee_19, r" rv:([.0-9]+)"); + +// woothee-0.8.0: r"IEMobile/([.0-9]+);" +consistent!(woothee_20, r"IEMobile/([.0-9]+);"); + +// woothee-0.8.0: r"(?:WILLCOM|DDIPOCKET);[^/]+/([^ /;()]+)" +consistent!(woothee_21, r"(?:WILLCOM|DDIPOCKET);[^/]+/([^ /;()]+)"); + +// woothee-0.8.0: r"Windows ([ .a-zA-Z0-9]+)[;\\)]" +consistent!(woothee_22, r"Windows ([ .a-zA-Z0-9]+)[;\\)]"); + +// woothee-0.8.0: r"^Phone(?: OS)? ([.0-9]+)" +consistent!(woothee_23, r"^Phone(?: OS)? ([.0-9]+)"); + +// woothee-0.8.0: r"iP(hone;|ad;|od) .*like Mac OS X" +consistent!(woothee_24, r"iP(hone;|ad;|od) .*like Mac OS X"); + +// woothee-0.8.0: r"Version/([.0-9]+)" +consistent!(woothee_25, r"Version/([.0-9]+)"); + +// woothee-0.8.0: r"rv:(\d+\.\d+\.\d+)" +consistent!(woothee_26, r"rv:(\d+\.\d+\.\d+)"); + +// woothee-0.8.0: r"FreeBSD ([^;\)]+);" +consistent!(woothee_27, r"FreeBSD ([^;\)]+);"); + +// woothee-0.8.0: r"CrOS ([^\)]+)\)" +consistent!(woothee_28, r"CrOS ([^\)]+)\)"); + +// woothee-0.8.0: r"Android[- ](\d+\.\d+(?:\.\d+)?)" +consistent!(woothee_29, r"Android[- ](\d+\.\d+(?:\.\d+)?)"); + +// woothee-0.8.0: r"PSP \(PlayStation Portable\); ([.0-9]+)\)" +consistent!(woothee_30, r"PSP \(PlayStation Portable\); ([.0-9]+)\)"); + +// woothee-0.8.0: r"PLAYSTATION 3;? ([.0-9]+)\)" +consistent!(woothee_31, r"PLAYSTATION 3;? ([.0-9]+)\)"); + +// woothee-0.8.0: r"PlayStation Vita ([.0-9]+)\)" +consistent!(woothee_32, r"PlayStation Vita ([.0-9]+)\)"); + +// woothee-0.8.0: r"PlayStation 4 ([.0-9]+)\)" +consistent!(woothee_33, r"PlayStation 4 ([.0-9]+)\)"); + +// woothee-0.8.0: r"BB10(?:.+)Version/([.0-9]+) " +consistent!(woothee_34, r"BB10(?:.+)Version/([.0-9]+) "); + +// woothee-0.8.0: r"BlackBerry(?:\d+)/([.0-9]+) " +consistent!(woothee_35, r"BlackBerry(?:\d+)/([.0-9]+) "); + +// woothee-0.8.0: r"; CPU(?: iPhone)? OS (\d+_\d+(?:_\d+)?) like Mac OS X" +consistent!( + woothee_36, + r"; CPU(?: iPhone)? OS (\d+_\d+(?:_\d+)?) like Mac OS X" +); + +// woothee-0.8.0: r"Mac OS X (10[._]\d+(?:[._]\d+)?)(?:\)|;)" +consistent!(woothee_37, r"Mac OS X (10[._]\d+(?:[._]\d+)?)(?:\)|;)"); + +// woothee-0.8.0: r"^(?:Apache-HttpClient/|Jakarta Commons-HttpClient/|Java/)" +consistent!( + woothee_38, + r"^(?:Apache-HttpClient/|Jakarta Commons-HttpClient/|Java/)" +); + +// woothee-0.8.0: r"[- ]HttpClient(/|$)" +consistent!(woothee_39, r"[- ]HttpClient(/|$)"); + +// woothee-0.8.0: r"^(?:PHP|WordPress|CakePHP|PukiWiki|PECL::HTTP)(?:/| |$)" +consistent!( + woothee_40, + r"^(?:PHP|WordPress|CakePHP|PukiWiki|PECL::HTTP)(?:/| |$)" +); + +// woothee-0.8.0: r"(?:PEAR HTTP_Request|HTTP_Request)(?: class|2)" +consistent!(woothee_41, r"(?:PEAR HTTP_Request|HTTP_Request)(?: class|2)"); + +// woothee-0.8.0: r"(?:Rome Client |UnwindFetchor/|ia_archiver |Summify |PostRank/)" +consistent!( + woothee_42, + r"(?:Rome Client |UnwindFetchor/|ia_archiver |Summify |PostRank/)" +); + +// woothee-0.8.0: r"Sleipnir/([.0-9]+)" +consistent!(woothee_43, r"Sleipnir/([.0-9]+)"); + +// word_replace-0.0.3: r"@@[a-z|A-Z|\d]+@@" +consistent!(word_replace_0, r"@@[a-z|A-Z|\d]+@@"); + +// wordcount-0.1.0: r"\w+" +consistent!(wordcount_0, r"\w+"); + +// just-0.3.12: "^([^=]+)=(.*)$" +consistent!(just_0, "^([^=]+)=(.*)$"); + +// emote-0.1.0: r":[a-zA-Z_]+?:" +consistent!(emote_0, r":[a-zA-Z_]+?:"); + +// emojicons-1.0.1: r":([a-zA-Z0-9_+-]+):" +consistent!(emojicons_0, r":([a-zA-Z0-9_+-]+):"); + +// git2_codecommit-0.1.2: r"git-codecommit\.([a-z0-9-]+)\.amazonaws\.com" +consistent!( + git2_codecommit_0, + r"git-codecommit\.([a-z0-9-]+)\.amazonaws\.com" +); + +// git-workarea-3.1.2: r"^submodule\.(?P<name>.*)\.(?P<key>[^=]*)=(?P<value>.*)$" +consistent!( + git_workarea_0, + r"^submodule\.(?P<name>.*)\.(?P<key>[^=]*)=(?P<value>.*)$" +); + +// git-shell-enforce-directory-1.0.0: r"^(?P<command>git-(?:receive|upload)-pack) '(?P<path>.+)'$" +consistent!( + git_shell_enforce_directory_0, + r"^(?P<command>git-(?:receive|upload)-pack) '(?P<path>.+)'$" +); + +// git-journal-1.6.3: r"[ \n]:(.*?):" +consistent!(git_journal_0, r"[ \n]:(.*?):"); + +// git-find-0.3.2: r"^git@(?P<host>[[:alnum:]\._-]+):(?P<path>[[:alnum:]\._\-/]+).git$" +consistent!( + git_find_0, + r"^git@(?P<host>[[:alnum:]\._-]+):(?P<path>[[:alnum:]\._\-/]+).git$" +); + +// gitlab-api-0.6.0: r"private_token=\w{20}" +consistent!(gitlab_api_0, r"private_token=\w{20}"); + +// td-client-0.7.0: "^(http://|https://)" +consistent!(td_client_0, "^(http://|https://)"); + +// karaconv-0.3.0: r"--(?P<type>[a-zA-Z]+)-- (?P<contents>.*)" +consistent!(karaconv_0, r"--(?P<type>[a-zA-Z]+)-- (?P<contents>.*)"); + +// katana-1.0.2: r"(?P<comp>et al\.)(?:\.)" +consistent!(katana_0, r"(?P<comp>et al\.)(?:\.)"); + +// katana-1.0.2: r"\.{3}" +consistent!(katana_1, r"\.{3}"); + +// katana-1.0.2: r"(?P<number>[0-9]+)\.(?P<decimal>[0-9]+)" +consistent!(katana_2, r"(?P<number>[0-9]+)\.(?P<decimal>[0-9]+)"); + +// katana-1.0.2: r"\s\.(?P<nums>[0-9]+)" +consistent!(katana_3, r"\s\.(?P<nums>[0-9]+)"); + +// katana-1.0.2: r"(?:[A-Za-z]\.){2,}" +consistent!(katana_4, r"(?:[A-Za-z]\.){2,}"); + +// katana-1.0.2: r"(?P<init>[A-Z])(?P<point>\.)" +consistent!(katana_5, r"(?P<init>[A-Z])(?P<point>\.)"); + +// katana-1.0.2: r"(?P<title>[A-Z][a-z]{1,3})(\.)" +consistent!(katana_6, r"(?P<title>[A-Z][a-z]{1,3})(\.)"); + +// katana-1.0.2: r"&==&(?P<p>[.!?])" +consistent!(katana_7, r"&==&(?P<p>[.!?])"); + +// katana-1.0.2: r"&\^&(?P<p>[.!?])" +consistent!(katana_8, r"&\^&(?P<p>[.!?])"); + +// katana-1.0.2: r"&\*\*&(?P<p>[.!?])" +consistent!(katana_9, r"&\*\*&(?P<p>[.!?])"); + +// katana-1.0.2: r"&=&(?P<p>[.!?])" +consistent!(katana_10, r"&=&(?P<p>[.!?])"); + +// katana-1.0.2: r"&##&(?P<p>[.!?])" +consistent!(katana_11, r"&##&(?P<p>[.!?])"); + +// katana-1.0.2: r"&\$&(?P<p>[.!?])" +consistent!(katana_12, r"&\$&(?P<p>[.!?])"); + +// kailua_syntax-1.1.0: r"@(?:_|\d+(?:/\d+(?:-\d+)?)?)" +consistent!(kailua_syntax_0, r"@(?:_|\d+(?:/\d+(?:-\d+)?)?)"); + +// kailua_syntax-1.1.0: r"<(\d+)>" +consistent!(kailua_syntax_1, r"<(\d+)>"); + +// ftp-3.0.1: r"\((\d+),(\d+),(\d+),(\d+),(\d+),(\d+)\)" +consistent!(ftp_0, r"\((\d+),(\d+),(\d+),(\d+),(\d+),(\d+)\)"); + +// ftp-3.0.1: r"\b(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})\b" +consistent!(ftp_1, r"\b(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})\b"); + +// ftp-3.0.1: r"\s+(\d+)\s*$" +consistent!(ftp_2, r"\s+(\d+)\s*$"); + +// vat-0.1.0: r"<countryCode>(.*?)</countryCode>" +consistent!(vat_0, r"<countryCode>(.*?)</countryCode>"); + +// vat-0.1.0: r"<vatNumber>(.*?)</vatNumber>" +consistent!(vat_1, r"<vatNumber>(.*?)</vatNumber>"); + +// vat-0.1.0: r"<name>(.*?)</name>" +consistent!(vat_2, r"<name>(.*?)</name>"); + +// vat-0.1.0: r"<address>(?s)(.*?)(?-s)</address>" +consistent!(vat_3, r"<address>(?s)(.*?)(?-s)</address>"); + +// vat-0.1.0: r"<valid>(true|false)</valid>" +consistent!(vat_4, r"<valid>(true|false)</valid>"); + +// vat-0.1.0: r"^ATU\d{8}$" +consistent!(vat_5, r"^ATU\d{8}$"); + +// vat-0.1.0: r"^BE0?\d{9, 10}$" +consistent!(vat_6, r"^BE0?\d{9, 10}$"); + +// vat-0.1.0: r"^BG\d{9,10}$" +consistent!(vat_7, r"^BG\d{9,10}$"); + +// vat-0.1.0: r"^HR\d{11}$" +consistent!(vat_8, r"^HR\d{11}$"); + +// vat-0.1.0: r"^CY\d{8}[A-Z]$" +consistent!(vat_9, r"^CY\d{8}[A-Z]$"); + +// vat-0.1.0: r"^CZ\d{8,10}$" +consistent!(vat_10, r"^CZ\d{8,10}$"); + +// vat-0.1.0: r"^DK\d{8}$" +consistent!(vat_11, r"^DK\d{8}$"); + +// vat-0.1.0: r"^EE\d{9}$" +consistent!(vat_12, r"^EE\d{9}$"); + +// vat-0.1.0: r"^FI\d{8}$" +consistent!(vat_13, r"^FI\d{8}$"); + +// vat-0.1.0: r"^FR[A-HJ-NP-Z0-9][A-HJ-NP-Z0-9]\d{9}$" +consistent!(vat_14, r"^FR[A-HJ-NP-Z0-9][A-HJ-NP-Z0-9]\d{9}$"); + +// vat-0.1.0: r"^DE\d{9}$" +consistent!(vat_15, r"^DE\d{9}$"); + +// vat-0.1.0: r"^EL\d{9}$" +consistent!(vat_16, r"^EL\d{9}$"); + +// vat-0.1.0: r"^HU\d{8}$" +consistent!(vat_17, r"^HU\d{8}$"); + +// vat-0.1.0: r"^IE\d[A-Z0-9\+\*]\d{5}[A-Z]{1,2}$" +consistent!(vat_18, r"^IE\d[A-Z0-9\+\*]\d{5}[A-Z]{1,2}$"); + +// vat-0.1.0: r"^IT\d{11}$" +consistent!(vat_19, r"^IT\d{11}$"); + +// vat-0.1.0: r"^LV\d{11}$" +consistent!(vat_20, r"^LV\d{11}$"); + +// vat-0.1.0: r"^LT(\d{9}|\d{12})$" +consistent!(vat_21, r"^LT(\d{9}|\d{12})$"); + +// vat-0.1.0: r"^LU\d{8}$" +consistent!(vat_22, r"^LU\d{8}$"); + +// vat-0.1.0: r"^MT\d{8}$" +consistent!(vat_23, r"^MT\d{8}$"); + +// vat-0.1.0: r"^NL\d{9}B\d{2}$" +consistent!(vat_24, r"^NL\d{9}B\d{2}$"); + +// vat-0.1.0: r"^PL\d{10}$" +consistent!(vat_25, r"^PL\d{10}$"); + +// vat-0.1.0: r"^PT\d{9}$" +consistent!(vat_26, r"^PT\d{9}$"); + +// vat-0.1.0: r"^RO\d{2,10}$" +consistent!(vat_27, r"^RO\d{2,10}$"); + +// vat-0.1.0: r"^SK\d{10}$" +consistent!(vat_28, r"^SK\d{10}$"); + +// vat-0.1.0: r"^SI\d{8}$" +consistent!(vat_29, r"^SI\d{8}$"); + +// vat-0.1.0: r"^ES[A-Z0-9]\d{7}[A-Z0-9]$" +consistent!(vat_30, r"^ES[A-Z0-9]\d{7}[A-Z0-9]$"); + +// vat-0.1.0: r"^SE\d{10}01$" +consistent!(vat_31, r"^SE\d{10}01$"); + +// vat-0.1.0: r"^(GB(GD|HA)\d{3}|GB\d{9}|GB\d{12})$" +consistent!(vat_32, r"^(GB(GD|HA)\d{3}|GB\d{9}|GB\d{12})$"); + +// eve-0.1.1: r"\{\{(.*)\}\}" +consistent!(eve_0, r"\{\{(.*)\}\}"); + +// egc-0.1.2: "^mio" +consistent!(egc_0, "^mio"); + +// pew-0.2.3: "" +consistent!(pew_0, ""); + +// pew-0.2.3: "" +consistent!(pew_1, ""); + +// mob-0.4.3: "y" +consistent!(mob_0, "y"); + +// lit-0.2.8: "@([a-z]+)" +consistent!(lit_0, "@([a-z]+)"); + +// lit-0.2.8: "([A-Z-]+):(.*)" +consistent!(lit_1, "([A-Z-]+):(.*)"); + +// lit-0.2.8: "^[a-zA-Z_][a-zA-Z0-9_]*$" +consistent!(lit_2, "^[a-zA-Z_][a-zA-Z0-9_]*$"); + +// avm-1.0.1: r"\d+\.\d+\.\d+" +consistent!(avm_0, r"\d+\.\d+\.\d+"); + +// avm-1.0.1: r"\d+\.\d+\.\d+" +consistent!(avm_1, r"\d+\.\d+\.\d+"); + +// orm-0.2.0: r"^Vec<(.+)>$" +consistent!(orm_0, r"^Vec<(.+)>$"); + +// sgf-0.1.5: r"\\(\r\n|\n\r|\n|\r)" +consistent!(sgf_0, r"\\(\r\n|\n\r|\n|\r)"); + +// sgf-0.1.5: r"\\(.)" +consistent!(sgf_1, r"\\(.)"); + +// sgf-0.1.5: r"\r\n|\n\r|\n|\r" +consistent!(sgf_2, r"\r\n|\n\r|\n|\r"); + +// sgf-0.1.5: r"([\]\\:])" +consistent!(sgf_3, r"([\]\\:])"); + +// dok-0.2.0: "^Bearer realm=\"(.+?)\",service=\"(.+?)\",scope=\"(.+?)\"$" +consistent!( + dok_0, + "^Bearer realm=\"(.+?)\",service=\"(.+?)\",scope=\"(.+?)\"$" +); + +// d20-0.1.0: r"([+-]?\s*\d+[dD]\d+|[+-]?\s*\d+)" +consistent!(d20_0, r"([+-]?\s*\d+[dD]\d+|[+-]?\s*\d+)"); + +// dvb-0.3.0: "E" +consistent!(dvb_0, "E"); + +// dvb-0.3.0: "^F" +consistent!(dvb_1, "^F"); + +// dvb-0.3.0: "^S" +consistent!(dvb_2, "^S"); + +// ger-0.2.0: r"Change-Id: (I[a-f0-9]{40})$" +consistent!(ger_0, r"Change-Id: (I[a-f0-9]{40})$"); + +// ger-0.2.0: r"(refs|ref|fix|fixes|close|closes)\s+([A-Z]{2,5}-[0-9]{1,5})$" +consistent!( + ger_1, + r"(refs|ref|fix|fixes|close|closes)\s+([A-Z]{2,5}-[0-9]{1,5})$" +); + +// n5-0.2.1: r"(\d+)(\.(\d+))?(\.(\d+))?(.*)" +consistent!(n5_0, r"(\d+)(\.(\d+))?(\.(\d+))?(.*)"); + +// po-0.1.4: r"[A-Za-z0-9]" +consistent!(po_0, r"[A-Za-z0-9]"); + +// carnix-0.8.5: "path is (‘|')?([^’'\n]*)(’|')?" +consistent!(carnix_0, "path is (‘|')?([^’'\n]*)(’|')?"); + +// carnix-0.8.5: r"^(\S*) (\d*)\.(\d*)\.(\d*)(-(\S*))?(.*)?" +consistent!(carnix_1, r"^(\S*) (\d*)\.(\d*)\.(\d*)(-(\S*))?(.*)?"); + +// carnix-0.8.5: r"(\d*)\.(\d*)\.(\d*)(-(\S*))?" +consistent!(carnix_2, r"(\d*)\.(\d*)\.(\d*)(-(\S*))?"); + +// carnix-0.8.5: r"(\S*)-(\d*)\.(\d*)\.(\d*)(-(\S*))?" +consistent!(carnix_3, r"(\S*)-(\d*)\.(\d*)\.(\d*)(-(\S*))?"); + +// caseless-0.2.1: r"^# CaseFolding-(\d+)\.(\d+)\.(\d+).txt$" +consistent!(caseless_0, r"^# CaseFolding-(\d+)\.(\d+)\.(\d+).txt$"); + +// caseless-0.2.1: r"^([0-9A-F]+); [CF]; ([0-9A-F ]+);" +consistent!(caseless_1, r"^([0-9A-F]+); [CF]; ([0-9A-F ]+);"); + +// cabot-0.2.0: "\r?\n\r?\n" +consistent!(cabot_0, "\r?\n\r?\n"); + +// cabot-0.2.0: "\r?\n" +consistent!(cabot_1, "\r?\n"); + +// card-validate-2.2.1: r"^600" +consistent!(card_validate_0, r"^600"); + +// card-validate-2.2.1: r"^5019" +consistent!(card_validate_1, r"^5019"); + +// card-validate-2.2.1: r"^4" +consistent!(card_validate_2, r"^4"); + +// card-validate-2.2.1: r"^(5[1-5]|2[2-7])" +consistent!(card_validate_3, r"^(5[1-5]|2[2-7])"); + +// card-validate-2.2.1: r"^3[47]" +consistent!(card_validate_4, r"^3[47]"); + +// card-validate-2.2.1: r"^3[0689]" +consistent!(card_validate_5, r"^3[0689]"); + +// card-validate-2.2.1: r"^6([045]|22)" +consistent!(card_validate_6, r"^6([045]|22)"); + +// card-validate-2.2.1: r"^(62|88)" +consistent!(card_validate_7, r"^(62|88)"); + +// card-validate-2.2.1: r"^35" +consistent!(card_validate_8, r"^35"); + +// card-validate-2.2.1: r"^[0-9]+$" +consistent!(card_validate_9, r"^[0-9]+$"); + +// cargo-testify-0.3.0: r"\d{1,} passed.*filtered out" +consistent!(cargo_testify_0, r"\d{1,} passed.*filtered out"); + +// cargo-testify-0.3.0: r"error(:|\[).*" +consistent!(cargo_testify_1, r"error(:|\[).*"); + +// cargo-wix-0.0.5: r"<(.*?)>" +consistent!(cargo_wix_0, r"<(.*?)>"); + +// cargo-wix-0.0.5: r"<(.*?)>" +consistent!(cargo_wix_1, r"<(.*?)>"); + +// cargo-wix-0.0.5: r"<(.*?)>" +consistent!(cargo_wix_2, r"<(.*?)>"); + +// cargo-wix-0.0.5: r"<(.*?)>" +consistent!(cargo_wix_3, r"<(.*?)>"); + +// cargo-incremental-0.1.23: r"(?m)^incremental: re-using (\d+) out of (\d+) modules$" +consistent!( + cargo_incremental_0, + r"(?m)^incremental: re-using (\d+) out of (\d+) modules$" +); + +// cargo-incremental-0.1.23: "(?m)(warning|error): (.*)\n --> ([^:]:\\d+:\\d+)$" +consistent!( + cargo_incremental_1, + "(?m)(warning|error): (.*)\n --> ([^:]:\\d+:\\d+)$" +); + +// cargo-incremental-0.1.23: r"(?m)^test (.*) \.\.\. (\w+)" +consistent!(cargo_incremental_2, r"(?m)^test (.*) \.\.\. (\w+)"); + +// cargo-incremental-0.1.23: r"(?m)(\d+) passed; (\d+) failed; (\d+) ignored; \d+ measured" +consistent!( + cargo_incremental_3, + r"(?m)(\d+) passed; (\d+) failed; (\d+) ignored; \d+ measured" +); + +// cargo-testjs-0.1.2: r"^[^-]+-[0-9a-f]+\.js$" +consistent!(cargo_testjs_0, r"^[^-]+-[0-9a-f]+\.js$"); + +// cargo-tarpaulin-0.6.2: r"\s*//" +consistent!(cargo_tarpaulin_0, r"\s*//"); + +// cargo-tarpaulin-0.6.2: r"/\*" +consistent!(cargo_tarpaulin_1, r"/\*"); + +// cargo-tarpaulin-0.6.2: r"\*/" +consistent!(cargo_tarpaulin_2, r"\*/"); + +// cargo-culture-kit-0.1.0: r"^fo" +consistent!(cargo_culture_kit_0, r"^fo"); + +// cargo-screeps-0.1.3: "\\s+" +consistent!(cargo_screeps_0, "\\s+"); + +// cargo-brew-0.1.4: r"`(\S+) v([0-9.]+)" +consistent!(cargo_brew_0, r"`(\S+) v([0-9.]+)"); + +// cargo-release-0.10.2: "^\\[.+\\]" +consistent!(cargo_release_0, "^\\[.+\\]"); + +// cargo-release-0.10.2: "^\\[\\[.+\\]\\]" +consistent!(cargo_release_1, "^\\[\\[.+\\]\\]"); + +// cargo-edit-0.3.0-beta.1: r"^https://github.com/([-_0-9a-zA-Z]+)/([-_0-9a-zA-Z]+)(/|.git)?$" +consistent!( + cargo_edit_0, + r"^https://github.com/([-_0-9a-zA-Z]+)/([-_0-9a-zA-Z]+)(/|.git)?$" +); + +// cargo-edit-0.3.0-beta.1: r"^https://gitlab.com/([-_0-9a-zA-Z]+)/([-_0-9a-zA-Z]+)(/|.git)?$" +consistent!( + cargo_edit_1, + r"^https://gitlab.com/([-_0-9a-zA-Z]+)/([-_0-9a-zA-Z]+)(/|.git)?$" +); + +// cargo-disassemble-0.1.1: ".*" +consistent!(cargo_disassemble_0, ".*"); + +// cargo-demangle-0.1.2: r"(?m)(?P<symbol>_ZN[0-9]+.*E)" +consistent!(cargo_demangle_0, r"(?m)(?P<symbol>_ZN[0-9]+.*E)"); + +// cargo-coverage-annotations-0.1.5: r"^\s*\}(?:\)*;?|\s*else\s*\{)$" +consistent!(cargo_coverage_annotations_0, r"^\s*\}(?:\)*;?|\s*else\s*\{)$"); + +// cargo-urlcrate-1.0.1: "[\u{001b}\u{009b}][\\[()#;?]*(?:[0-9]{1,4}(?:;[0-9]{0,4})*)?[0-9A-PRZcf-nqry=><]" +consistent!(cargo_urlcrate_0, "[\u{001b}\u{009b}][\\[()#;?]*(?:[0-9]{1,4}(?:;[0-9]{0,4})*)?[0-9A-PRZcf-nqry=><]"); + +// cargo-script-0.2.8: r"^\s*\*( |$)" +consistent!(cargo_script_0, r"^\s*\*( |$)"); + +// cargo-script-0.2.8: r"^(\s+)" +consistent!(cargo_script_1, r"^(\s+)"); + +// cargo-script-0.2.8: r"/\*|\*/" +consistent!(cargo_script_2, r"/\*|\*/"); + +// cargo-script-0.2.8: r"^\s*//!" +consistent!(cargo_script_3, r"^\s*//!"); + +// cargo-script-0.2.8: r"^#![^\[].*?(\r\n|\n)" +consistent!(cargo_script_4, r"^#![^\[].*?(\r\n|\n)"); + +// cargo-update-1.5.2: r"cargo-install-update\.exe-v.+" +consistent!(cargo_update_0, r"cargo-install-update\.exe-v.+"); + +// canteen-0.4.1: r"^<(?:(int|uint|str|float|path):)?([\w_][a-zA-Z0-9_]*)>$" +consistent!( + canteen_0, + r"^<(?:(int|uint|str|float|path):)?([\w_][a-zA-Z0-9_]*)>$" +); + +// thruster-cli-0.1.3: r"(.)([A-Z])" +consistent!(thruster_cli_0, r"(.)([A-Z])"); + +// thieves-cant-0.1.0: "([Z]+)$" +consistent!(thieves_cant_0, "([Z]+)$"); + +// codeowners-0.1.3: r"^@\S+/\S+" +consistent!(codeowners_0, r"^@\S+/\S+"); + +// codeowners-0.1.3: r"^@\S+" +consistent!(codeowners_1, r"^@\S+"); + +// codeowners-0.1.3: r"^\S+@\S+" +consistent!(codeowners_2, r"^\S+@\S+"); + +// conserve-0.4.2: r"^b0000 {21} complete 20[-0-9T:+]+\s +\d+s\n$" +consistent!(conserve_0, r"^b0000 {21} complete 20[-0-9T:+]+\s +\d+s\n$"); + +// commodore-0.3.0: r"(?P<greeting>\S+?) (?P<name>\S+?)$" +consistent!(commodore_0, r"(?P<greeting>\S+?) (?P<name>\S+?)$"); + +// corollary-0.3.0: r"([ \t]*)```haskell([\s\S]*?)```" +consistent!(corollary_0, r"([ \t]*)```haskell([\s\S]*?)```"); + +// corollary-0.3.0: r"\b((?:a|b|t)\d*)\b" +consistent!(corollary_1, r"\b((?:a|b|t)\d*)\b"); + +// colorizex-0.1.3: "NB" +consistent!(colorizex_0, "NB"); + +// colorstring-0.0.1: r"(?i)\[[a-z0-9_-]+\]" +consistent!(colorstring_0, r"(?i)\[[a-z0-9_-]+\]"); + +// colorstring-0.0.1: r"^(?i)(\[[a-z0-9_-]+\])+" +consistent!(colorstring_1, r"^(?i)(\[[a-z0-9_-]+\])+"); + +// cosmogony-0.3.0: "name:(.+)" +consistent!(cosmogony_0, "name:(.+)"); + +// cobalt-bin-0.12.1: r"(?m:^ {0,3}\[[^\]]+\]:.+$)" +consistent!(cobalt_bin_0, r"(?m:^ {0,3}\[[^\]]+\]:.+$)"); + +// comrak-0.2.12: r"[^\p{L}\p{M}\p{N}\p{Pc} -]" +consistent!(comrak_0, r"[^\p{L}\p{M}\p{N}\p{Pc} -]"); + +// content-blocker-0.2.3: "" +consistent!(content_blocker_0, ""); + +// content-blocker-0.2.3: "(?i)hi" +consistent!(content_blocker_1, "(?i)hi"); + +// content-blocker-0.2.3: "http[s]?://domain.org" +consistent!(content_blocker_2, "http[s]?://domain.org"); + +// content-blocker-0.2.3: "(?i)http[s]?://domain.org" +consistent!(content_blocker_3, "(?i)http[s]?://domain.org"); + +// content-blocker-0.2.3: "http://domain.org" +consistent!(content_blocker_4, "http://domain.org"); + +// content-blocker-0.2.3: "http://domain.org" +consistent!(content_blocker_5, "http://domain.org"); + +// content-blocker-0.2.3: "ad.html" +consistent!(content_blocker_6, "ad.html"); + +// content-blocker-0.2.3: "ad.html" +consistent!(content_blocker_7, "ad.html"); + +// content-blocker-0.2.3: "http://domain.org" +consistent!(content_blocker_8, "http://domain.org"); + +// content-blocker-0.2.3: "http://domain.org/nocookies.sjs" +consistent!(content_blocker_9, "http://domain.org/nocookies.sjs"); + +// content-blocker-0.2.3: "http://domain.org/nocookies.sjs" +consistent!(content_blocker_10, "http://domain.org/nocookies.sjs"); + +// content-blocker-0.2.3: "http://domain.org/hideme.jpg" +consistent!(content_blocker_11, "http://domain.org/hideme.jpg"); + +// content-blocker-0.2.3: "http://domain.org/ok.html" +consistent!(content_blocker_12, "http://domain.org/ok.html"); + +// content-blocker-0.2.3: "http://domain.org/ok.html\\?except_this=1" +consistent!(content_blocker_13, "http://domain.org/ok.html\\?except_this=1"); + +// victoria-dom-0.1.2: "[A-Za-z0-9=]" +consistent!(victoria_dom_0, "[A-Za-z0-9=]"); + +// numbat-1.0.0: r"^nsq://" +consistent!(numbat_0, r"^nsq://"); + +// airkorea-0.1.2: r"[\s\t\r\n]" +consistent!(airkorea_0, r"[\s\t\r\n]"); + +// airkorea-0.1.2: r"([\{\[,])|([\}\]])" +consistent!(airkorea_1, r"([\{\[,])|([\}\]])"); + +// airkorea-0.1.2: r"[^.\d]+$" +consistent!(airkorea_2, r"[^.\d]+$"); + +// rofl-0.0.1: r"\b" +// consistent!(rofl_0, r"\b"); + +// rogcat-0.2.15: r"--------- beginning of.*" +consistent!(rogcat_0, r"--------- beginning of.*"); + +// rogcat-0.2.15: r"a|e|i|o|u" +consistent!(rogcat_1, r"a|e|i|o|u"); + +// rogcat-0.2.15: r"^(\d+)([kMG])$" +consistent!(rogcat_2, r"^(\d+)([kMG])$"); + +// media_filename-0.1.4: "\\.([A-Za-z0-9]{2,4})$" +consistent!(media_filename_0, "\\.([A-Za-z0-9]{2,4})$"); + +// media_filename-0.1.4: "([0-9]{3,4}p|[0-9]{3,4}x[0-9]{3,4})" +consistent!(media_filename_1, "([0-9]{3,4}p|[0-9]{3,4}x[0-9]{3,4})"); + +// media_filename-0.1.4: "(?:^\\[([^]]+)\\]|- ?([^-]+)$)" +consistent!(media_filename_2, "(?:^\\[([^]]+)\\]|- ?([^-]+)$)"); + +// media_filename-0.1.4: "(?:[eE]([0-9]{2,3})|[^0-9A-Za-z]([0-9]{2,3})(?:v[0-9])?[^0-9A-Za-z])" +consistent!( + media_filename_3, + "(?:[eE]([0-9]{2,3})|[^0-9A-Za-z]([0-9]{2,3})(?:v[0-9])?[^0-9A-Za-z])" +); + +// media_filename-0.1.4: "[sS]([0-9]{1,2})" +consistent!(media_filename_4, "[sS]([0-9]{1,2})"); + +// media_filename-0.1.4: "((?i)(?:PPV.)?[HP]DTV|(?:HD)?CAM|BRRIP|[^a-z]TS[^a-z]|(?:PPV )?WEB.?DL(?: DVDRip)?|HDRip|DVDRip|CamRip|W[EB]BRip|BluRay|BD|DVD|DvDScr|hdtv)" +consistent!(media_filename_5, "((?i)(?:PPV.)?[HP]DTV|(?:HD)?CAM|BRRIP|[^a-z]TS[^a-z]|(?:PPV )?WEB.?DL(?: DVDRip)?|HDRip|DVDRip|CamRip|W[EB]BRip|BluRay|BD|DVD|DvDScr|hdtv)"); + +// media_filename-0.1.4: "((19[0-9]|20[01])[0-9])" +consistent!(media_filename_6, "((19[0-9]|20[01])[0-9])"); + +// media_filename-0.1.4: "((?i)xvid|x264|h\\.?264)" +consistent!(media_filename_7, "((?i)xvid|x264|h\\.?264)"); + +// media_filename-0.1.4: "((?i)MP3|DD5\\.?1|Dual[- ]Audio|LiNE|DTS|AAC(?:\\.?2\\.0)?|AC3(?:\\.5\\.1)?)" +consistent!(media_filename_8, "((?i)MP3|DD5\\.?1|Dual[- ]Audio|LiNE|DTS|AAC(?:\\.?2\\.0)?|AC3(?:\\.5\\.1)?)"); + +// media_filename-0.1.4: "\\[([0-9A-F]{8})\\]" +consistent!(media_filename_9, "\\[([0-9A-F]{8})\\]"); + +// termimage-0.3.2: r"(\d+)[xX](\d+)" +consistent!(termimage_0, r"(\d+)[xX](\d+)"); + +// teensy-0.1.0: r".*(\d{4}-\d{2}-\d{2}).*" +consistent!(teensy_0, r".*(\d{4}-\d{2}-\d{2}).*"); + +// telescreen-0.1.3: r"<@(.+)>" +consistent!(telescreen_0, r"<@(.+)>"); + +// tempus_fugit-0.4.4: r"^(\d+)" +consistent!(tempus_fugit_0, r"^(\d+)"); + +// fselect-0.4.1: "(\\?|\\.|\\*|\\[|\\]|\\(|\\)|\\^|\\$)" +consistent!(fselect_0, "(\\?|\\.|\\*|\\[|\\]|\\(|\\)|\\^|\\$)"); + +// fselect-0.4.1: "(%|_|\\?|\\.|\\*|\\[|\\]|\\(|\\)|\\^|\\$)" +consistent!(fselect_1, "(%|_|\\?|\\.|\\*|\\[|\\]|\\(|\\)|\\^|\\$)"); + +// fs_eventbridge-0.1.0: r"^([A-Z]+)(?:\s(.+))?\s*" +consistent!(fs_eventbridge_0, r"^([A-Z]+)(?:\s(.+))?\s*"); + +// joseki-0.0.1: r"(\w{1,2})\[(.+?)\]" +consistent!(joseki_0, r"(\w{1,2})\[(.+?)\]"); + +// tweetr-0.2.1: r"(?i)in (\d+) (second|minute|hour|day|week)s?" +consistent!(tweetr_0, r"(?i)in (\d+) (second|minute|hour|day|week)s?"); + +// bullet_core-0.1.1: "^(?u:[0-9])+" +consistent!(bullet_core_0, "^(?u:[0-9])+"); + +// bullet_core-0.1.1: "^(?u:[0-9])+(?u:\\.)(?u:[0-9])+" +consistent!(bullet_core_1, "^(?u:[0-9])+(?u:\\.)(?u:[0-9])+"); + +// bullet_core-0.1.1: "^(?u:[A-Za-zª-ªµ-µº-ºÀ-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮͰ-ʹͶ-ͷͺ-ͽͿ-ͿΆ-ΆΈ-ΊΌ-ΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-Ֆՙ-ՙա-ևא-תװ-ײؠ-يٮ-ٯٱ-ۓە-ەۥ-ۦۮ-ۯۺ-ۼۿ-ۿܐ-ܐܒ-ܯݍ-ޥޱ-ޱߊ-ߪߴ-ߵߺ-ߺࠀ-ࠕࠚ-ࠚࠤ-ࠤࠨ-ࠨࡀ-ࡘࢠ-ࢴऄ-हऽ-ऽॐ-ॐक़-ॡॱ-ঀঅ-ঌএ-ঐও-নপ-রল-লশ-হঽ-ঽৎ-ৎড়-ঢ়য়-ৡৰ-ৱਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹਖ਼-ੜਫ਼-ਫ਼ੲ-ੴઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હઽ-ઽૐ-ૐૠ-ૡૹ-ૹଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵ-ହଽ-ଽଡ଼-ଢ଼ୟ-ୡୱ-ୱஃ-ஃஅ-ஊஎ-ஐஒ-கங-சஜ-ஜஞ-டண-தந-பம-ஹௐ-ௐఅ-ఌఎ-ఐఒ-నప-హఽ-ఽౘ-ౚౠ-ౡಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹಽ-ಽೞ-ೞೠ-ೡೱ-ೲഅ-ഌഎ-ഐഒ-ഺഽ-ഽൎ-ൎൟ-ൡൺ-ൿඅ-ඖක-නඳ-රල-ලව-ෆก-ะา-ำเ-ๆກ-ຂຄ-ຄງ-ຈຊ-ຊຍ-ຍດ-ທນ-ຟມ-ຣລ-ລວ-ວສ-ຫອ-ະາ-ຳຽ-ຽເ-ໄໆ-ໆໜ-ໟༀ-ༀཀ-ཇཉ-ཬྈ-ྌက-ဪဿ-ဿၐ-ၕၚ-ၝၡ-ၡၥ-ၦၮ-ၰၵ-ႁႎ-ႎႠ-ჅჇ-ჇჍ-Ⴭა-ჺჼ-ቈቊ-ቍቐ-ቖቘ-ቘቚ-ቝበ-ኈኊ-ኍነ-ኰኲ-ኵኸ-ኾዀ-ዀዂ-ዅወ-ዖዘ-ጐጒ-ጕጘ-ፚᎀ-ᎏᎠ-Ᏽᏸ-ᏽᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛱ-ᛸᜀ-ᜌᜎ-ᜑᜠ-ᜱᝀ-ᝑᝠ-ᝬᝮ-ᝰក-ឳៗ-ៗៜ-ៜᠠ-ᡷᢀ-ᢨᢪ-ᢪᢰ-ᣵᤀ-ᤞᥐ-ᥭᥰ-ᥴᦀ-ᦫᦰ-ᧉᨀ-ᨖᨠ-ᩔᪧ-ᪧᬅ-ᬳᭅ-ᭋᮃ-ᮠᮮ-ᮯᮺ-ᯥᰀ-ᰣᱍ-ᱏᱚ-ᱽᳩ-ᳬᳮ-ᳱᳵ-ᳶᴀ-ᶿḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙ-ὙὛ-ὛὝ-ὝὟ-ώᾀ-ᾴᾶ-ᾼι-ιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼⁱ-ⁱⁿ-ⁿₐ-ₜℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎↃ-ↄⰀ-Ⱞⰰ-ⱞⱠ-ⳤⳫ-ⳮⳲ-ⳳⴀ-ⴥⴧ-ⴧⴭ-ⴭⴰ-ⵧⵯ-ⵯⶀ-ⶖⶠ-ⶦⶨ-ⶮⶰ-ⶶⶸ-ⶾⷀ-ⷆⷈ-ⷎⷐ-ⷖⷘ-ⷞⸯ-ⸯ々-〆〱-〵〻-〼ぁ-ゖゝ-ゟァ-ヺー-ヿㄅ-ㄭㄱ-ㆎㆠ-ㆺㇰ-ㇿ㐀-䶵一-鿕ꀀ-ꒌꓐ-ꓽꔀ-ꘌꘐ-ꘟꘪ-ꘫꙀ-ꙮꙿ-ꚝꚠ-ꛥꜗ-ꜟꜢ-ꞈꞋ-ꞭꞰ-ꞷꟷ-ꠁꠃ-ꠅꠇ-ꠊꠌ-ꠢꡀ-ꡳꢂ-ꢳꣲ-ꣷꣻ-ꣻꣽ-ꣽꤊ-ꤥꤰ-ꥆꥠ-ꥼꦄ-ꦲꧏ-ꧏꧠ-ꧤꧦ-ꧯꧺ-ꧾꨀ-ꨨꩀ-ꩂꩄ-ꩋꩠ-ꩶꩺ-ꩺꩾ-ꪯꪱ-ꪱꪵ-ꪶꪹ-ꪽꫀ-ꫀꫂ-ꫂꫛ-ꫝꫠ-ꫪꫲ-ꫴꬁ-ꬆꬉ-ꬎꬑ-ꬖꬠ-ꬦꬨ-ꬮꬰ-ꭚꭜ-ꭥꭰ-ꯢ가-힣ힰ-ퟆퟋ-ퟻ豈-舘並-龎ff-stﬓ-ﬗיִ-יִײַ-ﬨשׁ-זּטּ-לּמּ-מּנּ-סּףּ-פּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-ﷻﹰ-ﹴﹶ-ﻼA-Za-zヲ-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ𐀀-𐀋𐀍-𐀦𐀨-𐀺𐀼-𐀽𐀿-𐁍𐁐-𐁝𐂀-𐃺𐊀-𐊜𐊠-𐋐𐌀-𐌟𐌰-𐍀𐍂-𐍉𐍐-𐍵𐎀-𐎝𐎠-𐏃𐏈-𐏏𐐀-𐒝𐔀-𐔧𐔰-𐕣𐘀-𐜶𐝀-𐝕𐝠-𐝧𐠀-𐠅𐠈-𐠈𐠊-𐠵𐠷-𐠸𐠼-𐠼𐠿-𐡕𐡠-𐡶𐢀-𐢞𐣠-𐣲𐣴-𐣵𐤀-𐤕𐤠-𐤹𐦀-𐦷𐦾-𐦿𐨀-𐨀𐨐-𐨓𐨕-𐨗𐨙-𐨳𐩠-𐩼𐪀-𐪜𐫀-𐫇𐫉-𐫤𐬀-𐬵𐭀-𐭕𐭠-𐭲𐮀-𐮑𐰀-𐱈𐲀-𐲲𐳀-𐳲𑀃-𑀷𑂃-𑂯𑃐-𑃨𑄃-𑄦𑅐-𑅲𑅶-𑅶𑆃-𑆲𑇁-𑇄𑇚-𑇚𑇜-𑇜𑈀-𑈑𑈓-𑈫𑊀-𑊆𑊈-𑊈𑊊-𑊍𑊏-𑊝𑊟-𑊨𑊰-𑋞𑌅-𑌌𑌏-𑌐𑌓-𑌨𑌪-𑌰𑌲-𑌳𑌵-𑌹𑌽-𑌽𑍐-𑍐𑍝-𑍡𑒀-𑒯𑓄-𑓅𑓇-𑓇𑖀-𑖮𑗘-𑗛𑘀-𑘯𑙄-𑙄𑚀-𑚪𑜀-𑜙𑢠-𑣟𑣿-𑣿𑫀-𑫸𒀀-𒎙𒒀-𒕃𓀀-𓐮𔐀-𔙆𖠀-𖨸𖩀-𖩞𖫐-𖫭𖬀-𖬯𖭀-𖭃𖭣-𖭷𖭽-𖮏𖼀-𖽄𖽐-𖽐𖾓-𖾟𛀀-𛀁𛰀-𛱪𛱰-𛱼𛲀-𛲈𛲐-𛲙𝐀-𝑔𝑖-𝒜𝒞-𝒟𝒢-𝒢𝒥-𝒦𝒩-𝒬𝒮-𝒹𝒻-𝒻𝒽-𝓃𝓅-𝔅𝔇-𝔊𝔍-𝔔𝔖-𝔜𝔞-𝔹𝔻-𝔾𝕀-𝕄𝕆-𝕆𝕊-𝕐𝕒-𝚥𝚨-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𝟋𞠀-𞣄𞸀-𞸃𞸅-𞸟𞸡-𞸢𞸤-𞸤𞸧-𞸧𞸩-𞸲𞸴-𞸷𞸹-𞸹𞸻-𞸻𞹂-𞹂𞹇-𞹇𞹉-𞹉𞹋-𞹋𞹍-𞹏𞹑-𞹒𞹔-𞹔𞹗-𞹗𞹙-𞹙𞹛-𞹛𞹝-𞹝𞹟-𞹟𞹡-𞹢𞹤-𞹤𞹧-𞹪𞹬-𞹲𞹴-𞹷𞹹-𞹼𞹾-𞹾𞺀-𞺉𞺋-𞺛𞺡-𞺣𞺥-𞺩𞺫-𞺻𠀀-𪛖𪜀-𫜴𫝀-𫠝𫠠-𬺡丽-𪘀])+" +consistent!(bullet_core_2, "^(?u:[A-Za-zª-ªµ-µº-ºÀ-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮͰ-ʹͶ-ͷͺ-ͽͿ-ͿΆ-ΆΈ-ΊΌ-ΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-Ֆՙ-ՙա-ևא-תװ-ײؠ-يٮ-ٯٱ-ۓە-ەۥ-ۦۮ-ۯۺ-ۼۿ-ۿܐ-ܐܒ-ܯݍ-ޥޱ-ޱߊ-ߪߴ-ߵߺ-ߺࠀ-ࠕࠚ-ࠚࠤ-ࠤࠨ-ࠨࡀ-ࡘࢠ-ࢴऄ-हऽ-ऽॐ-ॐक़-ॡॱ-ঀঅ-ঌএ-ঐও-নপ-রল-লশ-হঽ-ঽৎ-ৎড়-ঢ়য়-ৡৰ-ৱਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹਖ਼-ੜਫ਼-ਫ਼ੲ-ੴઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હઽ-ઽૐ-ૐૠ-ૡૹ-ૹଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵ-ହଽ-ଽଡ଼-ଢ଼ୟ-ୡୱ-ୱஃ-ஃஅ-ஊஎ-ஐஒ-கங-சஜ-ஜஞ-டண-தந-பம-ஹௐ-ௐఅ-ఌఎ-ఐఒ-నప-హఽ-ఽౘ-ౚౠ-ౡಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹಽ-ಽೞ-ೞೠ-ೡೱ-ೲഅ-ഌഎ-ഐഒ-ഺഽ-ഽൎ-ൎൟ-ൡൺ-ൿඅ-ඖක-නඳ-රල-ලව-ෆก-ะา-ำเ-ๆກ-ຂຄ-ຄງ-ຈຊ-ຊຍ-ຍດ-ທນ-ຟມ-ຣລ-ລວ-ວສ-ຫອ-ະາ-ຳຽ-ຽເ-ໄໆ-ໆໜ-ໟༀ-ༀཀ-ཇཉ-ཬྈ-ྌက-ဪဿ-ဿၐ-ၕၚ-ၝၡ-ၡၥ-ၦၮ-ၰၵ-ႁႎ-ႎႠ-ჅჇ-ჇჍ-Ⴭა-ჺჼ-ቈቊ-ቍቐ-ቖቘ-ቘቚ-ቝበ-ኈኊ-ኍነ-ኰኲ-ኵኸ-ኾዀ-ዀዂ-ዅወ-ዖዘ-ጐጒ-ጕጘ-ፚᎀ-ᎏᎠ-Ᏽᏸ-ᏽᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛱ-ᛸᜀ-ᜌᜎ-ᜑᜠ-ᜱᝀ-ᝑᝠ-ᝬᝮ-ᝰក-ឳៗ-ៗៜ-ៜᠠ-ᡷᢀ-ᢨᢪ-ᢪᢰ-ᣵᤀ-ᤞᥐ-ᥭᥰ-ᥴᦀ-ᦫᦰ-ᧉᨀ-ᨖᨠ-ᩔᪧ-ᪧᬅ-ᬳᭅ-ᭋᮃ-ᮠᮮ-ᮯᮺ-ᯥᰀ-ᰣᱍ-ᱏᱚ-ᱽᳩ-ᳬᳮ-ᳱᳵ-ᳶᴀ-ᶿḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙ-ὙὛ-ὛὝ-ὝὟ-ώᾀ-ᾴᾶ-ᾼι-ιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼⁱ-ⁱⁿ-ⁿₐ-ₜℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎↃ-ↄⰀ-Ⱞⰰ-ⱞⱠ-ⳤⳫ-ⳮⳲ-ⳳⴀ-ⴥⴧ-ⴧⴭ-ⴭⴰ-ⵧⵯ-ⵯⶀ-ⶖⶠ-ⶦⶨ-ⶮⶰ-ⶶⶸ-ⶾⷀ-ⷆⷈ-ⷎⷐ-ⷖⷘ-ⷞⸯ-ⸯ々-〆〱-〵〻-〼ぁ-ゖゝ-ゟァ-ヺー-ヿㄅ-ㄭㄱ-ㆎㆠ-ㆺㇰ-ㇿ㐀-䶵一-鿕ꀀ-ꒌꓐ-ꓽꔀ-ꘌꘐ-ꘟꘪ-ꘫꙀ-ꙮꙿ-ꚝꚠ-ꛥꜗ-ꜟꜢ-ꞈꞋ-ꞭꞰ-ꞷꟷ-ꠁꠃ-ꠅꠇ-ꠊꠌ-ꠢꡀ-ꡳꢂ-ꢳꣲ-ꣷꣻ-ꣻꣽ-ꣽꤊ-ꤥꤰ-ꥆꥠ-ꥼꦄ-ꦲꧏ-ꧏꧠ-ꧤꧦ-ꧯꧺ-ꧾꨀ-ꨨꩀ-ꩂꩄ-ꩋꩠ-ꩶꩺ-ꩺꩾ-ꪯꪱ-ꪱꪵ-ꪶꪹ-ꪽꫀ-ꫀꫂ-ꫂꫛ-ꫝꫠ-ꫪꫲ-ꫴꬁ-ꬆꬉ-ꬎꬑ-ꬖꬠ-ꬦꬨ-ꬮꬰ-ꭚꭜ-ꭥꭰ-ꯢ가-힣ힰ-ퟆퟋ-ퟻ豈-舘並-龎ff-stﬓ-ﬗיִ-יִײַ-ﬨשׁ-זּטּ-לּמּ-מּנּ-סּףּ-פּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-ﷻﹰ-ﹴﹶ-ﻼA-Za-zヲ-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ𐀀-𐀋𐀍-𐀦𐀨-𐀺𐀼-𐀽𐀿-𐁍𐁐-𐁝𐂀-𐃺𐊀-𐊜𐊠-𐋐𐌀-𐌟𐌰-𐍀𐍂-𐍉𐍐-𐍵𐎀-𐎝𐎠-𐏃𐏈-𐏏𐐀-𐒝𐔀-𐔧𐔰-𐕣𐘀-𐜶𐝀-𐝕𐝠-𐝧𐠀-𐠅𐠈-𐠈𐠊-𐠵𐠷-𐠸𐠼-𐠼𐠿-𐡕𐡠-𐡶𐢀-𐢞𐣠-𐣲𐣴-𐣵𐤀-𐤕𐤠-𐤹𐦀-𐦷𐦾-𐦿𐨀-𐨀𐨐-𐨓𐨕-𐨗𐨙-𐨳𐩠-𐩼𐪀-𐪜𐫀-𐫇𐫉-𐫤𐬀-𐬵𐭀-𐭕𐭠-𐭲𐮀-𐮑𐰀-𐱈𐲀-𐲲𐳀-𐳲𑀃-𑀷𑂃-𑂯𑃐-𑃨𑄃-𑄦𑅐-𑅲𑅶-𑅶𑆃-𑆲𑇁-𑇄𑇚-𑇚𑇜-𑇜𑈀-𑈑𑈓-𑈫𑊀-𑊆𑊈-𑊈𑊊-𑊍𑊏-𑊝𑊟-𑊨𑊰-𑋞𑌅-𑌌𑌏-𑌐𑌓-𑌨𑌪-𑌰𑌲-𑌳𑌵-𑌹𑌽-𑌽𑍐-𑍐𑍝-𑍡𑒀-𑒯𑓄-𑓅𑓇-𑓇𑖀-𑖮𑗘-𑗛𑘀-𑘯𑙄-𑙄𑚀-𑚪𑜀-𑜙𑢠-𑣟𑣿-𑣿𑫀-𑫸𒀀-𒎙𒒀-𒕃𓀀-𓐮𔐀-𔙆𖠀-𖨸𖩀-𖩞𖫐-𖫭𖬀-𖬯𖭀-𖭃𖭣-𖭷𖭽-𖮏𖼀-𖽄𖽐-𖽐𖾓-𖾟𛀀-𛀁𛰀-𛱪𛱰-𛱼𛲀-𛲈𛲐-𛲙𝐀-𝑔𝑖-𝒜𝒞-𝒟𝒢-𝒢𝒥-𝒦𝒩-𝒬𝒮-𝒹𝒻-𝒻𝒽-𝓃𝓅-𝔅𝔇-𝔊𝔍-𝔔𝔖-𝔜𝔞-𝔹𝔻-𝔾𝕀-𝕄𝕆-𝕆𝕊-𝕐𝕒-𝚥𝚨-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𝟋𞠀-𞣄𞸀-𞸃𞸅-𞸟𞸡-𞸢𞸤-𞸤𞸧-𞸧𞸩-𞸲𞸴-𞸷𞸹-𞸹𞸻-𞸻𞹂-𞹂𞹇-𞹇𞹉-𞹉𞹋-𞹋𞹍-𞹏𞹑-𞹒𞹔-𞹔𞹗-𞹗𞹙-𞹙𞹛-𞹛𞹝-𞹝𞹟-𞹟𞹡-𞹢𞹤-𞹤𞹧-𞹪𞹬-𞹲𞹴-𞹷𞹹-𞹼𞹾-𞹾𞺀-𞺉𞺋-𞺛𞺡-𞺣𞺥-𞺩𞺫-𞺻𠀀-𪛖𪜀-𫜴𫝀-𫠝𫠠-𬺡丽-𪘀])+"); + +// bullet_core-0.1.1: "^(?u:d/d)((?u:[A-Za-zª-ªµ-µº-ºÀ-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮͰ-ʹͶ-ͷͺ-ͽͿ-ͿΆ-ΆΈ-ΊΌ-ΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-Ֆՙ-ՙա-ևא-תװ-ײؠ-يٮ-ٯٱ-ۓە-ەۥ-ۦۮ-ۯۺ-ۼۿ-ۿܐ-ܐܒ-ܯݍ-ޥޱ-ޱߊ-ߪߴ-ߵߺ-ߺࠀ-ࠕࠚ-ࠚࠤ-ࠤࠨ-ࠨࡀ-ࡘࢠ-ࢴऄ-हऽ-ऽॐ-ॐक़-ॡॱ-ঀঅ-ঌএ-ঐও-নপ-রল-লশ-হঽ-ঽৎ-ৎড়-ঢ়য়-ৡৰ-ৱਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹਖ਼-ੜਫ਼-ਫ਼ੲ-ੴઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હઽ-ઽૐ-ૐૠ-ૡૹ-ૹଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵ-ହଽ-ଽଡ଼-ଢ଼ୟ-ୡୱ-ୱஃ-ஃஅ-ஊஎ-ஐஒ-கங-சஜ-ஜஞ-டண-தந-பம-ஹௐ-ௐఅ-ఌఎ-ఐఒ-నప-హఽ-ఽౘ-ౚౠ-ౡಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹಽ-ಽೞ-ೞೠ-ೡೱ-ೲഅ-ഌഎ-ഐഒ-ഺഽ-ഽൎ-ൎൟ-ൡൺ-ൿඅ-ඖක-නඳ-රල-ලව-ෆก-ะา-ำเ-ๆກ-ຂຄ-ຄງ-ຈຊ-ຊຍ-ຍດ-ທນ-ຟມ-ຣລ-ລວ-ວສ-ຫອ-ະາ-ຳຽ-ຽເ-ໄໆ-ໆໜ-ໟༀ-ༀཀ-ཇཉ-ཬྈ-ྌက-ဪဿ-ဿၐ-ၕၚ-ၝၡ-ၡၥ-ၦၮ-ၰၵ-ႁႎ-ႎႠ-ჅჇ-ჇჍ-Ⴭა-ჺჼ-ቈቊ-ቍቐ-ቖቘ-ቘቚ-ቝበ-ኈኊ-ኍነ-ኰኲ-ኵኸ-ኾዀ-ዀዂ-ዅወ-ዖዘ-ጐጒ-ጕጘ-ፚᎀ-ᎏᎠ-Ᏽᏸ-ᏽᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛱ-ᛸᜀ-ᜌᜎ-ᜑᜠ-ᜱᝀ-ᝑᝠ-ᝬᝮ-ᝰក-ឳៗ-ៗៜ-ៜᠠ-ᡷᢀ-ᢨᢪ-ᢪᢰ-ᣵᤀ-ᤞᥐ-ᥭᥰ-ᥴᦀ-ᦫᦰ-ᧉᨀ-ᨖᨠ-ᩔᪧ-ᪧᬅ-ᬳᭅ-ᭋᮃ-ᮠᮮ-ᮯᮺ-ᯥᰀ-ᰣᱍ-ᱏᱚ-ᱽᳩ-ᳬᳮ-ᳱᳵ-ᳶᴀ-ᶿḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙ-ὙὛ-ὛὝ-ὝὟ-ώᾀ-ᾴᾶ-ᾼι-ιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼⁱ-ⁱⁿ-ⁿₐ-ₜℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎↃ-ↄⰀ-Ⱞⰰ-ⱞⱠ-ⳤⳫ-ⳮⳲ-ⳳⴀ-ⴥⴧ-ⴧⴭ-ⴭⴰ-ⵧⵯ-ⵯⶀ-ⶖⶠ-ⶦⶨ-ⶮⶰ-ⶶⶸ-ⶾⷀ-ⷆⷈ-ⷎⷐ-ⷖⷘ-ⷞⸯ-ⸯ々-〆〱-〵〻-〼ぁ-ゖゝ-ゟァ-ヺー-ヿㄅ-ㄭㄱ-ㆎㆠ-ㆺㇰ-ㇿ㐀-䶵一-鿕ꀀ-ꒌꓐ-ꓽꔀ-ꘌꘐ-ꘟꘪ-ꘫꙀ-ꙮꙿ-ꚝꚠ-ꛥꜗ-ꜟꜢ-ꞈꞋ-ꞭꞰ-ꞷꟷ-ꠁꠃ-ꠅꠇ-ꠊꠌ-ꠢꡀ-ꡳꢂ-ꢳꣲ-ꣷꣻ-ꣻꣽ-ꣽꤊ-ꤥꤰ-ꥆꥠ-ꥼꦄ-ꦲꧏ-ꧏꧠ-ꧤꧦ-ꧯꧺ-ꧾꨀ-ꨨꩀ-ꩂꩄ-ꩋꩠ-ꩶꩺ-ꩺꩾ-ꪯꪱ-ꪱꪵ-ꪶꪹ-ꪽꫀ-ꫀꫂ-ꫂꫛ-ꫝꫠ-ꫪꫲ-ꫴꬁ-ꬆꬉ-ꬎꬑ-ꬖꬠ-ꬦꬨ-ꬮꬰ-ꭚꭜ-ꭥꭰ-ꯢ가-힣ힰ-ퟆퟋ-ퟻ豈-舘並-龎ff-stﬓ-ﬗיִ-יִײַ-ﬨשׁ-זּטּ-לּמּ-מּנּ-סּףּ-פּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-ﷻﹰ-ﹴﹶ-ﻼA-Za-zヲ-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ𐀀-𐀋𐀍-𐀦𐀨-𐀺𐀼-𐀽𐀿-𐁍𐁐-𐁝𐂀-𐃺𐊀-𐊜𐊠-𐋐𐌀-𐌟𐌰-𐍀𐍂-𐍉𐍐-𐍵𐎀-𐎝𐎠-𐏃𐏈-𐏏𐐀-𐒝𐔀-𐔧𐔰-𐕣𐘀-𐜶𐝀-𐝕𐝠-𐝧𐠀-𐠅𐠈-𐠈𐠊-𐠵𐠷-𐠸𐠼-𐠼𐠿-𐡕𐡠-𐡶𐢀-𐢞𐣠-𐣲𐣴-𐣵𐤀-𐤕𐤠-𐤹𐦀-𐦷𐦾-𐦿𐨀-𐨀𐨐-𐨓𐨕-𐨗𐨙-𐨳𐩠-𐩼𐪀-𐪜𐫀-𐫇𐫉-𐫤𐬀-𐬵𐭀-𐭕𐭠-𐭲𐮀-𐮑𐰀-𐱈𐲀-𐲲𐳀-𐳲𑀃-𑀷𑂃-𑂯𑃐-𑃨𑄃-𑄦𑅐-𑅲𑅶-𑅶𑆃-𑆲𑇁-𑇄𑇚-𑇚𑇜-𑇜𑈀-𑈑𑈓-𑈫𑊀-𑊆𑊈-𑊈𑊊-𑊍𑊏-𑊝𑊟-𑊨𑊰-𑋞𑌅-𑌌𑌏-𑌐𑌓-𑌨𑌪-𑌰𑌲-𑌳𑌵-𑌹𑌽-𑌽𑍐-𑍐𑍝-𑍡𑒀-𑒯𑓄-𑓅𑓇-𑓇𑖀-𑖮𑗘-𑗛𑘀-𑘯𑙄-𑙄𑚀-𑚪𑜀-𑜙𑢠-𑣟𑣿-𑣿𑫀-𑫸𒀀-𒎙𒒀-𒕃𓀀-𓐮𔐀-𔙆𖠀-𖨸𖩀-𖩞𖫐-𖫭𖬀-𖬯𖭀-𖭃𖭣-𖭷𖭽-𖮏𖼀-𖽄𖽐-𖽐𖾓-𖾟𛀀-𛀁𛰀-𛱪𛱰-𛱼𛲀-𛲈𛲐-𛲙𝐀-𝑔𝑖-𝒜𝒞-𝒟𝒢-𝒢𝒥-𝒦𝒩-𝒬𝒮-𝒹𝒻-𝒻𝒽-𝓃𝓅-𝔅𝔇-𝔊𝔍-𝔔𝔖-𝔜𝔞-𝔹𝔻-𝔾𝕀-𝕄𝕆-𝕆𝕊-𝕐𝕒-𝚥𝚨-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𝟋𞠀-𞣄𞸀-𞸃𞸅-𞸟𞸡-𞸢𞸤-𞸤𞸧-𞸧𞸩-𞸲𞸴-𞸷𞸹-𞸹𞸻-𞸻𞹂-𞹂𞹇-𞹇𞹉-𞹉𞹋-𞹋𞹍-𞹏𞹑-𞹒𞹔-𞹔𞹗-𞹗𞹙-𞹙𞹛-𞹛𞹝-𞹝𞹟-𞹟𞹡-𞹢𞹤-𞹤𞹧-𞹪𞹬-𞹲𞹴-𞹷𞹹-𞹼𞹾-𞹾𞺀-𞺉𞺋-𞺛𞺡-𞺣𞺥-𞺩𞺫-𞺻𠀀-𪛖𪜀-𫜴𫝀-𫠝𫠠-𬺡丽-𪘀])+)" +consistent!(bullet_core_3, "^(?u:d/d)((?u:[A-Za-zª-ªµ-µº-ºÀ-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮͰ-ʹͶ-ͷͺ-ͽͿ-ͿΆ-ΆΈ-ΊΌ-ΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-Ֆՙ-ՙա-ևא-תװ-ײؠ-يٮ-ٯٱ-ۓە-ەۥ-ۦۮ-ۯۺ-ۼۿ-ۿܐ-ܐܒ-ܯݍ-ޥޱ-ޱߊ-ߪߴ-ߵߺ-ߺࠀ-ࠕࠚ-ࠚࠤ-ࠤࠨ-ࠨࡀ-ࡘࢠ-ࢴऄ-हऽ-ऽॐ-ॐक़-ॡॱ-ঀঅ-ঌএ-ঐও-নপ-রল-লশ-হঽ-ঽৎ-ৎড়-ঢ়য়-ৡৰ-ৱਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹਖ਼-ੜਫ਼-ਫ਼ੲ-ੴઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હઽ-ઽૐ-ૐૠ-ૡૹ-ૹଅ-ଌଏ-ଐଓ-ନପ-ରଲ-ଳଵ-ହଽ-ଽଡ଼-ଢ଼ୟ-ୡୱ-ୱஃ-ஃஅ-ஊஎ-ஐஒ-கங-சஜ-ஜஞ-டண-தந-பம-ஹௐ-ௐఅ-ఌఎ-ఐఒ-నప-హఽ-ఽౘ-ౚౠ-ౡಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹಽ-ಽೞ-ೞೠ-ೡೱ-ೲഅ-ഌഎ-ഐഒ-ഺഽ-ഽൎ-ൎൟ-ൡൺ-ൿඅ-ඖක-නඳ-රල-ලව-ෆก-ะา-ำเ-ๆກ-ຂຄ-ຄງ-ຈຊ-ຊຍ-ຍດ-ທນ-ຟມ-ຣລ-ລວ-ວສ-ຫອ-ະາ-ຳຽ-ຽເ-ໄໆ-ໆໜ-ໟༀ-ༀཀ-ཇཉ-ཬྈ-ྌက-ဪဿ-ဿၐ-ၕၚ-ၝၡ-ၡၥ-ၦၮ-ၰၵ-ႁႎ-ႎႠ-ჅჇ-ჇჍ-Ⴭა-ჺჼ-ቈቊ-ቍቐ-ቖቘ-ቘቚ-ቝበ-ኈኊ-ኍነ-ኰኲ-ኵኸ-ኾዀ-ዀዂ-ዅወ-ዖዘ-ጐጒ-ጕጘ-ፚᎀ-ᎏᎠ-Ᏽᏸ-ᏽᐁ-ᙬᙯ-ᙿᚁ-ᚚᚠ-ᛪᛱ-ᛸᜀ-ᜌᜎ-ᜑᜠ-ᜱᝀ-ᝑᝠ-ᝬᝮ-ᝰក-ឳៗ-ៗៜ-ៜᠠ-ᡷᢀ-ᢨᢪ-ᢪᢰ-ᣵᤀ-ᤞᥐ-ᥭᥰ-ᥴᦀ-ᦫᦰ-ᧉᨀ-ᨖᨠ-ᩔᪧ-ᪧᬅ-ᬳᭅ-ᭋᮃ-ᮠᮮ-ᮯᮺ-ᯥᰀ-ᰣᱍ-ᱏᱚ-ᱽᳩ-ᳬᳮ-ᳱᳵ-ᳶᴀ-ᶿḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙ-ὙὛ-ὛὝ-ὝὟ-ώᾀ-ᾴᾶ-ᾼι-ιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼⁱ-ⁱⁿ-ⁿₐ-ₜℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎↃ-ↄⰀ-Ⱞⰰ-ⱞⱠ-ⳤⳫ-ⳮⳲ-ⳳⴀ-ⴥⴧ-ⴧⴭ-ⴭⴰ-ⵧⵯ-ⵯⶀ-ⶖⶠ-ⶦⶨ-ⶮⶰ-ⶶⶸ-ⶾⷀ-ⷆⷈ-ⷎⷐ-ⷖⷘ-ⷞⸯ-ⸯ々-〆〱-〵〻-〼ぁ-ゖゝ-ゟァ-ヺー-ヿㄅ-ㄭㄱ-ㆎㆠ-ㆺㇰ-ㇿ㐀-䶵一-鿕ꀀ-ꒌꓐ-ꓽꔀ-ꘌꘐ-ꘟꘪ-ꘫꙀ-ꙮꙿ-ꚝꚠ-ꛥꜗ-ꜟꜢ-ꞈꞋ-ꞭꞰ-ꞷꟷ-ꠁꠃ-ꠅꠇ-ꠊꠌ-ꠢꡀ-ꡳꢂ-ꢳꣲ-ꣷꣻ-ꣻꣽ-ꣽꤊ-ꤥꤰ-ꥆꥠ-ꥼꦄ-ꦲꧏ-ꧏꧠ-ꧤꧦ-ꧯꧺ-ꧾꨀ-ꨨꩀ-ꩂꩄ-ꩋꩠ-ꩶꩺ-ꩺꩾ-ꪯꪱ-ꪱꪵ-ꪶꪹ-ꪽꫀ-ꫀꫂ-ꫂꫛ-ꫝꫠ-ꫪꫲ-ꫴꬁ-ꬆꬉ-ꬎꬑ-ꬖꬠ-ꬦꬨ-ꬮꬰ-ꭚꭜ-ꭥꭰ-ꯢ가-힣ힰ-ퟆퟋ-ퟻ豈-舘並-龎ff-stﬓ-ﬗיִ-יִײַ-ﬨשׁ-זּטּ-לּמּ-מּנּ-סּףּ-פּצּ-ﮱﯓ-ﴽﵐ-ﶏﶒ-ﷇﷰ-ﷻﹰ-ﹴﹶ-ﻼA-Za-zヲ-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ𐀀-𐀋𐀍-𐀦𐀨-𐀺𐀼-𐀽𐀿-𐁍𐁐-𐁝𐂀-𐃺𐊀-𐊜𐊠-𐋐𐌀-𐌟𐌰-𐍀𐍂-𐍉𐍐-𐍵𐎀-𐎝𐎠-𐏃𐏈-𐏏𐐀-𐒝𐔀-𐔧𐔰-𐕣𐘀-𐜶𐝀-𐝕𐝠-𐝧𐠀-𐠅𐠈-𐠈𐠊-𐠵𐠷-𐠸𐠼-𐠼𐠿-𐡕𐡠-𐡶𐢀-𐢞𐣠-𐣲𐣴-𐣵𐤀-𐤕𐤠-𐤹𐦀-𐦷𐦾-𐦿𐨀-𐨀𐨐-𐨓𐨕-𐨗𐨙-𐨳𐩠-𐩼𐪀-𐪜𐫀-𐫇𐫉-𐫤𐬀-𐬵𐭀-𐭕𐭠-𐭲𐮀-𐮑𐰀-𐱈𐲀-𐲲𐳀-𐳲𑀃-𑀷𑂃-𑂯𑃐-𑃨𑄃-𑄦𑅐-𑅲𑅶-𑅶𑆃-𑆲𑇁-𑇄𑇚-𑇚𑇜-𑇜𑈀-𑈑𑈓-𑈫𑊀-𑊆𑊈-𑊈𑊊-𑊍𑊏-𑊝𑊟-𑊨𑊰-𑋞𑌅-𑌌𑌏-𑌐𑌓-𑌨𑌪-𑌰𑌲-𑌳𑌵-𑌹𑌽-𑌽𑍐-𑍐𑍝-𑍡𑒀-𑒯𑓄-𑓅𑓇-𑓇𑖀-𑖮𑗘-𑗛𑘀-𑘯𑙄-𑙄𑚀-𑚪𑜀-𑜙𑢠-𑣟𑣿-𑣿𑫀-𑫸𒀀-𒎙𒒀-𒕃𓀀-𓐮𔐀-𔙆𖠀-𖨸𖩀-𖩞𖫐-𖫭𖬀-𖬯𖭀-𖭃𖭣-𖭷𖭽-𖮏𖼀-𖽄𖽐-𖽐𖾓-𖾟𛀀-𛀁𛰀-𛱪𛱰-𛱼𛲀-𛲈𛲐-𛲙𝐀-𝑔𝑖-𝒜𝒞-𝒟𝒢-𝒢𝒥-𝒦𝒩-𝒬𝒮-𝒹𝒻-𝒻𝒽-𝓃𝓅-𝔅𝔇-𝔊𝔍-𝔔𝔖-𝔜𝔞-𝔹𝔻-𝔾𝕀-𝕄𝕆-𝕆𝕊-𝕐𝕒-𝚥𝚨-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𝟋𞠀-𞣄𞸀-𞸃𞸅-𞸟𞸡-𞸢𞸤-𞸤𞸧-𞸧𞸩-𞸲𞸴-𞸷𞸹-𞸹𞸻-𞸻𞹂-𞹂𞹇-𞹇𞹉-𞹉𞹋-𞹋𞹍-𞹏𞹑-𞹒𞹔-𞹔𞹗-𞹗𞹙-𞹙𞹛-𞹛𞹝-𞹝𞹟-𞹟𞹡-𞹢𞹤-𞹤𞹧-𞹪𞹬-𞹲𞹴-𞹷𞹹-𞹼𞹾-𞹾𞺀-𞺉𞺋-𞺛𞺡-𞺣𞺥-𞺩𞺫-𞺻𠀀-𪛖𪜀-𫜴𫝀-𫠝𫠠-𬺡丽-𪘀])+)"); + +// bullet_core-0.1.1: "^(?u:\\()" +consistent!(bullet_core_4, "^(?u:\\()"); + +// bullet_core-0.1.1: "^(?u:\\))" +consistent!(bullet_core_5, "^(?u:\\))"); + +// bullet_core-0.1.1: "^(?u:\\*)" +consistent!(bullet_core_6, "^(?u:\\*)"); + +// bullet_core-0.1.1: "^(?u:\\+)" +consistent!(bullet_core_7, "^(?u:\\+)"); + +// bullet_core-0.1.1: "^(?u:,)" +consistent!(bullet_core_8, "^(?u:,)"); + +// bullet_core-0.1.1: "^(?u:\\-)" +consistent!(bullet_core_9, "^(?u:\\-)"); + +// bullet_core-0.1.1: "^(?u:/)" +consistent!(bullet_core_10, "^(?u:/)"); + +// bullet_core-0.1.1: "^(?u:\\[)" +consistent!(bullet_core_11, "^(?u:\\[)"); + +// bullet_core-0.1.1: "^(?u:\\])" +consistent!(bullet_core_12, "^(?u:\\])"); + +// bullet_core-0.1.1: "^(?u:\\^)" +consistent!(bullet_core_13, "^(?u:\\^)"); + +// bullet_core-0.1.1: "^(?u:·)" +consistent!(bullet_core_14, "^(?u:·)"); + +// actix-web-0.6.13: "//+" +consistent!(actix_web_0, "//+"); + +// actix-web-0.6.13: "//+" +consistent!(actix_web_1, "//+"); + +// althea_kernel_interface-0.1.0: r"(\S*) .* (\S*) (REACHABLE|STALE|DELAY)" +consistent!( + althea_kernel_interface_0, + r"(\S*) .* (\S*) (REACHABLE|STALE|DELAY)" +); + +// althea_kernel_interface-0.1.0: r"-s (.*) --ip6-dst (.*)/.* bcnt = (.*)" +consistent!( + althea_kernel_interface_1, + r"-s (.*) --ip6-dst (.*)/.* bcnt = (.*)" +); + +// alcibiades-0.3.0: r"\buci(?:\s|$)" +consistent!(alcibiades_0, r"\buci(?:\s|$)"); + +// ruma-identifiers-0.11.0: r"\A[a-z0-9._=-]+\z" +consistent!(ruma_identifiers_0, r"\A[a-z0-9._=-]+\z"); + +// rusqbin-0.2.3: r"/rusqbins/((?i)[A-F0-9]{8}\-[A-F0-9]{4}\-4[A-F0-9]{3}\-[89AB][A-F0-9]{3}\-[A-F0-9]{12})$" +consistent!(rusqbin_0, r"/rusqbins/((?i)[A-F0-9]{8}\-[A-F0-9]{4}\-4[A-F0-9]{3}\-[89AB][A-F0-9]{3}\-[A-F0-9]{12})$"); + +// rusqbin-0.2.3: r"/rusqbins/((?i)[A-F0-9]{8}\-[A-F0-9]{4}\-4[A-F0-9]{3}\-[89AB][A-F0-9]{3}\-[A-F0-9]{12})/requests/?$" +consistent!(rusqbin_1, r"/rusqbins/((?i)[A-F0-9]{8}\-[A-F0-9]{4}\-4[A-F0-9]{3}\-[89AB][A-F0-9]{3}\-[A-F0-9]{12})/requests/?$"); + +// rust-install-0.0.4: r"^(nightly|beta|stable)(?:-(\d{4}-\d{2}-\d{2}))?$" +consistent!( + rust_install_0, + r"^(nightly|beta|stable)(?:-(\d{4}-\d{2}-\d{2}))?$" +); + +// rust_inbox-0.0.5: "^+(.*)\r\n" +consistent!(rust_inbox_0, "^+(.*)\r\n"); + +// rust_inbox-0.0.5: r"^\* CAPABILITY (.*)\r\n" +consistent!(rust_inbox_1, r"^\* CAPABILITY (.*)\r\n"); + +// rust_inbox-0.0.5: r"^([a-zA-Z0-9]+) (OK|NO|BAD)(.*)" +consistent!(rust_inbox_2, r"^([a-zA-Z0-9]+) (OK|NO|BAD)(.*)"); + +// rust_inbox-0.0.5: r"^\* (\d+) EXISTS\r\n" +consistent!(rust_inbox_3, r"^\* (\d+) EXISTS\r\n"); + +// rust_inbox-0.0.5: r"^\* (\d+) RECENT\r\n" +consistent!(rust_inbox_4, r"^\* (\d+) RECENT\r\n"); + +// rust_inbox-0.0.5: r"^\* FLAGS (.+)\r\n" +consistent!(rust_inbox_5, r"^\* FLAGS (.+)\r\n"); + +// rust_inbox-0.0.5: r"^\* OK \[UNSEEN (\d+)\](.*)\r\n" +consistent!(rust_inbox_6, r"^\* OK \[UNSEEN (\d+)\](.*)\r\n"); + +// rust_inbox-0.0.5: r"^\* OK \[UIDVALIDITY (\d+)\](.*)\r\n" +consistent!(rust_inbox_7, r"^\* OK \[UIDVALIDITY (\d+)\](.*)\r\n"); + +// rust_inbox-0.0.5: r"^\* OK \[UIDNEXT (\d+)\](.*)\r\n" +consistent!(rust_inbox_8, r"^\* OK \[UIDNEXT (\d+)\](.*)\r\n"); + +// rust_inbox-0.0.5: r"^\* OK \[PERMANENTFLAGS (.+)\](.*)\r\n" +consistent!(rust_inbox_9, r"^\* OK \[PERMANENTFLAGS (.+)\](.*)\r\n"); + +// rustml-0.0.7: r"^[a-z]+ (\d+)$" +consistent!(rustml_0, r"^[a-z]+ (\d+)$"); + +// rustml-0.0.7: r"^[a-z]+ (\d+)$" +consistent!(rustml_1, r"^[a-z]+ (\d+)$"); + +// rustml-0.0.7: r"^[a-z]+ (\d+)$" +consistent!(rustml_2, r"^[a-z]+ (\d+)$"); + +// rustfmt-0.10.0: r"([^\\](\\\\)*)\\[\n\r][[:space:]]*" +consistent!(rustfmt_0, r"([^\\](\\\\)*)\\[\n\r][[:space:]]*"); + +// rustfmt-core-0.4.0: r"(^\s*$)|(^\s*//\s*rustfmt-[^:]+:\s*\S+)" +consistent!(rustfmt_core_0, r"(^\s*$)|(^\s*//\s*rustfmt-[^:]+:\s*\S+)"); + +// rustfmt-core-0.4.0: r"^## `([^`]+)`" +consistent!(rustfmt_core_1, r"^## `([^`]+)`"); + +// rustfmt-core-0.4.0: r"([^\\](\\\\)*)\\[\n\r][[:space:]]*" +consistent!(rustfmt_core_2, r"([^\\](\\\\)*)\\[\n\r][[:space:]]*"); + +// rustfmt-core-0.4.0: r"\s;" +consistent!(rustfmt_core_3, r"\s;"); + +// rust-enum-derive-0.4.0: r"^(0x)?([:digit:]+)$" +consistent!(rust_enum_derive_0, r"^(0x)?([:digit:]+)$"); + +// rust-enum-derive-0.4.0: r"^([:digit:]+)[:space:]*<<[:space:]*([:digit:]+)$" +consistent!( + rust_enum_derive_1, + r"^([:digit:]+)[:space:]*<<[:space:]*([:digit:]+)$" +); + +// rust-enum-derive-0.4.0: r"^[:space:]*([[:alnum:]_]+)([:space:]*=[:space:]*([:graph:]+))?[:space:]*," +consistent!(rust_enum_derive_2, r"^[:space:]*([[:alnum:]_]+)([:space:]*=[:space:]*([:graph:]+))?[:space:]*,"); + +// rust-enum-derive-0.4.0: r"^#define[:space:]+([:graph:]+)[:space:]+([:graph:]+)" +consistent!( + rust_enum_derive_3, + r"^#define[:space:]+([:graph:]+)[:space:]+([:graph:]+)" +); + +// rustsourcebundler-0.2.0: r"^\s*pub mod (.+);$" +consistent!(rustsourcebundler_0, r"^\s*pub mod (.+);$"); + +// rustsourcebundler-0.2.0: r"^\s*pub mod (.+);$" +consistent!(rustsourcebundler_1, r"^\s*pub mod (.+);$"); + +// rustfmt-nightly-0.8.2: r"([^\\](\\\\)*)\\[\n\r][[:space:]]*" +consistent!(rustfmt_nightly_0, r"([^\\](\\\\)*)\\[\n\r][[:space:]]*"); + +// rustfmt-nightly-0.8.2: r"\s;" +consistent!(rustfmt_nightly_1, r"\s;"); + +// rustache-0.1.0: r"(?s)(.*?)([ \t\r\n]*)(\{\{(\{?\S?\s*?[\w\.\s]*.*?\s*?\}?)\}\})([ \t\r\n]*)" +consistent!(rustache_0, r"(?s)(.*?)([ \t\r\n]*)(\{\{(\{?\S?\s*?[\w\.\s]*.*?\s*?\}?)\}\})([ \t\r\n]*)"); + +// rustfilt-0.2.0: r"_ZN[\$\._[:alnum:]]*" +consistent!(rustfilt_0, r"_ZN[\$\._[:alnum:]]*"); + +// rustache-lists-0.1.2: r"(?s)(.*?)([ \t\r\n]*)(\{\{(\{?\S?\s*?[\w\.\s]*.*?\s*?\}?)\}\})([ \t\r\n]*)" +consistent!(rustache_lists_0, r"(?s)(.*?)([ \t\r\n]*)(\{\{(\{?\S?\s*?[\w\.\s]*.*?\s*?\}?)\}\})([ \t\r\n]*)"); + +// rural-0.7.3: "(.+)=(.+)" +consistent!(rural_0, "(.+)=(.+)"); + +// rural-0.7.3: "(.*):(.+)" +consistent!(rural_1, "(.*):(.+)"); + +// rural-0.7.3: "(.+):=(.+)" +consistent!(rural_2, "(.+):=(.+)"); + +// rural-0.7.3: "(.*)==(.+)" +consistent!(rural_3, "(.*)==(.+)"); + +// rusoto_credential-0.11.0: r"^\[([^\]]+)\]$" +consistent!(rusoto_credential_0, r"^\[([^\]]+)\]$"); + +// rumblebars-0.3.0: "([:blank:]*)$" +consistent!(rumblebars_0, "([:blank:]*)$"); + +// rumblebars-0.3.0: "(\r?\n)[:blank:]*(\\{\\{~?[#!/](?:\\}?[^}])*\\}\\})[:blank:]*(:?\r?\n)?\\z" +consistent!(rumblebars_1, "(\r?\n)[:blank:]*(\\{\\{~?[#!/](?:\\}?[^}])*\\}\\})[:blank:]*(:?\r?\n)?\\z"); + +// rumblebars-0.3.0: "(\r?\n[:blank:]*)(\\{\\{~?>(?:\\}?[^}])*\\}\\})[:blank:]*(:?\r?\n)?\\z" +consistent!( + rumblebars_2, + "(\r?\n[:blank:]*)(\\{\\{~?>(?:\\}?[^}])*\\}\\})[:blank:]*(:?\r?\n)?\\z" +); + +// rumblebars-0.3.0: "((?:[:blank:]|\r?\n)*)(\r?\n)[:blank:]*$" +consistent!(rumblebars_3, "((?:[:blank:]|\r?\n)*)(\r?\n)[:blank:]*$"); + +// rumblebars-0.3.0: "^([:blank:]*\r?\n)(.*)" +consistent!(rumblebars_4, "^([:blank:]*\r?\n)(.*)"); + +// diesel_cli-1.3.1: r"(?P<stamp>[\d-]*)_hello" +consistent!(diesel_cli_0, r"(?P<stamp>[\d-]*)_hello"); + +// dishub-0.1.1: r"(\d+)s" +consistent!(dishub_0, r"(\d+)s"); + +// spreadsheet_textconv-0.1.0: r"\n" +consistent!(spreadsheet_textconv_0, r"\n"); + +// spreadsheet_textconv-0.1.0: r"\r" +consistent!(spreadsheet_textconv_1, r"\r"); + +// spreadsheet_textconv-0.1.0: r"\t" +consistent!(spreadsheet_textconv_2, r"\t"); + +// split_aud-0.1.0: r"DELAY (-?\d+)ms" +consistent!(split_aud_0, r"DELAY (-?\d+)ms"); + +// split_aud-0.1.0: r"Trim\((\d+), ?(\d+)\)" +consistent!(split_aud_1, r"Trim\((\d+), ?(\d+)\)"); + +// spotrust-0.0.5: r"spotify:[a-z]+:[a-zA-Z0-9]+" +consistent!(spotrust_0, r"spotify:[a-z]+:[a-zA-Z0-9]+"); + +// spaceslugs-0.1.0: r"[^\x00-\x7F]" +consistent!(spaceslugs_0, r"[^\x00-\x7F]"); + +// spaceslugs-0.1.0: r"[']+" +consistent!(spaceslugs_1, r"[']+"); + +// spaceslugs-0.1.0: r"\W+" +consistent!(spaceslugs_2, r"\W+"); + +// spaceslugs-0.1.0: r"[ ]+" +consistent!(spaceslugs_3, r"[ ]+"); + +// space_email_api-0.1.1: "PHPSESSID=([0-9a-f]+)" +consistent!(space_email_api_0, "PHPSESSID=([0-9a-f]+)"); + +// lorikeet-0.7.0: "[^0-9.,]" +consistent!(lorikeet_0, "[^0-9.,]"); + +// claude-0.3.0: r"^(?:\b|(-)?)(\p{Currency_Symbol})?((?:(?:\d{1,3}[\.,])+\d{3})|\d+)(?:[\.,](\d{2}))?\b$" +consistent!(claude_0, r"^(?:\b|(-)?)(\p{Currency_Symbol})?((?:(?:\d{1,3}[\.,])+\d{3})|\d+)(?:[\.,](\d{2}))?\b$"); + +// clam-0.1.6: r"<%=\s*(.+?)\s*%>" +consistent!(clam_0, r"<%=\s*(.+?)\s*%>"); + +// classifier-0.0.3: r"(\s)" +consistent!(classifier_0, r"(\s)"); + +// click-0.3.2: r"(-----BEGIN .*-----\n)((?:(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)*\n)+)(-----END .*-----)" +consistent!(click_0, r"(-----BEGIN .*-----\n)((?:(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)*\n)+)(-----END .*-----)"); + +// click-0.3.2: r"-----BEGIN PRIVATE KEY-----" +consistent!(click_1, r"-----BEGIN PRIVATE KEY-----"); + +// ultrastar-txt-0.1.2: r"#([A-Z3a-z]*):(.*)" +consistent!(ultrastar_txt_0, r"#([A-Z3a-z]*):(.*)"); + +// ultrastar-txt-0.1.2: "^-\\s?(-?[0-9]+)\\s*$" +consistent!(ultrastar_txt_1, "^-\\s?(-?[0-9]+)\\s*$"); + +// ultrastar-txt-0.1.2: "^-\\s?(-?[0-9]+)\\s+(-?[0-9]+)" +consistent!(ultrastar_txt_2, "^-\\s?(-?[0-9]+)\\s+(-?[0-9]+)"); + +// ultrastar-txt-0.1.2: "^(.)\\s*(-?[0-9]+)\\s+(-?[0-9]+)\\s+(-?[0-9]+)\\s?(.*)" +consistent!( + ultrastar_txt_3, + "^(.)\\s*(-?[0-9]+)\\s+(-?[0-9]+)\\s+(-?[0-9]+)\\s?(.*)" +); + +// ultrastar-txt-0.1.2: "^P\\s?(-?[0-9]+)" +consistent!(ultrastar_txt_4, "^P\\s?(-?[0-9]+)"); + +// db-accelerate-2.0.0: r"^template\.add($|\..+$)" +consistent!(db_accelerate_0, r"^template\.add($|\..+$)"); + +// db-accelerate-2.0.0: r"^template\.sub($|\..+$)" +consistent!(db_accelerate_1, r"^template\.sub($|\..+$)"); + +// sterling-0.3.0: r"(\d+)([cegps])" +consistent!(sterling_0, r"(\d+)([cegps])"); + +// stache-0.2.0: r"[^\w]" +consistent!(stache_0, r"[^\w]"); + +// strukt-0.1.0: "\"([<>]?)([xcbB\\?hHiIlLqQfdspP]*)\"" +consistent!(strukt_0, "\"([<>]?)([xcbB\\?hHiIlLqQfdspP]*)\""); + +// steamid-ng-0.3.1: r"^STEAM_([0-4]):([0-1]):([0-9]{1,10})$" +consistent!(steamid_ng_0, r"^STEAM_([0-4]):([0-1]):([0-9]{1,10})$"); + +// steamid-ng-0.3.1: r"^\[([AGMPCgcLTIUai]):([0-4]):([0-9]{1,10})(:([0-9]+))?\]$" +consistent!( + steamid_ng_1, + r"^\[([AGMPCgcLTIUai]):([0-4]):([0-9]{1,10})(:([0-9]+))?\]$" +); + +// strscan-0.1.1: r"^\w+" +consistent!(strscan_0, r"^\w+"); + +// strscan-0.1.1: r"^\s+" +consistent!(strscan_1, r"^\s+"); + +// strscan-0.1.1: r"^\w+" +consistent!(strscan_2, r"^\w+"); + +// strscan-0.1.1: r"^\s+" +consistent!(strscan_3, r"^\s+"); + +// strscan-0.1.1: r"^(\w+)\s+" +consistent!(strscan_4, r"^(\w+)\s+"); + +// tk-carbon-0.2.0: r"^([a-zA-Z0-9\.-]+)(?:\s+(\d+))$" +consistent!(tk_carbon_0, r"^([a-zA-Z0-9\.-]+)(?:\s+(\d+))$"); + +// tk-carbon-0.2.0: r"^([a-zA-Z0-9\.-]+)(?:\s+(\d+))$" +consistent!(tk_carbon_1, r"^([a-zA-Z0-9\.-]+)(?:\s+(\d+))$"); + +// evalrs-0.0.10: r"extern\s+crate\s+([a-z0-9_]+)\s*;(\s*//(.+))?" +consistent!(evalrs_0, r"extern\s+crate\s+([a-z0-9_]+)\s*;(\s*//(.+))?"); + +// evalrs-0.0.10: r"(?m)^# " +consistent!(evalrs_1, r"(?m)^# "); + +// evalrs-0.0.10: r"(?m)^\s*fn +main *\( *\)" +consistent!(evalrs_2, r"(?m)^\s*fn +main *\( *\)"); + +// evalrs-0.0.10: r"(extern\s+crate\s+[a-z0-9_]+\s*;)" +consistent!(evalrs_3, r"(extern\s+crate\s+[a-z0-9_]+\s*;)"); + +// gate_build-0.5.0: "(.*)_t([0-9]+)" +consistent!(gate_build_0, "(.*)_t([0-9]+)"); + +// rake-0.1.1: r"[^\P{P}-]|\s+-\s+" +consistent!(rake_0, r"[^\P{P}-]|\s+-\s+"); + +// rafy-0.2.1: r"^.*(?:(?:youtu\.be/|v/|vi/|u/w/|embed/)|(?:(?:watch)?\?v(?:i)?=|\&v(?:i)?=))([^#\&\?]*).*" +consistent!(rafy_0, r"^.*(?:(?:youtu\.be/|v/|vi/|u/w/|embed/)|(?:(?:watch)?\?v(?:i)?=|\&v(?:i)?=))([^#\&\?]*).*"); + +// raven-0.2.1: r"^(?P<protocol>.*?)://(?P<public_key>.*?):(?P<secret_key>.*?)@(?P<host>.*?)/(?P<path>.*/)?(?P<project_id>.*)$" +consistent!(raven_0, r"^(?P<protocol>.*?)://(?P<public_key>.*?):(?P<secret_key>.*?)@(?P<host>.*?)/(?P<path>.*/)?(?P<project_id>.*)$"); + +// rargs-0.2.0: r"\{[[:space:]]*[^{}]*[[:space:]]*\}" +consistent!(rargs_0, r"\{[[:space:]]*[^{}]*[[:space:]]*\}"); + +// rargs-0.2.0: r"^\{[[:space:]]*(?P<name>[[:word:]]*)[[:space:]]*\}$" +consistent!(rargs_1, r"^\{[[:space:]]*(?P<name>[[:word:]]*)[[:space:]]*\}$"); + +// rargs-0.2.0: r"^\{[[:space:]]*(?P<num>-?\d+)[[:space:]]*\}$" +consistent!(rargs_2, r"^\{[[:space:]]*(?P<num>-?\d+)[[:space:]]*\}$"); + +// rargs-0.2.0: r"^\{(?P<left>-?\d*)?\.\.(?P<right>-?\d*)?(?::(?P<sep>.*))?\}$" +consistent!( + rargs_3, + r"^\{(?P<left>-?\d*)?\.\.(?P<right>-?\d*)?(?::(?P<sep>.*))?\}$" +); + +// rargs-0.2.0: r"(.*?)[[:space:]]+|(.*?)$" +consistent!(rargs_4, r"(.*?)[[:space:]]+|(.*?)$"); + +// indradb-lib-0.15.0: r"[a-zA-Z0-9]{8}" +consistent!(indradb_lib_0, r"[a-zA-Z0-9]{8}"); + +// fungi-lang-0.1.50: r"::" +consistent!(fungi_lang_0, r"::"); + +// nickel-0.10.1: "/hello/(?P<name>[a-zA-Z]+)" +consistent!(nickel_0, "/hello/(?P<name>[a-zA-Z]+)"); + +// nickel-0.10.1: "/hello/(?P<name>[a-zA-Z]+)" +consistent!(nickel_1, "/hello/(?P<name>[a-zA-Z]+)"); + +// pact_verifier-0.4.0: r"\{(\w+)\}" +consistent!(pact_verifier_0, r"\{(\w+)\}"); + +// pact_matching-0.4.1: "application/.*json" +consistent!(pact_matching_0, "application/.*json"); + +// pact_matching-0.4.1: "application/json.*" +consistent!(pact_matching_1, "application/json.*"); + +// pact_matching-0.4.1: "application/.*xml" +consistent!(pact_matching_2, "application/.*xml"); + +// pangu-0.2.0: "([\"'\\(\\[\\{{<\u{201c}])(\\s*)(.+?)(\\s*)([\"'\\)\\]\\}}>\u{201d}])" +consistent!( + pangu_0, + "([\"'\\(\\[\\{{<\u{201c}])(\\s*)(.+?)(\\s*)([\"'\\)\\]\\}}>\u{201d}])" +); + +// pangu-0.2.0: "([\\(\\[\\{{<\u{201c}]+)(\\s*)(.+?)(\\s*)([\\)\\]\\}}>\u{201d}]+)" +consistent!( + pangu_1, + "([\\(\\[\\{{<\u{201c}]+)(\\s*)(.+?)(\\s*)([\\)\\]\\}}>\u{201d}]+)" +); + +// parser-haskell-0.2.0: r"\{-[\s\S]*?-\}" +consistent!(parser_haskell_0, r"\{-[\s\S]*?-\}"); + +// parser-haskell-0.2.0: r"(?m);+\s*$" +consistent!(parser_haskell_1, r"(?m);+\s*$"); + +// parser-haskell-0.2.0: r"(?m)^#(if|ifn?def|endif|else|include|elif).*" +consistent!(parser_haskell_2, r"(?m)^#(if|ifn?def|endif|else|include|elif).*"); + +// parser-haskell-0.2.0: r"'([^'\\]|\\[A-Z]{1,3}|\\.)'" +consistent!(parser_haskell_3, r"'([^'\\]|\\[A-Z]{1,3}|\\.)'"); + +// parser-haskell-0.2.0: r"forall\s+(.*?)\." +consistent!(parser_haskell_4, r"forall\s+(.*?)\."); + +// html2md-0.2.1: "\\s{2,}" +consistent!(html2md_0, "\\s{2,}"); + +// html2md-0.2.1: "\\n{2,}" +consistent!(html2md_1, "\\n{2,}"); + +// html2md-0.2.1: "(?m)(\\S) $" +consistent!(html2md_2, "(?m)(\\S) $"); + +// html2md-0.2.1: "(?m)^[-*] " +consistent!(html2md_3, "(?m)^[-*] "); + +// ovpnfile-0.1.2: r"#.*$" +consistent!(ovpnfile_0, r"#.*$"); + +// ovpnfile-0.1.2: r"^<(\S+)>" +consistent!(ovpnfile_1, r"^<(\S+)>"); + +// ovpnfile-0.1.2: r"^</(\S+)>" +consistent!(ovpnfile_2, r"^</(\S+)>"); + +// screenruster-saver-fractal-0.1.1: r"#([:xdigit:]{2})([:xdigit:]{2})([:xdigit:]{2})" +consistent!( + screenruster_saver_fractal_0, + r"#([:xdigit:]{2})([:xdigit:]{2})([:xdigit:]{2})" +); + +// scarlet-0.2.2: r"rgb\((?: *(\d{1,3}),)(?: *(\d{1,3}),)(?: *(\d{1,3}))\)" +consistent!( + scarlet_0, + r"rgb\((?: *(\d{1,3}),)(?: *(\d{1,3}),)(?: *(\d{1,3}))\)" +); + +// cpp_to_rust_generator-0.2.0: r"^([\w:]+)<(.+)>$" +consistent!(cpp_to_rust_generator_0, r"^([\w:]+)<(.+)>$"); + +// cpp_to_rust_generator-0.2.0: r"^type-parameter-(\d+)-(\d+)$" +consistent!(cpp_to_rust_generator_1, r"^type-parameter-(\d+)-(\d+)$"); + +// cpp_to_rust_generator-0.2.0: r"^([\w~]+)<[^<>]+>$" +consistent!(cpp_to_rust_generator_2, r"^([\w~]+)<[^<>]+>$"); + +// cpp_to_rust_generator-0.2.0: r"(signals|Q_SIGNALS)\s*:" +consistent!(cpp_to_rust_generator_3, r"(signals|Q_SIGNALS)\s*:"); + +// cpp_to_rust_generator-0.2.0: r"(slots|Q_SLOTS)\s*:" +consistent!(cpp_to_rust_generator_4, r"(slots|Q_SLOTS)\s*:"); + +// cpp_to_rust_generator-0.2.0: r"(public|protected|private)\s*:" +consistent!(cpp_to_rust_generator_5, r"(public|protected|private)\s*:"); + +// cpp_to_rust-0.5.3: r"^([\w:]+)<(.+)>$" +consistent!(cpp_to_rust_0, r"^([\w:]+)<(.+)>$"); + +// cpp_to_rust-0.5.3: r"^type-parameter-(\d+)-(\d+)$" +consistent!(cpp_to_rust_1, r"^type-parameter-(\d+)-(\d+)$"); + +// cpp_to_rust-0.5.3: r"^([\w~]+)<[^<>]+>$" +consistent!(cpp_to_rust_2, r"^([\w~]+)<[^<>]+>$"); + +// cpp_to_rust-0.5.3: r"(signals|Q_SIGNALS)\s*:" +consistent!(cpp_to_rust_3, r"(signals|Q_SIGNALS)\s*:"); + +// cpp_to_rust-0.5.3: r"(slots|Q_SLOTS)\s*:" +consistent!(cpp_to_rust_4, r"(slots|Q_SLOTS)\s*:"); + +// cpp_to_rust-0.5.3: r"(public|protected|private)\s*:" +consistent!(cpp_to_rust_5, r"(public|protected|private)\s*:"); + +// fritzbox_logs-0.2.0: "(\\d{2}\\.\\d{2}\\.\\d{2}) (\\d{2}:\\d{2}:\\d{2}) (.*)" +consistent!( + fritzbox_logs_0, + "(\\d{2}\\.\\d{2}\\.\\d{2}) (\\d{2}:\\d{2}:\\d{2}) (.*)" +); + +// fractal-matrix-api-3.29.0: r"mxc://(?P<server>[^/]+)/(?P<media>.+)" +consistent!(fractal_matrix_api_0, r"mxc://(?P<server>[^/]+)/(?P<media>.+)"); + +// smtp2go-0.1.4: r"^api-[a-zA-Z0-9]{32}$" +consistent!(smtp2go_0, r"^api-[a-zA-Z0-9]{32}$"); + +// pusher-0.3.1: r"^[-a-zA-Z0-9_=@,.;]+$" +consistent!(pusher_0, r"^[-a-zA-Z0-9_=@,.;]+$"); + +// pusher-0.3.1: r"\A\d+\.\d+\z" +consistent!(pusher_1, r"\A\d+\.\d+\z"); + +// bakervm-0.9.0: r"^\.(.+?) +?(.+)$" +consistent!(bakervm_0, r"^\.(.+?) +?(.+)$"); + +// bakervm-0.9.0: r"^\.([^\s]+)$" +consistent!(bakervm_1, r"^\.([^\s]+)$"); + +// bakervm-0.9.0: r"^include! +([^\s]+)$" +consistent!(bakervm_2, r"^include! +([^\s]+)$"); + +// bakervm-0.9.0: r"^@(\d+)$" +consistent!(bakervm_3, r"^@(\d+)$"); + +// bakervm-0.9.0: r"^true|false$" +consistent!(bakervm_4, r"^true|false$"); + +// bakervm-0.9.0: r"^(-?\d+)?\.[0-9]+$" +consistent!(bakervm_5, r"^(-?\d+)?\.[0-9]+$"); + +// bakervm-0.9.0: r"^(-?\d+)?$" +consistent!(bakervm_6, r"^(-?\d+)?$"); + +// bakervm-0.9.0: r"^#([0-9abcdefABCDEF]{6})$" +consistent!(bakervm_7, r"^#([0-9abcdefABCDEF]{6})$"); + +// bakervm-0.9.0: r"^'(.)'$" +consistent!(bakervm_8, r"^'(.)'$"); + +// bakervm-0.9.0: r"^\$vi\((\d+)\)$" +consistent!(bakervm_9, r"^\$vi\((\d+)\)$"); + +// bakervm-0.9.0: r"^\$key\((\d+)\)$" +consistent!(bakervm_10, r"^\$key\((\d+)\)$"); + +// banana-0.0.2: "(?P<type>[A-Z^']+) (?P<route>[^']+) HTTP/(?P<http>[^']+)" +consistent!( + banana_0, + "(?P<type>[A-Z^']+) (?P<route>[^']+) HTTP/(?P<http>[^']+)" +); + +// serial-key-2.0.0: r"[A-F0-9]{8}" +consistent!(serial_key_0, r"[A-F0-9]{8}"); + +// serde-hjson-0.8.1: "[\\\\\"\x00-\x1f\x7f-\u{9f}\u{00ad}\u{0600}-\u{0604}\u{070f}\u{17b4}\u{17b5}\u{200c}-\u{200f}\u{2028}-\u{202f}\u{2060}-\u{206f}\u{feff}\u{fff0}-\u{ffff}]" +consistent!(serde_hjson_0, "[\\\\\"\x00-\x1f\x7f-\u{9f}\u{00ad}\u{0600}-\u{0604}\u{070f}\u{17b4}\u{17b5}\u{200c}-\u{200f}\u{2028}-\u{202f}\u{2060}-\u{206f}\u{feff}\u{fff0}-\u{ffff}]"); + +// serde-hjson-0.8.1: "[\x00-\x1f\x7f-\u{9f}\u{00ad}\u{0600}-\u{0604}\u{070f}\u{17b4}\u{17b5}\u{200c}-\u{200f}\u{2028}-\u{202f}\u{2060}-\u{206f}\u{feff}\u{fff0}-\u{ffff}]" +consistent!(serde_hjson_1, "[\x00-\x1f\x7f-\u{9f}\u{00ad}\u{0600}-\u{0604}\u{070f}\u{17b4}\u{17b5}\u{200c}-\u{200f}\u{2028}-\u{202f}\u{2060}-\u{206f}\u{feff}\u{fff0}-\u{ffff}]"); + +// serde-hjson-0.8.1: "'''|[\x00-\x09\x0b\x0c\x0e-\x1f\x7f-\u{9f}\u{00ad}\u{0600}-\u{0604}\u{070f}\u{17b4}\u{17b5}\u{200c}-\u{200f}\u{2028}-\u{202f}\u{2060}-\u{206f}\u{feff}\u{fff0}-\u{ffff}]" +consistent!(serde_hjson_2, "'''|[\x00-\x09\x0b\x0c\x0e-\x1f\x7f-\u{9f}\u{00ad}\u{0600}-\u{0604}\u{070f}\u{17b4}\u{17b5}\u{200c}-\u{200f}\u{2028}-\u{202f}\u{2060}-\u{206f}\u{feff}\u{fff0}-\u{ffff}]"); + +// serde-odbc-0.1.0: r"/todos/(?P<id>\d+)" +consistent!(serde_odbc_0, r"/todos/(?P<id>\d+)"); + +// sentry-0.6.0: r"^(?:_<)?([a-zA-Z0-9_]+?)(?:\.\.|::)" +consistent!(sentry_0, r"^(?:_<)?([a-zA-Z0-9_]+?)(?:\.\.|::)"); + +// sentiment-0.1.1: r"[^a-zA-Z0 -]+" +consistent!(sentiment_0, r"[^a-zA-Z0 -]+"); + +// sentiment-0.1.1: r" {2,}" +consistent!(sentiment_1, r" {2,}"); + +// verilog-0.0.1: r"(?m)//.*" +consistent!(verilog_0, r"(?m)//.*"); + +// verex-0.2.2: "(?P<robot>C3PO)" +consistent!(verex_0, "(?P<robot>C3PO)"); + +// handlebars-0.32.4: ">|<|\"|&" +consistent!(handlebars_0, ">|<|\"|&"); + +// haikunator-0.1.2: r"^\w+-\w+-[0123456789]{4}$" +consistent!(haikunator_0, r"^\w+-\w+-[0123456789]{4}$"); + +// haikunator-0.1.2: r"^\w+@\w+@[0123456789]{4}$" +consistent!(haikunator_1, r"^\w+@\w+@[0123456789]{4}$"); + +// haikunator-0.1.2: r"^\w+-\w+-[0123456789abcdef]{4}$" +consistent!(haikunator_2, r"^\w+-\w+-[0123456789abcdef]{4}$"); + +// haikunator-0.1.2: r"^\w+-\w+-[0123456789忠犬ハチ公]{10}$" +consistent!(haikunator_3, r"^\w+-\w+-[0123456789忠犬ハチ公]{10}$"); + +// haikunator-0.1.2: r"^\w+-\w+$" +consistent!(haikunator_4, r"^\w+-\w+$"); + +// haikunator-0.1.2: r"^\w+-\w+-[foo]{4}$" +consistent!(haikunator_5, r"^\w+-\w+-[foo]{4}$"); + +// haikunator-0.1.2: r"^\w+-\w+-[0123456789忠犬ハチ公]{5}$" +consistent!(haikunator_6, r"^\w+-\w+-[0123456789忠犬ハチ公]{5}$"); + +// bobbin-cli-0.8.3: r"(.*)" +consistent!(bobbin_cli_0, r"(.*)"); + +// bobbin-cli-0.8.3: r"rustc (.*)" +consistent!(bobbin_cli_1, r"rustc (.*)"); + +// bobbin-cli-0.8.3: r"cargo (.*)" +consistent!(bobbin_cli_2, r"cargo (.*)"); + +// bobbin-cli-0.8.3: r"xargo (.*)\n" +consistent!(bobbin_cli_3, r"xargo (.*)\n"); + +// bobbin-cli-0.8.3: r"Open On-Chip Debugger (.*)" +consistent!(bobbin_cli_4, r"Open On-Chip Debugger (.*)"); + +// bobbin-cli-0.8.3: r"arm-none-eabi-gcc \(GNU Tools for ARM Embedded Processors[^\)]*\) (.*)" +consistent!( + bobbin_cli_5, + r"arm-none-eabi-gcc \(GNU Tools for ARM Embedded Processors[^\)]*\) (.*)" +); + +// bobbin-cli-0.8.3: r"(?m).*\nBasic Open Source SAM-BA Application \(BOSSA\) Version (.*)\n" +consistent!( + bobbin_cli_6, + r"(?m).*\nBasic Open Source SAM-BA Application \(BOSSA\) Version (.*)\n" +); + +// bobbin-cli-0.8.3: r"(?m)SEGGER J-Link Commander (.*)\n" +consistent!(bobbin_cli_7, r"(?m)SEGGER J-Link Commander (.*)\n"); + +// bobbin-cli-0.8.3: r"(?m)Teensy Loader, Command Line, Version (.*)\n" +consistent!(bobbin_cli_8, r"(?m)Teensy Loader, Command Line, Version (.*)\n"); + +// bobbin-cli-0.8.3: r"dfu-util (.*)\n" +consistent!(bobbin_cli_9, r"dfu-util (.*)\n"); + +// borsholder-0.9.1: r"^/static/[\w.]+$" +consistent!(borsholder_0, r"^/static/[\w.]+$"); + +// borsholder-0.9.1: r"^/timeline/([0-9]+)$" +consistent!(borsholder_1, r"^/timeline/([0-9]+)$"); + +// fblog-1.0.1: "\u{001B}\\[[\\d;]*[^\\d;]" +consistent!(fblog_0, "\u{001B}\\[[\\d;]*[^\\d;]"); + +// fblog-1.0.1: "\u{001B}\\[[\\d;]*[^\\d;]" +consistent!(fblog_1, "\u{001B}\\[[\\d;]*[^\\d;]"); + +// toml-query-0.6.0: r"^\[\d+\]$" +consistent!(toml_query_0, r"^\[\d+\]$"); + +// todo-txt-1.1.0: r" (?P<key>[^\s]+):(?P<value>[^\s^/]+)" +consistent!(todo_txt_0, r" (?P<key>[^\s]+):(?P<value>[^\s^/]+)"); + +// findr-0.1.5: r"\band\b" +consistent!(findr_0, r"\band\b"); + +// findr-0.1.5: r"\bor\b" +consistent!(findr_1, r"\bor\b"); + +// findr-0.1.5: r"\bnot\b" +consistent!(findr_2, r"\bnot\b"); + +// file-sniffer-3.0.1: r".*?\.(a|la|lo|o|ll|keter|bc|dyn_o|out|d|rlib|crate|min\.js|hi|dyn_hi|S|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$" +consistent!(file_sniffer_0, r".*?\.(a|la|lo|o|ll|keter|bc|dyn_o|out|d|rlib|crate|min\.js|hi|dyn_hi|S|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$"); + +// file-sniffer-3.0.1: r".*?\.(stats|conf|h|cache.*|dat|pc|info)$" +consistent!(file_sniffer_1, r".*?\.(stats|conf|h|cache.*|dat|pc|info)$"); + +// file-sniffer-3.0.1: r".*?\.(exe|a|la|o|ll|keter|bc|dyn_o|out|d|rlib|crate|min\.js|hi|dyn_hi|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$" +consistent!(file_sniffer_2, r".*?\.(exe|a|la|o|ll|keter|bc|dyn_o|out|d|rlib|crate|min\.js|hi|dyn_hi|jsexe|webapp|js\.externs|ibc|toc|aux|fdb_latexmk|fls|egg-info|whl|js_a|js_hi|jld|ji|js_o|so.*|dump-.*|vmb|crx|orig|elmo|elmi|pyc|mod|p_hi|p_o|prof|tix)$"); + +// file-sniffer-3.0.1: r".*?\.(stats|conf|h|cache.*)$" +consistent!(file_sniffer_3, r".*?\.(stats|conf|h|cache.*)$"); + +// file-sniffer-3.0.1: r"(\.git|\.pijul|_darcs|\.hg)$" +consistent!(file_sniffer_4, r"(\.git|\.pijul|_darcs|\.hg)$"); + +// file_logger-0.1.0: "test" +consistent!(file_logger_0, "test"); + +// file_scanner-0.2.0: r"foo" +consistent!(file_scanner_0, r"foo"); + +// file_scanner-0.2.0: r"a+b" +consistent!(file_scanner_1, r"a+b"); + +// file_scanner-0.2.0: r"a[ab]*b" +consistent!(file_scanner_2, r"a[ab]*b"); + +// file_scanner-0.2.0: r"\s+" +consistent!(file_scanner_3, r"\s+"); + +// file_scanner-0.2.0: r"\s+" +consistent!(file_scanner_4, r"\s+"); + +// cellsplit-0.2.1: r"^\s*([^\s]+) %cellsplit<\d+>$" +consistent!(cellsplit_0, r"^\s*([^\s]+) %cellsplit<\d+>$"); + +// cellsplit-0.2.1: r"^\s*([^\s]+) %cellsplit<\d+>$" +consistent!(cellsplit_1, r"^\s*([^\s]+) %cellsplit<\d+>$"); + +// aterm-0.20.0: r"^[+\-]?[0-9]+" +consistent!(aterm_0, r"^[+\-]?[0-9]+"); + +// aterm-0.20.0: r"^[+\-]?[0-9]+\.[0-9]*([eE][+\-]?[0-9]+)?" +consistent!(aterm_1, r"^[+\-]?[0-9]+\.[0-9]*([eE][+\-]?[0-9]+)?"); + +// atarashii_imap-0.3.0: r"^[*] OK" +consistent!(atarashii_imap_0, r"^[*] OK"); + +// atarashii_imap-0.3.0: r"FLAGS\s\((.+)\)" +consistent!(atarashii_imap_1, r"FLAGS\s\((.+)\)"); + +// atarashii_imap-0.3.0: r"\[PERMANENTFLAGS\s\((.+)\)\]" +consistent!(atarashii_imap_2, r"\[PERMANENTFLAGS\s\((.+)\)\]"); + +// atarashii_imap-0.3.0: r"\[UIDVALIDITY\s(\d+)\]" +consistent!(atarashii_imap_3, r"\[UIDVALIDITY\s(\d+)\]"); + +// atarashii_imap-0.3.0: r"(\d+)\sEXISTS" +consistent!(atarashii_imap_4, r"(\d+)\sEXISTS"); + +// atarashii_imap-0.3.0: r"(\d+)\sRECENT" +consistent!(atarashii_imap_5, r"(\d+)\sRECENT"); + +// atarashii_imap-0.3.0: r"\[UNSEEN\s(\d+)\]" +consistent!(atarashii_imap_6, r"\[UNSEEN\s(\d+)\]"); + +// atarashii_imap-0.3.0: r"\[UIDNEXT\s(\d+)\]" +consistent!(atarashii_imap_7, r"\[UIDNEXT\s(\d+)\]"); + +// editorconfig-1.0.0: r"\\(\{|\})" +consistent!(editorconfig_0, r"\\(\{|\})"); + +// editorconfig-1.0.0: r"(^|[^\\])\\\|" +consistent!(editorconfig_1, r"(^|[^\\])\\\|"); + +// editorconfig-1.0.0: r"\[([^\]]*)$" +consistent!(editorconfig_2, r"\[([^\]]*)$"); + +// editorconfig-1.0.0: r"\[(.*/.*)\]" +consistent!(editorconfig_3, r"\[(.*/.*)\]"); + +// editorconfig-1.0.0: r"\{(-?\d+\\\.\\\.-?\d+)\}" +consistent!(editorconfig_4, r"\{(-?\d+\\\.\\\.-?\d+)\}"); + +// editorconfig-1.0.0: r"\{([^,]+)\}" +consistent!(editorconfig_5, r"\{([^,]+)\}"); + +// editorconfig-1.0.0: r"\{(([^\}].*)?(,|\|)(.*[^\\])?)\}" +consistent!(editorconfig_6, r"\{(([^\}].*)?(,|\|)(.*[^\\])?)\}"); + +// editorconfig-1.0.0: r"^/" +consistent!(editorconfig_7, r"^/"); + +// editorconfig-1.0.0: r"(^|[^\\])(\{|\})" +consistent!(editorconfig_8, r"(^|[^\\])(\{|\})"); + +// edmunge-1.0.0: "^#!.*\n" +consistent!(edmunge_0, "^#!.*\n"); + +// unicode_names2_macros-0.2.0: r"\\N\{(.*?)(?:\}|$)" +consistent!(unicode_names2_macros_0, r"\\N\{(.*?)(?:\}|$)"); + +// unidiff-0.2.1: r"^--- (?P<filename>[^\t\n]+)(?:\t(?P<timestamp>[^\n]+))?" +consistent!( + unidiff_0, + r"^--- (?P<filename>[^\t\n]+)(?:\t(?P<timestamp>[^\n]+))?" +); + +// unidiff-0.2.1: r"^\+\+\+ (?P<filename>[^\t\n]+)(?:\t(?P<timestamp>[^\n]+))?" +consistent!( + unidiff_1, + r"^\+\+\+ (?P<filename>[^\t\n]+)(?:\t(?P<timestamp>[^\n]+))?" +); + +// unidiff-0.2.1: r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)" +consistent!(unidiff_2, r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)"); + +// unidiff-0.2.1: r"^(?P<line_type>[- \n\+\\]?)(?P<value>.*)" +consistent!(unidiff_3, r"^(?P<line_type>[- \n\+\\]?)(?P<value>.*)"); + +// slippy-map-tiles-0.13.1: "/?(?P<zoom>[0-9]?[0-9])/(?P<x>[0-9]{1,10})/(?P<y>[0-9]{1,10})(\\.[a-zA-Z]{3,4})?$" +consistent!(slippy_map_tiles_0, "/?(?P<zoom>[0-9]?[0-9])/(?P<x>[0-9]{1,10})/(?P<y>[0-9]{1,10})(\\.[a-zA-Z]{3,4})?$"); + +// slippy-map-tiles-0.13.1: r"^(?P<minlon>-?[0-9]{1,3}(\.[0-9]{1,10})?) (?P<minlat>-?[0-9]{1,3}(\.[0-9]{1,10})?) (?P<maxlon>-?[0-9]{1,3}(\.[0-9]{1,10})?) (?P<maxlat>-?[0-9]{1,3}(\.[0-9]{1,10})?)$" +consistent!(slippy_map_tiles_1, r"^(?P<minlon>-?[0-9]{1,3}(\.[0-9]{1,10})?) (?P<minlat>-?[0-9]{1,3}(\.[0-9]{1,10})?) (?P<maxlon>-?[0-9]{1,3}(\.[0-9]{1,10})?) (?P<maxlat>-?[0-9]{1,3}(\.[0-9]{1,10})?)$"); + +// slippy-map-tiles-0.13.1: r"^(?P<minlon>-?[0-9]{1,3}(\.[0-9]{1,10})?),(?P<minlat>-?[0-9]{1,3}(\.[0-9]{1,10})?),(?P<maxlon>-?[0-9]{1,3}(\.[0-9]{1,10})?),(?P<maxlat>-?[0-9]{1,3}(\.[0-9]{1,10})?)$" +consistent!(slippy_map_tiles_2, r"^(?P<minlon>-?[0-9]{1,3}(\.[0-9]{1,10})?),(?P<minlat>-?[0-9]{1,3}(\.[0-9]{1,10})?),(?P<maxlon>-?[0-9]{1,3}(\.[0-9]{1,10})?),(?P<maxlat>-?[0-9]{1,3}(\.[0-9]{1,10})?)$"); + +// sonos-0.1.2: r"^https?://(.+?):1400/xml" +consistent!(sonos_0, r"^https?://(.+?):1400/xml"); + +// validator_derive-0.7.0: r"^[a-z]{2}$" +consistent!(validator_derive_0, r"^[a-z]{2}$"); + +// validator_derive-0.7.0: r"[a-z]{2}" +consistent!(validator_derive_1, r"[a-z]{2}"); + +// validator_derive-0.7.0: r"[a-z]{2}" +consistent!(validator_derive_2, r"[a-z]{2}"); + +// nginx-config-0.8.0: r"one of \d+ options" +consistent!(nginx_config_0, r"one of \d+ options"); + +// waltz-0.4.0: r"[\s,]" +consistent!(waltz_0, r"[\s,]"); + +// warheadhateus-0.2.1: r"^aws_access_key_id = (.*)" +consistent!(warheadhateus_0, r"^aws_access_key_id = (.*)"); + +// warheadhateus-0.2.1: r"^aws_secret_access_key = (.*)" +consistent!(warheadhateus_1, r"^aws_secret_access_key = (.*)"); + +// warheadhateus-0.2.1: r"^aws_access_key_id = (.*)" +consistent!(warheadhateus_2, r"^aws_access_key_id = (.*)"); + +// warheadhateus-0.2.1: r"^aws_secret_access_key = (.*)" +consistent!(warheadhateus_3, r"^aws_secret_access_key = (.*)"); + +// jieba-rs-0.2.2: r"([\u{4E00}-\u{9FD5}a-zA-Z0-9+#&\._%]+)" +consistent!(jieba_rs_0, r"([\u{4E00}-\u{9FD5}a-zA-Z0-9+#&\._%]+)"); + +// jieba-rs-0.2.2: r"(\r\n|\s)" +consistent!(jieba_rs_1, r"(\r\n|\s)"); + +// jieba-rs-0.2.2: "([\u{4E00}-\u{9FD5}]+)" +consistent!(jieba_rs_2, "([\u{4E00}-\u{9FD5}]+)"); + +// jieba-rs-0.2.2: r"[^a-zA-Z0-9+#\n]" +consistent!(jieba_rs_3, r"[^a-zA-Z0-9+#\n]"); + +// jieba-rs-0.2.2: r"([\u{4E00}-\u{9FD5}]+)" +consistent!(jieba_rs_4, r"([\u{4E00}-\u{9FD5}]+)"); + +// jieba-rs-0.2.2: r"([a-zA-Z0-9]+(?:.\d+)?%?)" +consistent!(jieba_rs_5, r"([a-zA-Z0-9]+(?:.\d+)?%?)"); + +// lalrpop-0.15.2: r"Span\([0-9 ,]*\)" +consistent!(lalrpop_0, r"Span\([0-9 ,]*\)"); + +// lalrpop-snap-0.15.2: r"Span\([0-9 ,]*\)" +consistent!(lalrpop_snap_0, r"Span\([0-9 ,]*\)"); + +// nlp-tokenize-0.1.0: r"[\S]+" +consistent!(nlp_tokenize_0, r"[\S]+"); + +// kbgpg-0.1.2: "[[:xdigit:]][70]" +consistent!(kbgpg_0, "[[:xdigit:]][70]"); + +// cdbd-0.1.1: r"^((?P<address>.*):)?(?P<port>\d+)$" +consistent!(cdbd_0, r"^((?P<address>.*):)?(?P<port>\d+)$"); + +// mbutiles-0.1.1: r"[\w\s=+-/]+\((\{(.|\n)*\})\);?" +consistent!(mbutiles_0, r"[\w\s=+-/]+\((\{(.|\n)*\})\);?"); + +// extrahop-0.2.5: r"^-\d+(?:ms|s|m|h|d|w|y)?$" +consistent!(extrahop_0, r"^-\d+(?:ms|s|m|h|d|w|y)?$"); + +// pippin-0.1.0: "^((?:.*)-)?ss(0|[1-9][0-9]*)\\.pip$" +consistent!(pippin_0, "^((?:.*)-)?ss(0|[1-9][0-9]*)\\.pip$"); + +// pippin-0.1.0: "^((?:.*)-)?ss(0|[1-9][0-9]*)-cl(0|[1-9][0-9]*)\\.piplog$" +consistent!( + pippin_1, + "^((?:.*)-)?ss(0|[1-9][0-9]*)-cl(0|[1-9][0-9]*)\\.piplog$" +); + +// pippin-0.1.0: "^((?:.*)-)?ss(0|[1-9][0-9]*)\\.pip$" +consistent!(pippin_2, "^((?:.*)-)?ss(0|[1-9][0-9]*)\\.pip$"); + +// pippin-0.1.0: "^((?:.*)-)?ss(0|[1-9][0-9]*)-cl(0|[1-9][0-9]*)\\.piplog$" +consistent!( + pippin_3, + "^((?:.*)-)?ss(0|[1-9][0-9]*)-cl(0|[1-9][0-9]*)\\.piplog$" +); + +// pippin-0.1.0: "^.*pn(0|[1-9][0-9]*)(-ss(0|[1-9][0-9]*)(\\.pip|-cl(0|[1-9][0-9]*)\\.piplog))?$" +consistent!(pippin_4, "^.*pn(0|[1-9][0-9]*)(-ss(0|[1-9][0-9]*)(\\.pip|-cl(0|[1-9][0-9]*)\\.piplog))?$"); + +// pippin-0.1.0: "^(.*)-ss(?:0|[1-9][0-9]*)(?:\\.pip|-cl(?:0|[1-9][0-9]*)\\.piplog)$" +consistent!( + pippin_5, + "^(.*)-ss(?:0|[1-9][0-9]*)(?:\\.pip|-cl(?:0|[1-9][0-9]*)\\.piplog)$" +); + +// pinyin-0.3.0: r"(?i)[āáǎàēéěèōóǒòīíǐìūúǔùüǘǚǜńň]" +consistent!( + pinyin_0, + r"(?i)[āáǎàēéěèōóǒòīíǐìūúǔùüǘǚǜńň]" +); + +// pinyin-0.3.0: r"([aeoiuvnm])([0-4])$" +consistent!(pinyin_1, r"([aeoiuvnm])([0-4])$"); + +// duration-parser-0.2.0: r"(?P<value>\d+)(?P<units>[a-z])" +consistent!(duration_parser_0, r"(?P<value>\d+)(?P<units>[a-z])"); + +// dutree-0.2.7: r"^\d+\D?$" +consistent!(dutree_0, r"^\d+\D?$"); + +// djangohashers-0.3.0: r"^[A-Za-z0-9]*$" +consistent!(djangohashers_0, r"^[A-Za-z0-9]*$"); + +// rtag-0.3.5: r"^[A-Z][A-Z0-9]{2,}$" +consistent!(rtag_0, r"^[A-Z][A-Z0-9]{2,}$"); + +// rtag-0.3.5: r"^http://www\.emusic\.com" +consistent!(rtag_1, r"^http://www\.emusic\.com"); + +// rtag-0.3.5: r"^[A-Z][A-Z0-9]{2,}" +consistent!(rtag_2, r"^[A-Z][A-Z0-9]{2,}"); + +// rtag-0.3.5: r"(^[\x{0}|\x{feff}|\x{fffe}]*|[\x{0}|\x{feff}|\x{fffe}]*$)" +consistent!( + rtag_3, + r"(^[\x{0}|\x{feff}|\x{fffe}]*|[\x{0}|\x{feff}|\x{fffe}]*$)" +); + +// rtow-0.1.0: r"(\d+)[xX](\d+)" +consistent!(rtow_0, r"(\d+)[xX](\d+)"); + +// pleingres-sql-plugin-0.1.0: r"\$([a-zA-Z0-9_]+)" +consistent!(pleingres_sql_plugin_0, r"\$([a-zA-Z0-9_]+)"); + +// dono-2.0.0: "[\\n]+" +consistent!(dono_0, "[\\n]+"); + +// dono-2.0.0: "(?m)^\\n" +consistent!(dono_1, "(?m)^\\n"); + +// dono-2.0.0: "(?m)^\\n" +consistent!(dono_2, "(?m)^\\n"); + +// ssb-common-0.3.0: r"^[0-9A-Za-z\+/]{43}=\.ed25519$" +consistent!(ssb_common_0, r"^[0-9A-Za-z\+/]{43}=\.ed25519$"); + +// ssb-common-0.3.0: r"^[0-9A-Za-z\+/]{86}==\.ed25519$" +consistent!(ssb_common_1, r"^[0-9A-Za-z\+/]{86}==\.ed25519$"); + +// ssb-common-0.3.0: r"^[0-9A-Za-z\+/]{43}=\.sha256$" +consistent!(ssb_common_2, r"^[0-9A-Za-z\+/]{43}=\.sha256$"); + +// mozversion-0.1.3: r"^(?P<major>\d+)\.(?P<minor>\d+)(?:\.(?P<patch>\d+))?(?:(?P<pre0>[a-z]+)(?P<pre1>\d*))?$" +consistent!(mozversion_0, r"^(?P<major>\d+)\.(?P<minor>\d+)(?:\.(?P<patch>\d+))?(?:(?P<pre0>[a-z]+)(?P<pre1>\d*))?$"); + +// monger-0.5.6: r"^(\d+)\.(\d+)$" +consistent!(monger_0, r"^(\d+)\.(\d+)$"); + +// mongo_rub-0.0.2: r"^[rv]2\.6" +consistent!(mongo_rub_0, r"^[rv]2\.6"); + +// flow-0.3.5: "body value" +consistent!(flow_0, "body value"); + +// flow-0.3.5: "start marker" +consistent!(flow_1, "start marker"); + +// flow-0.3.5: "end marker" +consistent!(flow_2, "end marker"); + +// flow-0.3.5: "body value" +consistent!(flow_3, "body value"); + +// vobsub-0.2.3: "^([A-Za-z/ ]+): (.*)" +consistent!(vobsub_0, "^([A-Za-z/ ]+): (.*)"); + +// voidmap-1.1.2: r"#([^\s=]+)*" +consistent!(voidmap_0, r"#([^\s=]+)*"); + +// voidmap-1.1.2: r"#(\S+)*" +consistent!(voidmap_1, r"#(\S+)*"); + +// voidmap-1.1.2: r"#prio=(\d+)" +consistent!(voidmap_2, r"#prio=(\d+)"); + +// voidmap-1.1.2: r"\[(\S+)\]" +consistent!(voidmap_3, r"\[(\S+)\]"); + +// voidmap-1.1.2: r"#limit=(\d+)" +consistent!(voidmap_4, r"#limit=(\d+)"); + +// voidmap-1.1.2: r"#tagged=(\S+)" +consistent!(voidmap_5, r"#tagged=(\S+)"); + +// voidmap-1.1.2: r"#rev\b" +consistent!(voidmap_6, r"#rev\b"); + +// voidmap-1.1.2: r"#done\b" +consistent!(voidmap_7, r"#done\b"); + +// voidmap-1.1.2: r"#open\b" +consistent!(voidmap_8, r"#open\b"); + +// voidmap-1.1.2: r"#since=(\S+)" +consistent!(voidmap_9, r"#since=(\S+)"); + +// voidmap-1.1.2: r"#until=(\S+)" +consistent!(voidmap_10, r"#until=(\S+)"); + +// voidmap-1.1.2: r"#plot=(\S+)" +consistent!(voidmap_11, r"#plot=(\S+)"); + +// voidmap-1.1.2: r"#n=(\d+)" +consistent!(voidmap_12, r"#n=(\d+)"); + +// voidmap-1.1.2: r"(\S+)" +consistent!(voidmap_13, r"(\S+)"); + +// voidmap-1.1.2: r"(?P<y>\d+)y" +consistent!(voidmap_14, r"(?P<y>\d+)y"); + +// voidmap-1.1.2: r"(?P<m>\d+)m" +consistent!(voidmap_15, r"(?P<m>\d+)m"); + +// voidmap-1.1.2: r"(?P<w>\d+)w" +consistent!(voidmap_16, r"(?P<w>\d+)w"); + +// voidmap-1.1.2: r"(?P<d>\d+)d" +consistent!(voidmap_17, r"(?P<d>\d+)d"); + +// voidmap-1.1.2: r"(?P<h>\d+)h" +consistent!(voidmap_18, r"(?P<h>\d+)h"); + +// voidmap-1.1.2: r"C-(.)" +consistent!(voidmap_19, r"C-(.)"); + +// qt_generator-0.2.0: r"^\.\./qt[^/]+/" +consistent!(qt_generator_0, r"^\.\./qt[^/]+/"); + +// qt_generator-0.2.0: "(href|src)=\"([^\"]*)\"" +consistent!(qt_generator_1, "(href|src)=\"([^\"]*)\""); + +// kryptos-0.6.1: r"[01]{5}" +consistent!(kryptos_0, r"[01]{5}"); + +// cifar_10_loader-0.2.0: "data_batch_[1-5].bin" +consistent!(cifar_10_loader_0, "data_batch_[1-5].bin"); + +// cifar_10_loader-0.2.0: "test_batch.bin" +consistent!(cifar_10_loader_1, "test_batch.bin"); + +// circadian-0.6.0: r"^\d+.\d+s$" +consistent!(circadian_0, r"^\d+.\d+s$"); + +// circadian-0.6.0: r"^\d+:\d+$" +consistent!(circadian_1, r"^\d+:\d+$"); + +// circadian-0.6.0: r"^\d+:\d+m$" +consistent!(circadian_2, r"^\d+:\d+m$"); + +// cicada-0.8.1: r"!!" +consistent!(cicada_0, r"!!"); + +// cicada-0.8.1: r"^([^`]*)`([^`]+)`(.*)$" +consistent!(cicada_1, r"^([^`]*)`([^`]+)`(.*)$"); + +// cicada-0.8.1: r"\*+" +consistent!(cicada_2, r"\*+"); + +// cicada-0.8.1: r"([^\$]*)\$\{?([A-Za-z0-9\?\$_]+)\}?(.*)" +consistent!(cicada_3, r"([^\$]*)\$\{?([A-Za-z0-9\?\$_]+)\}?(.*)"); + +// cicada-0.8.1: r"^ *alias +([a-zA-Z0-9_\.-]+)=(.*)$" +consistent!(cicada_4, r"^ *alias +([a-zA-Z0-9_\.-]+)=(.*)$"); + +// vterm-sys-0.1.0: r"hi" +consistent!(vterm_sys_0, r"hi"); + +// skim-0.5.0: r".*?\t" +consistent!(skim_0, r".*?\t"); + +// skim-0.5.0: r".*?[\t ]" +consistent!(skim_1, r".*?[\t ]"); + +// skim-0.5.0: r"(\{-?[0-9.,q]*?})" +consistent!(skim_2, r"(\{-?[0-9.,q]*?})"); + +// skim-0.5.0: r"[ \t\n]+" +consistent!(skim_3, r"[ \t\n]+"); + +// skim-0.5.0: r"[ \t\n]+" +consistent!(skim_4, r"[ \t\n]+"); + +// skim-0.5.0: r"([^ |]+( +\| +[^ |]*)+)|( +)" +consistent!(skim_5, r"([^ |]+( +\| +[^ |]*)+)|( +)"); + +// skim-0.5.0: r" +\| +" +consistent!(skim_6, r" +\| +"); + +// skim-0.5.0: r"^(?P<left>-?\d+)?(?P<sep>\.\.)?(?P<right>-?\d+)?$" +consistent!(skim_7, r"^(?P<left>-?\d+)?(?P<sep>\.\.)?(?P<right>-?\d+)?$"); + +// skim-0.5.0: "," +consistent!(skim_8, ","); + +// skim-0.5.0: ".*?," +consistent!(skim_9, ".*?,"); + +// skim-0.5.0: ".*?," +consistent!(skim_10, ".*?,"); + +// skim-0.5.0: "," +consistent!(skim_11, ","); + +// skim-0.5.0: r"\x1B\[(?:([0-9]+;[0-9]+[Hf])|([0-9]+[ABCD])|(s|u|2J|K)|([0-9;]*m)|(=[0-9]+[hI]))" +consistent!(skim_12, r"\x1B\[(?:([0-9]+;[0-9]+[Hf])|([0-9]+[ABCD])|(s|u|2J|K)|([0-9;]*m)|(=[0-9]+[hI]))"); + +// egg-mode-text-1.14.7: r"[-_./]\z" +consistent!(egg_mode_text_0, r"[-_./]\z"); + +// java-properties-1.1.1: "^[ \t\r\n\x0c]*[#!]" +consistent!(java_properties_0, "^[ \t\r\n\x0c]*[#!]"); + +// java-properties-1.1.1: r"^[ \t\x0c]*[#!][^\r\n]*$" +consistent!(java_properties_1, r"^[ \t\x0c]*[#!][^\r\n]*$"); + +// java-properties-1.1.1: r"^([ \t\x0c]*[:=][ \t\x0c]*|[ \t\x0c]+)$" +consistent!(java_properties_2, r"^([ \t\x0c]*[:=][ \t\x0c]*|[ \t\x0c]+)$"); + +// ipaddress-0.1.2: r":.+\." +consistent!(ipaddress_0, r":.+\."); + +// ipaddress-0.1.2: r"\." +consistent!(ipaddress_1, r"\."); + +// ipaddress-0.1.2: r":" +consistent!(ipaddress_2, r":"); + +// iptables-0.2.2: r"v(\d+)\.(\d+)\.(\d+)" +consistent!(iptables_0, r"v(\d+)\.(\d+)\.(\d+)"); + +// rsure-0.8.1: r"^([^-]+)-(.*)\.dat\.gz$" +consistent!(rsure_0, r"^([^-]+)-(.*)\.dat\.gz$"); + +// rs-jsonpath-0.1.0: "^(.*?)(<=|<|==|>=|>)(.*?)$" +consistent!(rs_jsonpath_0, "^(.*?)(<=|<|==|>=|>)(.*?)$"); + +// oatie-0.3.0: r"(\n|^)(\w+):([\n\w\W]+?)(\n(?:\w)|(\n\]))" +consistent!(oatie_0, r"(\n|^)(\w+):([\n\w\W]+?)(\n(?:\w)|(\n\]))"); + +// weld-0.2.0: "#.*$" +consistent!(weld_0, "#.*$"); + +// weld-0.2.0: r"^[A-Za-z$_][A-Za-z0-9$_]*$" +consistent!(weld_1, r"^[A-Za-z$_][A-Za-z0-9$_]*$"); + +// weld-0.2.0: r"^[0-9]+[cC]$" +consistent!(weld_2, r"^[0-9]+[cC]$"); + +// weld-0.2.0: r"^0b[0-1]+[cC]$" +consistent!(weld_3, r"^0b[0-1]+[cC]$"); + +// weld-0.2.0: r"^0x[0-9a-fA-F]+[cC]$" +consistent!(weld_4, r"^0x[0-9a-fA-F]+[cC]$"); + +// weld-0.2.0: r"^[0-9]+$" +consistent!(weld_5, r"^[0-9]+$"); + +// weld-0.2.0: r"^0b[0-1]+$" +consistent!(weld_6, r"^0b[0-1]+$"); + +// weld-0.2.0: r"^0x[0-9a-fA-F]+$" +consistent!(weld_7, r"^0x[0-9a-fA-F]+$"); + +// weld-0.2.0: r"^[0-9]+[lL]$" +consistent!(weld_8, r"^[0-9]+[lL]$"); + +// weld-0.2.0: r"^0b[0-1]+[lL]$" +consistent!(weld_9, r"^0b[0-1]+[lL]$"); + +// weld-0.2.0: r"^0x[0-9a-fA-F]+[lL]$" +consistent!(weld_10, r"^0x[0-9a-fA-F]+[lL]$"); + +// webgl_generator-0.1.0: "([(, ])enum\\b" +consistent!(webgl_generator_0, "([(, ])enum\\b"); + +// webgl_generator-0.1.0: "\\bAcquireResourcesCallback\\b" +consistent!(webgl_generator_1, "\\bAcquireResourcesCallback\\b"); + +// weave-0.2.0: r"^(\d+)(,(\d+))?([acd]).*$" +consistent!(weave_0, r"^(\d+)(,(\d+))?([acd]).*$"); + +// wemo-0.0.12: r"<BinaryState>(\d)(\|-?\d+)*</BinaryState>" +consistent!(wemo_0, r"<BinaryState>(\d)(\|-?\d+)*</BinaryState>"); + +// webscale-0.9.4: r"(http[s]?://[^\s]+)" +consistent!(webscale_0, r"(http[s]?://[^\s]+)"); + +// svgrep-1.1.0: r"^\d+.*$" +consistent!(svgrep_0, r"^\d+.*$"); + +// ignore-0.4.2: r"^[\pL\pN]+$" +consistent!(ignore_0, r"^[\pL\pN]+$"); + +// ommui_string_patterns-0.1.2: r"^([A-Za-z][0-9A-Za-z_]*)?$" +consistent!(ommui_string_patterns_0, r"^([A-Za-z][0-9A-Za-z_]*)?$"); + +// ommui_string_patterns-0.1.2: r"^(\S+(?:.*\S)?)?$" +consistent!(ommui_string_patterns_1, r"^(\S+(?:.*\S)?)?$"); + +// opcua-types-0.3.0: "^(?P<min>[0-9]{1,10})(:(?P<max>[0-9]{1,10}))?$" +consistent!(opcua_types_0, "^(?P<min>[0-9]{1,10})(:(?P<max>[0-9]{1,10}))?$"); + +// opcua-types-0.3.0: r"^(ns=(?P<ns>[0-9]+);)?(?P<t>[isgb])=(?P<v>.+)$" +consistent!(opcua_types_1, r"^(ns=(?P<ns>[0-9]+);)?(?P<t>[isgb])=(?P<v>.+)$"); + +// open_read_later-1.1.1: r"^(.+?)\s*:\s*(.+)$" +consistent!(open_read_later_0, r"^(.+?)\s*:\s*(.+)$"); + +// youtube-downloader-0.1.0: r"^.*(?:(?:youtu\.be/|v/|vi/|u/w/|embed/)|(?:(?:watch)?\?v(?:i)?=|\&v(?:i)?=))([^#\&\?]*).*" +consistent!(youtube_downloader_0, r"^.*(?:(?:youtu\.be/|v/|vi/|u/w/|embed/)|(?:(?:watch)?\?v(?:i)?=|\&v(?:i)?=))([^#\&\?]*).*"); + +// yobot-0.1.1: "." +consistent!(yobot_0, "."); + +// yobot-0.1.1: r"." +consistent!(yobot_1, r"."); + +// yobot-0.1.1: r".+" +consistent!(yobot_2, r".+"); + +// yobot-0.1.1: r"." +consistent!(yobot_3, r"."); + +// ubiquity-0.1.5: r"foo" +consistent!(ubiquity_0, r"foo"); + +// ubiquity-0.1.5: r"/target/" +consistent!(ubiquity_1, r"/target/"); + +// ubiquity-0.1.5: r".DS_Store" +consistent!(ubiquity_2, r".DS_Store"); + +// qasm-1.0.0: r"//.*" +consistent!(qasm_0, r"//.*"); + +// drill-0.3.5: r"\{\{ *([a-z\._]+) *\}\}" +consistent!(drill_0, r"\{\{ *([a-z\._]+) *\}\}"); + +// queryst-2.0.0: r"^([^\]\[]+)" +consistent!(queryst_0, r"^([^\]\[]+)"); + +// queryst-2.0.0: r"(\[[^\]\[]*\])" +consistent!(queryst_1, r"(\[[^\]\[]*\])"); + +// qui-vive-0.1.0: r"^/(\w+)$" +consistent!(qui_vive_0, r"^/(\w+)$"); + +// qui-vive-0.1.0: r"^/key$" +consistent!(qui_vive_1, r"^/key$"); + +// qui-vive-0.1.0: r"^/key/(\w+)$" +consistent!(qui_vive_2, r"^/key/(\w+)$"); + +// qui-vive-0.1.0: r"^/url$" +consistent!(qui_vive_3, r"^/url$"); + +// qui-vive-0.1.0: r"^/url/(\w+)$" +consistent!(qui_vive_4, r"^/url/(\w+)$"); + +// qui-vive-0.1.0: r"^/inv$" +consistent!(qui_vive_5, r"^/inv$"); + +// qui-vive-0.1.0: r"^/inv/(\w+)$" +consistent!(qui_vive_6, r"^/inv/(\w+)$"); + +// subdiff-0.1.0: r"\b" +// consistent!(subdiff_0, r"\b"); + +// substudy-0.4.5: r"^(\d+)/(\d+)$" +consistent!(substudy_0, r"^(\d+)/(\d+)$"); + +// substudy-0.4.5: r"\s+" +consistent!(substudy_1, r"\s+"); + +// substudy-0.4.5: r"<[a-z/][^>]*>" +consistent!(substudy_2, r"<[a-z/][^>]*>"); + +// substudy-0.4.5: r"(\([^)]*\)|♪[^♪]*♪|[A-Z]{2,} ?:)" +consistent!(substudy_3, r"(\([^)]*\)|♪[^♪]*♪|[A-Z]{2,} ?:)"); + +// substudy-0.4.5: r"\s+" +consistent!(substudy_4, r"\s+"); + +// isbnid-0.1.3: r"^(\d(-| )?){9}(x|X|\d|(\d(-| )?){3}\d)$" +consistent!(isbnid_0, r"^(\d(-| )?){9}(x|X|\d|(\d(-| )?){3}\d)$"); + +// isbnid-0.1.3: r"[^0-9X]" +consistent!(isbnid_1, r"[^0-9X]"); + +// ispc-0.3.5: r"Intel\(r\) SPMD Program Compiler \(ispc\), (\d+\.\d+\.\d+)" +consistent!( + ispc_0, + r"Intel\(r\) SPMD Program Compiler \(ispc\), (\d+\.\d+\.\d+)" +); diff --git a/vendor/regex/tests/crazy.rs b/vendor/regex/tests/crazy.rs new file mode 100644 index 0000000000..293ac1ae72 --- /dev/null +++ b/vendor/regex/tests/crazy.rs @@ -0,0 +1,459 @@ +mat!(ascii_literal, r"a", "a", Some((0, 1))); + +// Some crazy expressions from regular-expressions.info. +mat!( + match_ranges, + r"(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b", + "num: 255", + Some((5, 8)) +); +mat!( + match_ranges_not, + r"(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b", + "num: 256", + None +); +mat!(match_float1, r"[-+]?[0-9]*\.?[0-9]+", "0.1", Some((0, 3))); +mat!(match_float2, r"[-+]?[0-9]*\.?[0-9]+", "0.1.2", Some((0, 3))); +mat!(match_float3, r"[-+]?[0-9]*\.?[0-9]+", "a1.2", Some((1, 4))); +mat!(match_float4, r"^[-+]?[0-9]*\.?[0-9]+$", "1.a", None); +mat!( + match_email, + r"(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b", + "mine is jam.slam@gmail.com ", + Some((8, 26)) +); +mat!( + match_email_not, + r"(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b", + "mine is jam.slam@gmail ", + None +); +mat!( + match_email_big, + r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?", + "mine is jam.slam@gmail.com ", + Some((8, 26)) +); +mat!( + match_date1, + r"(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", + "1900-01-01", + Some((0, 10)) +); +mat!( + match_date2, + r"(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", + "1900-00-01", + None +); +mat!( + match_date3, + r"(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", + "1900-13-01", + None +); + +// Do some crazy dancing with the start/end assertions. +matiter!(match_start_end_empty, r"^$", "", (0, 0)); +matiter!(match_start_end_empty_many_1, r"^$^$^$", "", (0, 0)); +matiter!(match_start_end_empty_many_2, r"^^^$$$", "", (0, 0)); +matiter!(match_start_end_empty_rev, r"$^", "", (0, 0)); +matiter!( + match_start_end_empty_rep, + r"(?:^$)*", + "a\nb\nc", + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5) +); +matiter!( + match_start_end_empty_rep_rev, + r"(?:$^)*", + "a\nb\nc", + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5) +); + +// Test negated character classes. +mat!(negclass_letters, r"[^ac]", "acx", Some((2, 3))); +mat!(negclass_letter_comma, r"[^a,]", "a,x", Some((2, 3))); +mat!(negclass_letter_space, r"[^a[:space:]]", "a x", Some((2, 3))); +mat!(negclass_comma, r"[^,]", ",,x", Some((2, 3))); +mat!(negclass_space, r"[^[:space:]]", " a", Some((1, 2))); +mat!(negclass_space_comma, r"[^,[:space:]]", ", a", Some((2, 3))); +mat!(negclass_comma_space, r"[^[:space:],]", " ,a", Some((2, 3))); +mat!(negclass_ascii, r"[^[:alpha:]Z]", "A1", Some((1, 2))); + +// Test that repeated empty expressions don't loop forever. +mat!(lazy_many_many, r"((?:.*)*?)=", "a=b", Some((0, 2))); +mat!(lazy_many_optional, r"((?:.?)*?)=", "a=b", Some((0, 2))); +mat!(lazy_one_many_many, r"((?:.*)+?)=", "a=b", Some((0, 2))); +mat!(lazy_one_many_optional, r"((?:.?)+?)=", "a=b", Some((0, 2))); +mat!(lazy_range_min_many, r"((?:.*){1,}?)=", "a=b", Some((0, 2))); +mat!(lazy_range_many, r"((?:.*){1,2}?)=", "a=b", Some((0, 2))); +mat!(greedy_many_many, r"((?:.*)*)=", "a=b", Some((0, 2))); +mat!(greedy_many_optional, r"((?:.?)*)=", "a=b", Some((0, 2))); +mat!(greedy_one_many_many, r"((?:.*)+)=", "a=b", Some((0, 2))); +mat!(greedy_one_many_optional, r"((?:.?)+)=", "a=b", Some((0, 2))); +mat!(greedy_range_min_many, r"((?:.*){1,})=", "a=b", Some((0, 2))); +mat!(greedy_range_many, r"((?:.*){1,2})=", "a=b", Some((0, 2))); + +// Test that we handle various flavors of empty expressions. +matiter!(match_empty1, r"", "", (0, 0)); +matiter!(match_empty2, r"", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty3, r"()", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty4, r"()*", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty5, r"()+", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty6, r"()?", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty7, r"()()", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty8, r"()+|z", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty9, r"z|()+", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty10, r"()+|b", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty11, r"b|()+", "abc", (0, 0), (1, 2), (3, 3)); +matiter!(match_empty12, r"|b", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty13, r"b|", "abc", (0, 0), (1, 2), (3, 3)); +matiter!(match_empty14, r"|z", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty15, r"z|", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty16, r"|", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty17, r"||", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty18, r"||z", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty19, r"(?:)|b", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty20, r"b|(?:)", "abc", (0, 0), (1, 2), (3, 3)); +matiter!(match_empty21, r"(?:|)", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty22, r"(?:|)|z", "abc", (0, 0), (1, 1), (2, 2), (3, 3)); +matiter!(match_empty23, r"a(?:)|b", "abc", (0, 1), (1, 2)); + +// Test that the DFA can handle pathological cases. +// (This should result in the DFA's cache being flushed too frequently, which +// should cause it to quit and fall back to the NFA algorithm.) +#[test] +fn dfa_handles_pathological_case() { + fn ones_and_zeroes(count: usize) -> String { + use rand::rngs::SmallRng; + use rand::{Rng, SeedableRng}; + + let mut rng = SmallRng::from_entropy(); + let mut s = String::new(); + for _ in 0..count { + if rng.gen() { + s.push('1'); + } else { + s.push('0'); + } + } + s + } + + let re = regex!(r"[01]*1[01]{20}$"); + let text = { + let mut pieces = ones_and_zeroes(100_000); + pieces.push('1'); + pieces.push_str(&ones_and_zeroes(20)); + pieces + }; + assert!(re.is_match(text!(&*text))); +} + +#[test] +fn nest_limit_makes_it_parse() { + use regex::RegexBuilder; + + RegexBuilder::new( + r#"(?-u) + 2(?: + [45]\d{3}| + 7(?: + 1[0-267]| + 2[0-289]| + 3[0-29]| + 4[01]| + 5[1-3]| + 6[013]| + 7[0178]| + 91 + )| + 8(?: + 0[125]| + [139][1-6]| + 2[0157-9]| + 41| + 6[1-35]| + 7[1-5]| + 8[1-8]| + 90 + )| + 9(?: + 0[0-2]| + 1[0-4]| + 2[568]| + 3[3-6]| + 5[5-7]| + 6[0167]| + 7[15]| + 8[0146-9] + ) + )\d{4}| + 3(?: + 12?[5-7]\d{2}| + 0(?: + 2(?: + [025-79]\d| + [348]\d{1,2} + )| + 3(?: + [2-4]\d| + [56]\d? + ) + )| + 2(?: + 1\d{2}| + 2(?: + [12]\d| + [35]\d{1,2}| + 4\d? + ) + )| + 3(?: + 1\d{2}| + 2(?: + [2356]\d| + 4\d{1,2} + ) + )| + 4(?: + 1\d{2}| + 2(?: + 2\d{1,2}| + [47]| + 5\d{2} + ) + )| + 5(?: + 1\d{2}| + 29 + )| + [67]1\d{2}| + 8(?: + 1\d{2}| + 2(?: + 2\d{2}| + 3| + 4\d + ) + ) + )\d{3}| + 4(?: + 0(?: + 2(?: + [09]\d| + 7 + )| + 33\d{2} + )| + 1\d{3}| + 2(?: + 1\d{2}| + 2(?: + [25]\d?| + [348]\d| + [67]\d{1,2} + ) + )| + 3(?: + 1\d{2}(?: + \d{2} + )?| + 2(?: + [045]\d| + [236-9]\d{1,2} + )| + 32\d{2} + )| + 4(?: + [18]\d{2}| + 2(?: + [2-46]\d{2}| + 3 + )| + 5[25]\d{2} + )| + 5(?: + 1\d{2}| + 2(?: + 3\d| + 5 + ) + )| + 6(?: + [18]\d{2}| + 2(?: + 3(?: + \d{2} + )?| + [46]\d{1,2}| + 5\d{2}| + 7\d + )| + 5(?: + 3\d?| + 4\d| + [57]\d{1,2}| + 6\d{2}| + 8 + ) + )| + 71\d{2}| + 8(?: + [18]\d{2}| + 23\d{2}| + 54\d{2} + )| + 9(?: + [18]\d{2}| + 2[2-5]\d{2}| + 53\d{1,2} + ) + )\d{3}| + 5(?: + 02[03489]\d{2}| + 1\d{2}| + 2(?: + 1\d{2}| + 2(?: + 2(?: + \d{2} + )?| + [457]\d{2} + ) + )| + 3(?: + 1\d{2}| + 2(?: + [37](?: + \d{2} + )?| + [569]\d{2} + ) + )| + 4(?: + 1\d{2}| + 2[46]\d{2} + )| + 5(?: + 1\d{2}| + 26\d{1,2} + )| + 6(?: + [18]\d{2}| + 2| + 53\d{2} + )| + 7(?: + 1| + 24 + )\d{2}| + 8(?: + 1| + 26 + )\d{2}| + 91\d{2} + )\d{3}| + 6(?: + 0(?: + 1\d{2}| + 2(?: + 3\d{2}| + 4\d{1,2} + ) + )| + 2(?: + 2[2-5]\d{2}| + 5(?: + [3-5]\d{2}| + 7 + )| + 8\d{2} + )| + 3(?: + 1| + 2[3478] + )\d{2}| + 4(?: + 1| + 2[34] + )\d{2}| + 5(?: + 1| + 2[47] + )\d{2}| + 6(?: + [18]\d{2}| + 6(?: + 2(?: + 2\d| + [34]\d{2} + )| + 5(?: + [24]\d{2}| + 3\d| + 5\d{1,2} + ) + ) + )| + 72[2-5]\d{2}| + 8(?: + 1\d{2}| + 2[2-5]\d{2} + )| + 9(?: + 1\d{2}| + 2[2-6]\d{2} + ) + )\d{3}| + 7(?: + (?: + 02| + [3-589]1| + 6[12]| + 72[24] + )\d{2}| + 21\d{3}| + 32 + )\d{3}| + 8(?: + (?: + 4[12]| + [5-7]2| + 1\d? + )| + (?: + 0| + 3[12]| + [5-7]1| + 217 + )\d + )\d{4}| + 9(?: + [35]1| + (?: + [024]2| + 81 + )\d| + (?: + 1| + [24]1 + )\d{2} + )\d{3} + "#, + ) + .build() + .unwrap(); +} diff --git a/vendor/regex/tests/flags.rs b/vendor/regex/tests/flags.rs new file mode 100644 index 0000000000..c33b82d434 --- /dev/null +++ b/vendor/regex/tests/flags.rs @@ -0,0 +1,31 @@ +mat!(match_flag_case, "(?-u)(?i)abc", "ABC", Some((0, 3))); +mat!(match_flag_weird_case, "(?-u)(?i)a(?-i)bc", "Abc", Some((0, 3))); +mat!(match_flag_weird_case_not, "(?-u)(?i)a(?-i)bc", "ABC", None); +mat!(match_flag_case_dotnl, "(?-u)(?is)a(?u:.)", "A\n", Some((0, 2))); +mat!( + match_flag_case_dotnl_toggle, + "(?-u)(?is)a(?u:.)(?-is)a(?u:.)", + "A\nab", + Some((0, 4)) +); +mat!( + match_flag_case_dotnl_toggle_not, + "(?-u)(?is)a(?u:.)(?-is)a(?u:.)", + "A\na\n", + None +); +mat!( + match_flag_case_dotnl_toggle_ok, + "(?-u)(?is)a(?u:.)(?-is:a(?u:.))?", + "A\na\n", + Some((0, 2)) +); +mat!( + match_flag_multi, + r"(?-u)(?m)(?:^\d+$\n?)+", + "123\n456\n789", + Some((0, 11)) +); +mat!(match_flag_ungreedy, "(?U)a+", "aa", Some((0, 1))); +mat!(match_flag_ungreedy_greedy, "(?U)a+?", "aa", Some((0, 2))); +mat!(match_flag_ungreedy_noop, "(?U)(?-U)a+", "aa", Some((0, 2))); diff --git a/vendor/regex/tests/fowler.rs b/vendor/regex/tests/fowler.rs new file mode 100644 index 0000000000..7f56a758d3 --- /dev/null +++ b/vendor/regex/tests/fowler.rs @@ -0,0 +1,1588 @@ +// DO NOT EDIT. Automatically generated by 'scripts/regex-match-tests.py' +// on 2019-09-02 11:07:37.849994. + +// Tests from basic.dat +mat!(match_basic_3, r"abracadabra$", r"abracadabracadabra", Some((7, 18))); +mat!(match_basic_4, r"a...b", r"abababbb", Some((2, 7))); +mat!(match_basic_5, r"XXXXXX", r"..XXXXXX", Some((2, 8))); +mat!(match_basic_6, r"\)", r"()", Some((1, 2))); +mat!(match_basic_7, r"a]", r"a]a", Some((0, 2))); +mat!(match_basic_9, r"\}", r"}", Some((0, 1))); +mat!(match_basic_10, r"\]", r"]", Some((0, 1))); +mat!(match_basic_12, r"]", r"]", Some((0, 1))); +mat!(match_basic_15, r"^a", r"ax", Some((0, 1))); +mat!(match_basic_16, r"\^a", r"a^a", Some((1, 3))); +mat!(match_basic_17, r"a\^", r"a^", Some((0, 2))); +mat!(match_basic_18, r"a$", r"aa", Some((1, 2))); +mat!(match_basic_19, r"a\$", r"a$", Some((0, 2))); +mat!(match_basic_20, r"^$", r"", Some((0, 0))); +mat!(match_basic_21, r"$^", r"", Some((0, 0))); +mat!(match_basic_22, r"a($)", r"aa", Some((1, 2)), Some((2, 2))); +mat!(match_basic_23, r"a*(^a)", r"aa", Some((0, 1)), Some((0, 1))); +mat!(match_basic_24, r"(..)*(...)*", r"a", Some((0, 0))); +mat!(match_basic_25, r"(..)*(...)*", r"abcd", Some((0, 4)), Some((2, 4))); +mat!( + match_basic_26, + r"(ab|a)(bc|c)", + r"abc", + Some((0, 3)), + Some((0, 2)), + Some((2, 3)) +); +mat!(match_basic_27, r"(ab)c|abc", r"abc", Some((0, 3)), Some((0, 2))); +mat!(match_basic_28, r"a{0}b", r"ab", Some((1, 2))); +mat!( + match_basic_29, + r"(a*)(b?)(b+)b{3}", + r"aaabbbbbbb", + Some((0, 10)), + Some((0, 3)), + Some((3, 4)), + Some((4, 7)) +); +mat!( + match_basic_30, + r"(a*)(b{0,1})(b{1,})b{3}", + r"aaabbbbbbb", + Some((0, 10)), + Some((0, 3)), + Some((3, 4)), + Some((4, 7)) +); +mat!( + match_basic_32, + r"((a|a)|a)", + r"a", + Some((0, 1)), + Some((0, 1)), + Some((0, 1)) +); +mat!( + match_basic_33, + r"(a*)(a|aa)", + r"aaaa", + Some((0, 4)), + Some((0, 3)), + Some((3, 4)) +); +mat!(match_basic_34, r"a*(a.|aa)", r"aaaa", Some((0, 4)), Some((2, 4))); +mat!( + match_basic_35, + r"a(b)|c(d)|a(e)f", + r"aef", + Some((0, 3)), + None, + None, + Some((1, 2)) +); +mat!(match_basic_36, r"(a|b)?.*", r"b", Some((0, 1)), Some((0, 1))); +mat!(match_basic_37, r"(a|b)c|a(b|c)", r"ac", Some((0, 2)), Some((0, 1))); +mat!( + match_basic_38, + r"(a|b)c|a(b|c)", + r"ab", + Some((0, 2)), + None, + Some((1, 2)) +); +mat!(match_basic_39, r"(a|b)*c|(a|ab)*c", r"abc", Some((0, 3)), Some((1, 2))); +mat!(match_basic_40, r"(a|b)*c|(a|ab)*c", r"xc", Some((1, 2))); +mat!( + match_basic_41, + r"(.a|.b).*|.*(.a|.b)", + r"xa", + Some((0, 2)), + Some((0, 2)) +); +mat!(match_basic_42, r"a?(ab|ba)ab", r"abab", Some((0, 4)), Some((0, 2))); +mat!(match_basic_43, r"a?(ac{0}b|ba)ab", r"abab", Some((0, 4)), Some((0, 2))); +mat!(match_basic_44, r"ab|abab", r"abbabab", Some((0, 2))); +mat!(match_basic_45, r"aba|bab|bba", r"baaabbbaba", Some((5, 8))); +mat!(match_basic_46, r"aba|bab", r"baaabbbaba", Some((6, 9))); +mat!( + match_basic_47, + r"(aa|aaa)*|(a|aaaaa)", + r"aa", + Some((0, 2)), + Some((0, 2)) +); +mat!( + match_basic_48, + r"(a.|.a.)*|(a|.a...)", + r"aa", + Some((0, 2)), + Some((0, 2)) +); +mat!(match_basic_49, r"ab|a", r"xabc", Some((1, 3))); +mat!(match_basic_50, r"ab|a", r"xxabc", Some((2, 4))); +mat!( + match_basic_51, + r"(?i)(?-u)(Ab|cD)*", + r"aBcD", + Some((0, 4)), + Some((2, 4)) +); +mat!(match_basic_52, r"[^-]", r"--a", Some((2, 3))); +mat!(match_basic_53, r"[a-]*", r"--a", Some((0, 3))); +mat!(match_basic_54, r"[a-m-]*", r"--amoma--", Some((0, 4))); +mat!( + match_basic_55, + r":::1:::0:|:::1:1:0:", + r":::0:::1:::1:::0:", + Some((8, 17)) +); +mat!( + match_basic_56, + r":::1:::0:|:::1:1:1:", + r":::0:::1:::1:::0:", + Some((8, 17)) +); +mat!(match_basic_57, r"[[:upper:]]", r"A", Some((0, 1))); +mat!(match_basic_58, r"[[:lower:]]+", r"`az{", Some((1, 3))); +mat!(match_basic_59, r"[[:upper:]]+", r"@AZ[", Some((1, 3))); +mat!( + match_basic_65, + r" +", + r" +", + Some((0, 1)) +); +mat!( + match_basic_66, + r" +", + r" +", + Some((0, 1)) +); +mat!( + match_basic_67, + r"[^a]", + r" +", + Some((0, 1)) +); +mat!( + match_basic_68, + r" +a", + r" +a", + Some((0, 2)) +); +mat!( + match_basic_69, + r"(a)(b)(c)", + r"abc", + Some((0, 3)), + Some((0, 1)), + Some((1, 2)), + Some((2, 3)) +); +mat!(match_basic_70, r"xxx", r"xxx", Some((0, 3))); +mat!( + match_basic_71, + r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", + r"feb 6,", + Some((0, 6)) +); +mat!( + match_basic_72, + r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", + r"2/7", + Some((0, 3)) +); +mat!( + match_basic_73, + r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", + r"feb 1,Feb 6", + Some((5, 11)) +); +mat!( + match_basic_74, + r"((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))", + r"x", + Some((0, 1)), + Some((0, 1)), + Some((0, 1)) +); +mat!( + match_basic_75, + r"((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*", + r"xx", + Some((0, 2)), + Some((1, 2)), + Some((1, 2)) +); +mat!( + match_basic_76, + r"a?(ab|ba)*", + r"ababababababababababababababababababababababababababababababababababababababababa", + Some((0, 81)), + Some((79, 81)) +); +mat!( + match_basic_77, + r"abaa|abbaa|abbbaa|abbbbaa", + r"ababbabbbabbbabbbbabbbbaa", + Some((18, 25)) +); +mat!( + match_basic_78, + r"abaa|abbaa|abbbaa|abbbbaa", + r"ababbabbbabbbabbbbabaa", + Some((18, 22)) +); +mat!( + match_basic_79, + r"aaac|aabc|abac|abbc|baac|babc|bbac|bbbc", + r"baaabbbabac", + Some((7, 11)) +); +mat!(match_basic_80, r".*", r"", Some((0, 2))); +mat!( + match_basic_81, + r"aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll", + r"XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa", + Some((53, 57)) +); +mat!(match_basic_83, r"a*a*a*a*a*b", r"aaaaaaaaab", Some((0, 10))); +mat!(match_basic_84, r"^", r"", Some((0, 0))); +mat!(match_basic_85, r"$", r"", Some((0, 0))); +mat!(match_basic_86, r"^$", r"", Some((0, 0))); +mat!(match_basic_87, r"^a$", r"a", Some((0, 1))); +mat!(match_basic_88, r"abc", r"abc", Some((0, 3))); +mat!(match_basic_89, r"abc", r"xabcy", Some((1, 4))); +mat!(match_basic_90, r"abc", r"ababc", Some((2, 5))); +mat!(match_basic_91, r"ab*c", r"abc", Some((0, 3))); +mat!(match_basic_92, r"ab*bc", r"abc", Some((0, 3))); +mat!(match_basic_93, r"ab*bc", r"abbc", Some((0, 4))); +mat!(match_basic_94, r"ab*bc", r"abbbbc", Some((0, 6))); +mat!(match_basic_95, r"ab+bc", r"abbc", Some((0, 4))); +mat!(match_basic_96, r"ab+bc", r"abbbbc", Some((0, 6))); +mat!(match_basic_97, r"ab?bc", r"abbc", Some((0, 4))); +mat!(match_basic_98, r"ab?bc", r"abc", Some((0, 3))); +mat!(match_basic_99, r"ab?c", r"abc", Some((0, 3))); +mat!(match_basic_100, r"^abc$", r"abc", Some((0, 3))); +mat!(match_basic_101, r"^abc", r"abcc", Some((0, 3))); +mat!(match_basic_102, r"abc$", r"aabc", Some((1, 4))); +mat!(match_basic_103, r"^", r"abc", Some((0, 0))); +mat!(match_basic_104, r"$", r"abc", Some((3, 3))); +mat!(match_basic_105, r"a.c", r"abc", Some((0, 3))); +mat!(match_basic_106, r"a.c", r"axc", Some((0, 3))); +mat!(match_basic_107, r"a.*c", r"axyzc", Some((0, 5))); +mat!(match_basic_108, r"a[bc]d", r"abd", Some((0, 3))); +mat!(match_basic_109, r"a[b-d]e", r"ace", Some((0, 3))); +mat!(match_basic_110, r"a[b-d]", r"aac", Some((1, 3))); +mat!(match_basic_111, r"a[-b]", r"a-", Some((0, 2))); +mat!(match_basic_112, r"a[b-]", r"a-", Some((0, 2))); +mat!(match_basic_113, r"a]", r"a]", Some((0, 2))); +mat!(match_basic_114, r"a[]]b", r"a]b", Some((0, 3))); +mat!(match_basic_115, r"a[^bc]d", r"aed", Some((0, 3))); +mat!(match_basic_116, r"a[^-b]c", r"adc", Some((0, 3))); +mat!(match_basic_117, r"a[^]b]c", r"adc", Some((0, 3))); +mat!(match_basic_118, r"ab|cd", r"abc", Some((0, 2))); +mat!(match_basic_119, r"ab|cd", r"abcd", Some((0, 2))); +mat!(match_basic_120, r"a\(b", r"a(b", Some((0, 3))); +mat!(match_basic_121, r"a\(*b", r"ab", Some((0, 2))); +mat!(match_basic_122, r"a\(*b", r"a((b", Some((0, 4))); +mat!( + match_basic_123, + r"((a))", + r"abc", + Some((0, 1)), + Some((0, 1)), + Some((0, 1)) +); +mat!( + match_basic_124, + r"(a)b(c)", + r"abc", + Some((0, 3)), + Some((0, 1)), + Some((2, 3)) +); +mat!(match_basic_125, r"a+b+c", r"aabbabc", Some((4, 7))); +mat!(match_basic_126, r"a*", r"aaa", Some((0, 3))); +mat!(match_basic_128, r"(a*)*", r"-", Some((0, 0)), None); +mat!(match_basic_129, r"(a*)+", r"-", Some((0, 0)), Some((0, 0))); +mat!(match_basic_131, r"(a*|b)*", r"-", Some((0, 0)), None); +mat!(match_basic_132, r"(a+|b)*", r"ab", Some((0, 2)), Some((1, 2))); +mat!(match_basic_133, r"(a+|b)+", r"ab", Some((0, 2)), Some((1, 2))); +mat!(match_basic_134, r"(a+|b)?", r"ab", Some((0, 1)), Some((0, 1))); +mat!(match_basic_135, r"[^ab]*", r"cde", Some((0, 3))); +mat!(match_basic_137, r"(^)*", r"-", Some((0, 0)), None); +mat!(match_basic_138, r"a*", r"", Some((0, 0))); +mat!(match_basic_139, r"([abc])*d", r"abbbcd", Some((0, 6)), Some((4, 5))); +mat!(match_basic_140, r"([abc])*bcd", r"abcd", Some((0, 4)), Some((0, 1))); +mat!(match_basic_141, r"a|b|c|d|e", r"e", Some((0, 1))); +mat!(match_basic_142, r"(a|b|c|d|e)f", r"ef", Some((0, 2)), Some((0, 1))); +mat!(match_basic_144, r"((a*|b))*", r"-", Some((0, 0)), None, None); +mat!(match_basic_145, r"abcd*efg", r"abcdefg", Some((0, 7))); +mat!(match_basic_146, r"ab*", r"xabyabbbz", Some((1, 3))); +mat!(match_basic_147, r"ab*", r"xayabbbz", Some((1, 2))); +mat!(match_basic_148, r"(ab|cd)e", r"abcde", Some((2, 5)), Some((2, 4))); +mat!(match_basic_149, r"[abhgefdc]ij", r"hij", Some((0, 3))); +mat!(match_basic_150, r"(a|b)c*d", r"abcd", Some((1, 4)), Some((1, 2))); +mat!(match_basic_151, r"(ab|ab*)bc", r"abc", Some((0, 3)), Some((0, 1))); +mat!(match_basic_152, r"a([bc]*)c*", r"abc", Some((0, 3)), Some((1, 3))); +mat!( + match_basic_153, + r"a([bc]*)(c*d)", + r"abcd", + Some((0, 4)), + Some((1, 3)), + Some((3, 4)) +); +mat!( + match_basic_154, + r"a([bc]+)(c*d)", + r"abcd", + Some((0, 4)), + Some((1, 3)), + Some((3, 4)) +); +mat!( + match_basic_155, + r"a([bc]*)(c+d)", + r"abcd", + Some((0, 4)), + Some((1, 2)), + Some((2, 4)) +); +mat!(match_basic_156, r"a[bcd]*dcdcde", r"adcdcde", Some((0, 7))); +mat!(match_basic_157, r"(ab|a)b*c", r"abc", Some((0, 3)), Some((0, 2))); +mat!( + match_basic_158, + r"((a)(b)c)(d)", + r"abcd", + Some((0, 4)), + Some((0, 3)), + Some((0, 1)), + Some((1, 2)), + Some((3, 4)) +); +mat!(match_basic_159, r"[A-Za-z_][A-Za-z0-9_]*", r"alpha", Some((0, 5))); +mat!(match_basic_160, r"^a(bc+|b[eh])g|.h$", r"abh", Some((1, 3))); +mat!( + match_basic_161, + r"(bc+d$|ef*g.|h?i(j|k))", + r"effgz", + Some((0, 5)), + Some((0, 5)) +); +mat!( + match_basic_162, + r"(bc+d$|ef*g.|h?i(j|k))", + r"ij", + Some((0, 2)), + Some((0, 2)), + Some((1, 2)) +); +mat!( + match_basic_163, + r"(bc+d$|ef*g.|h?i(j|k))", + r"reffgz", + Some((1, 6)), + Some((1, 6)) +); +mat!( + match_basic_164, + r"(((((((((a)))))))))", + r"a", + Some((0, 1)), + Some((0, 1)), + Some((0, 1)), + Some((0, 1)), + Some((0, 1)), + Some((0, 1)), + Some((0, 1)), + Some((0, 1)), + Some((0, 1)), + Some((0, 1)) +); +mat!( + match_basic_165, + r"multiple words", + r"multiple words yeah", + Some((0, 14)) +); +mat!( + match_basic_166, + r"(.*)c(.*)", + r"abcde", + Some((0, 5)), + Some((0, 2)), + Some((3, 5)) +); +mat!(match_basic_167, r"abcd", r"abcd", Some((0, 4))); +mat!(match_basic_168, r"a(bc)d", r"abcd", Some((0, 4)), Some((1, 3))); +mat!(match_basic_169, r"a[-]?c", r"ac", Some((0, 3))); +mat!( + match_basic_170, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muammar Qaddafi", + Some((0, 15)), + None, + Some((10, 12)) +); +mat!( + match_basic_171, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Mo'ammar Gadhafi", + Some((0, 16)), + None, + Some((11, 13)) +); +mat!( + match_basic_172, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muammar Kaddafi", + Some((0, 15)), + None, + Some((10, 12)) +); +mat!( + match_basic_173, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muammar Qadhafi", + Some((0, 15)), + None, + Some((10, 12)) +); +mat!( + match_basic_174, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muammar Gadafi", + Some((0, 14)), + None, + Some((10, 11)) +); +mat!( + match_basic_175, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Mu'ammar Qadafi", + Some((0, 15)), + None, + Some((11, 12)) +); +mat!( + match_basic_176, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Moamar Gaddafi", + Some((0, 14)), + None, + Some((9, 11)) +); +mat!( + match_basic_177, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Mu'ammar Qadhdhafi", + Some((0, 18)), + None, + Some((13, 15)) +); +mat!( + match_basic_178, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muammar Khaddafi", + Some((0, 16)), + None, + Some((11, 13)) +); +mat!( + match_basic_179, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muammar Ghaddafy", + Some((0, 16)), + None, + Some((11, 13)) +); +mat!( + match_basic_180, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muammar Ghadafi", + Some((0, 15)), + None, + Some((11, 12)) +); +mat!( + match_basic_181, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muammar Ghaddafi", + Some((0, 16)), + None, + Some((11, 13)) +); +mat!( + match_basic_182, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muamar Kaddafi", + Some((0, 14)), + None, + Some((9, 11)) +); +mat!( + match_basic_183, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muammar Quathafi", + Some((0, 16)), + None, + Some((11, 13)) +); +mat!( + match_basic_184, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Muammar Gheddafi", + Some((0, 16)), + None, + Some((11, 13)) +); +mat!( + match_basic_185, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Moammar Khadafy", + Some((0, 15)), + None, + Some((11, 12)) +); +mat!( + match_basic_186, + r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", + r"Moammar Qudhafi", + Some((0, 15)), + None, + Some((10, 12)) +); +mat!(match_basic_187, r"a+(b|c)*d+", r"aabcdd", Some((0, 6)), Some((3, 4))); +mat!(match_basic_188, r"^.+$", r"vivi", Some((0, 4))); +mat!(match_basic_189, r"^(.+)$", r"vivi", Some((0, 4)), Some((0, 4))); +mat!( + match_basic_190, + r"^([^!.]+).att.com!(.+)$", + r"gryphon.att.com!eby", + Some((0, 19)), + Some((0, 7)), + Some((16, 19)) +); +mat!( + match_basic_191, + r"^([^!]+!)?([^!]+)$", + r"bas", + Some((0, 3)), + None, + Some((0, 3)) +); +mat!( + match_basic_192, + r"^([^!]+!)?([^!]+)$", + r"bar!bas", + Some((0, 7)), + Some((0, 4)), + Some((4, 7)) +); +mat!( + match_basic_193, + r"^([^!]+!)?([^!]+)$", + r"foo!bas", + Some((0, 7)), + Some((0, 4)), + Some((4, 7)) +); +mat!( + match_basic_194, + r"^.+!([^!]+!)([^!]+)$", + r"foo!bar!bas", + Some((0, 11)), + Some((4, 8)), + Some((8, 11)) +); +mat!( + match_basic_195, + r"((foo)|(bar))!bas", + r"bar!bas", + Some((0, 7)), + Some((0, 3)), + None, + Some((0, 3)) +); +mat!( + match_basic_196, + r"((foo)|(bar))!bas", + r"foo!bar!bas", + Some((4, 11)), + Some((4, 7)), + None, + Some((4, 7)) +); +mat!( + match_basic_197, + r"((foo)|(bar))!bas", + r"foo!bas", + Some((0, 7)), + Some((0, 3)), + Some((0, 3)) +); +mat!( + match_basic_198, + r"((foo)|bar)!bas", + r"bar!bas", + Some((0, 7)), + Some((0, 3)) +); +mat!( + match_basic_199, + r"((foo)|bar)!bas", + r"foo!bar!bas", + Some((4, 11)), + Some((4, 7)) +); +mat!( + match_basic_200, + r"((foo)|bar)!bas", + r"foo!bas", + Some((0, 7)), + Some((0, 3)), + Some((0, 3)) +); +mat!( + match_basic_201, + r"(foo|(bar))!bas", + r"bar!bas", + Some((0, 7)), + Some((0, 3)), + Some((0, 3)) +); +mat!( + match_basic_202, + r"(foo|(bar))!bas", + r"foo!bar!bas", + Some((4, 11)), + Some((4, 7)), + Some((4, 7)) +); +mat!( + match_basic_203, + r"(foo|(bar))!bas", + r"foo!bas", + Some((0, 7)), + Some((0, 3)) +); +mat!( + match_basic_204, + r"(foo|bar)!bas", + r"bar!bas", + Some((0, 7)), + Some((0, 3)) +); +mat!( + match_basic_205, + r"(foo|bar)!bas", + r"foo!bar!bas", + Some((4, 11)), + Some((4, 7)) +); +mat!( + match_basic_206, + r"(foo|bar)!bas", + r"foo!bas", + Some((0, 7)), + Some((0, 3)) +); +mat!( + match_basic_207, + r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", + r"foo!bar!bas", + Some((0, 11)), + Some((0, 11)), + None, + None, + Some((4, 8)), + Some((8, 11)) +); +mat!( + match_basic_208, + r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", + r"bas", + Some((0, 3)), + None, + Some((0, 3)) +); +mat!( + match_basic_209, + r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", + r"bar!bas", + Some((0, 7)), + Some((0, 4)), + Some((4, 7)) +); +mat!( + match_basic_210, + r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", + r"foo!bar!bas", + Some((0, 11)), + None, + None, + Some((4, 8)), + Some((8, 11)) +); +mat!( + match_basic_211, + r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", + r"foo!bas", + Some((0, 7)), + Some((0, 4)), + Some((4, 7)) +); +mat!( + match_basic_212, + r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", + r"bas", + Some((0, 3)), + Some((0, 3)), + None, + Some((0, 3)) +); +mat!( + match_basic_213, + r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", + r"bar!bas", + Some((0, 7)), + Some((0, 7)), + Some((0, 4)), + Some((4, 7)) +); +mat!( + match_basic_214, + r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", + r"foo!bar!bas", + Some((0, 11)), + Some((0, 11)), + None, + None, + Some((4, 8)), + Some((8, 11)) +); +mat!( + match_basic_215, + r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", + r"foo!bas", + Some((0, 7)), + Some((0, 7)), + Some((0, 4)), + Some((4, 7)) +); +mat!(match_basic_216, r".*(/XXX).*", r"/XXX", Some((0, 4)), Some((0, 4))); +mat!(match_basic_217, r".*(\\XXX).*", r"\XXX", Some((0, 4)), Some((0, 4))); +mat!(match_basic_218, r"\\XXX", r"\XXX", Some((0, 4))); +mat!(match_basic_219, r".*(/000).*", r"/000", Some((0, 4)), Some((0, 4))); +mat!(match_basic_220, r".*(\\000).*", r"\000", Some((0, 4)), Some((0, 4))); +mat!(match_basic_221, r"\\000", r"\000", Some((0, 4))); + +// Tests from nullsubexpr.dat +mat!(match_nullsubexpr_3, r"(a*)*", r"a", Some((0, 1)), Some((0, 1))); +mat!(match_nullsubexpr_5, r"(a*)*", r"x", Some((0, 0)), None); +mat!(match_nullsubexpr_6, r"(a*)*", r"aaaaaa", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_7, r"(a*)*", r"aaaaaax", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_8, r"(a*)+", r"a", Some((0, 1)), Some((0, 1))); +mat!(match_nullsubexpr_9, r"(a*)+", r"x", Some((0, 0)), Some((0, 0))); +mat!(match_nullsubexpr_10, r"(a*)+", r"aaaaaa", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_11, r"(a*)+", r"aaaaaax", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_12, r"(a+)*", r"a", Some((0, 1)), Some((0, 1))); +mat!(match_nullsubexpr_13, r"(a+)*", r"x", Some((0, 0))); +mat!(match_nullsubexpr_14, r"(a+)*", r"aaaaaa", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_15, r"(a+)*", r"aaaaaax", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_16, r"(a+)+", r"a", Some((0, 1)), Some((0, 1))); +mat!(match_nullsubexpr_17, r"(a+)+", r"x", None); +mat!(match_nullsubexpr_18, r"(a+)+", r"aaaaaa", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_19, r"(a+)+", r"aaaaaax", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_21, r"([a]*)*", r"a", Some((0, 1)), Some((0, 1))); +mat!(match_nullsubexpr_23, r"([a]*)*", r"x", Some((0, 0)), None); +mat!(match_nullsubexpr_24, r"([a]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_25, r"([a]*)*", r"aaaaaax", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_26, r"([a]*)+", r"a", Some((0, 1)), Some((0, 1))); +mat!(match_nullsubexpr_27, r"([a]*)+", r"x", Some((0, 0)), Some((0, 0))); +mat!(match_nullsubexpr_28, r"([a]*)+", r"aaaaaa", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_29, r"([a]*)+", r"aaaaaax", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_30, r"([^b]*)*", r"a", Some((0, 1)), Some((0, 1))); +mat!(match_nullsubexpr_32, r"([^b]*)*", r"b", Some((0, 0)), None); +mat!(match_nullsubexpr_33, r"([^b]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6))); +mat!( + match_nullsubexpr_34, + r"([^b]*)*", + r"aaaaaab", + Some((0, 6)), + Some((0, 6)) +); +mat!(match_nullsubexpr_35, r"([ab]*)*", r"a", Some((0, 1)), Some((0, 1))); +mat!(match_nullsubexpr_36, r"([ab]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_37, r"([ab]*)*", r"ababab", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_38, r"([ab]*)*", r"bababa", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_39, r"([ab]*)*", r"b", Some((0, 1)), Some((0, 1))); +mat!(match_nullsubexpr_40, r"([ab]*)*", r"bbbbbb", Some((0, 6)), Some((0, 6))); +mat!( + match_nullsubexpr_41, + r"([ab]*)*", + r"aaaabcde", + Some((0, 5)), + Some((0, 5)) +); +mat!(match_nullsubexpr_42, r"([^a]*)*", r"b", Some((0, 1)), Some((0, 1))); +mat!(match_nullsubexpr_43, r"([^a]*)*", r"bbbbbb", Some((0, 6)), Some((0, 6))); +mat!(match_nullsubexpr_45, r"([^a]*)*", r"aaaaaa", Some((0, 0)), None); +mat!( + match_nullsubexpr_46, + r"([^ab]*)*", + r"ccccxx", + Some((0, 6)), + Some((0, 6)) +); +mat!(match_nullsubexpr_48, r"([^ab]*)*", r"ababab", Some((0, 0)), None); +mat!( + match_nullsubexpr_50, + r"((z)+|a)*", + r"zabcde", + Some((0, 2)), + Some((1, 2)) +); +mat!( + match_nullsubexpr_69, + r"(a*)*(x)", + r"x", + Some((0, 1)), + None, + Some((0, 1)) +); +mat!( + match_nullsubexpr_70, + r"(a*)*(x)", + r"ax", + Some((0, 2)), + Some((0, 1)), + Some((1, 2)) +); +mat!( + match_nullsubexpr_71, + r"(a*)*(x)", + r"axa", + Some((0, 2)), + Some((0, 1)), + Some((1, 2)) +); +mat!( + match_nullsubexpr_73, + r"(a*)+(x)", + r"x", + Some((0, 1)), + Some((0, 0)), + Some((0, 1)) +); +mat!( + match_nullsubexpr_74, + r"(a*)+(x)", + r"ax", + Some((0, 2)), + Some((0, 1)), + Some((1, 2)) +); +mat!( + match_nullsubexpr_75, + r"(a*)+(x)", + r"axa", + Some((0, 2)), + Some((0, 1)), + Some((1, 2)) +); +mat!( + match_nullsubexpr_77, + r"(a*){2}(x)", + r"x", + Some((0, 1)), + Some((0, 0)), + Some((0, 1)) +); +mat!( + match_nullsubexpr_78, + r"(a*){2}(x)", + r"ax", + Some((0, 2)), + Some((1, 1)), + Some((1, 2)) +); +mat!( + match_nullsubexpr_79, + r"(a*){2}(x)", + r"axa", + Some((0, 2)), + Some((1, 1)), + Some((1, 2)) +); + +// Tests from repetition.dat +mat!(match_repetition_10, r"((..)|(.))", r"", None); +mat!(match_repetition_11, r"((..)|(.))((..)|(.))", r"", None); +mat!(match_repetition_12, r"((..)|(.))((..)|(.))((..)|(.))", r"", None); +mat!(match_repetition_14, r"((..)|(.)){1}", r"", None); +mat!(match_repetition_15, r"((..)|(.)){2}", r"", None); +mat!(match_repetition_16, r"((..)|(.)){3}", r"", None); +mat!(match_repetition_18, r"((..)|(.))*", r"", Some((0, 0))); +mat!( + match_repetition_20, + r"((..)|(.))", + r"a", + Some((0, 1)), + Some((0, 1)), + None, + Some((0, 1)) +); +mat!(match_repetition_21, r"((..)|(.))((..)|(.))", r"a", None); +mat!(match_repetition_22, r"((..)|(.))((..)|(.))((..)|(.))", r"a", None); +mat!( + match_repetition_24, + r"((..)|(.)){1}", + r"a", + Some((0, 1)), + Some((0, 1)), + None, + Some((0, 1)) +); +mat!(match_repetition_25, r"((..)|(.)){2}", r"a", None); +mat!(match_repetition_26, r"((..)|(.)){3}", r"a", None); +mat!( + match_repetition_28, + r"((..)|(.))*", + r"a", + Some((0, 1)), + Some((0, 1)), + None, + Some((0, 1)) +); +mat!( + match_repetition_30, + r"((..)|(.))", + r"aa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_31, + r"((..)|(.))((..)|(.))", + r"aa", + Some((0, 2)), + Some((0, 1)), + None, + Some((0, 1)), + Some((1, 2)), + None, + Some((1, 2)) +); +mat!(match_repetition_32, r"((..)|(.))((..)|(.))((..)|(.))", r"aa", None); +mat!( + match_repetition_34, + r"((..)|(.)){1}", + r"aa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_35, + r"((..)|(.)){2}", + r"aa", + Some((0, 2)), + Some((1, 2)), + None, + Some((1, 2)) +); +mat!(match_repetition_36, r"((..)|(.)){3}", r"aa", None); +mat!( + match_repetition_38, + r"((..)|(.))*", + r"aa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_40, + r"((..)|(.))", + r"aaa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_41, + r"((..)|(.))((..)|(.))", + r"aaa", + Some((0, 3)), + Some((0, 2)), + Some((0, 2)), + None, + Some((2, 3)), + None, + Some((2, 3)) +); +mat!( + match_repetition_42, + r"((..)|(.))((..)|(.))((..)|(.))", + r"aaa", + Some((0, 3)), + Some((0, 1)), + None, + Some((0, 1)), + Some((1, 2)), + None, + Some((1, 2)), + Some((2, 3)), + None, + Some((2, 3)) +); +mat!( + match_repetition_44, + r"((..)|(.)){1}", + r"aaa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_46, + r"((..)|(.)){2}", + r"aaa", + Some((0, 3)), + Some((2, 3)), + Some((0, 2)), + Some((2, 3)) +); +mat!( + match_repetition_47, + r"((..)|(.)){3}", + r"aaa", + Some((0, 3)), + Some((2, 3)), + None, + Some((2, 3)) +); +mat!( + match_repetition_50, + r"((..)|(.))*", + r"aaa", + Some((0, 3)), + Some((2, 3)), + Some((0, 2)), + Some((2, 3)) +); +mat!( + match_repetition_52, + r"((..)|(.))", + r"aaaa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_53, + r"((..)|(.))((..)|(.))", + r"aaaa", + Some((0, 4)), + Some((0, 2)), + Some((0, 2)), + None, + Some((2, 4)), + Some((2, 4)), + None +); +mat!( + match_repetition_54, + r"((..)|(.))((..)|(.))((..)|(.))", + r"aaaa", + Some((0, 4)), + Some((0, 2)), + Some((0, 2)), + None, + Some((2, 3)), + None, + Some((2, 3)), + Some((3, 4)), + None, + Some((3, 4)) +); +mat!( + match_repetition_56, + r"((..)|(.)){1}", + r"aaaa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_57, + r"((..)|(.)){2}", + r"aaaa", + Some((0, 4)), + Some((2, 4)), + Some((2, 4)), + None +); +mat!( + match_repetition_59, + r"((..)|(.)){3}", + r"aaaa", + Some((0, 4)), + Some((3, 4)), + Some((0, 2)), + Some((3, 4)) +); +mat!( + match_repetition_61, + r"((..)|(.))*", + r"aaaa", + Some((0, 4)), + Some((2, 4)), + Some((2, 4)), + None +); +mat!( + match_repetition_63, + r"((..)|(.))", + r"aaaaa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_64, + r"((..)|(.))((..)|(.))", + r"aaaaa", + Some((0, 4)), + Some((0, 2)), + Some((0, 2)), + None, + Some((2, 4)), + Some((2, 4)), + None +); +mat!( + match_repetition_65, + r"((..)|(.))((..)|(.))((..)|(.))", + r"aaaaa", + Some((0, 5)), + Some((0, 2)), + Some((0, 2)), + None, + Some((2, 4)), + Some((2, 4)), + None, + Some((4, 5)), + None, + Some((4, 5)) +); +mat!( + match_repetition_67, + r"((..)|(.)){1}", + r"aaaaa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_68, + r"((..)|(.)){2}", + r"aaaaa", + Some((0, 4)), + Some((2, 4)), + Some((2, 4)), + None +); +mat!( + match_repetition_70, + r"((..)|(.)){3}", + r"aaaaa", + Some((0, 5)), + Some((4, 5)), + Some((2, 4)), + Some((4, 5)) +); +mat!( + match_repetition_73, + r"((..)|(.))*", + r"aaaaa", + Some((0, 5)), + Some((4, 5)), + Some((2, 4)), + Some((4, 5)) +); +mat!( + match_repetition_75, + r"((..)|(.))", + r"aaaaaa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_76, + r"((..)|(.))((..)|(.))", + r"aaaaaa", + Some((0, 4)), + Some((0, 2)), + Some((0, 2)), + None, + Some((2, 4)), + Some((2, 4)), + None +); +mat!( + match_repetition_77, + r"((..)|(.))((..)|(.))((..)|(.))", + r"aaaaaa", + Some((0, 6)), + Some((0, 2)), + Some((0, 2)), + None, + Some((2, 4)), + Some((2, 4)), + None, + Some((4, 6)), + Some((4, 6)), + None +); +mat!( + match_repetition_79, + r"((..)|(.)){1}", + r"aaaaaa", + Some((0, 2)), + Some((0, 2)), + Some((0, 2)), + None +); +mat!( + match_repetition_80, + r"((..)|(.)){2}", + r"aaaaaa", + Some((0, 4)), + Some((2, 4)), + Some((2, 4)), + None +); +mat!( + match_repetition_81, + r"((..)|(.)){3}", + r"aaaaaa", + Some((0, 6)), + Some((4, 6)), + Some((4, 6)), + None +); +mat!( + match_repetition_83, + r"((..)|(.))*", + r"aaaaaa", + Some((0, 6)), + Some((4, 6)), + Some((4, 6)), + None +); +mat!( + match_repetition_90, + r"X(.?){0,}Y", + r"X1234567Y", + Some((0, 9)), + Some((7, 8)) +); +mat!( + match_repetition_91, + r"X(.?){1,}Y", + r"X1234567Y", + Some((0, 9)), + Some((7, 8)) +); +mat!( + match_repetition_92, + r"X(.?){2,}Y", + r"X1234567Y", + Some((0, 9)), + Some((7, 8)) +); +mat!( + match_repetition_93, + r"X(.?){3,}Y", + r"X1234567Y", + Some((0, 9)), + Some((7, 8)) +); +mat!( + match_repetition_94, + r"X(.?){4,}Y", + r"X1234567Y", + Some((0, 9)), + Some((7, 8)) +); +mat!( + match_repetition_95, + r"X(.?){5,}Y", + r"X1234567Y", + Some((0, 9)), + Some((7, 8)) +); +mat!( + match_repetition_96, + r"X(.?){6,}Y", + r"X1234567Y", + Some((0, 9)), + Some((7, 8)) +); +mat!( + match_repetition_97, + r"X(.?){7,}Y", + r"X1234567Y", + Some((0, 9)), + Some((7, 8)) +); +mat!( + match_repetition_98, + r"X(.?){8,}Y", + r"X1234567Y", + Some((0, 9)), + Some((8, 8)) +); +mat!( + match_repetition_100, + r"X(.?){0,8}Y", + r"X1234567Y", + Some((0, 9)), + Some((8, 8)) +); +mat!( + match_repetition_102, + r"X(.?){1,8}Y", + r"X1234567Y", + Some((0, 9)), + Some((8, 8)) +); +mat!( + match_repetition_104, + r"X(.?){2,8}Y", + r"X1234567Y", + Some((0, 9)), + Some((8, 8)) +); +mat!( + match_repetition_106, + r"X(.?){3,8}Y", + r"X1234567Y", + Some((0, 9)), + Some((8, 8)) +); +mat!( + match_repetition_108, + r"X(.?){4,8}Y", + r"X1234567Y", + Some((0, 9)), + Some((8, 8)) +); +mat!( + match_repetition_110, + r"X(.?){5,8}Y", + r"X1234567Y", + Some((0, 9)), + Some((8, 8)) +); +mat!( + match_repetition_112, + r"X(.?){6,8}Y", + r"X1234567Y", + Some((0, 9)), + Some((8, 8)) +); +mat!( + match_repetition_114, + r"X(.?){7,8}Y", + r"X1234567Y", + Some((0, 9)), + Some((8, 8)) +); +mat!( + match_repetition_115, + r"X(.?){8,8}Y", + r"X1234567Y", + Some((0, 9)), + Some((8, 8)) +); +mat!( + match_repetition_126, + r"(a|ab|c|bcd){0,}(d*)", + r"ababcd", + Some((0, 1)), + Some((0, 1)), + Some((1, 1)) +); +mat!( + match_repetition_127, + r"(a|ab|c|bcd){1,}(d*)", + r"ababcd", + Some((0, 1)), + Some((0, 1)), + Some((1, 1)) +); +mat!( + match_repetition_128, + r"(a|ab|c|bcd){2,}(d*)", + r"ababcd", + Some((0, 6)), + Some((3, 6)), + Some((6, 6)) +); +mat!( + match_repetition_129, + r"(a|ab|c|bcd){3,}(d*)", + r"ababcd", + Some((0, 6)), + Some((3, 6)), + Some((6, 6)) +); +mat!(match_repetition_130, r"(a|ab|c|bcd){4,}(d*)", r"ababcd", None); +mat!( + match_repetition_131, + r"(a|ab|c|bcd){0,10}(d*)", + r"ababcd", + Some((0, 1)), + Some((0, 1)), + Some((1, 1)) +); +mat!( + match_repetition_132, + r"(a|ab|c|bcd){1,10}(d*)", + r"ababcd", + Some((0, 1)), + Some((0, 1)), + Some((1, 1)) +); +mat!( + match_repetition_133, + r"(a|ab|c|bcd){2,10}(d*)", + r"ababcd", + Some((0, 6)), + Some((3, 6)), + Some((6, 6)) +); +mat!( + match_repetition_134, + r"(a|ab|c|bcd){3,10}(d*)", + r"ababcd", + Some((0, 6)), + Some((3, 6)), + Some((6, 6)) +); +mat!(match_repetition_135, r"(a|ab|c|bcd){4,10}(d*)", r"ababcd", None); +mat!( + match_repetition_136, + r"(a|ab|c|bcd)*(d*)", + r"ababcd", + Some((0, 1)), + Some((0, 1)), + Some((1, 1)) +); +mat!( + match_repetition_137, + r"(a|ab|c|bcd)+(d*)", + r"ababcd", + Some((0, 1)), + Some((0, 1)), + Some((1, 1)) +); +mat!( + match_repetition_143, + r"(ab|a|c|bcd){0,}(d*)", + r"ababcd", + Some((0, 6)), + Some((4, 5)), + Some((5, 6)) +); +mat!( + match_repetition_145, + r"(ab|a|c|bcd){1,}(d*)", + r"ababcd", + Some((0, 6)), + Some((4, 5)), + Some((5, 6)) +); +mat!( + match_repetition_147, + r"(ab|a|c|bcd){2,}(d*)", + r"ababcd", + Some((0, 6)), + Some((4, 5)), + Some((5, 6)) +); +mat!( + match_repetition_149, + r"(ab|a|c|bcd){3,}(d*)", + r"ababcd", + Some((0, 6)), + Some((4, 5)), + Some((5, 6)) +); +mat!(match_repetition_150, r"(ab|a|c|bcd){4,}(d*)", r"ababcd", None); +mat!( + match_repetition_152, + r"(ab|a|c|bcd){0,10}(d*)", + r"ababcd", + Some((0, 6)), + Some((4, 5)), + Some((5, 6)) +); +mat!( + match_repetition_154, + r"(ab|a|c|bcd){1,10}(d*)", + r"ababcd", + Some((0, 6)), + Some((4, 5)), + Some((5, 6)) +); +mat!( + match_repetition_156, + r"(ab|a|c|bcd){2,10}(d*)", + r"ababcd", + Some((0, 6)), + Some((4, 5)), + Some((5, 6)) +); +mat!( + match_repetition_158, + r"(ab|a|c|bcd){3,10}(d*)", + r"ababcd", + Some((0, 6)), + Some((4, 5)), + Some((5, 6)) +); +mat!(match_repetition_159, r"(ab|a|c|bcd){4,10}(d*)", r"ababcd", None); +mat!( + match_repetition_161, + r"(ab|a|c|bcd)*(d*)", + r"ababcd", + Some((0, 6)), + Some((4, 5)), + Some((5, 6)) +); +mat!( + match_repetition_163, + r"(ab|a|c|bcd)+(d*)", + r"ababcd", + Some((0, 6)), + Some((4, 5)), + Some((5, 6)) +); diff --git a/vendor/regex/tests/macros.rs b/vendor/regex/tests/macros.rs new file mode 100644 index 0000000000..e70e9489fd --- /dev/null +++ b/vendor/regex/tests/macros.rs @@ -0,0 +1,160 @@ +// Convenience macros. + +macro_rules! findall { + ($re:expr, $text:expr) => {{ + $re.find_iter(text!($text)) + .map(|m| (m.start(), m.end())).collect::<Vec<_>>() + }} +} + +// Macros for automatically producing tests. + +macro_rules! ismatch { + ($name:ident, $re:expr, $text:expr, $ismatch:expr) => { + #[test] + fn $name() { + let re = regex!($re); + assert_eq!($ismatch, re.is_match(text!($text))); + } + }; +} + +macro_rules! mat( + ($name:ident, $re:expr, $text:expr, $($loc:tt)+) => ( + #[test] + fn $name() { + let text = text!($text); + let expected: Vec<Option<_>> = vec![$($loc)+]; + let r = regex!($re); + let got: Vec<Option<_>> = match r.captures(text) { + Some(c) => { + assert!(r.is_match(text)); + assert!(r.shortest_match(text).is_some()); + r.capture_names() + .enumerate() + .map(|(i, _)| c.get(i).map(|m| (m.start(), m.end()))) + .collect() + } + None => vec![None], + }; + // The test set sometimes leave out capture groups, so truncate + // actual capture groups to match test set. + let mut sgot = &got[..]; + if sgot.len() > expected.len() { + sgot = &sgot[0..expected.len()] + } + if expected != sgot { + panic!("For RE '{}' against '{:?}', \ + expected '{:?}' but got '{:?}'", + $re, text, expected, sgot); + } + } + ); +); + +macro_rules! matiter( + ($name:ident, $re:expr, $text:expr) => ( + #[test] + fn $name() { + let text = text!($text); + let expected: Vec<(usize, usize)> = vec![]; + let r = regex!($re); + let got: Vec<_> = + r.find_iter(text).map(|m| (m.start(), m.end())).collect(); + if expected != got { + panic!("For RE '{}' against '{:?}', \ + expected '{:?}' but got '{:?}'", + $re, text, expected, got); + } + let captures_got: Vec<_> = + r.captures_iter(text) + .map(|c| c.get(0).unwrap()) + .map(|m| (m.start(), m.end())) + .collect(); + if captures_got != got { + panic!("For RE '{}' against '{:?}', \ + got '{:?}' using find_iter but got '{:?}' \ + using captures_iter", + $re, text, got, captures_got); + } + } + ); + ($name:ident, $re:expr, $text:expr, $($loc:tt)+) => ( + #[test] + fn $name() { + let text = text!($text); + let expected: Vec<_> = vec![$($loc)+]; + let r = regex!($re); + let got: Vec<_> = + r.find_iter(text).map(|m| (m.start(), m.end())).collect(); + if expected != got { + panic!("For RE '{}' against '{:?}', \ + expected '{:?}' but got '{:?}'", + $re, text, expected, got); + } + let captures_got: Vec<_> = + r.captures_iter(text) + .map(|c| c.get(0).unwrap()) + .map(|m| (m.start(), m.end())) + .collect(); + if captures_got != got { + panic!("For RE '{}' against '{:?}', \ + got '{:?}' using find_iter but got '{:?}' \ + using captures_iter", + $re, text, got, captures_got); + } + } + ); +); + +macro_rules! matset { + ($name:ident, $res:expr, $text:expr, $($match_index:expr),*) => { + #[test] + fn $name() { + let text = text!($text); + let set = regex_set!($res); + assert!(set.is_match(text)); + let expected = vec![$($match_index),*]; + let matches = set.matches(text); + assert!(matches.matched_any()); + let got: Vec<_> = matches.into_iter().collect(); + assert_eq!(expected, got); + } + } +} + +macro_rules! nomatset { + ($name:ident, $res:expr, $text:expr) => { + #[test] + fn $name() { + let text = text!($text); + let set = regex_set!($res); + assert!(!set.is_match(text)); + let matches = set.matches(text); + assert!(!matches.matched_any()); + assert_eq!(0, matches.into_iter().count()); + } + } +} + +macro_rules! split { + ($name:ident, $re:expr, $text:expr, $expected:expr) => { + #[test] + fn $name() { + let re = regex!($re); + let splitted: Vec<_> = re.split(t!($text)).collect(); + assert_eq!($expected, &*splitted); + } + } +} + +macro_rules! splitn { + ($name:ident, $re:expr, $text:expr, $limit:expr, $expected:expr) => { + #[test] + fn $name() { + let re = regex!($re); + let splitted: Vec<_> = re.splitn(t!($text), $limit).collect(); + assert_eq!($expected, &*splitted); + } + } +} diff --git a/vendor/regex/tests/macros_bytes.rs b/vendor/regex/tests/macros_bytes.rs new file mode 100644 index 0000000000..3d6c8c3ac8 --- /dev/null +++ b/vendor/regex/tests/macros_bytes.rs @@ -0,0 +1,39 @@ +// Macros for use in writing tests generic over &str/&[u8]. +macro_rules! text { ($text:expr) => { $text.as_bytes() } } +macro_rules! t { ($re:expr) => { text!($re) } } +macro_rules! match_text { ($text:expr) => { $text.as_bytes() } } +macro_rules! use_ { ($($path: tt)*) => { use regex::bytes::$($path)*; } } +macro_rules! empty_vec { () => { <Vec<&[u8]>>::new() } } +macro_rules! bytes { ($text:expr) => { $text } } + +macro_rules! no_expand { + ($text:expr) => {{ + use regex::bytes::NoExpand; + NoExpand(text!($text)) + }} +} + +macro_rules! show { + ($text:expr) => {{ + use std::ascii::escape_default; + let mut s = vec![]; + for &b in bytes!($text) { + s.extend(escape_default(b)); + } + String::from_utf8(s).unwrap() + }} +} + +macro_rules! expand { + ($name:ident, $re:expr, $text:expr, $expand:expr, $expected:expr) => { + #[test] + fn $name() { + let re = regex!($re); + let cap = re.captures(t!($text)).unwrap(); + + let mut got = vec![]; + cap.expand(t!($expand), &mut got); + assert_eq!(show!(t!($expected)), show!(&*got)); + } + } +} diff --git a/vendor/regex/tests/macros_str.rs b/vendor/regex/tests/macros_str.rs new file mode 100644 index 0000000000..7b7eb110c2 --- /dev/null +++ b/vendor/regex/tests/macros_str.rs @@ -0,0 +1,38 @@ +// Macros for use in writing tests generic over &str/&[u8]. +macro_rules! text { ($text:expr) => { $text } } +macro_rules! t { ($text:expr) => { text!($text) } } +macro_rules! match_text { ($text:expr) => { $text.as_str() } } +macro_rules! use_ { ($($path: tt)*) => { use regex::$($path)*; } } +macro_rules! empty_vec { () => { <Vec<&str>>::new() } } +macro_rules! bytes { ($text:expr) => { std::str::from_utf8($text.as_ref()).unwrap() } } + +macro_rules! no_expand { + ($text:expr) => {{ + use regex::NoExpand; + NoExpand(text!($text)) + }} +} + +macro_rules! show { ($text:expr) => { $text } } + +// N.B. The expansion API for &str and &[u8] APIs differs slightly for now, +// but they should be unified in 1.0. Then we can move this macro back into +// tests/api.rs where it is used. ---AG +macro_rules! expand { + ($name:ident, $re:expr, $text:expr, $expand:expr, $expected:expr) => { + #[test] + fn $name() { + let re = regex!($re); + let cap = re.captures(t!($text)).unwrap(); + + let mut got = String::new(); + cap.expand(t!($expand), &mut got); + assert_eq!(show!(t!($expected)), show!(&*got)); + } + } +} + +#[cfg(feature = "pattern")] +macro_rules! searcher_expr { ($e:expr) => ($e) } +#[cfg(not(feature = "pattern"))] +macro_rules! searcher_expr { ($e:expr) => ({}) } diff --git a/vendor/regex/tests/misc.rs b/vendor/regex/tests/misc.rs new file mode 100644 index 0000000000..314811e252 --- /dev/null +++ b/vendor/regex/tests/misc.rs @@ -0,0 +1,4 @@ +mat!(prefix_literal_match, r"^abc", r"abc", Some((0, 3))); +mat!(prefix_literal_nomatch, r"^abc", r"zabc", None); +mat!(one_literal_edge, r"abc", r"xxxxxab", None); +matiter!(terminates, r"a$", r"a", (0, 1)); diff --git a/vendor/regex/tests/multiline.rs b/vendor/regex/tests/multiline.rs new file mode 100644 index 0000000000..62ee47b62b --- /dev/null +++ b/vendor/regex/tests/multiline.rs @@ -0,0 +1,144 @@ +matiter!( + match_multi_1, + r"(?m)^[a-z]+$", + "abc\ndef\nxyz", + (0, 3), + (4, 7), + (8, 11) +); +matiter!(match_multi_2, r"(?m)^$", "abc\ndef\nxyz"); +matiter!(match_multi_3, r"(?m)^", "abc\ndef\nxyz", (0, 0), (4, 4), (8, 8)); +matiter!(match_multi_4, r"(?m)$", "abc\ndef\nxyz", (3, 3), (7, 7), (11, 11)); +matiter!( + match_multi_5, + r"(?m)^[a-z]", + "abc\ndef\nxyz", + (0, 1), + (4, 5), + (8, 9) +); +matiter!(match_multi_6, r"(?m)[a-z]^", "abc\ndef\nxyz"); +matiter!( + match_multi_7, + r"(?m)[a-z]$", + "abc\ndef\nxyz", + (2, 3), + (6, 7), + (10, 11) +); +matiter!(match_multi_8, r"(?m)$[a-z]", "abc\ndef\nxyz"); +matiter!(match_multi_9, r"(?m)^$", "", (0, 0)); + +matiter!( + match_multi_rep_1, + r"(?m)(?:^$)*", + "a\nb\nc", + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5) +); +matiter!( + match_multi_rep_2, + r"(?m)(?:^|a)+", + "a\naaa\n", + (0, 0), + (2, 2), + (3, 5), + (6, 6) +); +matiter!( + match_multi_rep_3, + r"(?m)(?:^|a)*", + "a\naaa\n", + (0, 1), + (2, 5), + (6, 6) +); +matiter!( + match_multi_rep_4, + r"(?m)(?:^[a-z])+", + "abc\ndef\nxyz", + (0, 1), + (4, 5), + (8, 9) +); +matiter!( + match_multi_rep_5, + r"(?m)(?:^[a-z]{3}\n?)+", + "abc\ndef\nxyz", + (0, 11) +); +matiter!( + match_multi_rep_6, + r"(?m)(?:^[a-z]{3}\n?)*", + "abc\ndef\nxyz", + (0, 11) +); +matiter!( + match_multi_rep_7, + r"(?m)(?:\n?[a-z]{3}$)+", + "abc\ndef\nxyz", + (0, 11) +); +matiter!( + match_multi_rep_8, + r"(?m)(?:\n?[a-z]{3}$)*", + "abc\ndef\nxyz", + (0, 11) +); +matiter!( + match_multi_rep_9, + r"(?m)^*", + "\naa\n", + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4) +); +matiter!(match_multi_rep_10, r"(?m)^+", "\naa\n", (0, 0), (1, 1), (4, 4)); +matiter!( + match_multi_rep_11, + r"(?m)$*", + "\naa\n", + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4) +); +matiter!(match_multi_rep_12, r"(?m)$+", "\naa\n", (0, 0), (3, 3), (4, 4)); +matiter!(match_multi_rep_13, r"(?m)(?:$\n)+", "\n\naaa\n\n", (0, 2), (5, 7)); +matiter!( + match_multi_rep_14, + r"(?m)(?:$\n)*", + "\n\naaa\n\n", + (0, 2), + (3, 3), + (4, 4), + (5, 7) +); +matiter!(match_multi_rep_15, r"(?m)(?:$\n^)+", "\n\naaa\n\n", (0, 2), (5, 7)); +matiter!( + match_multi_rep_16, + r"(?m)(?:^|$)+", + "\n\naaa\n\n", + (0, 0), + (1, 1), + (2, 2), + (5, 5), + (6, 6), + (7, 7) +); +matiter!( + match_multi_rep_17, + r"(?m)(?:$\n)*", + "\n\naaa\n\n", + (0, 2), + (3, 3), + (4, 4), + (5, 7) +); diff --git a/vendor/regex/tests/noparse.rs b/vendor/regex/tests/noparse.rs new file mode 100644 index 0000000000..8ded1dce7b --- /dev/null +++ b/vendor/regex/tests/noparse.rs @@ -0,0 +1,45 @@ +macro_rules! noparse( + ($name:ident, $re:expr) => ( + #[test] + fn $name() { + let re = $re; + match regex_new!(re) { + Err(_) => {}, + Ok(_) => panic!("Regex '{}' should cause a parse error.", re), + } + } + ); +); + +noparse!(fail_no_repeat_arg, "*"); +noparse!(fail_incomplete_escape, "\\"); +noparse!(fail_class_incomplete, "[A-"); +noparse!(fail_class_not_closed, "[A"); +noparse!(fail_class_no_begin, r"[\A]"); +noparse!(fail_class_no_end, r"[\z]"); +noparse!(fail_class_no_boundary, r"[\b]"); +noparse!(fail_open_paren, "("); +noparse!(fail_close_paren, ")"); +noparse!(fail_invalid_range, "[a-Z]"); +noparse!(fail_empty_capture_name, "(?P<>a)"); +noparse!(fail_bad_capture_name, "(?P<na-me>)"); +noparse!(fail_bad_flag, "(?a)a"); +noparse!(fail_too_big, "a{10000000}"); +noparse!(fail_counted_no_close, "a{1001"); +noparse!(fail_counted_decreasing, "a{2,1}"); +noparse!(fail_counted_nonnegative, "a{-1,1}"); +noparse!(fail_unfinished_cap, "(?"); +noparse!(fail_unfinished_escape, "\\"); +noparse!(fail_octal_digit, r"\8"); +noparse!(fail_hex_digit, r"\xG0"); +noparse!(fail_hex_short, r"\xF"); +noparse!(fail_hex_long_digits, r"\x{fffg}"); +noparse!(fail_flag_bad, "(?a)"); +noparse!(fail_flag_empty, "(?)"); +noparse!(fail_double_neg, "(?-i-i)"); +noparse!(fail_neg_empty, "(?i-)"); +noparse!(fail_dupe_named, "(?P<a>.)(?P<a>.)"); +noparse!(fail_range_end_no_class, "[a-[:lower:]]"); +noparse!(fail_range_end_no_begin, r"[a-\A]"); +noparse!(fail_range_end_no_end, r"[a-\z]"); +noparse!(fail_range_end_no_boundary, r"[a-\b]"); diff --git a/vendor/regex/tests/regression.rs b/vendor/regex/tests/regression.rs new file mode 100644 index 0000000000..e8b2525385 --- /dev/null +++ b/vendor/regex/tests/regression.rs @@ -0,0 +1,222 @@ +// See: https://github.com/rust-lang/regex/issues/48 +#[test] +fn invalid_regexes_no_crash() { + assert!(regex_new!("(*)").is_err()); + assert!(regex_new!("(?:?)").is_err()); + assert!(regex_new!("(?)").is_err()); + assert!(regex_new!("*").is_err()); +} + +// See: https://github.com/rust-lang/regex/issues/98 +#[test] +fn regression_many_repeat_stack_overflow() { + let re = regex!("^.{1,2500}"); + assert_eq!(vec![(0, 1)], findall!(re, "a")); +} + +// See: https://github.com/rust-lang/regex/issues/555 +#[test] +fn regression_invalid_repetition_expr() { + assert!(regex_new!("(?m){1,1}").is_err()); +} + +// See: https://github.com/rust-lang/regex/issues/527 +#[test] +fn regression_invalid_flags_expression() { + assert!(regex_new!("(((?x)))").is_ok()); +} + +// See: https://github.com/rust-lang/regex/issues/75 +mat!(regression_unsorted_binary_search_1, r"(?i-u)[a_]+", "A_", Some((0, 2))); +mat!(regression_unsorted_binary_search_2, r"(?i-u)[A_]+", "a_", Some((0, 2))); + +// See: https://github.com/rust-lang/regex/issues/99 +#[cfg(feature = "unicode-case")] +mat!(regression_negated_char_class_1, r"(?i)[^x]", "x", None); +#[cfg(feature = "unicode-case")] +mat!(regression_negated_char_class_2, r"(?i)[^x]", "X", None); + +// See: https://github.com/rust-lang/regex/issues/101 +mat!(regression_ascii_word_underscore, r"[[:word:]]", "_", Some((0, 1))); + +// See: https://github.com/rust-lang/regex/issues/129 +#[test] +fn regression_captures_rep() { + let re = regex!(r"([a-f]){2}(?P<foo>[x-z])"); + let caps = re.captures(text!("abx")).unwrap(); + assert_eq!(match_text!(caps.name("foo").unwrap()), text!("x")); +} + +// See: https://github.com/rust-lang/regex/issues/153 +mat!(regression_alt_in_alt1, r"ab?|$", "az", Some((0, 1))); +mat!(regression_alt_in_alt2, r"^(.*?)(\n|\r\n?|$)", "ab\rcd", Some((0, 3))); + +// See: https://github.com/rust-lang/regex/issues/169 +mat!(regression_leftmost_first_prefix, r"z*azb", "azb", Some((0, 3))); + +// See: https://github.com/rust-lang/regex/issues/76 +#[cfg(all(feature = "unicode-case", feature = "unicode-gencat"))] +mat!(uni_case_lower_nocase_flag, r"(?i)\p{Ll}+", "ΛΘΓΔα", Some((0, 10))); + +// See: https://github.com/rust-lang/regex/issues/191 +mat!(many_alternates, r"1|2|3|4|5|6|7|8|9|10|int", "int", Some((0, 3))); + +// burntsushi was bad and didn't create an issue for this bug. +mat!(anchored_prefix1, r"^a[[:^space:]]", "a ", None); +mat!(anchored_prefix2, r"^a[[:^space:]]", "foo boo a ", None); +mat!(anchored_prefix3, r"^-[a-z]", "r-f", None); + +// See: https://github.com/rust-lang/regex/issues/204 +#[cfg(feature = "unicode-perl")] +split!( + split_on_word_boundary, + r"\b", + r"Should this (work?)", + &[ + t!(""), + t!("Should"), + t!(" "), + t!("this"), + t!(" ("), + t!("work"), + t!("?)") + ] +); +#[cfg(feature = "unicode-perl")] +matiter!( + word_boundary_dfa, + r"\b", + "a b c", + (0, 0), + (1, 1), + (2, 2), + (3, 3), + (4, 4), + (5, 5) +); + +// See: https://github.com/rust-lang/regex/issues/268 +matiter!(partial_anchor, r"^a|b", "ba", (0, 1)); + +// See: https://github.com/rust-lang/regex/issues/280 +ismatch!(partial_anchor_alternate_begin, r"^a|z", "yyyyya", false); +ismatch!(partial_anchor_alternate_end, r"a$|z", "ayyyyy", false); + +// See: https://github.com/rust-lang/regex/issues/289 +mat!(lits_unambiguous1, r"(ABC|CDA|BC)X", "CDAX", Some((0, 4))); + +// See: https://github.com/rust-lang/regex/issues/291 +mat!( + lits_unambiguous2, + r"((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P<n>[0-9]+)$", + "CIMG2341", + Some((0, 8)), + Some((0, 4)), + None, + Some((0, 4)), + Some((4, 8)) +); + +// See: https://github.com/rust-lang/regex/issues/271 +mat!(endl_or_wb, r"(?m:$)|(?-u:\b)", "\u{6084e}", Some((4, 4))); +mat!(zero_or_end, r"(?i-u:\x00)|$", "\u{e682f}", Some((4, 4))); +mat!(y_or_endl, r"(?i-u:y)|(?m:$)", "\u{b4331}", Some((4, 4))); +#[cfg(feature = "unicode-perl")] +mat!(wb_start_x, r"(?u:\b)^(?-u:X)", "X", Some((0, 1))); + +// See: https://github.com/rust-lang/regex/issues/321 +ismatch!(strange_anchor_non_complete_prefix, r"a^{2}", "", false); +ismatch!(strange_anchor_non_complete_suffix, r"${2}a", "", false); + +// See: https://github.com/BurntSushi/ripgrep/issues/1203 +ismatch!(reverse_suffix1, r"[0-4][0-4][0-4]000", "153.230000", true); +ismatch!(reverse_suffix2, r"[0-9][0-9][0-9]000", "153.230000\n", true); +matiter!(reverse_suffix3, r"[0-9][0-9][0-9]000", "153.230000\n", (4, 10)); + +// See: https://github.com/rust-lang/regex/issues/334 +// See: https://github.com/rust-lang/regex/issues/557 +mat!( + captures_after_dfa_premature_end1, + r"a(b*(X|$))?", + "abcbX", + Some((0, 1)), + None, + None +); +mat!( + captures_after_dfa_premature_end2, + r"a(bc*(X|$))?", + "abcbX", + Some((0, 1)), + None, + None +); +mat!(captures_after_dfa_premature_end3, r"(aa$)?", "aaz", Some((0, 0))); + +// See: https://github.com/rust-lang/regex/issues/437 +ismatch!( + literal_panic, + r"typename type\-parameter\-[0-9]+\-[0-9]+::.+", + "test", + false +); + +// See: https://github.com/rust-lang/regex/issues/533 +ismatch!( + blank_matches_nothing_between_space_and_tab, + r"[[:blank:]]", + "\u{a}\u{b}\u{c}\u{d}\u{e}\u{f}\ + \u{10}\u{11}\u{12}\u{13}\u{14}\u{15}\u{16}\u{17}\ + \u{18}\u{19}\u{1a}\u{1b}\u{1c}\u{1d}\u{1e}\u{1f}", + false +); + +ismatch!( + inverted_blank_matches_everything_between_space_and_tab, + r"^[[:^blank:]]+$", + "\u{a}\u{b}\u{c}\u{d}\u{e}\u{f}\ + \u{10}\u{11}\u{12}\u{13}\u{14}\u{15}\u{16}\u{17}\ + \u{18}\u{19}\u{1a}\u{1b}\u{1c}\u{1d}\u{1e}\u{1f}", + true +); + +// Tests that our Aho-Corasick optimization works correctly. It only +// kicks in when we have >32 literals. By "works correctly," we mean that +// leftmost-first match semantics are properly respected. That is, samwise +// should match, not sam. +mat!( + ahocorasick1, + "samwise|sam|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|\ + A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z", + "samwise", + Some((0, 7)) +); + +// See: https://github.com/BurntSushi/ripgrep/issues/1247 +#[test] +#[cfg(feature = "unicode-perl")] +fn regression_nfa_stops1() { + let re = ::regex::bytes::Regex::new(r"\bs(?:[ab])").unwrap(); + assert_eq!(0, re.find_iter(b"s\xE4").count()); +} + +// See: https://github.com/rust-lang/regex/issues/640 +#[cfg(feature = "unicode-case")] +matiter!( + flags_are_unset, + r"((?i)foo)|Bar", + "foo Foo bar Bar", + (0, 3), + (4, 7), + (12, 15) +); + +// See: https://github.com/rust-lang/regex/issues/659 +// +// Note that 'Ј' is not 'j', but cyrillic Je +// https://en.wikipedia.org/wiki/Je_(Cyrillic) +ismatch!(empty_group_match, r"()Ј01", "zЈ01", true); +matiter!(empty_group_find, r"()Ј01", "zЈ01", (1, 5)); + +// See: https://github.com/rust-lang/regex/issues/862 +mat!(non_greedy_question_literal, r"ab??", "ab", Some((0, 1))); diff --git a/vendor/regex/tests/regression_fuzz.rs b/vendor/regex/tests/regression_fuzz.rs new file mode 100644 index 0000000000..4e76704d2a --- /dev/null +++ b/vendor/regex/tests/regression_fuzz.rs @@ -0,0 +1,31 @@ +// These tests are only run for the "default" test target because some of them +// can take quite a long time. Some of them take long enough that it's not +// practical to run them in debug mode. :-/ + +// See: https://oss-fuzz.com/testcase-detail/5673225499181056 +// +// Ignored by default since it takes too long in debug mode (almost a minute). +#[test] +#[ignore] +fn fuzz1() { + regex!(r"1}{55}{0}*{1}{55}{55}{5}*{1}{55}+{56}|;**"); +} + +// See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=26505 +// See: https://github.com/rust-lang/regex/issues/722 +#[test] +fn empty_any_errors_no_panic() { + assert!(regex_new!(r"\P{any}").is_err()); +} + +// This tests that a very large regex errors during compilation instead of +// using gratuitous amounts of memory. The specific problem is that the +// compiler wasn't accounting for the memory used by Unicode character classes +// correctly. +// +// See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=33579 +#[test] +fn big_regex_fails_to_compile() { + let pat = "[\u{0}\u{e}\u{2}\\w~~>[l\t\u{0}]p?<]{971158}"; + assert!(regex_new!(pat).is_err()); +} diff --git a/vendor/regex/tests/replace.rs b/vendor/regex/tests/replace.rs new file mode 100644 index 0000000000..d65be072ff --- /dev/null +++ b/vendor/regex/tests/replace.rs @@ -0,0 +1,248 @@ +macro_rules! replace( + ($name:ident, $which:ident, $re:expr, + $search:expr, $replace:expr, $result:expr) => ( + #[test] + fn $name() { + let re = regex!($re); + assert_eq!(re.$which(text!($search), $replace), text!($result)); + } + ); +); + +replace!(first, replace, r"[0-9]", "age: 26", t!("Z"), "age: Z6"); +replace!(plus, replace, r"[0-9]+", "age: 26", t!("Z"), "age: Z"); +replace!(all, replace_all, r"[0-9]", "age: 26", t!("Z"), "age: ZZ"); +replace!( + groups, + replace, + r"(?-u)(\S+)\s+(\S+)", + "w1 w2", + t!("$2 $1"), + "w2 w1" +); +replace!( + double_dollar, + replace, + r"(?-u)(\S+)\s+(\S+)", + "w1 w2", + t!("$2 $$1"), + "w2 $1" +); +// replace!(adjacent_index, replace, +// r"([^aeiouy])ies$", "skies", t!("$1y"), "sky"); +replace!( + named, + replace_all, + r"(?-u)(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)", + "w1 w2 w3 w4", + t!("$last $first$space"), + "w2 w1 w4 w3" +); +replace!( + trim, + replace_all, + "^[ \t]+|[ \t]+$", + " \t trim me\t \t", + t!(""), + "trim me" +); +replace!(number_hypen, replace, r"(.)(.)", "ab", t!("$1-$2"), "a-b"); +// replace!(number_underscore, replace, r"(.)(.)", "ab", t!("$1_$2"), "a_b"); +replace!( + simple_expand, + replace_all, + r"(?-u)(\w) (\w)", + "a b", + t!("$2 $1"), + "b a" +); +replace!( + literal_dollar1, + replace_all, + r"(?-u)(\w+) (\w+)", + "a b", + t!("$$1"), + "$1" +); +replace!( + literal_dollar2, + replace_all, + r"(?-u)(\w+) (\w+)", + "a b", + t!("$2 $$c $1"), + "b $c a" +); +replace!( + no_expand1, + replace, + r"(?-u)(\S+)\s+(\S+)", + "w1 w2", + no_expand!("$2 $1"), + "$2 $1" +); +replace!( + no_expand2, + replace, + r"(?-u)(\S+)\s+(\S+)", + "w1 w2", + no_expand!("$$1"), + "$$1" +); +use_!(Captures); +replace!( + closure_returning_reference, + replace, + r"([0-9]+)", + "age: 26", + |captures: &Captures<'_>| { + match_text!(captures.get(1).unwrap())[0..1].to_owned() + }, + "age: 2" +); +replace!( + closure_returning_value, + replace, + r"[0-9]+", + "age: 26", + |_captures: &Captures<'_>| t!("Z").to_owned(), + "age: Z" +); + +// See https://github.com/rust-lang/regex/issues/314 +replace!( + match_at_start_replace_with_empty, + replace_all, + r"foo", + "foobar", + t!(""), + "bar" +); + +// See https://github.com/rust-lang/regex/issues/393 +replace!(single_empty_match, replace, r"^", "bar", t!("foo"), "foobar"); + +// See https://github.com/rust-lang/regex/issues/399 +replace!( + capture_longest_possible_name, + replace_all, + r"(.)", + "b", + t!("${1}a $1a"), + "ba " +); + +replace!( + impl_string, + replace, + r"[0-9]", + "age: 26", + t!("Z".to_string()), + "age: Z6" +); +replace!( + impl_string_ref, + replace, + r"[0-9]", + "age: 26", + t!(&"Z".to_string()), + "age: Z6" +); +replace!( + impl_cow_str_borrowed, + replace, + r"[0-9]", + "age: 26", + t!(std::borrow::Cow::<'_, str>::Borrowed("Z")), + "age: Z6" +); +replace!( + impl_cow_str_borrowed_ref, + replace, + r"[0-9]", + "age: 26", + t!(&std::borrow::Cow::<'_, str>::Borrowed("Z")), + "age: Z6" +); +replace!( + impl_cow_str_owned, + replace, + r"[0-9]", + "age: 26", + t!(std::borrow::Cow::<'_, str>::Owned("Z".to_string())), + "age: Z6" +); +replace!( + impl_cow_str_owned_ref, + replace, + r"[0-9]", + "age: 26", + t!(&std::borrow::Cow::<'_, str>::Owned("Z".to_string())), + "age: Z6" +); + +replace!( + impl_vec_u8, + replace, + r"[0-9]", + "age: 26", + bytes!(vec![b'Z']), + "age: Z6" +); +replace!( + impl_vec_u8_ref, + replace, + r"[0-9]", + "age: 26", + bytes!(&vec![b'Z']), + "age: Z6" +); +replace!( + impl_cow_slice_borrowed, + replace, + r"[0-9]", + "age: 26", + bytes!(std::borrow::Cow::<'_, [u8]>::Borrowed(&[b'Z'])), + "age: Z6" +); +replace!( + impl_cow_slice_borrowed_ref, + replace, + r"[0-9]", + "age: 26", + bytes!(&std::borrow::Cow::<'_, [u8]>::Borrowed(&[b'Z'])), + "age: Z6" +); +replace!( + impl_cow_slice_owned, + replace, + r"[0-9]", + "age: 26", + bytes!(std::borrow::Cow::<'_, [u8]>::Owned(vec![b'Z'])), + "age: Z6" +); +replace!( + impl_cow_slice_owned_ref, + replace, + r"[0-9]", + "age: 26", + bytes!(&std::borrow::Cow::<'_, [u8]>::Owned(vec![b'Z'])), + "age: Z6" +); + +#[test] +fn replacen_no_captures() { + let re = regex!(r"[0-9]"); + assert_eq!( + re.replacen(text!("age: 1234"), 2, t!("Z")), + text!("age: ZZ34") + ); +} + +#[test] +fn replacen_with_captures() { + let re = regex!(r"([0-9])"); + assert_eq!( + re.replacen(text!("age: 1234"), 2, t!("${1}Z")), + text!("age: 1Z2Z34") + ); +} diff --git a/vendor/regex/tests/searcher.rs b/vendor/regex/tests/searcher.rs new file mode 100644 index 0000000000..3779f54c31 --- /dev/null +++ b/vendor/regex/tests/searcher.rs @@ -0,0 +1,95 @@ +macro_rules! searcher { + ($name:ident, $re:expr, $haystack:expr) => ( + searcher!($name, $re, $haystack, vec vec![]); + ); + ($name:ident, $re:expr, $haystack:expr, $($steps:expr,)*) => ( + searcher!($name, $re, $haystack, vec vec![$($steps),*]); + ); + ($name:ident, $re:expr, $haystack:expr, $($steps:expr),*) => ( + searcher!($name, $re, $haystack, vec vec![$($steps),*]); + ); + ($name:ident, $re:expr, $haystack:expr, vec $expect_steps:expr) => ( + #[test] + #[allow(unused_imports)] + fn $name() { + searcher_expr! {{ + use std::str::pattern::{Pattern, Searcher}; + use std::str::pattern::SearchStep::{Match, Reject, Done}; + let re = regex!($re); + let mut se = re.into_searcher($haystack); + let mut got_steps = vec![]; + loop { + match se.next() { + Done => break, + step => { got_steps.push(step); } + } + } + assert_eq!(got_steps, $expect_steps); + }} + } + ); +} + +searcher!(searcher_empty_regex_empty_haystack, r"", "", Match(0, 0)); +searcher!( + searcher_empty_regex, + r"", + "ab", + Match(0, 0), + Reject(0, 1), + Match(1, 1), + Reject(1, 2), + Match(2, 2) +); +searcher!(searcher_empty_haystack, r"\d", ""); +searcher!(searcher_one_match, r"\d", "5", Match(0, 1)); +searcher!(searcher_no_match, r"\d", "a", Reject(0, 1)); +searcher!( + searcher_two_adjacent_matches, + r"\d", + "56", + Match(0, 1), + Match(1, 2) +); +searcher!( + searcher_two_non_adjacent_matches, + r"\d", + "5a6", + Match(0, 1), + Reject(1, 2), + Match(2, 3) +); +searcher!(searcher_reject_first, r"\d", "a6", Reject(0, 1), Match(1, 2)); +searcher!( + searcher_one_zero_length_matches, + r"\d*", + "a1b2", + Match(0, 0), // ^ + Reject(0, 1), // a + Match(1, 2), // a1 + Reject(2, 3), // a1b + Match(3, 4), // a1b2 +); +searcher!( + searcher_many_zero_length_matches, + r"\d*", + "a1bbb2", + Match(0, 0), // ^ + Reject(0, 1), // a + Match(1, 2), // a1 + Reject(2, 3), // a1b + Match(3, 3), // a1bb + Reject(3, 4), // a1bb + Match(4, 4), // a1bbb + Reject(4, 5), // a1bbb + Match(5, 6), // a1bbba +); +searcher!( + searcher_unicode, + r".+?", + "Ⅰ1Ⅱ2", + Match(0, 3), + Match(3, 4), + Match(4, 7), + Match(7, 8) +); diff --git a/vendor/regex/tests/set.rs b/vendor/regex/tests/set.rs new file mode 100644 index 0000000000..37fcf8700c --- /dev/null +++ b/vendor/regex/tests/set.rs @@ -0,0 +1,67 @@ +matset!(set1, &["a", "a"], "a", 0, 1); +matset!(set2, &["a", "a"], "ba", 0, 1); +matset!(set3, &["a", "b"], "a", 0); +matset!(set4, &["a", "b"], "b", 1); +matset!(set5, &["a|b", "b|a"], "b", 0, 1); +matset!(set6, &["foo", "oo"], "foo", 0, 1); +matset!(set7, &["^foo", "bar$"], "foo", 0); +matset!(set8, &["^foo", "bar$"], "foo bar", 0, 1); +matset!(set9, &["^foo", "bar$"], "bar", 1); +matset!(set10, &[r"[a-z]+$", "foo"], "01234 foo", 0, 1); +matset!(set11, &[r"[a-z]+$", "foo"], "foo 01234", 1); +matset!(set12, &[r".*?", "a"], "zzzzzza", 0, 1); +matset!(set13, &[r".*", "a"], "zzzzzza", 0, 1); +matset!(set14, &[r".*", "a"], "zzzzzz", 0); +matset!(set15, &[r"(?-u)\ba\b"], "hello a bye", 0); +matset!(set16, &["a"], "a", 0); +matset!(set17, &[".*a"], "a", 0); +matset!(set18, &["a", "β"], "β", 1); + +// regexes that match the empty string +matset!(setempty1, &["", "a"], "abc", 0, 1); +matset!(setempty2, &["", "b"], "abc", 0, 1); +matset!(setempty3, &["", "z"], "abc", 0); +matset!(setempty4, &["a", ""], "abc", 0, 1); +matset!(setempty5, &["b", ""], "abc", 0, 1); +matset!(setempty6, &["z", ""], "abc", 1); +matset!(setempty7, &["b", "(?:)"], "abc", 0, 1); +matset!(setempty8, &["(?:)", "b"], "abc", 0, 1); +matset!(setempty9, &["c(?:)", "b"], "abc", 0, 1); + +nomatset!(nset1, &["a", "a"], "b"); +nomatset!(nset2, &["^foo", "bar$"], "bar foo"); +nomatset!( + nset3, + { + let xs: &[&str] = &[]; + xs + }, + "a" +); +nomatset!(nset4, &[r"^rooted$", r"\.log$"], "notrooted"); + +// See: https://github.com/rust-lang/regex/issues/187 +#[test] +fn regression_subsequent_matches() { + let set = regex_set!(&["ab", "b"]); + let text = text!("ba"); + assert!(set.matches(text).matched(1)); + assert!(set.matches(text).matched(1)); +} + +#[test] +fn get_set_patterns() { + let set = regex_set!(&["a", "b"]); + assert_eq!(vec!["a", "b"], set.patterns()); +} + +#[test] +fn len_and_empty() { + let empty = regex_set!(&[""; 0]); + assert_eq!(empty.len(), 0); + assert!(empty.is_empty()); + + let not_empty = regex_set!(&["ab", "b"]); + assert_eq!(not_empty.len(), 2); + assert!(!not_empty.is_empty()); +} diff --git a/vendor/regex/tests/shortest_match.rs b/vendor/regex/tests/shortest_match.rs new file mode 100644 index 0000000000..f8b4fed156 --- /dev/null +++ b/vendor/regex/tests/shortest_match.rs @@ -0,0 +1,14 @@ +macro_rules! shortmat { + ($name:ident, $re:expr, $text:expr, $shortest_match:expr) => { + #[test] + fn $name() { + let text = text!($text); + let re = regex!($re); + assert_eq!($shortest_match, re.shortest_match(text)); + } + }; +} + +shortmat!(t01, r"a+", r"aa", Some(1)); +// Test that the reverse suffix optimization gets it right. +shortmat!(t02, r".*(?:abcd)+", r"abcdabcd", Some(4)); diff --git a/vendor/regex/tests/suffix_reverse.rs b/vendor/regex/tests/suffix_reverse.rs new file mode 100644 index 0000000000..774c9e85f0 --- /dev/null +++ b/vendor/regex/tests/suffix_reverse.rs @@ -0,0 +1,6 @@ +mat!(t01, r".*abcd", r"abcd", Some((0, 4))); +mat!(t02, r".*(?:abcd)+", r"abcd", Some((0, 4))); +mat!(t03, r".*(?:abcd)+", r"abcdabcd", Some((0, 8))); +mat!(t04, r".*(?:abcd)+", r"abcdxabcd", Some((0, 9))); +mat!(t05, r".*x(?:abcd)+", r"abcdxabcd", Some((0, 9))); +mat!(t06, r"[^abcd]*x(?:abcd)+", r"abcdxabcd", Some((4, 9))); diff --git a/vendor/regex/tests/test_backtrack.rs b/vendor/regex/tests/test_backtrack.rs new file mode 100644 index 0000000000..fb934e2d8f --- /dev/null +++ b/vendor/regex/tests/test_backtrack.rs @@ -0,0 +1,56 @@ +#![cfg_attr(feature = "pattern", feature(pattern))] + +macro_rules! regex_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new($re) + .bounded_backtracking() + .build() + .map(|e| e.into_regex()) + }}; +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + }; +} + +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new_many($re) + .bounded_backtracking() + .build() + .map(|e| e.into_regex_set()) + }}; +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + }; +} + +// Must come before other module definitions. +include!("macros_str.rs"); +include!("macros.rs"); + +mod api; +mod api_str; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod searcher; +mod set; +mod suffix_reverse; +#[cfg(feature = "unicode")] +mod unicode; +#[cfg(feature = "unicode-perl")] +mod word_boundary; +#[cfg(feature = "unicode-perl")] +mod word_boundary_unicode; diff --git a/vendor/regex/tests/test_backtrack_bytes.rs b/vendor/regex/tests/test_backtrack_bytes.rs new file mode 100644 index 0000000000..a59426c949 --- /dev/null +++ b/vendor/regex/tests/test_backtrack_bytes.rs @@ -0,0 +1,55 @@ +macro_rules! regex_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new($re) + .bounded_backtracking() + .only_utf8(false) + .build() + .map(|e| e.into_byte_regex()) + }}; +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + }; +} + +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new_many($re) + .bounded_backtracking() + .only_utf8(false) + .build() + .map(|e| e.into_byte_regex_set()) + }}; +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + }; +} + +// Must come before other module definitions. +include!("macros_bytes.rs"); +include!("macros.rs"); + +mod api; +mod bytes; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod set; +mod suffix_reverse; +#[cfg(feature = "unicode")] +mod unicode; +#[cfg(feature = "unicode-perl")] +mod word_boundary; +#[cfg(feature = "unicode-perl")] +mod word_boundary_ascii; diff --git a/vendor/regex/tests/test_backtrack_utf8bytes.rs b/vendor/regex/tests/test_backtrack_utf8bytes.rs new file mode 100644 index 0000000000..6d308e9e1c --- /dev/null +++ b/vendor/regex/tests/test_backtrack_utf8bytes.rs @@ -0,0 +1,58 @@ +#![cfg_attr(feature = "pattern", feature(pattern))] + +macro_rules! regex_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new($re) + .bounded_backtracking() + .bytes(true) + .build() + .map(|e| e.into_regex()) + }}; +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + }; +} + +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new_many($re) + .bounded_backtracking() + .bytes(true) + .build() + .map(|e| e.into_regex_set()) + }}; +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + }; +} + +// Must come before other module definitions. +include!("macros_str.rs"); +include!("macros.rs"); + +mod api; +mod api_str; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod searcher; +mod set; +mod suffix_reverse; +#[cfg(feature = "unicode")] +mod unicode; +#[cfg(feature = "unicode-perl")] +mod word_boundary; +#[cfg(feature = "unicode-perl")] +mod word_boundary_unicode; diff --git a/vendor/regex/tests/test_crates_regex.rs b/vendor/regex/tests/test_crates_regex.rs new file mode 100644 index 0000000000..a681604727 --- /dev/null +++ b/vendor/regex/tests/test_crates_regex.rs @@ -0,0 +1,54 @@ +/* + * This test is a minimal version of <rofl_0> and <subdiff_0> + * + * Once this bug gets fixed, uncomment rofl_0 and subdiff_0 + * (in `tests/crates_regex.rs`). +#[test] +fn word_boundary_backtracking_default_mismatch() { + use regex::internal::ExecBuilder; + + let backtrack_re = ExecBuilder::new(r"\b") + .bounded_backtracking() + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err)) + .unwrap(); + + let default_re = ExecBuilder::new(r"\b") + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err)) + .unwrap(); + + let input = "䅅\\u{a0}"; + + let fi1 = backtrack_re.find_iter(input); + let fi2 = default_re.find_iter(input); + for (m1, m2) in fi1.zip(fi2) { + assert_eq!(m1, m2); + } +} +*/ + +mod consistent; + +mod crates_regex { + + macro_rules! consistent { + ($test_name:ident, $regex_src:expr) => { + #[test] + fn $test_name() { + use super::consistent::backends_are_consistent; + + if option_env!("RUST_REGEX_RANDOM_TEST").is_some() { + match backends_are_consistent($regex_src) { + Ok(_) => {} + Err(err) => panic!("{}", err), + } + } + } + }; + } + + include!("crates_regex.rs"); +} diff --git a/vendor/regex/tests/test_default.rs b/vendor/regex/tests/test_default.rs new file mode 100644 index 0000000000..19a319af11 --- /dev/null +++ b/vendor/regex/tests/test_default.rs @@ -0,0 +1,232 @@ +#![cfg_attr(feature = "pattern", feature(pattern))] + +use regex; + +// Due to macro scoping rules, this definition only applies for the modules +// defined below. Effectively, it allows us to use the same tests for both +// native and dynamic regexes. +// +// This is also used to test the various matching engines. This one exercises +// the normal code path which automatically chooses the engine based on the +// regex and the input. Other dynamic tests explicitly set the engine to use. +macro_rules! regex_new { + ($re:expr) => {{ + use regex::Regex; + Regex::new($re) + }}; +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + }; +} + +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::RegexSet; + RegexSet::new($re) + }}; +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + }; +} + +// Must come before other module definitions. +include!("macros_str.rs"); +include!("macros.rs"); + +mod api; +mod api_str; +mod crazy; +mod flags; +mod fowler; +mod misc; +mod multiline; +mod noparse; +mod regression; +mod regression_fuzz; +mod replace; +mod searcher; +mod set; +mod shortest_match; +mod suffix_reverse; +#[cfg(feature = "unicode")] +mod unicode; +#[cfg(feature = "unicode-perl")] +mod word_boundary; +#[cfg(feature = "unicode-perl")] +mod word_boundary_unicode; + +#[test] +fn disallow_non_utf8() { + assert!(regex::Regex::new(r"(?-u)\xFF").is_err()); + assert!(regex::Regex::new(r"(?-u).").is_err()); + assert!(regex::Regex::new(r"(?-u)[\xFF]").is_err()); + assert!(regex::Regex::new(r"(?-u)☃").is_err()); +} + +#[test] +fn disallow_octal() { + assert!(regex::Regex::new(r"\0").is_err()); +} + +#[test] +fn allow_octal() { + assert!(regex::RegexBuilder::new(r"\0").octal(true).build().is_ok()); +} + +#[test] +fn oibits() { + use regex::bytes; + use regex::{Regex, RegexBuilder, RegexSet, RegexSetBuilder}; + use std::panic::{RefUnwindSafe, UnwindSafe}; + + fn assert_send<T: Send>() {} + fn assert_sync<T: Sync>() {} + fn assert_unwind_safe<T: UnwindSafe>() {} + fn assert_ref_unwind_safe<T: RefUnwindSafe>() {} + + assert_send::<Regex>(); + assert_sync::<Regex>(); + assert_unwind_safe::<Regex>(); + assert_ref_unwind_safe::<Regex>(); + assert_send::<RegexBuilder>(); + assert_sync::<RegexBuilder>(); + assert_unwind_safe::<RegexBuilder>(); + assert_ref_unwind_safe::<RegexBuilder>(); + + assert_send::<bytes::Regex>(); + assert_sync::<bytes::Regex>(); + assert_unwind_safe::<bytes::Regex>(); + assert_ref_unwind_safe::<bytes::Regex>(); + assert_send::<bytes::RegexBuilder>(); + assert_sync::<bytes::RegexBuilder>(); + assert_unwind_safe::<bytes::RegexBuilder>(); + assert_ref_unwind_safe::<bytes::RegexBuilder>(); + + assert_send::<RegexSet>(); + assert_sync::<RegexSet>(); + assert_unwind_safe::<RegexSet>(); + assert_ref_unwind_safe::<RegexSet>(); + assert_send::<RegexSetBuilder>(); + assert_sync::<RegexSetBuilder>(); + assert_unwind_safe::<RegexSetBuilder>(); + assert_ref_unwind_safe::<RegexSetBuilder>(); + + assert_send::<bytes::RegexSet>(); + assert_sync::<bytes::RegexSet>(); + assert_unwind_safe::<bytes::RegexSet>(); + assert_ref_unwind_safe::<bytes::RegexSet>(); + assert_send::<bytes::RegexSetBuilder>(); + assert_sync::<bytes::RegexSetBuilder>(); + assert_unwind_safe::<bytes::RegexSetBuilder>(); + assert_ref_unwind_safe::<bytes::RegexSetBuilder>(); +} + +// See: https://github.com/rust-lang/regex/issues/568 +#[test] +fn oibits_regression() { + use regex::Regex; + use std::panic; + + let _ = panic::catch_unwind(|| Regex::new("a").unwrap()); +} + +// See: https://github.com/rust-lang/regex/issues/750 +#[test] +#[cfg(target_pointer_width = "64")] +fn regex_is_reasonably_small() { + use std::mem::size_of; + + use regex::bytes; + use regex::{Regex, RegexSet}; + + assert_eq!(16, size_of::<Regex>()); + assert_eq!(16, size_of::<RegexSet>()); + assert_eq!(16, size_of::<bytes::Regex>()); + assert_eq!(16, size_of::<bytes::RegexSet>()); +} + +// See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 +// See: CVE-2022-24713 +// +// We test that our regex compiler will correctly return a "too big" error when +// we try to use a very large repetition on an *empty* sub-expression. +// +// At the time this test was written, the regex compiler does not represent +// empty sub-expressions with any bytecode instructions. In effect, it's an +// "optimization" to leave them out, since they would otherwise correspond +// to an unconditional JUMP in the regex bytecode (i.e., an unconditional +// epsilon transition in the NFA graph). Therefore, an empty sub-expression +// represents an interesting case for the compiler's size limits. Since it +// doesn't actually contribute any additional memory to the compiled regex +// instructions, the size limit machinery never detects it. Instead, it just +// dumbly tries to compile the empty sub-expression N times, where N is the +// repetition size. +// +// When N is very large, this will cause the compiler to essentially spin and +// do nothing for a decently large amount of time. It causes the regex to take +// quite a bit of time to compile, despite the concrete syntax of the regex +// being quite small. +// +// The degree to which this is actually a problem is somewhat of a judgment +// call. Some regexes simply take a long time to compile. But in general, you +// should be able to reasonably control this by setting lower or higher size +// limits on the compiled object size. But this mitigation doesn't work at all +// for this case. +// +// This particular test is somewhat narrow. It merely checks that regex +// compilation will, at some point, return a "too big" error. Before the +// fix landed, this test would eventually fail because the regex would be +// successfully compiled (after enough time elapsed). So while this test +// doesn't check that we exit in a reasonable amount of time, it does at least +// check that we are properly returning an error at some point. +#[test] +fn big_empty_regex_fails() { + use regex::Regex; + + let result = Regex::new("(?:){4294967295}"); + assert!(result.is_err()); +} + +// Below is a "billion laughs" variant of the previous test case. +#[test] +fn big_empty_reps_chain_regex_fails() { + use regex::Regex; + + let result = Regex::new("(?:){64}{64}{64}{64}{64}{64}"); + assert!(result.is_err()); +} + +// Below is another situation where a zero-length sub-expression can be +// introduced. +#[test] +fn big_zero_reps_regex_fails() { + use regex::Regex; + + let result = Regex::new(r"x{0}{4294967295}"); + assert!(result.is_err()); +} + +// Testing another case for completeness. +#[test] +fn empty_alt_regex_fails() { + use regex::Regex; + + let result = Regex::new(r"(?:|){4294967295}"); + assert!(result.is_err()); +} + +// Regression test for: https://github.com/rust-lang/regex/issues/969 +#[test] +fn regression_i969() { + use regex::Regex; + + let re = Regex::new(r"c.*d\z").unwrap(); + assert_eq!(Some(6), re.shortest_match_at("ababcd", 4)); + assert_eq!(Some(6), re.find_at("ababcd", 4).map(|m| m.end())); +} diff --git a/vendor/regex/tests/test_default_bytes.rs b/vendor/regex/tests/test_default_bytes.rs new file mode 100644 index 0000000000..f200596ba1 --- /dev/null +++ b/vendor/regex/tests/test_default_bytes.rs @@ -0,0 +1,75 @@ +macro_rules! regex_new { + ($re:expr) => {{ + use regex::bytes::Regex; + Regex::new($re) + }}; +} + +macro_rules! regex_set_new { + ($res:expr) => {{ + use regex::bytes::RegexSet; + RegexSet::new($res) + }}; +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + }; +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + }; +} + +// Must come before other module definitions. +include!("macros_bytes.rs"); +include!("macros.rs"); + +// A silly wrapper to make it possible to write and match raw bytes. +struct R<'a>(&'a [u8]); +impl<'a> R<'a> { + fn as_bytes(&self) -> &'a [u8] { + self.0 + } +} + +// See: https://github.com/rust-lang/regex/issues/321 +// +// These tests are here because they do not have the same behavior in every +// regex engine. +mat!(invalid_utf8_nfa1, r".", R(b"\xD4\xC2\x65\x2B\x0E\xFE"), Some((2, 3))); +mat!(invalid_utf8_nfa2, r"${2}ä", R(b"\xD4\xC2\x65\x2B\x0E\xFE"), None); +mat!( + invalid_utf8_nfa3, + r".", + R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"), + Some((1, 3)) +); +mat!( + invalid_utf8_nfa4, + r"${2}ä", + R(b"\x0A\xDB\x82\x6E\x33\x01\xDD\x33\xCD"), + None +); + +mod api; +mod bytes; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod set; +mod shortest_match; +mod suffix_reverse; +#[cfg(feature = "unicode")] +mod unicode; +#[cfg(feature = "unicode-perl")] +mod word_boundary; +#[cfg(feature = "unicode-perl")] +mod word_boundary_unicode; diff --git a/vendor/regex/tests/test_nfa.rs b/vendor/regex/tests/test_nfa.rs new file mode 100644 index 0000000000..e5a67d180a --- /dev/null +++ b/vendor/regex/tests/test_nfa.rs @@ -0,0 +1,50 @@ +#![cfg_attr(feature = "pattern", feature(pattern))] + +macro_rules! regex_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new($re).nfa().build().map(|e| e.into_regex()) + }}; +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + }; +} + +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new_many($re).nfa().build().map(|e| e.into_regex_set()) + }}; +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + }; +} + +// Must come before other module definitions. +include!("macros_str.rs"); +include!("macros.rs"); + +mod api; +mod api_str; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod searcher; +mod set; +mod suffix_reverse; +#[cfg(feature = "unicode")] +mod unicode; +#[cfg(feature = "unicode-perl")] +mod word_boundary; +#[cfg(feature = "unicode-perl")] +mod word_boundary_unicode; diff --git a/vendor/regex/tests/test_nfa_bytes.rs b/vendor/regex/tests/test_nfa_bytes.rs new file mode 100644 index 0000000000..0a10e032a2 --- /dev/null +++ b/vendor/regex/tests/test_nfa_bytes.rs @@ -0,0 +1,55 @@ +macro_rules! regex_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new($re) + .nfa() + .only_utf8(false) + .build() + .map(|e| e.into_byte_regex()) + }}; +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + }; +} + +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new_many($re) + .nfa() + .only_utf8(false) + .build() + .map(|e| e.into_byte_regex_set()) + }}; +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + }; +} + +// Must come before other module definitions. +include!("macros_bytes.rs"); +include!("macros.rs"); + +mod api; +mod bytes; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod set; +mod suffix_reverse; +#[cfg(feature = "unicode")] +mod unicode; +#[cfg(feature = "unicode-perl")] +mod word_boundary; +#[cfg(feature = "unicode-perl")] +mod word_boundary_unicode; diff --git a/vendor/regex/tests/test_nfa_utf8bytes.rs b/vendor/regex/tests/test_nfa_utf8bytes.rs new file mode 100644 index 0000000000..36a572b5fc --- /dev/null +++ b/vendor/regex/tests/test_nfa_utf8bytes.rs @@ -0,0 +1,54 @@ +#![cfg_attr(feature = "pattern", feature(pattern))] + +macro_rules! regex_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new($re).nfa().bytes(true).build().map(|e| e.into_regex()) + }}; +} + +macro_rules! regex { + ($re:expr) => { + regex_new!($re).unwrap() + }; +} + +macro_rules! regex_set_new { + ($re:expr) => {{ + use regex::internal::ExecBuilder; + ExecBuilder::new_many($re) + .nfa() + .bytes(true) + .build() + .map(|e| e.into_regex_set()) + }}; +} + +macro_rules! regex_set { + ($res:expr) => { + regex_set_new!($res).unwrap() + }; +} + +// Must come before other module definitions. +include!("macros_str.rs"); +include!("macros.rs"); + +mod api; +mod api_str; +mod crazy; +mod flags; +mod fowler; +mod multiline; +mod noparse; +mod regression; +mod replace; +mod searcher; +mod set; +mod suffix_reverse; +#[cfg(feature = "unicode")] +mod unicode; +#[cfg(feature = "unicode-perl")] +mod word_boundary; +#[cfg(feature = "unicode-perl")] +mod word_boundary_unicode; diff --git a/vendor/regex/tests/unicode.rs b/vendor/regex/tests/unicode.rs new file mode 100644 index 0000000000..9b32286247 --- /dev/null +++ b/vendor/regex/tests/unicode.rs @@ -0,0 +1,251 @@ +mat!(uni_literal, r"☃", "☃", Some((0, 3))); +mat!(uni_literal_plus, r"☃+", "☃", Some((0, 3))); +mat!(uni_literal_casei_plus, r"(?i)☃+", "☃", Some((0, 3))); +mat!(uni_class_plus, r"[☃Ⅰ]+", "☃", Some((0, 3))); +mat!(uni_one, r"\pN", "Ⅰ", Some((0, 3))); +mat!(uni_mixed, r"\pN+", "Ⅰ1Ⅱ2", Some((0, 8))); +mat!(uni_not, r"\PN+", "abⅠ", Some((0, 2))); +mat!(uni_not_class, r"[\PN]+", "abⅠ", Some((0, 2))); +mat!(uni_not_class_neg, r"[^\PN]+", "abⅠ", Some((2, 5))); +mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2))); +mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8))); +mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10))); +mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10))); +mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10))); + +// Test the Unicode friendliness of Perl character classes. +mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4))); +mat!(uni_perl_w_not, r"\w+", "⥡", None); +mat!(uni_perl_w_neg, r"\W+", "⥡", Some((0, 3))); +mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8))); +mat!(uni_perl_d_not, r"\d+", "Ⅱ", None); +mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3))); +mat!(uni_perl_s, r"\s+", " ", Some((0, 3))); +mat!(uni_perl_s_not, r"\s+", "☃", None); +mat!(uni_perl_s_neg, r"\S+", "☃", Some((0, 3))); + +// And do the same for word boundaries. +mat!(uni_boundary_none, r"\d\b", "6δ", None); +mat!(uni_boundary_ogham, r"\d\b", "6 ", Some((0, 1))); +mat!(uni_not_boundary_none, r"\d\B", "6δ", Some((0, 1))); +mat!(uni_not_boundary_ogham, r"\d\B", "6 ", None); + +// Test general categories. +// +// We should test more, but there's a lot. Write a script to generate more of +// these tests. +mat!(uni_class_gencat_cased_letter, r"\p{Cased_Letter}", "A", Some((0, 3))); +mat!( + uni_class_gencat_close_punctuation, + r"\p{Close_Punctuation}", + "❯", + Some((0, 3)) +); +mat!( + uni_class_gencat_connector_punctuation, + r"\p{Connector_Punctuation}", + "⁀", + Some((0, 3)) +); +mat!(uni_class_gencat_control, r"\p{Control}", "\u{9f}", Some((0, 2))); +mat!( + uni_class_gencat_currency_symbol, + r"\p{Currency_Symbol}", + "£", + Some((0, 3)) +); +mat!( + uni_class_gencat_dash_punctuation, + r"\p{Dash_Punctuation}", + "〰", + Some((0, 3)) +); +mat!(uni_class_gencat_decimal_numer, r"\p{Decimal_Number}", "𑓙", Some((0, 4))); +mat!( + uni_class_gencat_enclosing_mark, + r"\p{Enclosing_Mark}", + "\u{A672}", + Some((0, 3)) +); +mat!( + uni_class_gencat_final_punctuation, + r"\p{Final_Punctuation}", + "⸡", + Some((0, 3)) +); +mat!(uni_class_gencat_format, r"\p{Format}", "\u{E007F}", Some((0, 4))); +// See: https://github.com/rust-lang/regex/issues/719 +mat!(uni_class_gencat_format_abbrev1, r"\p{cf}", "\u{E007F}", Some((0, 4))); +mat!(uni_class_gencat_format_abbrev2, r"\p{gc=cf}", "\u{E007F}", Some((0, 4))); +mat!( + uni_class_gencat_initial_punctuation, + r"\p{Initial_Punctuation}", + "⸜", + Some((0, 3)) +); +mat!(uni_class_gencat_letter, r"\p{Letter}", "Έ", Some((0, 2))); +mat!(uni_class_gencat_letter_number, r"\p{Letter_Number}", "ↂ", Some((0, 3))); +mat!( + uni_class_gencat_line_separator, + r"\p{Line_Separator}", + "\u{2028}", + Some((0, 3)) +); +mat!( + uni_class_gencat_lowercase_letter, + r"\p{Lowercase_Letter}", + "ϛ", + Some((0, 2)) +); +mat!(uni_class_gencat_mark, r"\p{Mark}", "\u{E01EF}", Some((0, 4))); +mat!(uni_class_gencat_math, r"\p{Math}", "⋿", Some((0, 3))); +mat!( + uni_class_gencat_modifier_letter, + r"\p{Modifier_Letter}", + "𖭃", + Some((0, 4)) +); +mat!( + uni_class_gencat_modifier_symbol, + r"\p{Modifier_Symbol}", + "🏿", + Some((0, 4)) +); +mat!( + uni_class_gencat_nonspacing_mark, + r"\p{Nonspacing_Mark}", + "\u{1E94A}", + Some((0, 4)) +); +mat!(uni_class_gencat_number, r"\p{Number}", "⓿", Some((0, 3))); +mat!( + uni_class_gencat_open_punctuation, + r"\p{Open_Punctuation}", + "⦅", + Some((0, 3)) +); +mat!(uni_class_gencat_other, r"\p{Other}", "\u{bc9}", Some((0, 3))); +mat!(uni_class_gencat_other_letter, r"\p{Other_Letter}", "ꓷ", Some((0, 3))); +mat!(uni_class_gencat_other_number, r"\p{Other_Number}", "㉏", Some((0, 3))); +mat!( + uni_class_gencat_other_punctuation, + r"\p{Other_Punctuation}", + "𞥞", + Some((0, 4)) +); +mat!(uni_class_gencat_other_symbol, r"\p{Other_Symbol}", "⅌", Some((0, 3))); +mat!( + uni_class_gencat_paragraph_separator, + r"\p{Paragraph_Separator}", + "\u{2029}", + Some((0, 3)) +); +mat!( + uni_class_gencat_private_use, + r"\p{Private_Use}", + "\u{10FFFD}", + Some((0, 4)) +); +mat!(uni_class_gencat_punctuation, r"\p{Punctuation}", "𑁍", Some((0, 4))); +mat!(uni_class_gencat_separator, r"\p{Separator}", "\u{3000}", Some((0, 3))); +mat!( + uni_class_gencat_space_separator, + r"\p{Space_Separator}", + "\u{205F}", + Some((0, 3)) +); +mat!( + uni_class_gencat_spacing_mark, + r"\p{Spacing_Mark}", + "\u{16F7E}", + Some((0, 4)) +); +mat!(uni_class_gencat_symbol, r"\p{Symbol}", "⯈", Some((0, 3))); +mat!( + uni_class_gencat_titlecase_letter, + r"\p{Titlecase_Letter}", + "ῼ", + Some((0, 3)) +); +mat!( + uni_class_gencat_unassigned, + r"\p{Unassigned}", + "\u{10FFFF}", + Some((0, 4)) +); +mat!( + uni_class_gencat_uppercase_letter, + r"\p{Uppercase_Letter}", + "Ꝋ", + Some((0, 3)) +); + +// Test a smattering of properties. +mat!(uni_class_prop_emoji1, r"\p{Emoji}", "\u{23E9}", Some((0, 3))); +mat!(uni_class_prop_emoji2, r"\p{emoji}", "\u{1F21A}", Some((0, 4))); +mat!( + uni_class_prop_picto1, + r"\p{extendedpictographic}", + "\u{1FA6E}", + Some((0, 4)) +); +mat!( + uni_class_prop_picto2, + r"\p{extendedpictographic}", + "\u{1FFFD}", + Some((0, 4)) +); + +// grapheme_cluster_break +mat!( + uni_class_gcb_prepend, + r"\p{grapheme_cluster_break=prepend}", + "\u{11D46}", + Some((0, 4)) +); +mat!( + uni_class_gcb_ri1, + r"\p{gcb=regional_indicator}", + "\u{1F1E6}", + Some((0, 4)) +); +mat!(uni_class_gcb_ri2, r"\p{gcb=ri}", "\u{1F1E7}", Some((0, 4))); +mat!( + uni_class_gcb_ri3, + r"\p{gcb=regionalindicator}", + "\u{1F1FF}", + Some((0, 4)) +); +mat!(uni_class_gcb_lvt, r"\p{gcb=lvt}", "\u{C989}", Some((0, 3))); +mat!(uni_class_gcb_zwj, r"\p{gcb=zwj}", "\u{200D}", Some((0, 3))); + +// word_break +mat!(uni_class_wb1, r"\p{word_break=Hebrew_Letter}", "\u{FB46}", Some((0, 3))); +mat!(uni_class_wb2, r"\p{wb=hebrewletter}", "\u{FB46}", Some((0, 3))); +mat!(uni_class_wb3, r"\p{wb=ExtendNumLet}", "\u{FF3F}", Some((0, 3))); +mat!(uni_class_wb4, r"\p{wb=WSegSpace}", "\u{3000}", Some((0, 3))); +mat!(uni_class_wb5, r"\p{wb=numeric}", "\u{1E950}", Some((0, 4))); + +// sentence_break +mat!(uni_class_sb1, r"\p{sentence_break=Lower}", "\u{0469}", Some((0, 2))); +mat!(uni_class_sb2, r"\p{sb=lower}", "\u{0469}", Some((0, 2))); +mat!(uni_class_sb3, r"\p{sb=Close}", "\u{FF60}", Some((0, 3))); +mat!(uni_class_sb4, r"\p{sb=Close}", "\u{1F677}", Some((0, 4))); +mat!(uni_class_sb5, r"\p{sb=SContinue}", "\u{FF64}", Some((0, 3))); + +// Test 'Vithkuqi' support, which was added in Unicode 14. +// See: https://github.com/rust-lang/regex/issues/877 +mat!( + uni_vithkuqi_literal_upper, + r"(?i)^\u{10570}$", + "\u{10570}", + Some((0, 4)) +); +mat!( + uni_vithkuqi_literal_lower, + r"(?i)^\u{10570}$", + "\u{10597}", + Some((0, 4)) +); +mat!(uni_vithkuqi_word_upper, r"^\w$", "\u{10570}", Some((0, 4))); +mat!(uni_vithkuqi_word_lower, r"^\w$", "\u{10597}", Some((0, 4))); diff --git a/vendor/regex/tests/word_boundary.rs b/vendor/regex/tests/word_boundary.rs new file mode 100644 index 0000000000..7fe97a2974 --- /dev/null +++ b/vendor/regex/tests/word_boundary.rs @@ -0,0 +1,89 @@ +// Many of these are cribbed from RE2's test suite. + +matiter!(wb1, r"\b", ""); +matiter!(wb2, r"\b", "a", (0, 0), (1, 1)); +matiter!(wb3, r"\b", "ab", (0, 0), (2, 2)); +matiter!(wb4, r"^\b", "ab", (0, 0)); +matiter!(wb5, r"\b$", "ab", (2, 2)); +matiter!(wb6, r"^\b$", "ab"); +matiter!(wb7, r"\bbar\b", "nobar bar foo bar", (6, 9), (14, 17)); +matiter!(wb8, r"a\b", "faoa x", (3, 4)); +matiter!(wb9, r"\bbar", "bar x", (0, 3)); +matiter!(wb10, r"\bbar", "foo\nbar x", (4, 7)); +matiter!(wb11, r"bar\b", "foobar", (3, 6)); +matiter!(wb12, r"bar\b", "foobar\nxxx", (3, 6)); +matiter!(wb13, r"(foo|bar|[A-Z])\b", "foo", (0, 3)); +matiter!(wb14, r"(foo|bar|[A-Z])\b", "foo\n", (0, 3)); +matiter!(wb15, r"\b(foo|bar|[A-Z])", "foo", (0, 3)); +matiter!(wb16, r"\b(foo|bar|[A-Z])\b", "X", (0, 1)); +matiter!(wb17, r"\b(foo|bar|[A-Z])\b", "XY"); +matiter!(wb18, r"\b(foo|bar|[A-Z])\b", "bar", (0, 3)); +matiter!(wb19, r"\b(foo|bar|[A-Z])\b", "foo", (0, 3)); +matiter!(wb20, r"\b(foo|bar|[A-Z])\b", "foo\n", (0, 3)); +matiter!(wb21, r"\b(foo|bar|[A-Z])\b", "ffoo bbar N x", (10, 11)); +matiter!(wb22, r"\b(fo|foo)\b", "fo", (0, 2)); +matiter!(wb23, r"\b(fo|foo)\b", "foo", (0, 3)); +matiter!(wb24, r"\b\b", ""); +matiter!(wb25, r"\b\b", "a", (0, 0), (1, 1)); +matiter!(wb26, r"\b$", ""); +matiter!(wb27, r"\b$", "x", (1, 1)); +matiter!(wb28, r"\b$", "y x", (3, 3)); +matiter!(wb29, r"\b.$", "x", (0, 1)); +matiter!(wb30, r"^\b(fo|foo)\b", "fo", (0, 2)); +matiter!(wb31, r"^\b(fo|foo)\b", "foo", (0, 3)); +matiter!(wb32, r"^\b$", ""); +matiter!(wb33, r"^\b$", "x"); +matiter!(wb34, r"^\b.$", "x", (0, 1)); +matiter!(wb35, r"^\b.\b$", "x", (0, 1)); +matiter!(wb36, r"^^^^^\b$$$$$", ""); +matiter!(wb37, r"^^^^^\b.$$$$$", "x", (0, 1)); +matiter!(wb38, r"^^^^^\b$$$$$", "x"); +matiter!(wb39, r"^^^^^\b\b\b.\b\b\b$$$$$", "x", (0, 1)); +matiter!(wb40, r"\b.+\b", "$$abc$$", (2, 5)); +matiter!(wb41, r"\b", "a b c", (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)); + +matiter!(nb1, r"\Bfoo\B", "n foo xfoox that", (7, 10)); +matiter!(nb2, r"a\B", "faoa x", (1, 2)); +matiter!(nb3, r"\Bbar", "bar x"); +matiter!(nb4, r"\Bbar", "foo\nbar x"); +matiter!(nb5, r"bar\B", "foobar"); +matiter!(nb6, r"bar\B", "foobar\nxxx"); +matiter!(nb7, r"(foo|bar|[A-Z])\B", "foox", (0, 3)); +matiter!(nb8, r"(foo|bar|[A-Z])\B", "foo\n"); +matiter!(nb9, r"\B", "", (0, 0)); +matiter!(nb10, r"\B", "x"); +matiter!(nb11, r"\B(foo|bar|[A-Z])", "foo"); +matiter!(nb12, r"\B(foo|bar|[A-Z])\B", "xXy", (1, 2)); +matiter!(nb13, r"\B(foo|bar|[A-Z])\B", "XY"); +matiter!(nb14, r"\B(foo|bar|[A-Z])\B", "XYZ", (1, 2)); +matiter!(nb15, r"\B(foo|bar|[A-Z])\B", "abara", (1, 4)); +matiter!(nb16, r"\B(foo|bar|[A-Z])\B", "xfoo_", (1, 4)); +matiter!(nb17, r"\B(foo|bar|[A-Z])\B", "xfoo\n"); +matiter!(nb18, r"\B(foo|bar|[A-Z])\B", "foo bar vNX", (9, 10)); +matiter!(nb19, r"\B(fo|foo)\B", "xfoo", (1, 3)); +matiter!(nb20, r"\B(foo|fo)\B", "xfooo", (1, 4)); +matiter!(nb21, r"\B\B", "", (0, 0)); +matiter!(nb22, r"\B\B", "x"); +matiter!(nb23, r"\B$", "", (0, 0)); +matiter!(nb24, r"\B$", "x"); +matiter!(nb25, r"\B$", "y x"); +matiter!(nb26, r"\B.$", "x"); +matiter!(nb27, r"^\B(fo|foo)\B", "fo"); +matiter!(nb28, r"^\B(fo|foo)\B", "foo"); +matiter!(nb29, r"^\B", "", (0, 0)); +matiter!(nb30, r"^\B", "x"); +matiter!(nb31, r"^\B\B", "", (0, 0)); +matiter!(nb32, r"^\B\B", "x"); +matiter!(nb33, r"^\B$", "", (0, 0)); +matiter!(nb34, r"^\B$", "x"); +matiter!(nb35, r"^\B.$", "x"); +matiter!(nb36, r"^\B.\B$", "x"); +matiter!(nb37, r"^^^^^\B$$$$$", "", (0, 0)); +matiter!(nb38, r"^^^^^\B.$$$$$", "x"); +matiter!(nb39, r"^^^^^\B$$$$$", "x"); + +// These work for both Unicode and ASCII because all matches are reported as +// byte offsets, and « and » do not correspond to word boundaries at either +// the character or byte level. +matiter!(unicode1, r"\bx\b", "«x", (2, 3)); +matiter!(unicode2, r"\bx\b", "x»", (0, 1)); diff --git a/vendor/regex/tests/word_boundary_ascii.rs b/vendor/regex/tests/word_boundary_ascii.rs new file mode 100644 index 0000000000..5a3cf1166c --- /dev/null +++ b/vendor/regex/tests/word_boundary_ascii.rs @@ -0,0 +1,9 @@ +// ASCII word boundaries are completely oblivious to Unicode characters. +// For Unicode word boundaries, the tests are precisely inverted. +matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3)); +matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ"); +matiter!(ascii3, r"(?-u:\B)", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5)); + +// We still get Unicode word boundaries by default in byte regexes. +matiter!(unicode1, r"\bx\b", "áxβ"); +matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3)); diff --git a/vendor/regex/tests/word_boundary_unicode.rs b/vendor/regex/tests/word_boundary_unicode.rs new file mode 100644 index 0000000000..c41355ffc4 --- /dev/null +++ b/vendor/regex/tests/word_boundary_unicode.rs @@ -0,0 +1,6 @@ +// Unicode word boundaries know about Unicode characters. +// For ASCII word boundaries, the tests are precisely inverted. +matiter!(unicode1, r"\bx\b", "áxβ"); +matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3)); + +matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3));