From 309e5432adb0156f8e23d07b27ecb2197eeed59b Mon Sep 17 00:00:00 2001 From: Jack Wrenn Date: Sat, 14 Mar 2026 14:25:27 +0000 Subject: [PATCH 1/3] [codegen] Break build dependency between tests and lib Since codegen test artifacts are rendered in crate documentation, changes to those artifacts place the crate source in a dirty state. This effectively serializes codegen blessing, since cargo must rebuild `zerocopy` between each test. We correct this by stubbing out `codegen_section!` when `cfg(not(doc))`. Makes progress towards #3079. gherrit-pr-id: Gcf771ba083fd307e788ceb72bc09085177a48ca8 --- src/util/macros.rs | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/util/macros.rs b/src/util/macros.rs index 59fd2549d9..7dca5410c8 100644 --- a/src/util/macros.rs +++ b/src/util/macros.rs @@ -982,8 +982,37 @@ macro_rules! codegen_preamble { } } +/// Stub for rendering codegen documentation; used to break build dependency +/// between benches and zerocopy when re-blessing codegen tests. +#[allow(unused)] +#[cfg(not(doc))] +macro_rules! codegen_section { + ( + header = $level:expr, + bench = $bench:expr, + format = $format:expr, + arity = $arity:literal, + $([ + $($open:ident)? + @index $index:literal + @title $title:literal + @variant $variant:literal + ]),* + ) => { + "" + }; + ( + header = $level:expr, + bench = $bench:expr, + format = $format:expr, + ) => { + "" + }; +} + /// Generates the HTML for code generation documentation. #[allow(unused)] +#[cfg(doc)] macro_rules! codegen_section { ( header = $level:expr, From bfb7dc2a8ac66c093a63e890aed0c4f98496d2f5 Mon Sep 17 00:00:00 2001 From: Jack Wrenn Date: Sat, 14 Mar 2026 16:06:04 +0000 Subject: [PATCH 2/3] Optimize bounds checking by validating metadata against maximum Introduces `KnownLayout::is_valid_metadata` which produces `true` iff the given metadata can describe a valid allocation of `Self` by comparing the given metadata against the theoretical maximum for `Self`'s layout. The theoretical metadata can (and is, in practice) computed at compile time. We insert this check in the two critical places that bounds-check user- provided metadata: `new_box` and `validate_cast_and_convert_metadata`. For sized and simple dynamically sized types, this does not meaningfully impact codegen, as the optimizer was already able to deduce the maximum valid metadata. However, for dynamically padded types, this permits the compiler to use unchecked arithmetic, resulting in MCA cycle count reductions of as much as 44%. Makes progress towards #3079. gherrit-pr-id: Gcf66958135e905f7cd7d2fac87d9f881e5e5185f --- ...x_zeroed_with_elems_dynamic_padding.x86-64 | 32 ++-- ...roed_with_elems_dynamic_padding.x86-64.mca | 86 ++++------- ..._box_zeroed_with_elems_dynamic_size.x86-64 | 10 +- ..._zeroed_with_elems_dynamic_size.x86-64.mca | 32 ++-- benches/new_vec_zeroed.x86-64 | 62 ++++---- benches/new_vec_zeroed.x86-64.mca | 142 +++++++++--------- ...om_bytes_with_elems_dynamic_padding.x86-64 | 41 ++--- ...ytes_with_elems_dynamic_padding.x86-64.mca | 92 +++++------- ..._from_bytes_with_elems_dynamic_size.x86-64 | 2 +- ...m_bytes_with_elems_dynamic_size.x86-64.mca | 4 +- ...m_prefix_with_elems_dynamic_padding.x86-64 | 47 +++--- ...efix_with_elems_dynamic_padding.x86-64.mca | 106 ++++++------- ...from_prefix_with_elems_dynamic_size.x86-64 | 2 +- ..._prefix_with_elems_dynamic_size.x86-64.mca | 4 +- ...m_suffix_with_elems_dynamic_padding.x86-64 | 41 +++-- ...ffix_with_elems_dynamic_padding.x86-64.mca | 98 ++++++------ ...from_suffix_with_elems_dynamic_size.x86-64 | 2 +- ..._suffix_with_elems_dynamic_size.x86-64.mca | 4 +- ...om_bytes_with_elems_dynamic_padding.x86-64 | 49 +++--- ...ytes_with_elems_dynamic_padding.x86-64.mca | 106 +++++-------- ..._from_bytes_with_elems_dynamic_size.x86-64 | 2 +- ...m_bytes_with_elems_dynamic_size.x86-64.mca | 4 +- ...m_prefix_with_elems_dynamic_padding.x86-64 | 39 +++-- ...efix_with_elems_dynamic_padding.x86-64.mca | 96 ++++++------ ...from_prefix_with_elems_dynamic_size.x86-64 | 2 +- ..._prefix_with_elems_dynamic_size.x86-64.mca | 4 +- ...m_suffix_with_elems_dynamic_padding.x86-64 | 49 +++--- ...ffix_with_elems_dynamic_padding.x86-64.mca | 114 ++++++-------- ...from_suffix_with_elems_dynamic_size.x86-64 | 2 +- ..._suffix_with_elems_dynamic_size.x86-64.mca | 4 +- src/byte_slice.rs | 2 +- src/layout.rs | 20 ++- src/lib.rs | 45 +++++- src/util/mod.rs | 58 +++---- 34 files changed, 631 insertions(+), 772 deletions(-) diff --git a/benches/new_box_zeroed_with_elems_dynamic_padding.x86-64 b/benches/new_box_zeroed_with_elems_dynamic_padding.x86-64 index 0ab9f379a2..22a8d048ce 100644 --- a/benches/new_box_zeroed_with_elems_dynamic_padding.x86-64 +++ b/benches/new_box_zeroed_with_elems_dynamic_padding.x86-64 @@ -3,32 +3,20 @@ bench_new_box_zeroed_with_elems_dynamic_padding: push rbx push rax mov rbx, rdi - mov ecx, 3 - mov rax, rdi - mul rcx - jo .LBB5_6 - mov r14, rax - cmp rax, -10 - ja .LBB5_6 - lea rax, [r14 + 9] - not r14d - and r14d, 3 - add r14, rax - setb al - movabs rcx, 9223372036854775803 - cmp r14, rcx - seta cl - or cl, al - je .LBB5_4 -.LBB5_6: - xor eax, eax - jmp .LBB5_5 -.LBB5_4: + movabs rax, 3074457345618258598 + cmp rdi, rax + ja .LBB5_1 + lea r14, [rbx + 2*rbx] + or r14, 3 + add r14, 9 call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] mov esi, 4 mov rdi, r14 call qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] -.LBB5_5: + jmp .LBB5_3 +.LBB5_1: + xor eax, eax +.LBB5_3: mov rdx, rbx add rsp, 8 pop rbx diff --git a/benches/new_box_zeroed_with_elems_dynamic_padding.x86-64.mca b/benches/new_box_zeroed_with_elems_dynamic_padding.x86-64.mca index f666a03ce9..e6efaeded4 100644 --- a/benches/new_box_zeroed_with_elems_dynamic_padding.x86-64.mca +++ b/benches/new_box_zeroed_with_elems_dynamic_padding.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 3200 -Total Cycles: 2989 -Total uOps: 4300 +Instructions: 2100 +Total Cycles: 2990 +Total uOps: 3000 Dispatch Width: 4 -uOps Per Cycle: 1.44 -IPC: 1.07 -Block RThroughput: 10.8 +uOps Per Cycle: 1.00 +IPC: 0.70 +Block RThroughput: 7.5 Instruction Info: @@ -22,29 +22,18 @@ Instruction Info: 2 5 1.00 * push rbx 2 5 1.00 * push rax 1 1 0.33 mov rbx, rdi - 1 1 0.33 mov ecx, 3 - 1 1 0.33 mov rax, rdi - 2 4 1.00 mul rcx - 1 1 1.00 jo .LBB5_6 - 1 1 0.33 mov r14, rax - 1 1 0.33 cmp rax, -10 - 1 1 1.00 ja .LBB5_6 - 1 1 0.50 lea rax, [r14 + 9] - 1 1 0.33 not r14d - 1 1 0.33 and r14d, 3 - 1 1 0.33 add r14, rax - 1 1 0.50 setb al - 1 1 0.33 movabs rcx, 9223372036854775803 - 1 1 0.33 cmp r14, rcx - 2 2 1.00 seta cl - 1 1 0.33 or cl, al - 1 1 1.00 je .LBB5_4 - 1 0 0.25 xor eax, eax - 1 1 1.00 jmp .LBB5_5 + 1 1 0.33 movabs rax, 3074457345618258598 + 1 1 0.33 cmp rdi, rax + 1 1 1.00 ja .LBB5_1 + 1 1 0.50 lea r14, [rbx + 2*rbx] + 1 1 0.33 or r14, 3 + 1 1 0.33 add r14, 9 4 7 1.00 * call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] 1 1 0.33 mov esi, 4 1 1 0.33 mov rdi, r14 4 7 1.00 * call qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] + 1 1 1.00 jmp .LBB5_3 + 1 0 0.25 xor eax, eax 1 1 0.33 mov rdx, rbx 1 1 0.33 add rsp, 8 1 6 0.50 * pop rbx @@ -65,39 +54,28 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 8.99 8.98 5.00 10.03 4.49 4.51 + - - 4.49 4.50 5.00 6.01 4.50 4.50 Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - - - 1.00 - 0.49 0.51 push r14 - - - - - 1.00 - 0.51 0.49 push rbx - - - - - 1.00 - 0.49 0.51 push rax - - - 0.95 0.04 - 0.01 - - mov rbx, rdi - - - - 0.97 - 0.03 - - mov ecx, 3 - - - 0.02 0.02 - 0.96 - - mov rax, rdi - - - 1.00 1.00 - - - - mul rcx - - - - - - 1.00 - - jo .LBB5_6 - - - 0.02 0.97 - 0.01 - - mov r14, rax - - - 0.97 0.03 - - - - cmp rax, -10 - - - - - - 1.00 - - ja .LBB5_6 - - - 0.99 0.01 - - - - lea rax, [r14 + 9] - - - 0.01 0.99 - - - - not r14d - - - 0.97 0.03 - - - - and r14d, 3 - - - 0.01 0.98 - 0.01 - - add r14, rax - - - 1.00 - - - - - setb al - - - 0.02 - - 0.98 - - movabs rcx, 9223372036854775803 - - - - 0.97 - 0.03 - - cmp r14, rcx - - - 2.00 - - - - - seta cl - - - 0.03 0.03 - 0.94 - - or cl, al - - - - - - 1.00 - - je .LBB5_4 + - - - - 1.00 - 0.50 0.50 push r14 + - - - - 1.00 - 0.50 0.50 push rbx + - - - - 1.00 - 0.50 0.50 push rax + - - 0.49 0.50 - 0.01 - - mov rbx, rdi + - - 0.50 0.50 - - - - movabs rax, 3074457345618258598 + - - 0.50 0.50 - - - - cmp rdi, rax + - - - - - 1.00 - - ja .LBB5_1 + - - 0.50 0.50 - - - - lea r14, [rbx + 2*rbx] + - - 0.50 0.50 - - - - or r14, 3 + - - 0.50 - - 0.50 - - add r14, 9 + - - - - 1.00 1.00 1.00 1.00 call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] + - - - 0.50 - 0.50 - - mov esi, 4 + - - 0.50 0.50 - - - - mov rdi, r14 + - - - - 1.00 1.00 1.00 1.00 call qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] + - - - - - 1.00 - - jmp .LBB5_3 - - - - - - - - xor eax, eax - - - - - - 1.00 - - jmp .LBB5_5 - - - - - 1.00 1.00 1.02 0.98 call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] - - - 0.03 0.97 - - - - mov esi, 4 - - - 0.96 0.01 - 0.03 - - mov rdi, r14 - - - - - 1.00 1.00 0.98 1.02 call qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] - - - - 0.97 - 0.03 - - mov rdx, rbx - - - 0.01 0.99 - - - - add rsp, 8 + - - 0.51 0.49 - - - - mov rdx, rbx + - - 0.49 0.51 - - - - add rsp, 8 - - - - - - 0.50 0.50 pop rbx - - - - - - 0.50 0.50 pop r14 - - - - - 1.00 - - ret diff --git a/benches/new_box_zeroed_with_elems_dynamic_size.x86-64 b/benches/new_box_zeroed_with_elems_dynamic_size.x86-64 index 175fff0fd3..bff15e55ad 100644 --- a/benches/new_box_zeroed_with_elems_dynamic_size.x86-64 +++ b/benches/new_box_zeroed_with_elems_dynamic_size.x86-64 @@ -3,17 +3,17 @@ bench_new_box_zeroed_with_elems_dynamic_size: push rbx push rax mov rbx, rdi - movabs rax, 4611686018427387900 + movabs rax, 4611686018427387901 cmp rdi, rax - jbe .LBB5_2 - xor eax, eax - jmp .LBB5_3 -.LBB5_2: + ja .LBB5_1 lea r14, [2*rbx + 4] call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] mov esi, 2 mov rdi, r14 call qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] + jmp .LBB5_3 +.LBB5_1: + xor eax, eax .LBB5_3: mov rdx, rbx add rsp, 8 diff --git a/benches/new_box_zeroed_with_elems_dynamic_size.x86-64.mca b/benches/new_box_zeroed_with_elems_dynamic_size.x86-64.mca index 88b5f84b98..153d36c01c 100644 --- a/benches/new_box_zeroed_with_elems_dynamic_size.x86-64.mca +++ b/benches/new_box_zeroed_with_elems_dynamic_size.x86-64.mca @@ -22,16 +22,16 @@ Instruction Info: 2 5 1.00 * push rbx 2 5 1.00 * push rax 1 1 0.33 mov rbx, rdi - 1 1 0.33 movabs rax, 4611686018427387900 + 1 1 0.33 movabs rax, 4611686018427387901 1 1 0.33 cmp rdi, rax - 1 1 1.00 jbe .LBB5_2 - 1 0 0.25 xor eax, eax - 1 1 1.00 jmp .LBB5_3 + 1 1 1.00 ja .LBB5_1 1 1 0.50 lea r14, [2*rbx + 4] 4 7 1.00 * call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] 1 1 0.33 mov esi, 2 1 1 0.33 mov rdi, r14 4 7 1.00 * call qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] + 1 1 1.00 jmp .LBB5_3 + 1 0 0.25 xor eax, eax 1 1 0.33 mov rdx, rbx 1 1 0.33 add rsp, 8 1 6 0.50 * pop rbx @@ -52,26 +52,26 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 3.97 3.97 5.00 5.06 4.50 4.50 + - - 3.97 3.98 5.00 5.05 4.50 4.50 Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - - 1.00 - 0.50 0.50 push r14 - - - - 1.00 - 0.50 0.50 push rbx - - - - 1.00 - 0.50 0.50 push rax - - - 0.94 0.05 - 0.01 - - mov rbx, rdi - - - 0.05 0.95 - - - - movabs rax, 4611686018427387900 - - - 0.95 - - 0.05 - - cmp rdi, rax - - - - - - 1.00 - - jbe .LBB5_2 - - - - - - - - - xor eax, eax - - - - - - 1.00 - - jmp .LBB5_3 - - - - 1.00 - - - - lea r14, [2*rbx + 4] + - - 0.05 0.94 - 0.01 - - mov rbx, rdi + - - 0.94 0.06 - - - - movabs rax, 4611686018427387901 + - - 0.06 0.94 - - - - cmp rdi, rax + - - - - - 1.00 - - ja .LBB5_1 + - - 0.94 0.06 - - - - lea r14, [2*rbx + 4] - - - - 1.00 1.00 1.00 1.00 call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] - - - 0.06 0.94 - - - - mov esi, 2 - - - 0.94 0.06 - - - - mov rdi, r14 + - - 0.98 0.02 - - - - mov esi, 2 + - - 0.02 0.94 - 0.04 - - mov rdi, r14 - - - - 1.00 1.00 1.00 1.00 call qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] - - - 0.05 0.95 - - - - mov rdx, rbx - - - 0.98 0.02 - - - - add rsp, 8 + - - - - - 1.00 - - jmp .LBB5_3 + - - - - - - - - xor eax, eax + - - 0.94 0.06 - - - - mov rdx, rbx + - - 0.04 0.96 - - - - add rsp, 8 - - - - - - 0.50 0.50 pop rbx - - - - - - 0.50 0.50 pop r14 - - - - - 1.00 - - ret diff --git a/benches/new_vec_zeroed.x86-64 b/benches/new_vec_zeroed.x86-64 index 264fa4a852..b5c083aa0d 100644 --- a/benches/new_vec_zeroed.x86-64 +++ b/benches/new_vec_zeroed.x86-64 @@ -1,44 +1,40 @@ bench_new_vec_zeroed: + mov rax, rdi + movabs rcx, 1537228672809129301 + cmp rsi, rcx + ja .LBB5_5 + test rsi, rsi + je .LBB5_2 push r15 push r14 - push r12 push rbx - push rax - mov rbx, rdi - movabs r12, 9223372036854775805 - mov ecx, 6 - mov rax, rsi - mul rcx - jo .LBB5_6 - cmp rax, r12 - jbe .LBB5_2 -.LBB5_6: - add r12, 3 - mov qword ptr [rbx], r12 - jmp .LBB5_7 -.LBB5_2: - mov r14, rsi - test rax, rax - je .LBB5_3 - mov r15, rax + lea rcx, [rsi + rsi] + lea rbx, [rcx + 2*rcx] + mov r14, rax + mov r15, rsi call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] mov esi, 2 - mov rdi, r15 + mov rdi, rbx call qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] - test rax, rax - jne .LBB5_5 - jmp .LBB5_6 -.LBB5_3: - mov eax, 2 -.LBB5_5: - mov qword ptr [rbx], r14 - mov qword ptr [rbx + 8], rax - mov qword ptr [rbx + 16], r14 -.LBB5_7: - mov rax, rbx - add rsp, 8 + mov rsi, r15 + mov rcx, rax + mov rax, r14 + test rcx, rcx pop rbx - pop r12 pop r14 pop r15 + je .LBB5_5 + mov qword ptr [rax], rsi + mov qword ptr [rax + 8], rcx + mov qword ptr [rax + 16], rsi + ret +.LBB5_5: + movabs rcx, -9223372036854775808 + mov qword ptr [rax], rcx + ret +.LBB5_2: + mov ecx, 2 + mov qword ptr [rax], rsi + mov qword ptr [rax + 8], rcx + mov qword ptr [rax + 16], rsi ret diff --git a/benches/new_vec_zeroed.x86-64.mca b/benches/new_vec_zeroed.x86-64.mca index 093bbde096..b4fb4544ec 100644 --- a/benches/new_vec_zeroed.x86-64.mca +++ b/benches/new_vec_zeroed.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 3800 -Total Cycles: 5277 -Total uOps: 5000 +Instructions: 3700 +Total Cycles: 3486 +Total uOps: 4600 Dispatch Width: 4 -uOps Per Cycle: 0.95 -IPC: 0.72 -Block RThroughput: 12.5 +uOps Per Cycle: 1.32 +IPC: 1.06 +Block RThroughput: 12.0 Instruction Info: @@ -18,43 +18,42 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: + 1 1 0.33 mov rax, rdi + 1 1 0.33 movabs rcx, 1537228672809129301 + 1 1 0.33 cmp rsi, rcx + 1 1 1.00 ja .LBB5_5 + 1 1 0.33 test rsi, rsi + 1 1 1.00 je .LBB5_2 2 5 1.00 * push r15 2 5 1.00 * push r14 - 2 5 1.00 * push r12 2 5 1.00 * push rbx - 2 5 1.00 * push rax - 1 1 0.33 mov rbx, rdi - 1 1 0.33 movabs r12, 9223372036854775805 - 1 1 0.33 mov ecx, 6 - 1 1 0.33 mov rax, rsi - 2 4 1.00 mul rcx - 1 1 1.00 jo .LBB5_6 - 1 1 0.33 cmp rax, r12 - 1 1 1.00 jbe .LBB5_2 - 1 1 0.33 add r12, 3 - 1 1 1.00 * mov qword ptr [rbx], r12 - 1 1 1.00 jmp .LBB5_7 - 1 1 0.33 mov r14, rsi - 1 1 0.33 test rax, rax - 1 1 1.00 je .LBB5_3 - 1 1 0.33 mov r15, rax + 1 1 0.50 lea rcx, [rsi + rsi] + 1 1 0.50 lea rbx, [rcx + 2*rcx] + 1 1 0.33 mov r14, rax + 1 1 0.33 mov r15, rsi 4 7 1.00 * call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] 1 1 0.33 mov esi, 2 - 1 1 0.33 mov rdi, r15 + 1 1 0.33 mov rdi, rbx 4 7 1.00 * call qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] - 1 1 0.33 test rax, rax - 1 1 1.00 jne .LBB5_5 - 1 1 1.00 jmp .LBB5_6 - 1 1 0.33 mov eax, 2 - 1 1 1.00 * mov qword ptr [rbx], r14 - 1 1 1.00 * mov qword ptr [rbx + 8], rax - 1 1 1.00 * mov qword ptr [rbx + 16], r14 - 1 1 0.33 mov rax, rbx - 1 1 0.33 add rsp, 8 + 1 1 0.33 mov rsi, r15 + 1 1 0.33 mov rcx, rax + 1 1 0.33 mov rax, r14 + 1 1 0.33 test rcx, rcx 1 6 0.50 * pop rbx - 1 6 0.50 * pop r12 1 6 0.50 * pop r14 1 6 0.50 * pop r15 + 1 1 1.00 je .LBB5_5 + 1 1 1.00 * mov qword ptr [rax], rsi + 1 1 1.00 * mov qword ptr [rax + 8], rcx + 1 1 1.00 * mov qword ptr [rax + 16], rsi + 1 1 1.00 U ret + 1 1 0.33 movabs rcx, -9223372036854775808 + 1 1 1.00 * mov qword ptr [rax], rcx + 1 1 1.00 U ret + 1 1 0.33 mov ecx, 2 + 1 1 1.00 * mov qword ptr [rax], rsi + 1 1 1.00 * mov qword ptr [rax + 8], rcx + 1 1 1.00 * mov qword ptr [rax + 16], rsi 1 1 1.00 U ret @@ -71,45 +70,44 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 7.01 7.98 11.00 11.01 8.50 8.50 + - - 6.99 6.99 12.00 10.02 8.00 9.00 Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - - - 1.00 - 0.50 0.50 push r15 - - - - - 1.00 - 0.50 0.50 push r14 - - - - - 1.00 - 0.50 0.50 push r12 - - - - - 1.00 - 0.50 0.50 push rbx - - - - - 1.00 - 0.50 0.50 push rax - - - 0.98 0.01 - 0.01 - - mov rbx, rdi - - - 0.01 0.99 - - - - movabs r12, 9223372036854775805 - - - 0.02 - - 0.98 - - mov ecx, 6 - - - - 0.98 - 0.02 - - mov rax, rsi - - - 1.00 1.00 - - - - mul rcx - - - - - - 1.00 - - jo .LBB5_6 - - - 0.99 0.01 - - - - cmp rax, r12 - - - - - - 1.00 - - jbe .LBB5_2 - - - - - - 1.00 - - add r12, 3 - - - - - 1.00 - 0.50 0.50 mov qword ptr [rbx], r12 - - - - - - 1.00 - - jmp .LBB5_7 - - - 0.98 0.02 - - - - mov r14, rsi - - - 0.01 0.99 - - - - test rax, rax - - - - - - 1.00 - - je .LBB5_3 - - - 0.99 0.01 - - - - mov r15, rax - - - - - 1.00 1.00 1.00 1.00 call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] - - - 0.01 0.99 - - - - mov esi, 2 - - - 0.99 0.01 - - - - mov rdi, r15 - - - - - 1.00 1.00 1.00 1.00 call qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] - - - 0.01 0.99 - - - - test rax, rax - - - - - - 1.00 - - jne .LBB5_5 - - - - - - 1.00 - - jmp .LBB5_6 - - - 0.02 0.98 - - - - mov eax, 2 - - - - - 1.00 - 0.50 0.50 mov qword ptr [rbx], r14 - - - - - 1.00 - 0.50 0.50 mov qword ptr [rbx + 8], rax - - - - - 1.00 - 0.50 0.50 mov qword ptr [rbx + 16], r14 - - - 0.97 0.03 - - - - mov rax, rbx - - - 0.03 0.97 - - - - add rsp, 8 - - - - - - - 0.50 0.50 pop rbx - - - - - - - 0.50 0.50 pop r12 - - - - - - - 0.50 0.50 pop r14 - - - - - - - 0.50 0.50 pop r15 + - - 0.01 0.98 - 0.01 - - mov rax, rdi + - - 0.98 0.02 - - - - movabs rcx, 1537228672809129301 + - - 0.02 0.98 - - - - cmp rsi, rcx + - - - - - 1.00 - - ja .LBB5_5 + - - 0.98 - - 0.02 - - test rsi, rsi + - - - - - 1.00 - - je .LBB5_2 + - - - - 1.00 - - 1.00 push r15 + - - - - 1.00 - 1.00 - push r14 + - - - - 1.00 - - 1.00 push rbx + - - - 1.00 - - - - lea rcx, [rsi + rsi] + - - - 1.00 - - - - lea rbx, [rcx + 2*rcx] + - - 1.00 - - - - - mov r14, rax + - - 1.00 - - - - - mov r15, rsi + - - - - 1.00 1.00 2.00 - call qword ptr [rip + __rustc::__rust_no_alloc_shim_is_unstable_v2@GOTPCREL] + - - - 0.01 - 0.99 - - mov esi, 2 + - - 0.01 0.99 - - - - mov rdi, rbx + - - - - 1.00 1.00 - 2.00 call qword ptr [rip + __rustc::__rust_alloc_zeroed@GOTPCREL] + - - 0.01 - - 0.99 - - mov rsi, r15 + - - 0.99 0.01 - - - - mov rcx, rax + - - - 0.99 - 0.01 - - mov rax, r14 + - - 0.99 0.01 - - - - test rcx, rcx + - - - - - - - 1.00 pop rbx + - - - - - - 1.00 - pop r14 + - - - - - - - 1.00 pop r15 + - - - - - 1.00 - - je .LBB5_5 + - - - - 1.00 - 1.00 - mov qword ptr [rax], rsi + - - - - 1.00 - - 1.00 mov qword ptr [rax + 8], rcx + - - - - 1.00 - 1.00 - mov qword ptr [rax + 16], rsi + - - - - - 1.00 - - ret + - - 0.01 0.99 - - - - movabs rcx, -9223372036854775808 + - - - - 1.00 - - 1.00 mov qword ptr [rax], rcx + - - - - - 1.00 - - ret + - - 0.99 0.01 - - - - mov ecx, 2 + - - - - 1.00 - 1.00 - mov qword ptr [rax], rsi + - - - - 1.00 - - 1.00 mov qword ptr [rax + 8], rcx + - - - - 1.00 - 1.00 - mov qword ptr [rax + 16], rsi - - - - - 1.00 - - ret diff --git a/benches/ref_from_bytes_with_elems_dynamic_padding.x86-64 b/benches/ref_from_bytes_with_elems_dynamic_padding.x86-64 index ba9e1a2c78..d579b3faef 100644 --- a/benches/ref_from_bytes_with_elems_dynamic_padding.x86-64 +++ b/benches/ref_from_bytes_with_elems_dynamic_padding.x86-64 @@ -1,30 +1,19 @@ bench_ref_from_bytes_with_elems_dynamic_padding: - mov rcx, rdx - mov edx, 3 - mov rax, rcx - mul rdx - jo .LBB5_5 - cmp rax, -10 - ja .LBB5_5 - mov edx, eax - not edx - and edx, 3 - add rdx, rax - add rdx, 9 - cmp rsi, rdx - jne .LBB5_5 - mov r8d, edi - and r8d, 3 - jne .LBB5_5 - add rax, 9 + movabs rax, 3074457345618258598 cmp rdx, rax - jb .LBB5_5 + seta cl mov rax, rdi - mov rdx, rcx - ret -.LBB5_5: - xor edi, edi - mov rcx, rsi - mov rax, rdi - mov rdx, rcx + test al, 3 + setne dil + or dil, cl + jne .LBB5_2 + lea rcx, [rdx + 2*rdx] + or rcx, 3 + add rcx, 9 + cmp rsi, rcx + je .LBB5_3 +.LBB5_2: + xor eax, eax + mov rdx, rsi +.LBB5_3: ret diff --git a/benches/ref_from_bytes_with_elems_dynamic_padding.x86-64.mca b/benches/ref_from_bytes_with_elems_dynamic_padding.x86-64.mca index 93696305cb..ea2d83dbd1 100644 --- a/benches/ref_from_bytes_with_elems_dynamic_padding.x86-64.mca +++ b/benches/ref_from_bytes_with_elems_dynamic_padding.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 2800 -Total Cycles: 944 -Total uOps: 2900 +Instructions: 1600 +Total Cycles: 539 +Total uOps: 1700 Dispatch Width: 4 -uOps Per Cycle: 3.07 +uOps Per Cycle: 3.15 IPC: 2.97 -Block RThroughput: 7.3 +Block RThroughput: 4.3 Instruction Info: @@ -18,33 +18,21 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 mov rcx, rdx - 1 1 0.33 mov edx, 3 - 1 1 0.33 mov rax, rcx - 2 4 1.00 mul rdx - 1 1 1.00 jo .LBB5_5 - 1 1 0.33 cmp rax, -10 - 1 1 1.00 ja .LBB5_5 - 1 1 0.33 mov edx, eax - 1 1 0.33 not edx - 1 1 0.33 and edx, 3 - 1 1 0.33 add rdx, rax - 1 1 0.33 add rdx, 9 - 1 1 0.33 cmp rsi, rdx - 1 1 1.00 jne .LBB5_5 - 1 1 0.33 mov r8d, edi - 1 1 0.33 and r8d, 3 - 1 1 1.00 jne .LBB5_5 - 1 1 0.33 add rax, 9 + 1 1 0.33 movabs rax, 3074457345618258598 1 1 0.33 cmp rdx, rax - 1 1 1.00 jb .LBB5_5 + 2 2 1.00 seta cl 1 1 0.33 mov rax, rdi - 1 1 0.33 mov rdx, rcx - 1 1 1.00 U ret - 1 0 0.25 xor edi, edi - 1 1 0.33 mov rcx, rsi - 1 1 0.33 mov rax, rdi - 1 1 0.33 mov rdx, rcx + 1 1 0.33 test al, 3 + 1 1 0.50 setne dil + 1 1 0.33 or dil, cl + 1 1 1.00 jne .LBB5_2 + 1 1 0.50 lea rcx, [rdx + 2*rdx] + 1 1 0.33 or rcx, 3 + 1 1 0.33 add rcx, 9 + 1 1 0.33 cmp rsi, rcx + 1 1 1.00 je .LBB5_3 + 1 0 0.25 xor eax, eax + 1 1 0.33 mov rdx, rsi 1 1 1.00 U ret @@ -61,35 +49,23 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 9.32 9.32 - 9.36 - - + - - 5.33 5.32 - 5.35 - - Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.64 0.18 - 0.18 - - mov rcx, rdx - - - 0.17 0.83 - - - - mov edx, 3 - - - 0.50 0.49 - 0.01 - - mov rax, rcx - - - 1.00 1.00 - - - - mul rdx - - - - - - 1.00 - - jo .LBB5_5 - - - 0.82 0.18 - - - - cmp rax, -10 - - - - - - 1.00 - - ja .LBB5_5 - - - 0.02 0.98 - - - - mov edx, eax - - - 0.82 0.02 - 0.16 - - not edx - - - 0.82 0.17 - 0.01 - - and edx, 3 - - - 0.99 - - 0.01 - - add rdx, rax - - - 0.98 0.01 - 0.01 - - add rdx, 9 - - - 1.00 - - - - - cmp rsi, rdx - - - - - - 1.00 - - jne .LBB5_5 - - - 0.16 0.83 - 0.01 - - mov r8d, edi - - - 0.17 0.17 - 0.66 - - and r8d, 3 - - - - - - 1.00 - - jne .LBB5_5 - - - 0.02 0.98 - - - - add rax, 9 - - - - 0.17 - 0.83 - - cmp rdx, rax - - - - - - 1.00 - - jb .LBB5_5 - - - 0.01 0.67 - 0.32 - - mov rax, rdi - - - 0.02 0.98 - - - - mov rdx, rcx - - - - - - 1.00 - - ret - - - - - - - - - xor edi, edi - - - 0.34 0.66 - - - - mov rcx, rsi - - - 0.34 0.50 - 0.16 - - mov rax, rdi - - - 0.50 0.50 - - - - mov rdx, rcx + - - 0.01 0.98 - 0.01 - - movabs rax, 3074457345618258598 + - - - 1.00 - - - - cmp rdx, rax + - - 1.98 - - 0.02 - - seta cl + - - 0.02 0.98 - - - - mov rax, rdi + - - - 0.67 - 0.33 - - test al, 3 + - - 0.67 - - 0.33 - - setne dil + - - 0.99 - - 0.01 - - or dil, cl + - - - - - 1.00 - - jne .LBB5_2 + - - 0.01 0.99 - - - - lea rcx, [rdx + 2*rdx] + - - - 0.01 - 0.99 - - or rcx, 3 + - - 0.65 0.02 - 0.33 - - add rcx, 9 + - - 0.99 0.01 - - - - cmp rsi, rcx + - - - - - 1.00 - - je .LBB5_3 + - - - - - - - - xor eax, eax + - - 0.01 0.66 - 0.33 - - mov rdx, rsi - - - - - 1.00 - - ret diff --git a/benches/ref_from_bytes_with_elems_dynamic_size.x86-64 b/benches/ref_from_bytes_with_elems_dynamic_size.x86-64 index 6aaff6d066..3d8d15b7f6 100644 --- a/benches/ref_from_bytes_with_elems_dynamic_size.x86-64 +++ b/benches/ref_from_bytes_with_elems_dynamic_size.x86-64 @@ -1,5 +1,5 @@ bench_ref_from_bytes_with_elems_dynamic_size: - movabs rax, 9223372036854775805 + movabs rax, 4611686018427387901 cmp rdx, rax seta cl mov rax, rdi diff --git a/benches/ref_from_bytes_with_elems_dynamic_size.x86-64.mca b/benches/ref_from_bytes_with_elems_dynamic_size.x86-64.mca index 4a67974f1a..602179f3c9 100644 --- a/benches/ref_from_bytes_with_elems_dynamic_size.x86-64.mca +++ b/benches/ref_from_bytes_with_elems_dynamic_size.x86-64.mca @@ -18,7 +18,7 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 movabs rax, 9223372036854775805 + 1 1 0.33 movabs rax, 4611686018427387901 1 1 0.33 cmp rdx, rax 2 2 1.00 seta cl 1 1 0.33 mov rax, rdi @@ -50,7 +50,7 @@ Resource pressure per iteration: Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - - 0.99 - 0.01 - - movabs rax, 9223372036854775805 + - - - 0.99 - 0.01 - - movabs rax, 4611686018427387901 - - 0.33 0.67 - - - - cmp rdx, rax - - 1.98 - - 0.02 - - seta cl - - 0.01 0.99 - - - - mov rax, rdi diff --git a/benches/ref_from_prefix_with_elems_dynamic_padding.x86-64 b/benches/ref_from_prefix_with_elems_dynamic_padding.x86-64 index c03811bdbe..5b31277bde 100644 --- a/benches/ref_from_prefix_with_elems_dynamic_padding.x86-64 +++ b/benches/ref_from_prefix_with_elems_dynamic_padding.x86-64 @@ -1,35 +1,26 @@ bench_ref_from_prefix_with_elems_dynamic_padding: - mov rcx, rdx - mov edx, 3 - mov rax, rcx - mul rdx + movabs rax, 3074457345618258598 + cmp rdx, rax + ja .LBB5_1 + xor ecx, ecx + mov eax, 0 + test dil, 3 + je .LBB5_3 + mov rdx, rcx + ret +.LBB5_1: mov edx, 1 - jo .LBB5_5 - cmp rax, -10 - ja .LBB5_5 - lea r8, [rax + 9] - not eax - and eax, 3 - add rax, r8 - jae .LBB5_3 -.LBB5_5: - xor r8d, r8d - mov rax, r8 + xor eax, eax ret .LBB5_3: - xor edx, edx - mov r8d, 0 - test dil, 3 - je .LBB5_4 - mov rax, r8 - ret -.LBB5_4: - xor edx, edx + lea rax, [rdx + 2*rdx] + or rax, 3 + add rax, 9 + xor r8d, r8d cmp rax, rsi - mov eax, 1 - cmova rcx, rax - cmova rdi, rdx + mov ecx, 1 + cmovbe rcx, rdx + cmova rdi, r8 + mov rax, rdi mov rdx, rcx - mov r8, rdi - mov rax, r8 ret diff --git a/benches/ref_from_prefix_with_elems_dynamic_padding.x86-64.mca b/benches/ref_from_prefix_with_elems_dynamic_padding.x86-64.mca index 6a3968fe9e..2f212ec6d0 100644 --- a/benches/ref_from_prefix_with_elems_dynamic_padding.x86-64.mca +++ b/benches/ref_from_prefix_with_elems_dynamic_padding.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 3100 -Total Cycles: 1110 -Total uOps: 3600 +Instructions: 2300 +Total Cycles: 807 +Total uOps: 2700 Dispatch Width: 4 -uOps Per Cycle: 3.24 -IPC: 2.79 -Block RThroughput: 9.0 +uOps Per Cycle: 3.35 +IPC: 2.85 +Block RThroughput: 6.8 Instruction Info: @@ -18,36 +18,28 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 mov rcx, rdx - 1 1 0.33 mov edx, 3 - 1 1 0.33 mov rax, rcx - 2 4 1.00 mul rdx - 1 1 0.33 mov edx, 1 - 1 1 1.00 jo .LBB5_5 - 1 1 0.33 cmp rax, -10 - 1 1 1.00 ja .LBB5_5 - 1 1 0.50 lea r8, [rax + 9] - 1 1 0.33 not eax - 1 1 0.33 and eax, 3 - 1 1 0.33 add rax, r8 - 1 1 1.00 jae .LBB5_3 - 1 0 0.25 xor r8d, r8d - 1 1 0.33 mov rax, r8 - 1 1 1.00 U ret - 1 0 0.25 xor edx, edx - 1 1 0.33 mov r8d, 0 + 1 1 0.33 movabs rax, 3074457345618258598 + 1 1 0.33 cmp rdx, rax + 1 1 1.00 ja .LBB5_1 + 1 0 0.25 xor ecx, ecx + 1 1 0.33 mov eax, 0 1 1 0.33 test dil, 3 - 1 1 1.00 je .LBB5_4 - 1 1 0.33 mov rax, r8 + 1 1 1.00 je .LBB5_3 + 1 1 0.33 mov rdx, rcx 1 1 1.00 U ret - 1 0 0.25 xor edx, edx + 1 1 0.33 mov edx, 1 + 1 0 0.25 xor eax, eax + 1 1 1.00 U ret + 1 1 0.50 lea rax, [rdx + 2*rdx] + 1 1 0.33 or rax, 3 + 1 1 0.33 add rax, 9 + 1 0 0.25 xor r8d, r8d 1 1 0.33 cmp rax, rsi - 1 1 0.33 mov eax, 1 - 3 3 1.00 cmova rcx, rax - 3 3 1.00 cmova rdi, rdx + 1 1 0.33 mov ecx, 1 + 3 3 1.00 cmovbe rcx, rdx + 3 3 1.00 cmova rdi, r8 + 1 1 0.33 mov rax, rdi 1 1 0.33 mov rdx, rcx - 1 1 0.33 mov r8, rdi - 1 1 0.33 mov rax, r8 1 1 1.00 U ret @@ -64,38 +56,30 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 11.01 10.98 - 11.01 - - + - - 7.99 7.99 - 8.02 - - Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.48 0.50 - 0.02 - - mov rcx, rdx - - - 0.02 0.98 - - - - mov edx, 3 - - - 0.51 0.48 - 0.01 - - mov rax, rcx - - - 1.00 1.00 - - - - mul rdx - - - 0.49 0.50 - 0.01 - - mov edx, 1 - - - - - - 1.00 - - jo .LBB5_5 - - - 0.98 0.02 - - - - cmp rax, -10 - - - - - - 1.00 - - ja .LBB5_5 - - - 0.02 0.98 - - - - lea r8, [rax + 9] - - - 0.98 0.02 - - - - not eax - - - 0.99 0.01 - - - - and eax, 3 - - - 0.98 0.01 - 0.01 - - add rax, r8 - - - - - - 1.00 - - jae .LBB5_3 - - - - - - - - - xor r8d, r8d - - - 0.01 0.98 - 0.01 - - mov rax, r8 + - - 0.47 0.52 - 0.01 - - movabs rax, 3074457345618258598 + - - 0.94 0.01 - 0.05 - - cmp rdx, rax + - - - - - 1.00 - - ja .LBB5_1 + - - - - - - - - xor ecx, ecx + - - 0.03 0.97 - - - - mov eax, 0 + - - 0.01 0.52 - 0.47 - - test dil, 3 + - - - - - 1.00 - - je .LBB5_3 + - - 0.03 0.51 - 0.46 - - mov rdx, rcx - - - - - 1.00 - - ret - - - - - - - - - xor edx, edx - - - 0.48 0.52 - - - - mov r8d, 0 - - - 0.02 0.97 - 0.01 - - test dil, 3 - - - - - - 1.00 - - je .LBB5_4 - - - 0.49 0.50 - 0.01 - - mov rax, r8 + - - 0.04 0.96 - - - - mov edx, 1 + - - - - - - - - xor eax, eax - - - - - 1.00 - - ret - - - - - - - - - xor edx, edx - - - 0.51 0.49 - - - - cmp rax, rsi - - - - 1.00 - - - - mov eax, 1 - - - 1.04 0.97 - 0.99 - - cmova rcx, rax - - - 0.98 0.53 - 1.49 - - cmova rdi, rdx - - - 0.50 0.50 - - - - mov rdx, rcx - - - 0.51 0.01 - 0.48 - - mov r8, rdi - - - 0.02 0.01 - 0.97 - - mov rax, r8 + - - 0.01 0.99 - - - - lea rax, [rdx + 2*rdx] + - - 0.52 0.48 - - - - or rax, 3 + - - 0.51 0.49 - - - - add rax, 9 + - - - - - - - - xor r8d, r8d + - - 0.97 0.03 - - - - cmp rax, rsi + - - 0.01 0.99 - - - - mov ecx, 1 + - - 1.04 0.97 - 0.99 - - cmovbe rcx, rdx + - - 1.44 0.54 - 1.02 - - cmova rdi, r8 + - - 0.97 0.01 - 0.02 - - mov rax, rdi + - - 1.00 - - - - - mov rdx, rcx - - - - - 1.00 - - ret diff --git a/benches/ref_from_prefix_with_elems_dynamic_size.x86-64 b/benches/ref_from_prefix_with_elems_dynamic_size.x86-64 index 05818b0633..069fd4859c 100644 --- a/benches/ref_from_prefix_with_elems_dynamic_size.x86-64 +++ b/benches/ref_from_prefix_with_elems_dynamic_size.x86-64 @@ -1,5 +1,5 @@ bench_ref_from_prefix_with_elems_dynamic_size: - movabs rax, 9223372036854775805 + movabs rax, 4611686018427387901 cmp rdx, rax ja .LBB5_1 mov rcx, rdx diff --git a/benches/ref_from_prefix_with_elems_dynamic_size.x86-64.mca b/benches/ref_from_prefix_with_elems_dynamic_size.x86-64.mca index 94c718e22c..6f22726406 100644 --- a/benches/ref_from_prefix_with_elems_dynamic_size.x86-64.mca +++ b/benches/ref_from_prefix_with_elems_dynamic_size.x86-64.mca @@ -18,7 +18,7 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 movabs rax, 9223372036854775805 + 1 1 0.33 movabs rax, 4611686018427387901 1 1 0.33 cmp rdx, rax 1 1 1.00 ja .LBB5_1 1 1 0.33 mov rcx, rdx @@ -56,7 +56,7 @@ Resource pressure per iteration: Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - - 0.99 - 0.01 - - movabs rax, 9223372036854775805 + - - - 0.99 - 0.01 - - movabs rax, 4611686018427387901 - - 0.37 0.63 - - - - cmp rdx, rax - - - - - 1.00 - - ja .LBB5_1 - - 0.63 0.37 - - - - mov rcx, rdx diff --git a/benches/ref_from_suffix_with_elems_dynamic_padding.x86-64 b/benches/ref_from_suffix_with_elems_dynamic_padding.x86-64 index b3e239cb75..c3d10b5fc6 100644 --- a/benches/ref_from_suffix_with_elems_dynamic_padding.x86-64 +++ b/benches/ref_from_suffix_with_elems_dynamic_padding.x86-64 @@ -1,34 +1,27 @@ bench_ref_from_suffix_with_elems_dynamic_padding: - mov rcx, rdx - mov edx, 3 - mov rax, rcx - mul rdx - jo .LBB5_1 - cmp rax, -10 + movabs rax, 3074457345618258598 + cmp rdx, rax ja .LBB5_1 - lea rdx, [rax + 9] - not eax - and eax, 3 - add rax, rdx + lea r8d, [rsi + rdi] + xor ecx, ecx + mov eax, 0 + test r8b, 3 + je .LBB5_3 + mov rdx, rcx + ret +.LBB5_3: + lea rax, [rdx + 2*rdx] + or rax, 3 + add rax, 9 + sub rsi, rax jae .LBB5_4 .LBB5_1: - xor r8d, r8d + xor eax, eax mov edx, 1 - mov rax, r8 ret .LBB5_4: - lea r9d, [rsi + rdi] - xor edx, edx - mov r8d, 0 - test r9b, 3 - je .LBB5_5 - mov rax, r8 - ret -.LBB5_5: - sub rsi, rax - jb .LBB5_1 add rdi, rsi + mov rcx, rdx + mov rax, rdi mov rdx, rcx - mov r8, rdi - mov rax, r8 ret diff --git a/benches/ref_from_suffix_with_elems_dynamic_padding.x86-64.mca b/benches/ref_from_suffix_with_elems_dynamic_padding.x86-64.mca index c7c3c7ec2b..92e6280bb4 100644 --- a/benches/ref_from_suffix_with_elems_dynamic_padding.x86-64.mca +++ b/benches/ref_from_suffix_with_elems_dynamic_padding.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 3000 -Total Cycles: 973 -Total uOps: 3100 +Instructions: 2300 +Total Cycles: 706 +Total uOps: 2300 Dispatch Width: 4 -uOps Per Cycle: 3.19 -IPC: 3.08 -Block RThroughput: 8.0 +uOps Per Cycle: 3.26 +IPC: 3.26 +Block RThroughput: 6.0 Instruction Info: @@ -18,35 +18,28 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 mov rcx, rdx - 1 1 0.33 mov edx, 3 - 1 1 0.33 mov rax, rcx - 2 4 1.00 mul rdx - 1 1 1.00 jo .LBB5_1 - 1 1 0.33 cmp rax, -10 + 1 1 0.33 movabs rax, 3074457345618258598 + 1 1 0.33 cmp rdx, rax 1 1 1.00 ja .LBB5_1 - 1 1 0.50 lea rdx, [rax + 9] - 1 1 0.33 not eax - 1 1 0.33 and eax, 3 - 1 1 0.33 add rax, rdx + 1 1 0.50 lea r8d, [rsi + rdi] + 1 0 0.25 xor ecx, ecx + 1 1 0.33 mov eax, 0 + 1 1 0.33 test r8b, 3 + 1 1 1.00 je .LBB5_3 + 1 1 0.33 mov rdx, rcx + 1 1 1.00 U ret + 1 1 0.50 lea rax, [rdx + 2*rdx] + 1 1 0.33 or rax, 3 + 1 1 0.33 add rax, 9 + 1 1 0.33 sub rsi, rax 1 1 1.00 jae .LBB5_4 - 1 0 0.25 xor r8d, r8d + 1 0 0.25 xor eax, eax 1 1 0.33 mov edx, 1 - 1 1 0.33 mov rax, r8 - 1 1 1.00 U ret - 1 1 0.50 lea r9d, [rsi + rdi] - 1 0 0.25 xor edx, edx - 1 1 0.33 mov r8d, 0 - 1 1 0.33 test r9b, 3 - 1 1 1.00 je .LBB5_5 - 1 1 0.33 mov rax, r8 1 1 1.00 U ret - 1 1 0.33 sub rsi, rax - 1 1 1.00 jb .LBB5_1 1 1 0.33 add rdi, rsi + 1 1 0.33 mov rcx, rdx + 1 1 0.33 mov rax, rdi 1 1 0.33 mov rdx, rcx - 1 1 0.33 mov r8, rdi - 1 1 0.33 mov rax, r8 1 1 1.00 U ret @@ -63,37 +56,30 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 9.66 9.66 - 9.68 - - + - - 6.99 7.00 - 7.01 - - Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - - 0.99 - 0.01 - - mov rcx, rdx - - - 0.66 0.34 - - - - mov edx, 3 - - - 0.34 0.66 - - - - mov rax, rcx - - - 1.00 1.00 - - - - mul rdx - - - - - - 1.00 - - jo .LBB5_1 - - - 1.00 - - - - - cmp rax, -10 + - - - 0.99 - 0.01 - - movabs rax, 3074457345618258598 + - - 0.01 0.50 - 0.49 - - cmp rdx, rax - - - - - 1.00 - - ja .LBB5_1 - - - - 1.00 - - - - lea rdx, [rax + 9] - - - 1.00 - - - - - not eax - - - 1.00 - - - - - and eax, 3 - - - 1.00 - - - - - add rax, rdx - - - - - - 1.00 - - jae .LBB5_4 - - - - - - - - - xor r8d, r8d - - - 0.33 0.33 - 0.34 - - mov edx, 1 - - - 0.33 - - 0.67 - - mov rax, r8 + - - - 1.00 - - - - lea r8d, [rsi + rdi] + - - - - - - - - xor ecx, ecx + - - 0.50 0.49 - 0.01 - - mov eax, 0 + - - 0.49 0.51 - - - - test r8b, 3 + - - - - - 1.00 - - je .LBB5_3 + - - 0.51 0.49 - - - - mov rdx, rcx - - - - - 1.00 - - ret - - - 0.33 0.67 - - - - lea r9d, [rsi + rdi] - - - - - - - - - xor edx, edx - - - 0.67 0.33 - - - - mov r8d, 0 - - - 0.33 0.34 - 0.33 - - test r9b, 3 - - - - - - 1.00 - - je .LBB5_5 - - - 0.66 0.01 - 0.33 - - mov rax, r8 + - - 0.50 0.50 - - - - lea rax, [rdx + 2*rdx] + - - 1.00 - - - - - or rax, 3 + - - 1.00 - - - - - add rax, 9 + - - 0.99 0.01 - - - - sub rsi, rax + - - - - - 1.00 - - jae .LBB5_4 + - - - - - - - - xor eax, eax + - - - 1.00 - - - - mov edx, 1 - - - - - 1.00 - - ret - - - 0.33 0.67 - - - - sub rsi, rax - - - - - - 1.00 - - jb .LBB5_1 - - - - 1.00 - - - - add rdi, rsi - - - 0.01 0.99 - - - - mov rdx, rcx - - - - 1.00 - - - - mov r8, rdi - - - 0.67 0.33 - - - - mov rax, r8 + - - 1.00 - - - - - add rdi, rsi + - - - 1.00 - - - - mov rcx, rdx + - - 0.99 0.01 - - - - mov rax, rdi + - - - 0.50 - 0.50 - - mov rdx, rcx - - - - - 1.00 - - ret diff --git a/benches/ref_from_suffix_with_elems_dynamic_size.x86-64 b/benches/ref_from_suffix_with_elems_dynamic_size.x86-64 index 5b401e7ca1..bdca571924 100644 --- a/benches/ref_from_suffix_with_elems_dynamic_size.x86-64 +++ b/benches/ref_from_suffix_with_elems_dynamic_size.x86-64 @@ -1,5 +1,5 @@ bench_ref_from_suffix_with_elems_dynamic_size: - movabs rax, 9223372036854775805 + movabs rax, 4611686018427387901 cmp rdx, rax ja .LBB5_1 lea r8d, [rsi + rdi] diff --git a/benches/ref_from_suffix_with_elems_dynamic_size.x86-64.mca b/benches/ref_from_suffix_with_elems_dynamic_size.x86-64.mca index eac400f3f4..6d9de0b3eb 100644 --- a/benches/ref_from_suffix_with_elems_dynamic_size.x86-64.mca +++ b/benches/ref_from_suffix_with_elems_dynamic_size.x86-64.mca @@ -18,7 +18,7 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 movabs rax, 9223372036854775805 + 1 1 0.33 movabs rax, 4611686018427387901 1 1 0.33 cmp rdx, rax 1 1 1.00 ja .LBB5_1 1 1 0.50 lea r8d, [rsi + rdi] @@ -56,7 +56,7 @@ Resource pressure per iteration: Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.66 0.33 - 0.01 - - movabs rax, 9223372036854775805 + - - 0.66 0.33 - 0.01 - - movabs rax, 4611686018427387901 - - 0.01 0.99 - - - - cmp rdx, rax - - - - - 1.00 - - ja .LBB5_1 - - 0.99 0.01 - - - - lea r8d, [rsi + rdi] diff --git a/benches/try_ref_from_bytes_with_elems_dynamic_padding.x86-64 b/benches/try_ref_from_bytes_with_elems_dynamic_padding.x86-64 index f8a719dd10..3ef8d1448a 100644 --- a/benches/try_ref_from_bytes_with_elems_dynamic_padding.x86-64 +++ b/benches/try_ref_from_bytes_with_elems_dynamic_padding.x86-64 @@ -1,36 +1,21 @@ bench_try_ref_from_bytes_with_elems_dynamic_padding: - mov rcx, rdx - mov edx, 3 - mov rax, rcx - mul rdx - jo .LBB5_8 - mov rdx, rax - cmp rax, -10 - ja .LBB5_8 - mov eax, edx - not eax - and eax, 3 - lea r8, [rax + rdx] - add r8, 9 - xor eax, eax - cmp rsi, r8 - jne .LBB5_6 - mov r9d, edi - and r9d, 3 - jne .LBB5_6 - add rdx, 9 - cmp r8, rdx - jb .LBB5_6 - movzx edx, word ptr [rdi] - cmp dx, -16192 - cmove rsi, rcx - xor eax, eax - cmp edx, 49344 - cmove rax, rdi -.LBB5_6: - mov rdx, rsi - ret -.LBB5_8: + movabs rax, 3074457345618258598 + cmp rdx, rax + seta cl + mov rax, rdi + test al, 3 + setne dil + or dil, cl + jne .LBB5_3 + lea rcx, [rdx + 2*rdx] + or rcx, 3 + add rcx, 9 + cmp rsi, rcx + jne .LBB5_3 + cmp word ptr [rax], -16192 + je .LBB5_4 +.LBB5_3: xor eax, eax mov rdx, rsi +.LBB5_4: ret diff --git a/benches/try_ref_from_bytes_with_elems_dynamic_padding.x86-64.mca b/benches/try_ref_from_bytes_with_elems_dynamic_padding.x86-64.mca index bc48088077..8131f3bd54 100644 --- a/benches/try_ref_from_bytes_with_elems_dynamic_padding.x86-64.mca +++ b/benches/try_ref_from_bytes_with_elems_dynamic_padding.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 3300 -Total Cycles: 1082 -Total uOps: 3600 +Instructions: 1800 +Total Cycles: 607 +Total uOps: 2000 Dispatch Width: 4 -uOps Per Cycle: 3.33 -IPC: 3.05 -Block RThroughput: 9.0 +uOps Per Cycle: 3.29 +IPC: 2.97 +Block RThroughput: 5.0 Instruction Info: @@ -18,36 +18,21 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 mov rcx, rdx - 1 1 0.33 mov edx, 3 - 1 1 0.33 mov rax, rcx - 2 4 1.00 mul rdx - 1 1 1.00 jo .LBB5_8 - 1 1 0.33 mov rdx, rax - 1 1 0.33 cmp rax, -10 - 1 1 1.00 ja .LBB5_8 - 1 1 0.33 mov eax, edx - 1 1 0.33 not eax - 1 1 0.33 and eax, 3 - 1 1 0.50 lea r8, [rax + rdx] - 1 1 0.33 add r8, 9 - 1 0 0.25 xor eax, eax - 1 1 0.33 cmp rsi, r8 - 1 1 1.00 jne .LBB5_6 - 1 1 0.33 mov r9d, edi - 1 1 0.33 and r9d, 3 - 1 1 1.00 jne .LBB5_6 - 1 1 0.33 add rdx, 9 - 1 1 0.33 cmp r8, rdx - 1 1 1.00 jb .LBB5_6 - 1 5 0.50 * movzx edx, word ptr [rdi] - 1 1 0.33 cmp dx, -16192 - 2 2 0.67 cmove rsi, rcx - 1 0 0.25 xor eax, eax - 1 1 0.33 cmp edx, 49344 - 2 2 0.67 cmove rax, rdi - 1 1 0.33 mov rdx, rsi - 1 1 1.00 U ret + 1 1 0.33 movabs rax, 3074457345618258598 + 1 1 0.33 cmp rdx, rax + 2 2 1.00 seta cl + 1 1 0.33 mov rax, rdi + 1 1 0.33 test al, 3 + 1 1 0.50 setne dil + 1 1 0.33 or dil, cl + 1 1 1.00 jne .LBB5_3 + 1 1 0.50 lea rcx, [rdx + 2*rdx] + 1 1 0.33 or rcx, 3 + 1 1 0.33 add rcx, 9 + 1 1 0.33 cmp rsi, rcx + 1 1 1.00 jne .LBB5_3 + 2 6 0.50 * cmp word ptr [rax], -16192 + 1 1 1.00 je .LBB5_4 1 0 0.25 xor eax, eax 1 1 0.33 mov rdx, rsi 1 1 1.00 U ret @@ -66,40 +51,25 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 10.64 10.65 - 10.71 0.50 0.50 + - - 5.99 5.99 - 6.02 0.50 0.50 Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.25 0.73 - 0.02 - - mov rcx, rdx - - - 0.74 0.03 - 0.23 - - mov edx, 3 - - - 0.60 0.40 - - - - mov rax, rcx - - - 1.00 1.00 - - - - mul rdx - - - - - - 1.00 - - jo .LBB5_8 - - - 0.11 0.89 - - - - mov rdx, rax - - - 0.99 0.01 - - - - cmp rax, -10 - - - - - - 1.00 - - ja .LBB5_8 - - - 0.01 0.89 - 0.10 - - mov eax, edx - - - 0.11 0.89 - - - - not eax - - - 0.01 0.88 - 0.11 - - and eax, 3 - - - 0.01 0.99 - - - - lea r8, [rax + rdx] - - - 0.01 0.99 - - - - add r8, 9 - - - - - - - - - xor eax, eax - - - - 0.99 - 0.01 - - cmp rsi, r8 - - - - - - 1.00 - - jne .LBB5_6 - - - 0.42 - - 0.58 - - mov r9d, edi - - - 0.53 0.01 - 0.46 - - and r9d, 3 - - - - - - 1.00 - - jne .LBB5_6 - - - 0.99 0.01 - - - - add rdx, 9 - - - 0.99 0.01 - - - - cmp r8, rdx - - - - - - 1.00 - - jb .LBB5_6 - - - - - - - 0.50 0.50 movzx edx, word ptr [rdi] - - - 0.45 0.01 - 0.54 - - cmp dx, -16192 - - - 1.00 0.35 - 0.65 - - cmove rsi, rcx - - - - - - - - - xor eax, eax - - - 0.75 0.02 - 0.23 - - cmp edx, 49344 - - - 1.00 0.68 - 0.32 - - cmove rax, rdi - - - 0.12 0.54 - 0.34 - - mov rdx, rsi - - - - - - 1.00 - - ret + - - - 0.99 - 0.01 - - movabs rax, 3074457345618258598 + - - - 1.00 - - - - cmp rdx, rax + - - - - - 2.00 - - seta cl + - - 1.00 - - - - - mov rax, rdi + - - 0.99 0.01 - - - - test al, 3 + - - 1.00 - - - - - setne dil + - - - 0.99 - 0.01 - - or dil, cl + - - - - - 1.00 - - jne .LBB5_3 + - - 0.01 0.99 - - - - lea rcx, [rdx + 2*rdx] + - - - 1.00 - - - - or rcx, 3 + - - 0.99 0.01 - - - - add rcx, 9 + - - - 1.00 - - - - cmp rsi, rcx + - - - - - 1.00 - - jne .LBB5_3 + - - 1.00 - - - 0.50 0.50 cmp word ptr [rax], -16192 + - - - - - 1.00 - - je .LBB5_4 - - - - - - - - xor eax, eax - - - 0.55 0.33 - 0.12 - - mov rdx, rsi + - - 1.00 - - - - - mov rdx, rsi - - - - - 1.00 - - ret diff --git a/benches/try_ref_from_bytes_with_elems_dynamic_size.x86-64 b/benches/try_ref_from_bytes_with_elems_dynamic_size.x86-64 index 791351a659..ba34b1855b 100644 --- a/benches/try_ref_from_bytes_with_elems_dynamic_size.x86-64 +++ b/benches/try_ref_from_bytes_with_elems_dynamic_size.x86-64 @@ -1,5 +1,5 @@ bench_try_ref_from_bytes_with_elems_dynamic_size: - movabs rax, 9223372036854775805 + movabs rax, 4611686018427387901 cmp rdx, rax seta cl mov rax, rdi diff --git a/benches/try_ref_from_bytes_with_elems_dynamic_size.x86-64.mca b/benches/try_ref_from_bytes_with_elems_dynamic_size.x86-64.mca index 76a7caaecf..ae049c03df 100644 --- a/benches/try_ref_from_bytes_with_elems_dynamic_size.x86-64.mca +++ b/benches/try_ref_from_bytes_with_elems_dynamic_size.x86-64.mca @@ -18,7 +18,7 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 movabs rax, 9223372036854775805 + 1 1 0.33 movabs rax, 4611686018427387901 1 1 0.33 cmp rdx, rax 2 2 1.00 seta cl 1 1 0.33 mov rax, rdi @@ -52,7 +52,7 @@ Resource pressure per iteration: Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - - 0.99 - 0.01 - - movabs rax, 9223372036854775805 + - - - 0.99 - 0.01 - - movabs rax, 4611686018427387901 - - 0.50 0.50 - - - - cmp rdx, rax - - 1.96 - - 0.04 - - seta cl - - 0.01 0.99 - - - - mov rax, rdi diff --git a/benches/try_ref_from_prefix_with_elems_dynamic_padding.x86-64 b/benches/try_ref_from_prefix_with_elems_dynamic_padding.x86-64 index d7b2ca9ce2..80e66ba160 100644 --- a/benches/try_ref_from_prefix_with_elems_dynamic_padding.x86-64 +++ b/benches/try_ref_from_prefix_with_elems_dynamic_padding.x86-64 @@ -1,35 +1,30 @@ bench_try_ref_from_prefix_with_elems_dynamic_padding: - mov rcx, rdx - mov edx, 3 - mov rax, rcx - mul rdx - jo .LBB5_1 - cmp rax, -10 + movabs rax, 3074457345618258598 + cmp rdx, rax ja .LBB5_1 - lea rdx, [rax + 9] - not eax - and eax, 3 - add rax, rdx - jae .LBB5_4 + xor ecx, ecx + mov eax, 0 + test dil, 3 + je .LBB5_3 + mov rdx, rcx + ret +.LBB5_3: + lea rax, [rdx + 2*rdx] + or rax, 3 + add rax, 9 + cmp rax, rsi + jbe .LBB5_4 .LBB5_1: xor eax, eax mov edx, 1 ret .LBB5_4: - mov r8, rax - xor edx, edx - mov eax, 0 - test dil, 3 - je .LBB5_5 - ret -.LBB5_5: - cmp r8, rsi - ja .LBB5_1 movzx esi, word ptr [rdi] cmp si, -16192 - mov edx, 2 - cmove rdx, rcx + mov ecx, 2 + cmove rcx, rdx xor eax, eax cmp esi, 49344 cmove rax, rdi + mov rdx, rcx ret diff --git a/benches/try_ref_from_prefix_with_elems_dynamic_padding.x86-64.mca b/benches/try_ref_from_prefix_with_elems_dynamic_padding.x86-64.mca index 9df1d29761..512e8ce643 100644 --- a/benches/try_ref_from_prefix_with_elems_dynamic_padding.x86-64.mca +++ b/benches/try_ref_from_prefix_with_elems_dynamic_padding.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 3100 -Total Cycles: 1008 -Total uOps: 3400 +Instructions: 2600 +Total Cycles: 806 +Total uOps: 2800 Dispatch Width: 4 -uOps Per Cycle: 3.37 -IPC: 3.08 -Block RThroughput: 8.5 +uOps Per Cycle: 3.47 +IPC: 3.23 +Block RThroughput: 7.0 Instruction Info: @@ -18,36 +18,31 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 mov rcx, rdx - 1 1 0.33 mov edx, 3 - 1 1 0.33 mov rax, rcx - 2 4 1.00 mul rdx - 1 1 1.00 jo .LBB5_1 - 1 1 0.33 cmp rax, -10 + 1 1 0.33 movabs rax, 3074457345618258598 + 1 1 0.33 cmp rdx, rax 1 1 1.00 ja .LBB5_1 - 1 1 0.50 lea rdx, [rax + 9] - 1 1 0.33 not eax - 1 1 0.33 and eax, 3 - 1 1 0.33 add rax, rdx - 1 1 1.00 jae .LBB5_4 - 1 0 0.25 xor eax, eax - 1 1 0.33 mov edx, 1 - 1 1 1.00 U ret - 1 1 0.33 mov r8, rax - 1 0 0.25 xor edx, edx + 1 0 0.25 xor ecx, ecx 1 1 0.33 mov eax, 0 1 1 0.33 test dil, 3 - 1 1 1.00 je .LBB5_5 + 1 1 1.00 je .LBB5_3 + 1 1 0.33 mov rdx, rcx + 1 1 1.00 U ret + 1 1 0.50 lea rax, [rdx + 2*rdx] + 1 1 0.33 or rax, 3 + 1 1 0.33 add rax, 9 + 1 1 0.33 cmp rax, rsi + 1 1 1.00 jbe .LBB5_4 + 1 0 0.25 xor eax, eax + 1 1 0.33 mov edx, 1 1 1 1.00 U ret - 1 1 0.33 cmp r8, rsi - 1 1 1.00 ja .LBB5_1 1 5 0.50 * movzx esi, word ptr [rdi] 1 1 0.33 cmp si, -16192 - 1 1 0.33 mov edx, 2 - 2 2 0.67 cmove rdx, rcx + 1 1 0.33 mov ecx, 2 + 2 2 0.67 cmove rcx, rdx 1 0 0.25 xor eax, eax 1 1 0.33 cmp esi, 49344 2 2 0.67 cmove rax, rdi + 1 1 0.33 mov rdx, rcx 1 1 1.00 U ret @@ -64,38 +59,33 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 9.98 9.99 - 10.03 0.50 0.50 + - - 7.98 7.99 - 8.03 0.50 0.50 Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.49 0.50 - 0.01 - - mov rcx, rdx - - - 0.01 0.99 - - - - mov edx, 3 - - - 0.99 0.01 - - - - mov rax, rcx - - - 1.00 1.00 - - - - mul rdx - - - - - - 1.00 - - jo .LBB5_1 - - - 1.00 - - - - - cmp rax, -10 + - - 0.98 - - 0.02 - - movabs rax, 3074457345618258598 + - - - 1.00 - - - - cmp rdx, rax - - - - - 1.00 - - ja .LBB5_1 - - - - 1.00 - - - - lea rdx, [rax + 9] - - - 1.00 - - - - - not eax - - - 0.99 0.01 - - - - and eax, 3 - - - 0.99 0.01 - - - - add rax, rdx - - - - - - 1.00 - - jae .LBB5_4 - - - - - - - - - xor eax, eax - - - - 0.98 - 0.02 - - mov edx, 1 + - - - - - - - - xor ecx, ecx + - - 0.99 0.01 - - - - mov eax, 0 + - - 0.01 0.96 - 0.03 - - test dil, 3 + - - - - - 1.00 - - je .LBB5_3 + - - 0.97 0.01 - 0.02 - - mov rdx, rcx - - - - - 1.00 - - ret - - - 0.50 0.50 - - - - mov r8, rax - - - - - - - - - xor edx, edx - - - 0.02 0.49 - 0.49 - - mov eax, 0 - - - - 0.49 - 0.51 - - test dil, 3 - - - - - - 1.00 - - je .LBB5_5 + - - 0.03 0.97 - - - - lea rax, [rdx + 2*rdx] + - - 0.03 0.97 - - - - or rax, 3 + - - 0.01 0.99 - - - - add rax, 9 + - - - 1.00 - - - - cmp rax, rsi + - - - - - 1.00 - - jbe .LBB5_4 + - - - - - - - - xor eax, eax + - - 0.98 0.01 - 0.01 - - mov edx, 1 - - - - - 1.00 - - ret - - - 0.98 0.02 - - - - cmp r8, rsi - - - - - - 1.00 - - ja .LBB5_1 - - - - - - 0.50 0.50 movzx esi, word ptr [rdi] - - - 0.02 0.98 - - - - cmp si, -16192 - - - 0.98 0.02 - - - - mov edx, 2 - - - 0.50 1.00 - 0.50 - - cmove rdx, rcx + - - 0.97 0.03 - - - - cmp si, -16192 + - - 0.98 0.01 - 0.01 - - mov ecx, 2 + - - 1.00 0.03 - 0.97 - - cmove rcx, rdx - - - - - - - - xor eax, eax - - - 0.01 0.99 - - - - cmp esi, 49344 - - - 0.50 1.00 - 0.50 - - cmove rax, rdi + - - 0.03 0.97 - - - - cmp esi, 49344 + - - 1.00 1.00 - - - - cmove rax, rdi + - - - 0.03 - 0.97 - - mov rdx, rcx - - - - - 1.00 - - ret diff --git a/benches/try_ref_from_prefix_with_elems_dynamic_size.x86-64 b/benches/try_ref_from_prefix_with_elems_dynamic_size.x86-64 index b659b67b58..c12e87c137 100644 --- a/benches/try_ref_from_prefix_with_elems_dynamic_size.x86-64 +++ b/benches/try_ref_from_prefix_with_elems_dynamic_size.x86-64 @@ -1,5 +1,5 @@ bench_try_ref_from_prefix_with_elems_dynamic_size: - movabs rax, 9223372036854775805 + movabs rax, 4611686018427387901 cmp rdx, rax ja .LBB5_1 mov rcx, rdx diff --git a/benches/try_ref_from_prefix_with_elems_dynamic_size.x86-64.mca b/benches/try_ref_from_prefix_with_elems_dynamic_size.x86-64.mca index 7dc6caa16b..6c3f1a1ec9 100644 --- a/benches/try_ref_from_prefix_with_elems_dynamic_size.x86-64.mca +++ b/benches/try_ref_from_prefix_with_elems_dynamic_size.x86-64.mca @@ -18,7 +18,7 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 movabs rax, 9223372036854775805 + 1 1 0.33 movabs rax, 4611686018427387901 1 1 0.33 cmp rdx, rax 1 1 1.00 ja .LBB5_1 1 1 0.33 mov rcx, rdx @@ -59,7 +59,7 @@ Resource pressure per iteration: Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.66 0.33 - 0.01 - - movabs rax, 9223372036854775805 + - - 0.66 0.33 - 0.01 - - movabs rax, 4611686018427387901 - - 0.02 0.66 - 0.32 - - cmp rdx, rax - - - - - 1.00 - - ja .LBB5_1 - - 0.66 0.33 - 0.01 - - mov rcx, rdx diff --git a/benches/try_ref_from_suffix_with_elems_dynamic_padding.x86-64 b/benches/try_ref_from_suffix_with_elems_dynamic_padding.x86-64 index a8ceabe11f..c7530d8b68 100644 --- a/benches/try_ref_from_suffix_with_elems_dynamic_padding.x86-64 +++ b/benches/try_ref_from_suffix_with_elems_dynamic_padding.x86-64 @@ -1,39 +1,32 @@ bench_try_ref_from_suffix_with_elems_dynamic_padding: - mov rcx, rdx - mov edx, 3 - mov rax, rcx - mul rdx - jo .LBB5_1 - cmp rax, -10 + movabs rax, 3074457345618258598 + cmp rdx, rax ja .LBB5_1 - lea rdx, [rax + 9] - not eax - and eax, 3 - add rax, rdx + lea r8d, [rsi + rdi] + xor ecx, ecx + mov eax, 0 + test r8b, 3 + je .LBB5_3 + mov rdx, rcx + ret +.LBB5_3: + lea rax, [rdx + 2*rdx] + or rax, 3 + add rax, 9 + sub rsi, rax jae .LBB5_4 .LBB5_1: - xor r8d, r8d + xor eax, eax mov edx, 1 - mov rax, r8 ret .LBB5_4: - lea r9d, [rsi + rdi] - xor edx, edx - mov r8d, 0 - test r9b, 3 - je .LBB5_5 - mov rax, r8 - ret -.LBB5_5: - sub rsi, rax - jb .LBB5_1 - lea rax, [rdi + rsi] + lea r8, [rdi + rsi] movzx esi, word ptr [rdi + rsi] cmp si, -16192 - mov edx, 2 - cmove rdx, rcx - xor r8d, r8d + mov ecx, 2 + cmove rcx, rdx + xor eax, eax cmp esi, 49344 - cmove r8, rax - mov rax, r8 + cmove rax, r8 + mov rdx, rcx ret diff --git a/benches/try_ref_from_suffix_with_elems_dynamic_padding.x86-64.mca b/benches/try_ref_from_suffix_with_elems_dynamic_padding.x86-64.mca index 4937b556fe..be736c00c2 100644 --- a/benches/try_ref_from_suffix_with_elems_dynamic_padding.x86-64.mca +++ b/benches/try_ref_from_suffix_with_elems_dynamic_padding.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 3500 -Total Cycles: 1144 -Total uOps: 3800 +Instructions: 2800 +Total Cycles: 878 +Total uOps: 3000 Dispatch Width: 4 -uOps Per Cycle: 3.32 -IPC: 3.06 -Block RThroughput: 9.5 +uOps Per Cycle: 3.42 +IPC: 3.19 +Block RThroughput: 7.5 Instruction Info: @@ -18,40 +18,33 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 mov rcx, rdx - 1 1 0.33 mov edx, 3 - 1 1 0.33 mov rax, rcx - 2 4 1.00 mul rdx - 1 1 1.00 jo .LBB5_1 - 1 1 0.33 cmp rax, -10 + 1 1 0.33 movabs rax, 3074457345618258598 + 1 1 0.33 cmp rdx, rax 1 1 1.00 ja .LBB5_1 - 1 1 0.50 lea rdx, [rax + 9] - 1 1 0.33 not eax - 1 1 0.33 and eax, 3 - 1 1 0.33 add rax, rdx + 1 1 0.50 lea r8d, [rsi + rdi] + 1 0 0.25 xor ecx, ecx + 1 1 0.33 mov eax, 0 + 1 1 0.33 test r8b, 3 + 1 1 1.00 je .LBB5_3 + 1 1 0.33 mov rdx, rcx + 1 1 1.00 U ret + 1 1 0.50 lea rax, [rdx + 2*rdx] + 1 1 0.33 or rax, 3 + 1 1 0.33 add rax, 9 + 1 1 0.33 sub rsi, rax 1 1 1.00 jae .LBB5_4 - 1 0 0.25 xor r8d, r8d + 1 0 0.25 xor eax, eax 1 1 0.33 mov edx, 1 - 1 1 0.33 mov rax, r8 - 1 1 1.00 U ret - 1 1 0.50 lea r9d, [rsi + rdi] - 1 0 0.25 xor edx, edx - 1 1 0.33 mov r8d, 0 - 1 1 0.33 test r9b, 3 - 1 1 1.00 je .LBB5_5 - 1 1 0.33 mov rax, r8 1 1 1.00 U ret - 1 1 0.33 sub rsi, rax - 1 1 1.00 jb .LBB5_1 - 1 1 0.50 lea rax, [rdi + rsi] + 1 1 0.50 lea r8, [rdi + rsi] 1 5 0.50 * movzx esi, word ptr [rdi + rsi] 1 1 0.33 cmp si, -16192 - 1 1 0.33 mov edx, 2 - 2 2 0.67 cmove rdx, rcx - 1 0 0.25 xor r8d, r8d + 1 1 0.33 mov ecx, 2 + 2 2 0.67 cmove rcx, rdx + 1 0 0.25 xor eax, eax 1 1 0.33 cmp esi, 49344 - 2 2 0.67 cmove r8, rax - 1 1 0.33 mov rax, r8 + 2 2 0.67 cmove rax, r8 + 1 1 0.33 mov rdx, rcx 1 1 1.00 U ret @@ -68,42 +61,35 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 11.32 11.32 - 11.36 0.50 0.50 + - - 8.65 8.65 - 8.70 0.50 0.50 Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.32 0.67 - 0.01 - - mov rcx, rdx - - - 0.66 0.18 - 0.16 - - mov edx, 3 - - - 1.00 - - - - - mov rax, rcx - - - 1.00 1.00 - - - - mul rdx - - - - - - 1.00 - - jo .LBB5_1 - - - 0.01 0.99 - - - - cmp rax, -10 + - - 0.67 0.30 - 0.03 - - movabs rax, 3074457345618258598 + - - 0.01 0.99 - - - - cmp rdx, rax - - - - - 1.00 - - ja .LBB5_1 - - - 0.99 0.01 - - - - lea rdx, [rax + 9] - - - 0.01 0.99 - - - - not eax - - - 0.02 0.98 - - - - and eax, 3 - - - 0.02 0.98 - - - - add rax, rdx - - - - - - 1.00 - - jae .LBB5_4 - - - - - - - - - xor r8d, r8d - - - 0.66 0.01 - 0.33 - - mov edx, 1 - - - 0.50 - - 0.50 - - mov rax, r8 + - - 0.99 0.01 - - - - lea r8d, [rsi + rdi] + - - - - - - - - xor ecx, ecx + - - 0.35 0.62 - 0.03 - - mov eax, 0 + - - 0.99 0.01 - - - - test r8b, 3 + - - - - - 1.00 - - je .LBB5_3 + - - 0.68 0.30 - 0.02 - - mov rdx, rcx - - - - - 1.00 - - ret - - - 0.99 0.01 - - - - lea r9d, [rsi + rdi] - - - - - - - - - xor edx, edx - - - 0.50 0.32 - 0.18 - - mov r8d, 0 - - - 0.16 0.17 - 0.67 - - test r9b, 3 - - - - - - 1.00 - - je .LBB5_5 - - - 0.33 0.33 - 0.34 - - mov rax, r8 + - - 0.07 0.93 - - - - lea rax, [rdx + 2*rdx] + - - 0.06 0.35 - 0.59 - - or rax, 3 + - - 0.02 0.07 - 0.91 - - add rax, 9 + - - 0.01 0.04 - 0.95 - - sub rsi, rax + - - - - - 1.00 - - jae .LBB5_4 + - - - - - - - - xor eax, eax + - - 0.92 0.01 - 0.07 - - mov edx, 1 - - - - - 1.00 - - ret - - - - 0.51 - 0.49 - - sub rsi, rax - - - - - - 1.00 - - jb .LBB5_1 - - - 0.16 0.84 - - - - lea rax, [rdi + rsi] + - - - 1.00 - - - - lea r8, [rdi + rsi] - - - - - - 0.50 0.50 movzx esi, word ptr [rdi + rsi] - - - 0.02 0.98 - - - - cmp si, -16192 - - - 1.00 - - - - - mov edx, 2 - - - 0.99 0.84 - 0.17 - - cmove rdx, rcx - - - - - - - - - xor r8d, r8d - - - 0.98 - - 0.02 - - cmp esi, 49344 - - - 0.99 0.52 - 0.49 - - cmove r8, rax - - - 0.01 0.99 - - - - mov rax, r8 + - - 0.01 0.99 - - - - cmp si, -16192 + - - 0.88 0.04 - 0.08 - - mov ecx, 2 + - - 1.00 0.99 - 0.01 - - cmove rcx, rdx + - - - - - - - - xor eax, eax + - - 0.99 0.01 - - - - cmp esi, 49344 + - - 1.00 1.00 - - - - cmove rax, r8 + - - - 0.99 - 0.01 - - mov rdx, rcx - - - - - 1.00 - - ret diff --git a/benches/try_ref_from_suffix_with_elems_dynamic_size.x86-64 b/benches/try_ref_from_suffix_with_elems_dynamic_size.x86-64 index ff25a78945..952eb12de8 100644 --- a/benches/try_ref_from_suffix_with_elems_dynamic_size.x86-64 +++ b/benches/try_ref_from_suffix_with_elems_dynamic_size.x86-64 @@ -1,5 +1,5 @@ bench_try_ref_from_suffix_with_elems_dynamic_size: - movabs rax, 9223372036854775805 + movabs rax, 4611686018427387901 cmp rdx, rax ja .LBB5_1 lea r8d, [rsi + rdi] diff --git a/benches/try_ref_from_suffix_with_elems_dynamic_size.x86-64.mca b/benches/try_ref_from_suffix_with_elems_dynamic_size.x86-64.mca index 8b6333bf34..d4f78f67a2 100644 --- a/benches/try_ref_from_suffix_with_elems_dynamic_size.x86-64.mca +++ b/benches/try_ref_from_suffix_with_elems_dynamic_size.x86-64.mca @@ -18,7 +18,7 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 movabs rax, 9223372036854775805 + 1 1 0.33 movabs rax, 4611686018427387901 1 1 0.33 cmp rdx, rax 1 1 1.00 ja .LBB5_1 1 1 0.50 lea r8d, [rsi + rdi] @@ -61,7 +61,7 @@ Resource pressure per iteration: Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.02 0.95 - 0.03 - - movabs rax, 9223372036854775805 + - - 0.02 0.95 - 0.03 - - movabs rax, 4611686018427387901 - - 0.93 0.04 - 0.03 - - cmp rdx, rax - - - - - 1.00 - - ja .LBB5_1 - - 0.96 0.04 - - - - lea r8d, [rsi + rdi] diff --git a/src/byte_slice.rs b/src/byte_slice.rs index ace0b5dd6c..6f9ee9ac33 100644 --- a/src/byte_slice.rs +++ b/src/byte_slice.rs @@ -365,7 +365,7 @@ mod proofs { fn any_vec() -> Vec { let len = kani::any(); - kani::assume(len <= isize::MAX as usize); + kani::assume(len <= crate::DstLayout::MAX_SIZE); vec![0u8; len] } diff --git a/src/layout.rs b/src/layout.rs index 6c83676c80..19ad5ca85f 100644 --- a/src/layout.rs +++ b/src/layout.rs @@ -136,6 +136,18 @@ impl DstLayout { None => const_unreachable!(), }; + /// The maximum size of an allocation \[1\]. + /// + /// \[1\] Per : + /// + /// For any allocation with base `address`, `size`, and a set of `addresses`, + /// the following are guaranteed: [..] + /// + /// - `size <= isize::MAX` + /// + #[allow(clippy::as_conversions)] + pub(crate) const MAX_SIZE: usize = isize::MAX as usize; + /// Assumes that this layout lacks static shallow padding. /// /// # Panics @@ -1975,7 +1987,7 @@ mod proofs { true => { let size: usize = kani::any(); - kani::assume(size <= isize::MAX as _); + kani::assume(size <= DstLayout::MAX_SIZE); SizeInfo::Sized { size } } @@ -1989,8 +2001,8 @@ mod proofs { let elem_size: usize = kani::any(); let offset: usize = kani::any(); - kani::assume(elem_size < isize::MAX as _); - kani::assume(offset < isize::MAX as _); + kani::assume(elem_size < DstLayout::MAX_SIZE); + kani::assume(offset < DstLayout::MAX_SIZE); TrailingSliceLayout { elem_size, offset } } @@ -2019,7 +2031,7 @@ mod proofs { loop {} }; - if unpadded_size >= isize::MAX as usize { + if unpadded_size >= DstLayout::MAX_SIZE { // The `unpadded_size` exceeds `isize::MAX`; `meta` is invalid. kani::assume(false); loop {} diff --git a/src/lib.rs b/src/lib.rs index cf6fb4518c..0c66144932 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -863,6 +863,19 @@ pub unsafe trait KnownLayout { fn size_for_metadata(meta: Self::PointerMetadata) -> Option { meta.size_for_metadata(Self::LAYOUT) } + + /// Computes whether `meta` can describe a valid allocation of `Self`. + /// + /// # Safety + /// + /// `is_valid_metadata` promises to return `true` if and only if the size of + /// an allocation of `Self` with `meta` would not overflow an + /// [`isize::MAX`]. + #[doc(hidden)] + #[inline(always)] + fn is_valid_metadata(meta: Self::PointerMetadata) -> bool { + meta.to_elem_count() <= maximum_trailing_slice_len::().to_elem_count() + } } /// Efficiently produces the [`TrailingSliceLayout`] of `T`. @@ -888,9 +901,39 @@ where T::SIZE_INFO } +/// Efficiently produces the maximum trailing slice length `T`. +#[inline(always)] +pub(crate) fn maximum_trailing_slice_len() -> usize +where + T: ?Sized + KnownLayout, +{ + trait LayoutFacts { + const MAX_LEN: usize; + } + + impl LayoutFacts for T + where + T: KnownLayout, + { + const MAX_LEN: usize = match T::LAYOUT.size_info { + SizeInfo::SliceDst(TrailingSliceLayout { elem_size: 0, .. }) => usize::MAX, + _ => match T::LAYOUT.validate_cast_and_convert_metadata( + T::LAYOUT.align.get(), + DstLayout::MAX_SIZE, + CastType::Prefix, + ) { + Ok((elems, _)) => elems, + Err(_) => const_panic!("unreachable"), + }, + }; + } + + T::MAX_LEN +} + /// The metadata associated with a [`KnownLayout`] type. #[doc(hidden)] -pub trait PointerMetadata: Copy + Eq + Debug { +pub trait PointerMetadata: Copy + Eq + Debug + Ord { /// Constructs a `Self` from an element count. /// /// If `Self = ()`, this returns `()`. If `Self = usize`, this returns diff --git a/src/util/mod.rs b/src/util/mod.rs index ccc5166fdd..4016f8f048 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -142,7 +142,7 @@ pub(crate) fn validate_aligned_to(t: T) -> Result<(), Alignment /// on the answer it gives if this is not the case. #[cfg_attr( kani, - kani::requires(len <= isize::MAX as usize), + kani::requires(len <= DstLayout::MAX_SIZE), kani::requires(align.is_power_of_two()), kani::ensures(|&p| (len + p) % align.get() == 0), // Ensures that we add the minimum required padding. @@ -382,29 +382,29 @@ pub(crate) unsafe fn new_box( where T: ?Sized + crate::KnownLayout, { + let align = T::LAYOUT.align.get(); + if !T::is_valid_metadata(meta) { + return Err(AllocError); + } let size = match T::size_for_metadata(meta) { Some(size) => size, + // Thanks to the `!T::is_valid_metadata(meta)` check + // above, this branch is unreachable. Fortunately, the + // optimizer recognizes this, so replacing this branch + // with `unreachable_unchecked` produces no codegen + // improvements. None => return Err(AllocError), }; - - let align = T::LAYOUT.align.get(); - // On stable Rust versions <= 1.64.0, `Layout::from_size_align` has a bug in - // which sufficiently-large allocations (those which, when rounded up to the - // alignment, overflow `isize`) are not rejected, which can cause undefined - // behavior. See #64 for details. - // - // FIXME(#67): Once our MSRV is > 1.64.0, remove this assertion. - #[allow(clippy::as_conversions)] - let max_alloc = (isize::MAX as usize).saturating_sub(align); - if size > max_alloc { - return Err(AllocError); - } - - // FIXME(https://github.com/rust-lang/rust/issues/55724): Use - // `Layout::repeat` once it's stabilized. - let layout = Layout::from_size_align(size, align).or(Err(AllocError))?; - - let ptr = if layout.size() != 0 { + let ptr = if size != 0 { + // SAFETY: + // - `align` is derived from a `NonZeroUsize` and is thus non-zero. + // - `align` is a power of two because, by invariant on + // `KnownLayout::LAYOUT` `::LAYOUT` accurately + // reflects the layout of `T`. + // - `size`, by invariant on `size_for_metadata` is well-aligned for + // `align` and, by the check on `T::is_valid_metadata(meta)`, is less + // than `isize::MAX`. + let layout: Layout = unsafe { Layout::from_size_align_unchecked(size, align) }; // SAFETY: By contract on the caller, `allocate` is either // `alloc::alloc::alloc` or `alloc::alloc::alloc_zeroed`. The above // check ensures their shared safety precondition: that the supplied @@ -420,8 +420,6 @@ where None => return Err(AllocError), } } else { - let align = T::LAYOUT.align.get(); - // We use `transmute` instead of an `as` cast since Miri (with strict // provenance enabled) notices and complains that an `as` cast creates a // pointer with no provenance. Miri isn't smart enough to realize that @@ -435,8 +433,8 @@ where #[allow(unknown_lints)] #[allow(clippy::useless_transmute, integer_to_ptr_transmutes)] let dangling = unsafe { mem::transmute::(align) }; - // SAFETY: `dangling` is constructed from `T::LAYOUT.align`, which is a - // `NonZeroUsize`, which is guaranteed to be non-zero. + // SAFETY: `dangling` is constructed from `align`, which is derived from + // a `NonZeroUsize`, which is guaranteed to be non-zero. // // `Box<[T]>` does not allocate when `T` is zero-sized or when `len` is // zero, but it does require a non-null dangling pointer for its @@ -579,11 +577,19 @@ mod len_of { ) -> Result<(MetadataOf, MetadataOf<[u8]>), MetadataCastError> { let layout = match meta { None => T::LAYOUT, - // This can return `None` if the metadata describes an object - // which can't fit in an `isize`. + // This can return `Err(MetadataCastError::Size)` if the + // metadata describes an object which can't fit in an `isize`. Some(meta) => { + if !T::is_valid_metadata(meta) { + return Err(MetadataCastError::Size); + } let size = match T::size_for_metadata(meta) { Some(size) => size, + // Thanks to the `!T::is_valid_metadata(meta)` check + // above, this branch is unreachable. Fortunately, the + // optimizer recognizes this, so replacing this branch + // with `unreachable_unchecked` produces no codegen + // improvements. None => return Err(MetadataCastError::Size), }; DstLayout { From 16a435370d6d7e14a8791ac1bc05967cf6a6e716 Mon Sep 17 00:00:00 2001 From: Jack Wrenn Date: Tue, 17 Mar 2026 16:57:13 +0000 Subject: [PATCH 3/3] [wip] experiment with reversing alignment and size validation order gherrit-pr-id: G74bf0db1b7c30669171b5985ce24849a264746e7 --- benches/ref_from_bytes_dynamic_padding.x86-64 | 28 ++--- .../ref_from_bytes_dynamic_padding.x86-64.mca | 76 +++++++------ benches/ref_from_bytes_dynamic_size.x86-64 | 26 ++--- .../ref_from_bytes_dynamic_size.x86-64.mca | 60 +++++------ ...om_bytes_with_elems_dynamic_padding.x86-64 | 16 ++- ...ytes_with_elems_dynamic_padding.x86-64.mca | 52 +++++---- ..._from_bytes_with_elems_dynamic_size.x86-64 | 13 ++- ...m_bytes_with_elems_dynamic_size.x86-64.mca | 46 ++++---- .../ref_from_prefix_dynamic_padding.x86-64 | 28 ++--- ...ref_from_prefix_dynamic_padding.x86-64.mca | 70 ++++++------ benches/ref_from_prefix_dynamic_size.x86-64 | 12 +-- .../ref_from_prefix_dynamic_size.x86-64.mca | 36 +++---- benches/ref_from_prefix_static_size.x86-64 | 7 +- .../ref_from_prefix_static_size.x86-64.mca | 30 +++--- ...m_prefix_with_elems_dynamic_padding.x86-64 | 33 +++--- ...efix_with_elems_dynamic_padding.x86-64.mca | 78 ++++++-------- ...from_prefix_with_elems_dynamic_size.x86-64 | 27 +++-- ..._prefix_with_elems_dynamic_size.x86-64.mca | 66 ++++++------ .../ref_from_suffix_dynamic_padding.x86-64 | 8 +- ...ref_from_suffix_dynamic_padding.x86-64.mca | 44 ++++---- benches/ref_from_suffix_dynamic_size.x86-64 | 20 ++-- .../ref_from_suffix_dynamic_size.x86-64.mca | 56 +++++----- benches/ref_from_suffix_static_size.x86-64 | 15 ++- .../ref_from_suffix_static_size.x86-64.mca | 44 ++++---- ...m_suffix_with_elems_dynamic_padding.x86-64 | 37 ++++--- ...ffix_with_elems_dynamic_padding.x86-64.mca | 82 +++++++------- ...from_suffix_with_elems_dynamic_size.x86-64 | 31 +++--- ..._suffix_with_elems_dynamic_size.x86-64.mca | 70 ++++++------ .../try_ref_from_bytes_dynamic_padding.x86-64 | 23 ++-- ..._ref_from_bytes_dynamic_padding.x86-64.mca | 76 +++++++------ .../try_ref_from_bytes_dynamic_size.x86-64 | 32 +++--- ...try_ref_from_bytes_dynamic_size.x86-64.mca | 76 ++++++------- ...om_bytes_with_elems_dynamic_padding.x86-64 | 18 ++-- ...ytes_with_elems_dynamic_padding.x86-64.mca | 52 +++++---- ..._from_bytes_with_elems_dynamic_size.x86-64 | 12 +-- ...m_bytes_with_elems_dynamic_size.x86-64.mca | 46 ++++---- ...try_ref_from_prefix_dynamic_padding.x86-64 | 36 ++++--- ...ref_from_prefix_dynamic_padding.x86-64.mca | 86 ++++++++------- .../try_ref_from_prefix_dynamic_size.x86-64 | 12 +-- ...ry_ref_from_prefix_dynamic_size.x86-64.mca | 32 +++--- .../try_ref_from_prefix_static_size.x86-64 | 5 +- ...try_ref_from_prefix_static_size.x86-64.mca | 30 +++--- ...m_prefix_with_elems_dynamic_padding.x86-64 | 32 +++--- ...efix_with_elems_dynamic_padding.x86-64.mca | 78 +++++++------- ...from_prefix_with_elems_dynamic_size.x86-64 | 20 ++-- ..._prefix_with_elems_dynamic_size.x86-64.mca | 48 ++++----- ...try_ref_from_suffix_dynamic_padding.x86-64 | 8 +- ...ref_from_suffix_dynamic_padding.x86-64.mca | 52 ++++----- .../try_ref_from_suffix_dynamic_size.x86-64 | 15 ++- ...ry_ref_from_suffix_dynamic_size.x86-64.mca | 50 +++++---- .../try_ref_from_suffix_static_size.x86-64 | 15 ++- ...try_ref_from_suffix_static_size.x86-64.mca | 44 ++++---- ...m_suffix_with_elems_dynamic_padding.x86-64 | 47 ++++---- ...ffix_with_elems_dynamic_padding.x86-64.mca | 102 +++++++++--------- ...from_suffix_with_elems_dynamic_size.x86-64 | 41 +++---- ..._suffix_with_elems_dynamic_size.x86-64.mca | 90 ++++++++-------- src/layout.rs | 68 ++++++------ 57 files changed, 1170 insertions(+), 1187 deletions(-) diff --git a/benches/ref_from_bytes_dynamic_padding.x86-64 b/benches/ref_from_bytes_dynamic_padding.x86-64 index e844a4608f..5177a4ce95 100644 --- a/benches/ref_from_bytes_dynamic_padding.x86-64 +++ b/benches/ref_from_bytes_dynamic_padding.x86-64 @@ -1,22 +1,24 @@ bench_ref_from_bytes_dynamic_padding: - test dil, 3 - jne .LBB5_3 movabs rax, 9223372036854775804 and rax, rsi cmp rax, 9 - jb .LBB5_3 + setb cl + test dil, 3 + setne dl + or dl, cl + jne .LBB5_1 add rax, -9 movabs rcx, -6148914691236517205 mul rcx shr rdx - lea rax, [rdx + 2*rdx] - or rax, 3 - add rax, 9 - cmp rsi, rax - je .LBB5_4 -.LBB5_3: - xor edi, edi - mov rdx, rsi -.LBB5_4: - mov rax, rdi + lea rcx, [rdx + 2*rdx] + or rcx, 3 + add rcx, 9 + xor eax, eax + cmp rsi, rcx + cmovne rdx, rsi + cmove rax, rdi + ret +.LBB5_1: + xor eax, eax ret diff --git a/benches/ref_from_bytes_dynamic_padding.x86-64.mca b/benches/ref_from_bytes_dynamic_padding.x86-64.mca index 423ed38ba2..25a0d3e961 100644 --- a/benches/ref_from_bytes_dynamic_padding.x86-64.mca +++ b/benches/ref_from_bytes_dynamic_padding.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 1900 -Total Cycles: 645 -Total uOps: 2000 +Instructions: 2200 +Total Cycles: 783 +Total uOps: 2500 Dispatch Width: 4 -uOps Per Cycle: 3.10 -IPC: 2.95 -Block RThroughput: 5.0 +uOps Per Cycle: 3.19 +IPC: 2.81 +Block RThroughput: 6.3 Instruction Info: @@ -18,24 +18,27 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 test dil, 3 - 1 1 1.00 jne .LBB5_3 1 1 0.33 movabs rax, 9223372036854775804 1 1 0.33 and rax, rsi 1 1 0.33 cmp rax, 9 - 1 1 1.00 jb .LBB5_3 + 1 1 0.50 setb cl + 1 1 0.33 test dil, 3 + 1 1 0.50 setne dl + 1 1 0.33 or dl, cl + 1 1 1.00 jne .LBB5_1 1 1 0.33 add rax, -9 1 1 0.33 movabs rcx, -6148914691236517205 2 4 1.00 mul rcx 1 1 0.50 shr rdx - 1 1 0.50 lea rax, [rdx + 2*rdx] - 1 1 0.33 or rax, 3 - 1 1 0.33 add rax, 9 - 1 1 0.33 cmp rsi, rax - 1 1 1.00 je .LBB5_4 - 1 0 0.25 xor edi, edi - 1 1 0.33 mov rdx, rsi - 1 1 0.33 mov rax, rdi + 1 1 0.50 lea rcx, [rdx + 2*rdx] + 1 1 0.33 or rcx, 3 + 1 1 0.33 add rcx, 9 + 1 0 0.25 xor eax, eax + 1 1 0.33 cmp rsi, rcx + 2 2 0.67 cmovne rdx, rsi + 2 2 0.67 cmove rax, rdi + 1 1 1.00 U ret + 1 0 0.25 xor eax, eax 1 1 1.00 U ret @@ -52,26 +55,29 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 6.32 6.33 - 6.35 - - + - - 7.65 7.67 - 7.68 - - Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.64 0.35 - 0.01 - - test dil, 3 - - - - - - 1.00 - - jne .LBB5_3 - - - 0.34 0.65 - 0.01 - - movabs rax, 9223372036854775804 - - - 0.35 0.65 - - - - and rax, rsi - - - 0.33 0.34 - 0.33 - - cmp rax, 9 - - - - - - 1.00 - - jb .LBB5_3 - - - 0.35 - - 0.65 - - add rax, -9 - - - 0.97 0.01 - 0.02 - - movabs rcx, -6148914691236517205 + - - - 0.99 - 0.01 - - movabs rax, 9223372036854775804 + - - 0.04 0.95 - 0.01 - - and rax, rsi + - - 0.09 0.85 - 0.06 - - cmp rax, 9 + - - 0.50 - - 0.50 - - setb cl + - - 0.01 0.95 - 0.04 - - test dil, 3 + - - 0.36 - - 0.64 - - setne dl + - - 0.47 0.12 - 0.41 - - or dl, cl + - - - - - 1.00 - - jne .LBB5_1 + - - - 0.95 - 0.05 - - add rax, -9 + - - - 0.81 - 0.19 - - movabs rcx, -6148914691236517205 - - 1.00 1.00 - - - - mul rcx - - - 0.99 - - 0.01 - - shr rdx - - - 0.33 0.67 - - - - lea rax, [rdx + 2*rdx] - - - 0.34 0.66 - - - - or rax, 3 - - - 0.33 0.66 - 0.01 - - add rax, 9 - - - 0.01 0.99 - - - - cmp rsi, rax - - - - - - 1.00 - - je .LBB5_4 - - - - - - - - - xor edi, edi - - - 0.32 0.01 - 0.67 - - mov rdx, rsi - - - 0.02 0.34 - 0.64 - - mov rax, rdi + - - 0.62 - - 0.38 - - shr rdx + - - 0.62 0.38 - - - - lea rcx, [rdx + 2*rdx] + - - 0.59 0.17 - 0.24 - - or rcx, 3 + - - 0.61 0.19 - 0.20 - - add rcx, 9 + - - - - - - - - xor eax, eax + - - 0.75 0.24 - 0.01 - - cmp rsi, rcx + - - 1.00 0.03 - 0.97 - - cmovne rdx, rsi + - - 0.99 0.04 - 0.97 - - cmove rax, rdi + - - - - - 1.00 - - ret + - - - - - - - - xor eax, eax - - - - - 1.00 - - ret diff --git a/benches/ref_from_bytes_dynamic_size.x86-64 b/benches/ref_from_bytes_dynamic_size.x86-64 index cc905b76c0..2ed6e32b05 100644 --- a/benches/ref_from_bytes_dynamic_size.x86-64 +++ b/benches/ref_from_bytes_dynamic_size.x86-64 @@ -1,20 +1,20 @@ bench_ref_from_bytes_dynamic_size: - mov rdx, rsi cmp rsi, 4 setb al - or al, dil - test al, 1 - je .LBB5_2 + mov ecx, edi + or cl, al + test cl, 1 + jne .LBB5_1 + lea rcx, [rsi - 4] + mov rdx, rcx + shr rdx + and rcx, -2 + add rcx, 4 xor eax, eax + cmp rsi, rcx + cmovne rdx, rsi + cmove rax, rdi ret -.LBB5_2: - lea rcx, [rdx - 4] - mov rsi, rcx - and rsi, -2 - add rsi, 4 - shr rcx +.LBB5_1: xor eax, eax - cmp rdx, rsi - cmove rdx, rcx - cmove rax, rdi ret diff --git a/benches/ref_from_bytes_dynamic_size.x86-64.mca b/benches/ref_from_bytes_dynamic_size.x86-64.mca index 68aea583e4..7c90f65142 100644 --- a/benches/ref_from_bytes_dynamic_size.x86-64.mca +++ b/benches/ref_from_bytes_dynamic_size.x86-64.mca @@ -1,11 +1,11 @@ Iterations: 100 Instructions: 1800 -Total Cycles: 704 +Total Cycles: 606 Total uOps: 2000 Dispatch Width: 4 -uOps Per Cycle: 2.84 -IPC: 2.56 +uOps Per Cycle: 3.30 +IPC: 2.97 Block RThroughput: 5.0 @@ -18,23 +18,23 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 mov rdx, rsi 1 1 0.33 cmp rsi, 4 1 1 0.50 setb al - 1 1 0.33 or al, dil - 1 1 0.33 test al, 1 - 1 1 1.00 je .LBB5_2 + 1 1 0.33 mov ecx, edi + 1 1 0.33 or cl, al + 1 1 0.33 test cl, 1 + 1 1 1.00 jne .LBB5_1 + 1 1 0.50 lea rcx, [rsi - 4] + 1 1 0.33 mov rdx, rcx + 1 1 0.50 shr rdx + 1 1 0.33 and rcx, -2 + 1 1 0.33 add rcx, 4 1 0 0.25 xor eax, eax + 1 1 0.33 cmp rsi, rcx + 2 2 0.67 cmovne rdx, rsi + 2 2 0.67 cmove rax, rdi 1 1 1.00 U ret - 1 1 0.50 lea rcx, [rdx - 4] - 1 1 0.33 mov rsi, rcx - 1 1 0.33 and rsi, -2 - 1 1 0.33 add rsi, 4 - 1 1 0.50 shr rcx 1 0 0.25 xor eax, eax - 1 1 0.33 cmp rdx, rsi - 2 2 0.67 cmove rdx, rcx - 2 2 0.67 cmove rax, rdi 1 1 1.00 U ret @@ -51,25 +51,25 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 5.97 5.98 - 6.05 - - + - - 6.00 6.00 - 6.00 - - Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.97 0.01 - 0.02 - - mov rdx, rsi - - - 0.01 0.02 - 0.97 - - cmp rsi, 4 - - - 0.03 - - 0.97 - - setb al - - - 0.01 0.02 - 0.97 - - or al, dil - - - - 0.98 - 0.02 - - test al, 1 - - - - - - 1.00 - - je .LBB5_2 + - - 0.99 - - 0.01 - - cmp rsi, 4 + - - 1.00 - - - - - setb al + - - 0.98 0.02 - - - - mov ecx, edi + - - 0.98 0.01 - 0.01 - - or cl, al + - - 0.01 0.99 - - - - test cl, 1 + - - - - - 1.00 - - jne .LBB5_1 + - - 0.99 0.01 - - - - lea rcx, [rsi - 4] + - - 0.02 0.98 - - - - mov rdx, rcx + - - - - - 1.00 - - shr rdx + - - 0.99 0.01 - - - - and rcx, -2 + - - - 1.00 - - - - add rcx, 4 - - - - - - - - xor eax, eax + - - 0.02 0.98 - - - - cmp rsi, rcx + - - 0.01 1.00 - 0.99 - - cmovne rdx, rsi + - - 0.01 1.00 - 0.99 - - cmove rax, rdi - - - - - 1.00 - - ret - - - 0.98 0.02 - - - - lea rcx, [rdx - 4] - - - 0.01 0.99 - - - - mov rsi, rcx - - - - 0.98 - 0.02 - - and rsi, -2 - - - 0.98 0.01 - 0.01 - - add rsi, 4 - - - 0.99 - - 0.01 - - shr rcx - - - - - - - - xor eax, eax - - - 0.02 0.97 - 0.01 - - cmp rdx, rsi - - - 0.99 0.99 - 0.02 - - cmove rdx, rcx - - - 0.98 0.99 - 0.03 - - cmove rax, rdi - - - - - 1.00 - - ret diff --git a/benches/ref_from_bytes_with_elems_dynamic_padding.x86-64 b/benches/ref_from_bytes_with_elems_dynamic_padding.x86-64 index d579b3faef..1ab816b4cc 100644 --- a/benches/ref_from_bytes_with_elems_dynamic_padding.x86-64 +++ b/benches/ref_from_bytes_with_elems_dynamic_padding.x86-64 @@ -1,19 +1,17 @@ bench_ref_from_bytes_with_elems_dynamic_padding: - movabs rax, 3074457345618258598 - cmp rdx, rax - seta cl + movabs rcx, 3074457345618258598 + cmp rdx, rcx + ja .LBB5_3 mov rax, rdi test al, 3 - setne dil - or dil, cl - jne .LBB5_2 + jne .LBB5_3 lea rcx, [rdx + 2*rdx] or rcx, 3 add rcx, 9 cmp rsi, rcx - je .LBB5_3 -.LBB5_2: + jne .LBB5_3 + ret +.LBB5_3: xor eax, eax mov rdx, rsi -.LBB5_3: ret diff --git a/benches/ref_from_bytes_with_elems_dynamic_padding.x86-64.mca b/benches/ref_from_bytes_with_elems_dynamic_padding.x86-64.mca index ea2d83dbd1..afb0b4c0b1 100644 --- a/benches/ref_from_bytes_with_elems_dynamic_padding.x86-64.mca +++ b/benches/ref_from_bytes_with_elems_dynamic_padding.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 1600 -Total Cycles: 539 -Total uOps: 1700 +Instructions: 1500 +Total Cycles: 505 +Total uOps: 1500 Dispatch Width: 4 -uOps Per Cycle: 3.15 +uOps Per Cycle: 2.97 IPC: 2.97 -Block RThroughput: 4.3 +Block RThroughput: 5.0 Instruction Info: @@ -18,19 +18,18 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 movabs rax, 3074457345618258598 - 1 1 0.33 cmp rdx, rax - 2 2 1.00 seta cl + 1 1 0.33 movabs rcx, 3074457345618258598 + 1 1 0.33 cmp rdx, rcx + 1 1 1.00 ja .LBB5_3 1 1 0.33 mov rax, rdi 1 1 0.33 test al, 3 - 1 1 0.50 setne dil - 1 1 0.33 or dil, cl - 1 1 1.00 jne .LBB5_2 + 1 1 1.00 jne .LBB5_3 1 1 0.50 lea rcx, [rdx + 2*rdx] 1 1 0.33 or rcx, 3 1 1 0.33 add rcx, 9 1 1 0.33 cmp rsi, rcx - 1 1 1.00 je .LBB5_3 + 1 1 1.00 jne .LBB5_3 + 1 1 1.00 U ret 1 0 0.25 xor eax, eax 1 1 0.33 mov rdx, rsi 1 1 1.00 U ret @@ -49,23 +48,22 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 5.33 5.32 - 5.35 - - + - - 4.49 4.49 - 5.02 - - Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.01 0.98 - 0.01 - - movabs rax, 3074457345618258598 - - - - 1.00 - - - - cmp rdx, rax - - - 1.98 - - 0.02 - - seta cl - - - 0.02 0.98 - - - - mov rax, rdi - - - - 0.67 - 0.33 - - test al, 3 - - - 0.67 - - 0.33 - - setne dil - - - 0.99 - - 0.01 - - or dil, cl - - - - - - 1.00 - - jne .LBB5_2 - - - 0.01 0.99 - - - - lea rcx, [rdx + 2*rdx] - - - - 0.01 - 0.99 - - or rcx, 3 - - - 0.65 0.02 - 0.33 - - add rcx, 9 - - - 0.99 0.01 - - - - cmp rsi, rcx - - - - - - 1.00 - - je .LBB5_3 + - - 0.97 0.02 - 0.01 - - movabs rcx, 3074457345618258598 + - - 0.50 0.50 - - - - cmp rdx, rcx + - - - - - 1.00 - - ja .LBB5_3 + - - 0.50 0.50 - - - - mov rax, rdi + - - 0.02 0.97 - 0.01 - - test al, 3 + - - - - - 1.00 - - jne .LBB5_3 + - - 0.97 0.03 - - - - lea rcx, [rdx + 2*rdx] + - - 0.50 0.50 - - - - or rcx, 3 + - - 0.03 0.97 - - - - add rcx, 9 + - - 0.03 0.97 - - - - cmp rsi, rcx + - - - - - 1.00 - - jne .LBB5_3 + - - - - - 1.00 - - ret - - - - - - - - xor eax, eax - - - 0.01 0.66 - 0.33 - - mov rdx, rsi + - - 0.97 0.03 - - - - mov rdx, rsi - - - - - 1.00 - - ret diff --git a/benches/ref_from_bytes_with_elems_dynamic_size.x86-64 b/benches/ref_from_bytes_with_elems_dynamic_size.x86-64 index 3d8d15b7f6..efee25e23f 100644 --- a/benches/ref_from_bytes_with_elems_dynamic_size.x86-64 +++ b/benches/ref_from_bytes_with_elems_dynamic_size.x86-64 @@ -1,13 +1,12 @@ bench_ref_from_bytes_with_elems_dynamic_size: - movabs rax, 4611686018427387901 - cmp rdx, rax - seta cl + movabs rcx, 4611686018427387901 + cmp rdx, rcx + ja .LBB5_2 mov rax, rdi - or dil, cl - test dil, 1 - jne .LBB5_2 lea rcx, [2*rdx + 4] - cmp rsi, rcx + and edi, 1 + xor rcx, rsi + or rcx, rdi je .LBB5_3 .LBB5_2: xor eax, eax diff --git a/benches/ref_from_bytes_with_elems_dynamic_size.x86-64.mca b/benches/ref_from_bytes_with_elems_dynamic_size.x86-64.mca index 602179f3c9..3235e68f2b 100644 --- a/benches/ref_from_bytes_with_elems_dynamic_size.x86-64.mca +++ b/benches/ref_from_bytes_with_elems_dynamic_size.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 1300 -Total Cycles: 439 -Total uOps: 1400 +Instructions: 1200 +Total Cycles: 371 +Total uOps: 1200 Dispatch Width: 4 -uOps Per Cycle: 3.19 -IPC: 2.96 -Block RThroughput: 3.5 +uOps Per Cycle: 3.23 +IPC: 3.23 +Block RThroughput: 3.0 Instruction Info: @@ -18,15 +18,14 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 movabs rax, 4611686018427387901 - 1 1 0.33 cmp rdx, rax - 2 2 1.00 seta cl + 1 1 0.33 movabs rcx, 4611686018427387901 + 1 1 0.33 cmp rdx, rcx + 1 1 1.00 ja .LBB5_2 1 1 0.33 mov rax, rdi - 1 1 0.33 or dil, cl - 1 1 0.33 test dil, 1 - 1 1 1.00 jne .LBB5_2 1 1 0.50 lea rcx, [2*rdx + 4] - 1 1 0.33 cmp rsi, rcx + 1 1 0.33 and edi, 1 + 1 1 0.33 xor rcx, rsi + 1 1 0.33 or rcx, rdi 1 1 1.00 je .LBB5_3 1 0 0.25 xor eax, eax 1 1 0.33 mov rdx, rsi @@ -46,20 +45,19 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 4.32 4.33 - 4.35 - - + - - 3.66 3.66 - 3.68 - - Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - - 0.99 - 0.01 - - movabs rax, 4611686018427387901 - - - 0.33 0.67 - - - - cmp rdx, rax - - - 1.98 - - 0.02 - - seta cl - - - 0.01 0.99 - - - - mov rax, rdi - - - 1.00 - - - - - or dil, cl - - - 0.99 0.01 - - - - test dil, 1 - - - - - - 1.00 - - jne .LBB5_2 - - - - 1.00 - - - - lea rcx, [2*rdx + 4] - - - 0.01 - - 0.99 - - cmp rsi, rcx + - - - 0.99 - 0.01 - - movabs rcx, 4611686018427387901 + - - 0.35 0.33 - 0.32 - - cmp rdx, rcx + - - - - - 1.00 - - ja .LBB5_2 + - - 0.63 0.37 - - - - mov rax, rdi + - - 0.35 0.65 - - - - lea rcx, [2*rdx + 4] + - - 0.34 0.65 - 0.01 - - and edi, 1 + - - 0.99 0.01 - - - - xor rcx, rsi + - - 1.00 - - - - - or rcx, rdi - - - - - 1.00 - - je .LBB5_3 - - - - - - - - xor eax, eax - - - - 0.67 - 0.33 - - mov rdx, rsi + - - - 0.66 - 0.34 - - mov rdx, rsi - - - - - 1.00 - - ret diff --git a/benches/ref_from_prefix_dynamic_padding.x86-64 b/benches/ref_from_prefix_dynamic_padding.x86-64 index a58592a245..01228fbcbc 100644 --- a/benches/ref_from_prefix_dynamic_padding.x86-64 +++ b/benches/ref_from_prefix_dynamic_padding.x86-64 @@ -1,22 +1,24 @@ bench_ref_from_prefix_dynamic_padding: - xor edx, edx - mov eax, 0 - test dil, 3 - je .LBB5_1 - ret -.LBB5_1: movabs rax, 9223372036854775804 - and rsi, rax - cmp rsi, 9 - jae .LBB5_3 + and rax, rsi + cmp rax, 9 + jae .LBB5_2 mov edx, 1 - xor eax, eax + xor ecx, ecx + mov rax, rcx + ret +.LBB5_2: + xor edx, edx + mov ecx, 0 + test dil, 3 + je .LBB5_3 + mov rax, rcx ret .LBB5_3: - add rsi, -9 + add rax, -9 movabs rcx, -6148914691236517205 - mov rax, rsi mul rcx shr rdx - mov rax, rdi + mov rcx, rdi + mov rax, rcx ret diff --git a/benches/ref_from_prefix_dynamic_padding.x86-64.mca b/benches/ref_from_prefix_dynamic_padding.x86-64.mca index 62ea4babaf..6e50e96210 100644 --- a/benches/ref_from_prefix_dynamic_padding.x86-64.mca +++ b/benches/ref_from_prefix_dynamic_padding.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 1900 -Total Cycles: 608 -Total uOps: 2000 +Instructions: 2100 +Total Cycles: 673 +Total uOps: 2200 Dispatch Width: 4 -uOps Per Cycle: 3.29 -IPC: 3.13 -Block RThroughput: 5.0 +uOps Per Cycle: 3.27 +IPC: 3.12 +Block RThroughput: 5.5 Instruction Info: @@ -18,24 +18,26 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 0 0.25 xor edx, edx - 1 1 0.33 mov eax, 0 - 1 1 0.33 test dil, 3 - 1 1 1.00 je .LBB5_1 - 1 1 1.00 U ret 1 1 0.33 movabs rax, 9223372036854775804 - 1 1 0.33 and rsi, rax - 1 1 0.33 cmp rsi, 9 - 1 1 1.00 jae .LBB5_3 + 1 1 0.33 and rax, rsi + 1 1 0.33 cmp rax, 9 + 1 1 1.00 jae .LBB5_2 1 1 0.33 mov edx, 1 - 1 0 0.25 xor eax, eax + 1 0 0.25 xor ecx, ecx + 1 1 0.33 mov rax, rcx + 1 1 1.00 U ret + 1 0 0.25 xor edx, edx + 1 1 0.33 mov ecx, 0 + 1 1 0.33 test dil, 3 + 1 1 1.00 je .LBB5_3 + 1 1 0.33 mov rax, rcx 1 1 1.00 U ret - 1 1 0.33 add rsi, -9 + 1 1 0.33 add rax, -9 1 1 0.33 movabs rcx, -6148914691236517205 - 1 1 0.33 mov rax, rsi 2 4 1.00 mul rcx 1 1 0.50 shr rdx - 1 1 0.33 mov rax, rdi + 1 1 0.33 mov rcx, rdi + 1 1 0.33 mov rax, rcx 1 1 1.00 U ret @@ -52,26 +54,28 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 6.00 6.00 - 6.00 - - + - - 6.67 6.66 - 6.67 - - Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - - - - - - - xor edx, edx - - - 0.01 0.98 - 0.01 - - mov eax, 0 - - - 0.98 0.01 - 0.01 - - test dil, 3 - - - - - - 1.00 - - je .LBB5_1 + - - 0.66 0.33 - 0.01 - - movabs rax, 9223372036854775804 + - - 0.33 0.67 - - - - and rax, rsi + - - - 1.00 - - - - cmp rax, 9 + - - - - - 1.00 - - jae .LBB5_2 + - - 0.67 - - 0.33 - - mov edx, 1 + - - - - - - - - xor ecx, ecx + - - 0.66 - - 0.34 - - mov rax, rcx - - - - - 1.00 - - ret - - - 0.01 0.99 - - - - movabs rax, 9223372036854775804 - - - - 1.00 - - - - and rsi, rax - - - - 1.00 - - - - cmp rsi, 9 - - - - - - 1.00 - - jae .LBB5_3 - - - 1.00 - - - - - mov edx, 1 - - - - - - - - - xor eax, eax + - - - - - - - - xor edx, edx + - - 0.67 0.33 - - - - mov ecx, 0 + - - - 0.67 - 0.33 - - test dil, 3 + - - - - - 1.00 - - je .LBB5_3 + - - 0.34 0.66 - - - - mov rax, rcx - - - - - 1.00 - - ret - - - 0.02 0.02 - 0.96 - - add rsi, -9 - - - 0.99 0.01 - - - - movabs rcx, -6148914691236517205 - - - 0.01 0.99 - - - - mov rax, rsi + - - - 1.00 - - - - add rax, -9 + - - 1.00 - - - - - movabs rcx, -6148914691236517205 - - 1.00 1.00 - - - - mul rcx - - 1.00 - - - - - shr rdx - - - 0.98 - - 0.02 - - mov rax, rdi + - - 0.33 0.34 - 0.33 - - mov rcx, rdi + - - 0.01 0.66 - 0.33 - - mov rax, rcx - - - - - 1.00 - - ret diff --git a/benches/ref_from_prefix_dynamic_size.x86-64 b/benches/ref_from_prefix_dynamic_size.x86-64 index fe6332c910..e402765c33 100644 --- a/benches/ref_from_prefix_dynamic_size.x86-64 +++ b/benches/ref_from_prefix_dynamic_size.x86-64 @@ -1,14 +1,14 @@ bench_ref_from_prefix_dynamic_size: - xor edx, edx - mov eax, 0 - test dil, 1 - jne .LBB5_4 cmp rsi, 4 - jae .LBB5_3 + jae .LBB5_2 mov edx, 1 xor eax, eax ret -.LBB5_3: +.LBB5_2: + xor edx, edx + mov eax, 0 + test dil, 1 + jne .LBB5_4 add rsi, -4 shr rsi mov rdx, rsi diff --git a/benches/ref_from_prefix_dynamic_size.x86-64.mca b/benches/ref_from_prefix_dynamic_size.x86-64.mca index 3900a59461..ce71749bc4 100644 --- a/benches/ref_from_prefix_dynamic_size.x86-64.mca +++ b/benches/ref_from_prefix_dynamic_size.x86-64.mca @@ -1,11 +1,11 @@ Iterations: 100 Instructions: 1400 -Total Cycles: 405 +Total Cycles: 404 Total uOps: 1400 Dispatch Width: 4 -uOps Per Cycle: 3.46 -IPC: 3.46 +uOps Per Cycle: 3.47 +IPC: 3.47 Block RThroughput: 4.0 @@ -18,15 +18,15 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 0 0.25 xor edx, edx - 1 1 0.33 mov eax, 0 - 1 1 0.33 test dil, 1 - 1 1 1.00 jne .LBB5_4 1 1 0.33 cmp rsi, 4 - 1 1 1.00 jae .LBB5_3 + 1 1 1.00 jae .LBB5_2 1 1 0.33 mov edx, 1 1 0 0.25 xor eax, eax 1 1 1.00 U ret + 1 0 0.25 xor edx, edx + 1 1 0.33 mov eax, 0 + 1 1 0.33 test dil, 1 + 1 1 1.00 jne .LBB5_4 1 1 0.33 add rsi, -4 1 1 0.50 shr rsi 1 1 0.33 mov rdx, rsi @@ -47,21 +47,21 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 3.99 3.99 - 4.02 - - + - - 3.99 4.00 - 4.01 - - Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - - - - - - - xor edx, edx - - - 0.01 0.98 - 0.01 - - mov eax, 0 - - - 0.98 0.02 - - - - test dil, 1 - - - - - - 1.00 - - jne .LBB5_4 - - - 0.02 0.98 - - - - cmp rsi, 4 - - - - - - 1.00 - - jae .LBB5_3 - - - 0.98 0.01 - 0.01 - - mov edx, 1 + - - 0.99 - - 0.01 - - cmp rsi, 4 + - - - - - 1.00 - - jae .LBB5_2 + - - - 1.00 - - - - mov edx, 1 - - - - - - - - xor eax, eax - - - - - 1.00 - - ret - - - 0.01 0.99 - - - - add rsi, -4 + - - - - - - - - xor edx, edx + - - 1.00 - - - - - mov eax, 0 + - - - 1.00 - - - - test dil, 1 + - - - - - 1.00 - - jne .LBB5_4 + - - 1.00 - - - - - add rsi, -4 - - 1.00 - - - - - shr rsi - - - 1.00 - - - - mov rdx, rsi - - - 0.99 0.01 - - - - mov rax, rdi + - - - 1.00 - - - - mov rax, rdi - - - - - 1.00 - - ret diff --git a/benches/ref_from_prefix_static_size.x86-64 b/benches/ref_from_prefix_static_size.x86-64 index 7c1bf45bb6..0328ae9719 100644 --- a/benches/ref_from_prefix_static_size.x86-64 +++ b/benches/ref_from_prefix_static_size.x86-64 @@ -1,8 +1,7 @@ bench_ref_from_prefix_static_size: xor eax, eax - cmp rsi, 6 - mov rcx, rdi - cmovb rcx, rax test dil, 1 - cmove rax, rcx + cmovne rdi, rax + cmp rsi, 6 + cmovae rax, rdi ret diff --git a/benches/ref_from_prefix_static_size.x86-64.mca b/benches/ref_from_prefix_static_size.x86-64.mca index 9691b88fe0..d4355bc6e8 100644 --- a/benches/ref_from_prefix_static_size.x86-64.mca +++ b/benches/ref_from_prefix_static_size.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 700 -Total Cycles: 274 -Total uOps: 900 +Instructions: 600 +Total Cycles: 305 +Total uOps: 800 Dispatch Width: 4 -uOps Per Cycle: 3.28 -IPC: 2.55 -Block RThroughput: 2.3 +uOps Per Cycle: 2.62 +IPC: 1.97 +Block RThroughput: 2.0 Instruction Info: @@ -19,11 +19,10 @@ Instruction Info: [1] [2] [3] [4] [5] [6] Instructions: 1 0 0.25 xor eax, eax - 1 1 0.33 cmp rsi, 6 - 1 1 0.33 mov rcx, rdi - 2 2 0.67 cmovb rcx, rax 1 1 0.33 test dil, 1 - 2 2 0.67 cmove rax, rcx + 2 2 0.67 cmovne rdi, rax + 1 1 0.33 cmp rsi, 6 + 2 2 0.67 cmovae rax, rdi 1 1 1.00 U ret @@ -40,14 +39,13 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 2.66 2.67 - 2.67 - - + - - 1.95 2.28 - 2.77 - - Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - - - - - - xor eax, eax - - - - 0.01 - 0.99 - - cmp rsi, 6 - - - 0.01 0.67 - 0.32 - - mov rcx, rdi - - - 1.00 0.99 - 0.01 - - cmovb rcx, rax - - - 0.66 0.01 - 0.33 - - test dil, 1 - - - 0.99 0.99 - 0.02 - - cmove rax, rcx + - - 0.05 0.06 - 0.89 - - test dil, 1 + - - 0.95 0.94 - 0.11 - - cmovne rdi, rax + - - - 0.34 - 0.66 - - cmp rsi, 6 + - - 0.95 0.94 - 0.11 - - cmovae rax, rdi - - - - - 1.00 - - ret diff --git a/benches/ref_from_prefix_with_elems_dynamic_padding.x86-64 b/benches/ref_from_prefix_with_elems_dynamic_padding.x86-64 index 5b31277bde..2552d72393 100644 --- a/benches/ref_from_prefix_with_elems_dynamic_padding.x86-64 +++ b/benches/ref_from_prefix_with_elems_dynamic_padding.x86-64 @@ -1,26 +1,21 @@ bench_ref_from_prefix_with_elems_dynamic_padding: - movabs rax, 3074457345618258598 - cmp rdx, rax - ja .LBB5_1 - xor ecx, ecx - mov eax, 0 - test dil, 3 - je .LBB5_3 - mov rdx, rcx - ret -.LBB5_1: + mov rcx, rdx mov edx, 1 - xor eax, eax - ret -.LBB5_3: - lea rax, [rdx + 2*rdx] + movabs rax, 3074457345618258598 + cmp rcx, rax + ja .LBB5_3 + lea rax, [rcx + 2*rcx] or rax, 3 add rax, 9 - xor r8d, r8d cmp rax, rsi - mov ecx, 1 - cmovbe rcx, rdx - cmova rdi, r8 - mov rax, rdi + jbe .LBB5_4 +.LBB5_3: + xor eax, eax + ret +.LBB5_4: + xor eax, eax + test dil, 3 + cmovne rcx, rax + cmove rax, rdi mov rdx, rcx ret diff --git a/benches/ref_from_prefix_with_elems_dynamic_padding.x86-64.mca b/benches/ref_from_prefix_with_elems_dynamic_padding.x86-64.mca index 2f212ec6d0..d69beeedc4 100644 --- a/benches/ref_from_prefix_with_elems_dynamic_padding.x86-64.mca +++ b/benches/ref_from_prefix_with_elems_dynamic_padding.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 2300 -Total Cycles: 807 -Total uOps: 2700 +Instructions: 1800 +Total Cycles: 605 +Total uOps: 2000 Dispatch Width: 4 -uOps Per Cycle: 3.35 -IPC: 2.85 -Block RThroughput: 6.8 +uOps Per Cycle: 3.31 +IPC: 2.98 +Block RThroughput: 5.0 Instruction Info: @@ -18,27 +18,22 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 movabs rax, 3074457345618258598 - 1 1 0.33 cmp rdx, rax - 1 1 1.00 ja .LBB5_1 - 1 0 0.25 xor ecx, ecx - 1 1 0.33 mov eax, 0 - 1 1 0.33 test dil, 3 - 1 1 1.00 je .LBB5_3 - 1 1 0.33 mov rdx, rcx - 1 1 1.00 U ret + 1 1 0.33 mov rcx, rdx 1 1 0.33 mov edx, 1 - 1 0 0.25 xor eax, eax - 1 1 1.00 U ret - 1 1 0.50 lea rax, [rdx + 2*rdx] + 1 1 0.33 movabs rax, 3074457345618258598 + 1 1 0.33 cmp rcx, rax + 1 1 1.00 ja .LBB5_3 + 1 1 0.50 lea rax, [rcx + 2*rcx] 1 1 0.33 or rax, 3 1 1 0.33 add rax, 9 - 1 0 0.25 xor r8d, r8d 1 1 0.33 cmp rax, rsi - 1 1 0.33 mov ecx, 1 - 3 3 1.00 cmovbe rcx, rdx - 3 3 1.00 cmova rdi, r8 - 1 1 0.33 mov rax, rdi + 1 1 1.00 jbe .LBB5_4 + 1 0 0.25 xor eax, eax + 1 1 1.00 U ret + 1 0 0.25 xor eax, eax + 1 1 0.33 test dil, 3 + 2 2 0.67 cmovne rcx, rax + 2 2 0.67 cmove rax, rdi 1 1 0.33 mov rdx, rcx 1 1 1.00 U ret @@ -56,30 +51,25 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 7.99 7.99 - 8.02 - - + - - 5.99 5.99 - 6.02 - - Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.47 0.52 - 0.01 - - movabs rax, 3074457345618258598 - - - 0.94 0.01 - 0.05 - - cmp rdx, rax - - - - - - 1.00 - - ja .LBB5_1 - - - - - - - - - xor ecx, ecx - - - 0.03 0.97 - - - - mov eax, 0 - - - 0.01 0.52 - 0.47 - - test dil, 3 - - - - - - 1.00 - - je .LBB5_3 - - - 0.03 0.51 - 0.46 - - mov rdx, rcx - - - - - - 1.00 - - ret - - - 0.04 0.96 - - - - mov edx, 1 + - - 0.98 0.01 - 0.01 - - mov rcx, rdx + - - 0.01 0.99 - - - - mov edx, 1 + - - 0.02 0.98 - - - - movabs rax, 3074457345618258598 + - - 0.98 0.01 - 0.01 - - cmp rcx, rax + - - - - - 1.00 - - ja .LBB5_3 + - - 0.01 0.99 - - - - lea rax, [rcx + 2*rcx] + - - 0.99 0.01 - - - - or rax, 3 + - - 0.99 0.01 - - - - add rax, 9 + - - 0.99 0.01 - - - - cmp rax, rsi + - - - - - 1.00 - - jbe .LBB5_4 - - - - - - - - xor eax, eax - - - - - 1.00 - - ret - - - 0.01 0.99 - - - - lea rax, [rdx + 2*rdx] - - - 0.52 0.48 - - - - or rax, 3 - - - 0.51 0.49 - - - - add rax, 9 - - - - - - - - - xor r8d, r8d - - - 0.97 0.03 - - - - cmp rax, rsi - - - 0.01 0.99 - - - - mov ecx, 1 - - - 1.04 0.97 - 0.99 - - cmovbe rcx, rdx - - - 1.44 0.54 - 1.02 - - cmova rdi, r8 - - - 0.97 0.01 - 0.02 - - mov rax, rdi - - - 1.00 - - - - - mov rdx, rcx + - - - - - - - - xor eax, eax + - - 0.01 0.98 - 0.01 - - test dil, 3 + - - 0.01 0.99 - 1.00 - - cmovne rcx, rax + - - 0.01 1.00 - 0.99 - - cmove rax, rdi + - - 0.99 0.01 - - - - mov rdx, rcx - - - - - 1.00 - - ret diff --git a/benches/ref_from_prefix_with_elems_dynamic_size.x86-64 b/benches/ref_from_prefix_with_elems_dynamic_size.x86-64 index 069fd4859c..1d6a8e334b 100644 --- a/benches/ref_from_prefix_with_elems_dynamic_size.x86-64 +++ b/benches/ref_from_prefix_with_elems_dynamic_size.x86-64 @@ -1,22 +1,19 @@ bench_ref_from_prefix_with_elems_dynamic_size: - movabs rax, 4611686018427387901 - cmp rdx, rax - ja .LBB5_1 mov rcx, rdx - xor edx, edx - mov eax, 0 - test dil, 1 - jne .LBB5_4 + mov edx, 1 + movabs rax, 4611686018427387901 + cmp rcx, rax + ja .LBB5_3 lea rax, [2*rcx + 4] - xor r8d, r8d cmp rax, rsi - mov edx, 1 - cmovbe rdx, rcx - cmova rdi, r8 - mov rax, rdi -.LBB5_4: + jbe .LBB5_4 +.LBB5_3: + xor eax, eax ret -.LBB5_1: - mov edx, 1 +.LBB5_4: xor eax, eax + test dil, 1 + cmovne rcx, rax + cmove rax, rdi + mov rdx, rcx ret diff --git a/benches/ref_from_prefix_with_elems_dynamic_size.x86-64.mca b/benches/ref_from_prefix_with_elems_dynamic_size.x86-64.mca index 6f22726406..da9883ddde 100644 --- a/benches/ref_from_prefix_with_elems_dynamic_size.x86-64.mca +++ b/benches/ref_from_prefix_with_elems_dynamic_size.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 1900 -Total Cycles: 672 -Total uOps: 2300 +Instructions: 1600 +Total Cycles: 603 +Total uOps: 1800 Dispatch Width: 4 -uOps Per Cycle: 3.42 -IPC: 2.83 -Block RThroughput: 5.8 +uOps Per Cycle: 2.99 +IPC: 2.65 +Block RThroughput: 4.5 Instruction Info: @@ -18,24 +18,21 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 movabs rax, 4611686018427387901 - 1 1 0.33 cmp rdx, rax - 1 1 1.00 ja .LBB5_1 1 1 0.33 mov rcx, rdx - 1 0 0.25 xor edx, edx - 1 1 0.33 mov eax, 0 - 1 1 0.33 test dil, 1 - 1 1 1.00 jne .LBB5_4 + 1 1 0.33 mov edx, 1 + 1 1 0.33 movabs rax, 4611686018427387901 + 1 1 0.33 cmp rcx, rax + 1 1 1.00 ja .LBB5_3 1 1 0.50 lea rax, [2*rcx + 4] - 1 0 0.25 xor r8d, r8d 1 1 0.33 cmp rax, rsi - 1 1 0.33 mov edx, 1 - 3 3 1.00 cmovbe rdx, rcx - 3 3 1.00 cmova rdi, r8 - 1 1 0.33 mov rax, rdi + 1 1 1.00 jbe .LBB5_4 + 1 0 0.25 xor eax, eax 1 1 1.00 U ret - 1 1 0.33 mov edx, 1 1 0 0.25 xor eax, eax + 1 1 0.33 test dil, 1 + 2 2 0.67 cmovne rcx, rax + 2 2 0.67 cmove rax, rdi + 1 1 0.33 mov rdx, rcx 1 1 1.00 U ret @@ -52,26 +49,23 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 6.66 6.66 - 6.68 - - + - - 5.33 5.33 - 5.34 - - Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - - 0.99 - 0.01 - - movabs rax, 4611686018427387901 - - - 0.37 0.63 - - - - cmp rdx, rax - - - - - - 1.00 - - ja .LBB5_1 - - - 0.63 0.37 - - - - mov rcx, rdx - - - - - - - - - xor edx, edx - - - 0.01 0.98 - 0.01 - - mov eax, 0 - - - 0.98 0.02 - - - - test dil, 1 - - - - - - 1.00 - - jne .LBB5_4 - - - 0.01 0.99 - - - - lea rax, [2*rcx + 4] - - - - - - - - - xor r8d, r8d - - - 1.00 - - - - - cmp rax, rsi - - - - 0.67 - 0.33 - - mov edx, 1 - - - 0.73 0.98 - 1.29 - - cmovbe rdx, rcx - - - 1.60 0.36 - 1.04 - - cmova rdi, r8 - - - 0.99 0.01 - - - - mov rax, rdi + - - 0.48 0.45 - 0.07 - - mov rcx, rdx + - - 0.45 0.49 - 0.06 - - mov edx, 1 + - - 0.18 0.25 - 0.57 - - movabs rax, 4611686018427387901 + - - 0.24 0.51 - 0.25 - - cmp rcx, rax + - - - - - 1.00 - - ja .LBB5_3 + - - 0.52 0.48 - - - - lea rax, [2*rcx + 4] + - - 0.47 0.53 - - - - cmp rax, rsi + - - - - - 1.00 - - jbe .LBB5_4 + - - - - - - - - xor eax, eax - - - - - 1.00 - - ret - - - 0.34 0.66 - - - - mov edx, 1 - - - - - - - - xor eax, eax + - - 0.47 0.50 - 0.03 - - test dil, 1 + - - 1.00 1.00 - - - - cmovne rcx, rax + - - 0.99 0.66 - 0.35 - - cmove rax, rdi + - - 0.53 0.46 - 0.01 - - mov rdx, rcx - - - - - 1.00 - - ret diff --git a/benches/ref_from_suffix_dynamic_padding.x86-64 b/benches/ref_from_suffix_dynamic_padding.x86-64 index 3e05f6023f..9da52dcae0 100644 --- a/benches/ref_from_suffix_dynamic_padding.x86-64 +++ b/benches/ref_from_suffix_dynamic_padding.x86-64 @@ -1,11 +1,11 @@ bench_ref_from_suffix_dynamic_padding: - lea eax, [rsi + rdi] - test al, 3 - jne .LBB5_1 movabs rax, 9223372036854775804 and rax, rsi cmp rax, 9 - jae .LBB5_3 + jb .LBB5_1 + lea ecx, [rsi + rdi] + test cl, 3 + je .LBB5_3 .LBB5_1: xor eax, eax ret diff --git a/benches/ref_from_suffix_dynamic_padding.x86-64.mca b/benches/ref_from_suffix_dynamic_padding.x86-64.mca index 73599d5b6a..929873f5e7 100644 --- a/benches/ref_from_suffix_dynamic_padding.x86-64.mca +++ b/benches/ref_from_suffix_dynamic_padding.x86-64.mca @@ -1,10 +1,10 @@ Iterations: 100 Instructions: 2000 -Total Cycles: 682 +Total Cycles: 683 Total uOps: 2100 Dispatch Width: 4 -uOps Per Cycle: 3.08 +uOps Per Cycle: 3.07 IPC: 2.93 Block RThroughput: 5.3 @@ -18,13 +18,13 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.50 lea eax, [rsi + rdi] - 1 1 0.33 test al, 3 - 1 1 1.00 jne .LBB5_1 1 1 0.33 movabs rax, 9223372036854775804 1 1 0.33 and rax, rsi 1 1 0.33 cmp rax, 9 - 1 1 1.00 jae .LBB5_3 + 1 1 1.00 jb .LBB5_1 + 1 1 0.50 lea ecx, [rsi + rdi] + 1 1 0.33 test cl, 3 + 1 1 1.00 je .LBB5_3 1 0 0.25 xor eax, eax 1 1 1.00 U ret 1 1 0.33 add rax, -9 @@ -53,27 +53,27 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 6.65 6.67 - 6.68 - - + - - 6.67 6.65 - 6.68 - - Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.90 0.10 - - - - lea eax, [rsi + rdi] - - - 0.93 - - 0.07 - - test al, 3 - - - - - - 1.00 - - jne .LBB5_1 - - - 0.51 0.47 - 0.02 - - movabs rax, 9223372036854775804 - - - - - - 1.00 - - and rax, rsi - - - - 0.09 - 0.91 - - cmp rax, 9 - - - - - - 1.00 - - jae .LBB5_3 + - - 0.05 0.32 - 0.63 - - movabs rax, 9223372036854775804 + - - 0.63 0.03 - 0.34 - - and rax, rsi + - - 0.94 0.03 - 0.03 - - cmp rax, 9 + - - - - - 1.00 - - jb .LBB5_1 + - - 0.05 0.95 - - - - lea ecx, [rsi + rdi] + - - 0.03 0.97 - - - - test cl, 3 + - - - - - 1.00 - - je .LBB5_3 - - - - - - - - xor eax, eax - - - - - 1.00 - - ret - - - 0.43 0.47 - 0.10 - - add rax, -9 - - - 0.42 0.39 - 0.19 - - movabs rcx, -6148914691236517205 + - - 0.35 0.35 - 0.30 - - add rax, -9 + - - 0.95 0.04 - 0.01 - - movabs rcx, -6148914691236517205 - - 1.00 1.00 - - - - mul rcx - - 0.69 - - 0.31 - - shr rdx - - - 0.54 0.46 - - - - lea rax, [rdx + 2*rdx] - - - 0.07 0.91 - 0.02 - - sub rsi, rax - - - 0.91 0.05 - 0.04 - - or rax, -4 - - - 0.08 0.90 - 0.02 - - add rsi, rdi - - - 0.09 0.91 - - - - add rax, rsi - - - 0.08 0.92 - - - - add rax, -8 + - - 0.65 0.35 - - - - lea rax, [rdx + 2*rdx] + - - 0.30 0.35 - 0.35 - - sub rsi, rax + - - 0.66 0.02 - 0.32 - - or rax, -4 + - - 0.02 0.64 - 0.34 - - add rsi, rdi + - - 0.33 0.65 - 0.02 - - add rax, rsi + - - 0.02 0.95 - 0.03 - - add rax, -8 - - - - - 1.00 - - ret diff --git a/benches/ref_from_suffix_dynamic_size.x86-64 b/benches/ref_from_suffix_dynamic_size.x86-64 index bd4ace8983..13fdcf8624 100644 --- a/benches/ref_from_suffix_dynamic_size.x86-64 +++ b/benches/ref_from_suffix_dynamic_size.x86-64 @@ -1,13 +1,15 @@ bench_ref_from_suffix_dynamic_size: - mov rdx, rsi + cmp rsi, 4 + jb .LBB5_1 + mov rax, rdi lea ecx, [rsi + rdi] - mov eax, edx - and eax, 1 - add rax, rdi - xor esi, esi - sub rdx, 4 - cmovb rax, rsi - shr rdx test cl, 1 - cmovne rax, rsi + jne .LBB5_1 + lea rdx, [rsi - 4] + shr rdx + and esi, 1 + add rax, rsi + ret +.LBB5_1: + xor eax, eax ret diff --git a/benches/ref_from_suffix_dynamic_size.x86-64.mca b/benches/ref_from_suffix_dynamic_size.x86-64.mca index 1398bcfe27..949b83310c 100644 --- a/benches/ref_from_suffix_dynamic_size.x86-64.mca +++ b/benches/ref_from_suffix_dynamic_size.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 1200 -Total Cycles: 439 -Total uOps: 1400 +Instructions: 1300 +Total Cycles: 405 +Total uOps: 1300 Dispatch Width: 4 -uOps Per Cycle: 3.19 -IPC: 2.73 -Block RThroughput: 3.5 +uOps Per Cycle: 3.21 +IPC: 3.21 +Block RThroughput: 4.0 Instruction Info: @@ -18,17 +18,18 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 mov rdx, rsi + 1 1 0.33 cmp rsi, 4 + 1 1 1.00 jb .LBB5_1 + 1 1 0.33 mov rax, rdi 1 1 0.50 lea ecx, [rsi + rdi] - 1 1 0.33 mov eax, edx - 1 1 0.33 and eax, 1 - 1 1 0.33 add rax, rdi - 1 0 0.25 xor esi, esi - 1 1 0.33 sub rdx, 4 - 2 2 0.67 cmovb rax, rsi - 1 1 0.50 shr rdx 1 1 0.33 test cl, 1 - 2 2 0.67 cmovne rax, rsi + 1 1 1.00 jne .LBB5_1 + 1 1 0.50 lea rdx, [rsi - 4] + 1 1 0.50 shr rdx + 1 1 0.33 and esi, 1 + 1 1 0.33 add rax, rsi + 1 1 1.00 U ret + 1 0 0.25 xor eax, eax 1 1 1.00 U ret @@ -45,19 +46,20 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 4.33 4.33 - 4.34 - - + - - 3.99 3.99 - 4.02 - - Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.02 0.32 - 0.66 - - mov rdx, rsi - - - 0.32 0.68 - - - - lea ecx, [rsi + rdi] - - - 0.66 - - 0.34 - - mov eax, edx - - - 0.02 0.33 - 0.65 - - and eax, 1 - - - - 0.99 - 0.01 - - add rax, rdi - - - - - - - - - xor esi, esi - - - 0.65 - - 0.35 - - sub rdx, 4 - - - 1.00 1.00 - - - - cmovb rax, rsi - - - 0.66 - - 0.34 - - shr rdx - - - - 0.01 - 0.99 - - test cl, 1 - - - 1.00 1.00 - - - - cmovne rax, rsi + - - 0.02 0.97 - 0.01 - - cmp rsi, 4 + - - - - - 1.00 - - jb .LBB5_1 + - - 0.97 0.03 - - - - mov rax, rdi + - - 0.01 0.99 - - - - lea ecx, [rsi + rdi] + - - 0.98 0.02 - - - - test cl, 1 + - - - - - 1.00 - - jne .LBB5_1 + - - 0.97 0.03 - - - - lea rdx, [rsi - 4] + - - 1.00 - - - - - shr rdx + - - 0.02 0.98 - - - - and esi, 1 + - - 0.02 0.97 - 0.01 - - add rax, rsi + - - - - - 1.00 - - ret + - - - - - - - - xor eax, eax - - - - - 1.00 - - ret diff --git a/benches/ref_from_suffix_static_size.x86-64 b/benches/ref_from_suffix_static_size.x86-64 index 9e90b9e254..4f003e061d 100644 --- a/benches/ref_from_suffix_static_size.x86-64 +++ b/benches/ref_from_suffix_static_size.x86-64 @@ -1,13 +1,12 @@ bench_ref_from_suffix_static_size: - lea eax, [rsi + rdi] cmp rsi, 6 - setb cl - or cl, al - test cl, 1 - je .LBB5_2 - xor eax, eax - ret -.LBB5_2: + jb .LBB5_1 + lea eax, [rsi + rdi] + test al, 1 + jne .LBB5_1 lea rax, [rdi + rsi] add rax, -6 ret +.LBB5_1: + xor eax, eax + ret diff --git a/benches/ref_from_suffix_static_size.x86-64.mca b/benches/ref_from_suffix_static_size.x86-64.mca index ef5892647b..70da98d6db 100644 --- a/benches/ref_from_suffix_static_size.x86-64.mca +++ b/benches/ref_from_suffix_static_size.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 1100 -Total Cycles: 338 -Total uOps: 1100 +Instructions: 1000 +Total Cycles: 404 +Total uOps: 1000 Dispatch Width: 4 -uOps Per Cycle: 3.25 -IPC: 3.25 -Block RThroughput: 3.0 +uOps Per Cycle: 2.48 +IPC: 2.48 +Block RThroughput: 4.0 Instruction Info: @@ -18,17 +18,16 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.50 lea eax, [rsi + rdi] 1 1 0.33 cmp rsi, 6 - 1 1 0.50 setb cl - 1 1 0.33 or cl, al - 1 1 0.33 test cl, 1 - 1 1 1.00 je .LBB5_2 - 1 0 0.25 xor eax, eax - 1 1 1.00 U ret + 1 1 1.00 jb .LBB5_1 + 1 1 0.50 lea eax, [rsi + rdi] + 1 1 0.33 test al, 1 + 1 1 1.00 jne .LBB5_1 1 1 0.50 lea rax, [rdi + rsi] 1 1 0.33 add rax, -6 1 1 1.00 U ret + 1 0 0.25 xor eax, eax + 1 1 1.00 U ret Resources: @@ -44,18 +43,17 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 3.32 3.33 - 3.35 - - + - - 2.49 2.50 - 4.01 - - Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.97 0.03 - - - - lea eax, [rsi + rdi] - - - 0.33 0.32 - 0.35 - - cmp rsi, 6 - - - 1.00 - - - - - setb cl - - - - 1.00 - - - - or cl, al - - - - 1.00 - - - - test cl, 1 - - - - - - 1.00 - - je .LBB5_2 - - - - - - - - - xor eax, eax - - - - - - 1.00 - - ret + - - 0.49 0.50 - 0.01 - - cmp rsi, 6 + - - - - - 1.00 - - jb .LBB5_1 + - - 0.50 0.50 - - - - lea eax, [rsi + rdi] + - - 0.66 0.34 - - - - test al, 1 + - - - - - 1.00 - - jne .LBB5_1 - - 0.34 0.66 - - - - lea rax, [rdi + rsi] - - - 0.68 0.32 - - - - add rax, -6 + - - 0.50 0.50 - - - - add rax, -6 + - - - - - 1.00 - - ret + - - - - - - - - xor eax, eax - - - - - 1.00 - - ret diff --git a/benches/ref_from_suffix_with_elems_dynamic_padding.x86-64 b/benches/ref_from_suffix_with_elems_dynamic_padding.x86-64 index c3d10b5fc6..e1844f6b1e 100644 --- a/benches/ref_from_suffix_with_elems_dynamic_padding.x86-64 +++ b/benches/ref_from_suffix_with_elems_dynamic_padding.x86-64 @@ -1,27 +1,26 @@ bench_ref_from_suffix_with_elems_dynamic_padding: + mov rcx, rdx + mov edx, 1 movabs rax, 3074457345618258598 - cmp rdx, rax - ja .LBB5_1 - lea r8d, [rsi + rdi] - xor ecx, ecx - mov eax, 0 - test r8b, 3 - je .LBB5_3 - mov rdx, rcx - ret -.LBB5_3: - lea rax, [rdx + 2*rdx] + cmp rcx, rax + ja .LBB5_4 + lea rax, [rcx + 2*rcx] or rax, 3 add rax, 9 - sub rsi, rax - jae .LBB5_4 -.LBB5_1: + mov r8, rsi + sub r8, rax + jae .LBB5_2 +.LBB5_4: xor eax, eax - mov edx, 1 +.LBB5_5: ret -.LBB5_4: - add rdi, rsi - mov rcx, rdx - mov rax, rdi +.LBB5_2: + add esi, edi + xor edx, edx + mov eax, 0 + test sil, 3 + jne .LBB5_5 + add rdi, r8 mov rdx, rcx + mov rax, rdi ret diff --git a/benches/ref_from_suffix_with_elems_dynamic_padding.x86-64.mca b/benches/ref_from_suffix_with_elems_dynamic_padding.x86-64.mca index 92e6280bb4..6cde05d596 100644 --- a/benches/ref_from_suffix_with_elems_dynamic_padding.x86-64.mca +++ b/benches/ref_from_suffix_with_elems_dynamic_padding.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 2300 -Total Cycles: 706 -Total uOps: 2300 +Instructions: 2200 +Total Cycles: 671 +Total uOps: 2200 Dispatch Width: 4 -uOps Per Cycle: 3.26 -IPC: 3.26 -Block RThroughput: 6.0 +uOps Per Cycle: 3.28 +IPC: 3.28 +Block RThroughput: 5.5 Instruction Info: @@ -18,28 +18,27 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: + 1 1 0.33 mov rcx, rdx + 1 1 0.33 mov edx, 1 1 1 0.33 movabs rax, 3074457345618258598 - 1 1 0.33 cmp rdx, rax - 1 1 1.00 ja .LBB5_1 - 1 1 0.50 lea r8d, [rsi + rdi] - 1 0 0.25 xor ecx, ecx - 1 1 0.33 mov eax, 0 - 1 1 0.33 test r8b, 3 - 1 1 1.00 je .LBB5_3 - 1 1 0.33 mov rdx, rcx - 1 1 1.00 U ret - 1 1 0.50 lea rax, [rdx + 2*rdx] + 1 1 0.33 cmp rcx, rax + 1 1 1.00 ja .LBB5_4 + 1 1 0.50 lea rax, [rcx + 2*rcx] 1 1 0.33 or rax, 3 1 1 0.33 add rax, 9 - 1 1 0.33 sub rsi, rax - 1 1 1.00 jae .LBB5_4 + 1 1 0.33 mov r8, rsi + 1 1 0.33 sub r8, rax + 1 1 1.00 jae .LBB5_2 1 0 0.25 xor eax, eax - 1 1 0.33 mov edx, 1 1 1 1.00 U ret - 1 1 0.33 add rdi, rsi - 1 1 0.33 mov rcx, rdx - 1 1 0.33 mov rax, rdi + 1 1 0.33 add esi, edi + 1 0 0.25 xor edx, edx + 1 1 0.33 mov eax, 0 + 1 1 0.33 test sil, 3 + 1 1 1.00 jne .LBB5_5 + 1 1 0.33 add rdi, r8 1 1 0.33 mov rdx, rcx + 1 1 0.33 mov rax, rdi 1 1 1.00 U ret @@ -56,30 +55,29 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 6.99 7.00 - 7.01 - - + - - 6.66 6.66 - 6.68 - - Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - - 0.99 - 0.01 - - movabs rax, 3074457345618258598 - - - 0.01 0.50 - 0.49 - - cmp rdx, rax - - - - - - 1.00 - - ja .LBB5_1 - - - - 1.00 - - - - lea r8d, [rsi + rdi] - - - - - - - - - xor ecx, ecx - - - 0.50 0.49 - 0.01 - - mov eax, 0 - - - 0.49 0.51 - - - - test r8b, 3 - - - - - - 1.00 - - je .LBB5_3 - - - 0.51 0.49 - - - - mov rdx, rcx - - - - - - 1.00 - - ret - - - 0.50 0.50 - - - - lea rax, [rdx + 2*rdx] - - - 1.00 - - - - - or rax, 3 + - - 0.66 0.33 - 0.01 - - mov rcx, rdx + - - 0.33 0.67 - - - - mov edx, 1 + - - 0.67 0.33 - - - - movabs rax, 3074457345618258598 + - - 0.99 - - 0.01 - - cmp rcx, rax + - - - - - 1.00 - - ja .LBB5_4 + - - - 1.00 - - - - lea rax, [rcx + 2*rcx] + - - 0.34 - - 0.66 - - or rax, 3 - - 1.00 - - - - - add rax, 9 - - - 0.99 0.01 - - - - sub rsi, rax - - - - - - 1.00 - - jae .LBB5_4 + - - - 0.34 - 0.66 - - mov r8, rsi + - - 1.00 - - - - - sub r8, rax + - - - - - 1.00 - - jae .LBB5_2 - - - - - - - - xor eax, eax - - - - 1.00 - - - - mov edx, 1 - - - - - 1.00 - - ret - - - 1.00 - - - - - add rdi, rsi - - - - 1.00 - - - - mov rcx, rdx - - - 0.99 0.01 - - - - mov rax, rdi - - - - 0.50 - 0.50 - - mov rdx, rcx + - - - 1.00 - - - - add esi, edi + - - - - - - - - xor edx, edx + - - - 0.99 - 0.01 - - mov eax, 0 + - - 0.33 0.34 - 0.33 - - test sil, 3 + - - - - - 1.00 - - jne .LBB5_5 + - - 0.67 0.33 - - - - add rdi, r8 + - - 0.33 0.67 - - - - mov rdx, rcx + - - 0.34 0.66 - - - - mov rax, rdi - - - - - 1.00 - - ret diff --git a/benches/ref_from_suffix_with_elems_dynamic_size.x86-64 b/benches/ref_from_suffix_with_elems_dynamic_size.x86-64 index bdca571924..b9414b2d4c 100644 --- a/benches/ref_from_suffix_with_elems_dynamic_size.x86-64 +++ b/benches/ref_from_suffix_with_elems_dynamic_size.x86-64 @@ -1,23 +1,24 @@ bench_ref_from_suffix_with_elems_dynamic_size: + mov rcx, rdx + mov edx, 1 movabs rax, 4611686018427387901 - cmp rdx, rax - ja .LBB5_1 - lea r8d, [rsi + rdi] - xor ecx, ecx - mov eax, 0 - test r8b, 1 - jne .LBB5_5 - lea rax, [2*rdx + 4] - sub rsi, rax + cmp rcx, rax + ja .LBB5_3 + lea rax, [2*rcx + 4] + mov r8, rsi + sub r8, rax jae .LBB5_4 -.LBB5_1: +.LBB5_3: xor eax, eax - mov edx, 1 ret .LBB5_4: - add rdi, rsi - mov rcx, rdx - mov rax, rdi -.LBB5_5: + add esi, edi + xor edx, edx + mov eax, 0 + test sil, 1 + jne .LBB5_6 + add rdi, r8 mov rdx, rcx + mov rax, rdi +.LBB5_6: ret diff --git a/benches/ref_from_suffix_with_elems_dynamic_size.x86-64.mca b/benches/ref_from_suffix_with_elems_dynamic_size.x86-64.mca index 6d9de0b3eb..46ce6b7d5e 100644 --- a/benches/ref_from_suffix_with_elems_dynamic_size.x86-64.mca +++ b/benches/ref_from_suffix_with_elems_dynamic_size.x86-64.mca @@ -1,11 +1,11 @@ Iterations: 100 -Instructions: 1900 -Total Cycles: 571 -Total uOps: 1900 +Instructions: 2000 +Total Cycles: 604 +Total uOps: 2000 Dispatch Width: 4 -uOps Per Cycle: 3.33 -IPC: 3.33 +uOps Per Cycle: 3.31 +IPC: 3.31 Block RThroughput: 5.0 @@ -18,24 +18,25 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: + 1 1 0.33 mov rcx, rdx + 1 1 0.33 mov edx, 1 1 1 0.33 movabs rax, 4611686018427387901 - 1 1 0.33 cmp rdx, rax - 1 1 1.00 ja .LBB5_1 - 1 1 0.50 lea r8d, [rsi + rdi] - 1 0 0.25 xor ecx, ecx - 1 1 0.33 mov eax, 0 - 1 1 0.33 test r8b, 1 - 1 1 1.00 jne .LBB5_5 - 1 1 0.50 lea rax, [2*rdx + 4] - 1 1 0.33 sub rsi, rax + 1 1 0.33 cmp rcx, rax + 1 1 1.00 ja .LBB5_3 + 1 1 0.50 lea rax, [2*rcx + 4] + 1 1 0.33 mov r8, rsi + 1 1 0.33 sub r8, rax 1 1 1.00 jae .LBB5_4 1 0 0.25 xor eax, eax - 1 1 0.33 mov edx, 1 1 1 1.00 U ret - 1 1 0.33 add rdi, rsi - 1 1 0.33 mov rcx, rdx - 1 1 0.33 mov rax, rdi + 1 1 0.33 add esi, edi + 1 0 0.25 xor edx, edx + 1 1 0.33 mov eax, 0 + 1 1 0.33 test sil, 1 + 1 1 1.00 jne .LBB5_6 + 1 1 0.33 add rdi, r8 1 1 0.33 mov rdx, rcx + 1 1 0.33 mov rax, rdi 1 1 1.00 U ret @@ -52,26 +53,27 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 5.66 5.66 - 5.68 - - + - - 5.99 6.00 - 6.01 - - Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.66 0.33 - 0.01 - - movabs rax, 4611686018427387901 - - - 0.01 0.99 - - - - cmp rdx, rax - - - - - - 1.00 - - ja .LBB5_1 - - - 0.99 0.01 - - - - lea r8d, [rsi + rdi] - - - - - - - - - xor ecx, ecx - - - 0.33 0.33 - 0.34 - - mov eax, 0 - - - 0.33 0.34 - 0.33 - - test r8b, 1 - - - - - - 1.00 - - jne .LBB5_5 - - - 0.34 0.66 - - - - lea rax, [2*rdx + 4] - - - - 1.00 - - - - sub rsi, rax + - - 0.04 0.95 - 0.01 - - mov rcx, rdx + - - - 1.00 - - - - mov edx, 1 + - - 1.00 - - - - - movabs rax, 4611686018427387901 + - - - - - 1.00 - - cmp rcx, rax + - - - - - 1.00 - - ja .LBB5_3 + - - - 1.00 - - - - lea rax, [2*rcx + 4] + - - 1.00 - - - - - mov r8, rsi + - - - 1.00 - - - - sub r8, rax - - - - - 1.00 - - jae .LBB5_4 - - - - - - - - xor eax, eax - - - 1.00 - - - - - mov edx, 1 - - - - - 1.00 - - ret - - - - 1.00 - - - - add rdi, rsi - - - 1.00 - - - - - mov rcx, rdx - - - 0.32 0.68 - - - - mov rax, rdi - - - 0.68 0.32 - - - - mov rdx, rcx + - - 1.00 - - - - - add esi, edi + - - - - - - - - xor edx, edx + - - - 1.00 - - - - mov eax, 0 + - - 1.00 - - - - - test sil, 1 + - - - - - 1.00 - - jne .LBB5_6 + - - - 1.00 - - - - add rdi, r8 + - - 1.00 - - - - - mov rdx, rcx + - - 0.95 0.05 - - - - mov rax, rdi - - - - - 1.00 - - ret diff --git a/benches/try_ref_from_bytes_dynamic_padding.x86-64 b/benches/try_ref_from_bytes_dynamic_padding.x86-64 index 217c5fc617..5b6bc5b189 100644 --- a/benches/try_ref_from_bytes_dynamic_padding.x86-64 +++ b/benches/try_ref_from_bytes_dynamic_padding.x86-64 @@ -1,24 +1,27 @@ bench_try_ref_from_bytes_dynamic_padding: - test dil, 3 - jne .LBB5_4 movabs rax, 9223372036854775804 and rax, rsi cmp rax, 9 - jb .LBB5_4 + jb .LBB5_1 add rax, -9 movabs rcx, -6148914691236517205 mul rcx + test dil, 3 + jne .LBB5_1 shr rdx lea rax, [rdx + 2*rdx] or rax, 3 add rax, 9 cmp rsi, rax - jne .LBB5_4 - cmp word ptr [rdi], -16192 - je .LBB5_5 -.LBB5_4: - xor edi, edi + jne .LBB5_1 + movzx ecx, word ptr [rdi] + xor eax, eax + cmp ecx, 49344 + cmove rsi, rdx + cmove rax, rdi + mov rdx, rsi + ret +.LBB5_1: + xor eax, eax mov rdx, rsi -.LBB5_5: - mov rax, rdi ret diff --git a/benches/try_ref_from_bytes_dynamic_padding.x86-64.mca b/benches/try_ref_from_bytes_dynamic_padding.x86-64.mca index 95b993c7e0..ccc679bdcf 100644 --- a/benches/try_ref_from_bytes_dynamic_padding.x86-64.mca +++ b/benches/try_ref_from_bytes_dynamic_padding.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 2100 -Total Cycles: 709 -Total uOps: 2300 +Instructions: 2500 +Total Cycles: 1008 +Total uOps: 2800 Dispatch Width: 4 -uOps Per Cycle: 3.24 -IPC: 2.96 -Block RThroughput: 5.8 +uOps Per Cycle: 2.78 +IPC: 2.48 +Block RThroughput: 7.0 Instruction Info: @@ -18,26 +18,30 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 test dil, 3 - 1 1 1.00 jne .LBB5_4 1 1 0.33 movabs rax, 9223372036854775804 1 1 0.33 and rax, rsi 1 1 0.33 cmp rax, 9 - 1 1 1.00 jb .LBB5_4 + 1 1 1.00 jb .LBB5_1 1 1 0.33 add rax, -9 1 1 0.33 movabs rcx, -6148914691236517205 2 4 1.00 mul rcx + 1 1 0.33 test dil, 3 + 1 1 1.00 jne .LBB5_1 1 1 0.50 shr rdx 1 1 0.50 lea rax, [rdx + 2*rdx] 1 1 0.33 or rax, 3 1 1 0.33 add rax, 9 1 1 0.33 cmp rsi, rax - 1 1 1.00 jne .LBB5_4 - 2 6 0.50 * cmp word ptr [rdi], -16192 - 1 1 1.00 je .LBB5_5 - 1 0 0.25 xor edi, edi + 1 1 1.00 jne .LBB5_1 + 1 5 0.50 * movzx ecx, word ptr [rdi] + 1 0 0.25 xor eax, eax + 1 1 0.33 cmp ecx, 49344 + 2 2 0.67 cmove rsi, rdx + 2 2 0.67 cmove rax, rdi + 1 1 0.33 mov rdx, rsi + 1 1 1.00 U ret + 1 0 0.25 xor eax, eax 1 1 0.33 mov rdx, rsi - 1 1 0.33 mov rax, rdi 1 1 1.00 U ret @@ -54,28 +58,32 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 6.98 6.99 - 7.03 0.50 0.50 + - - 8.02 8.01 - 8.97 0.50 0.50 Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.48 0.51 - 0.01 - - test dil, 3 - - - - - - 1.00 - - jne .LBB5_4 - - - 0.51 0.49 - - - - movabs rax, 9223372036854775804 - - - 0.01 0.99 - - - - and rax, rsi - - - 0.51 0.49 - - - - cmp rax, 9 - - - - - - 1.00 - - jb .LBB5_4 - - - 0.98 - - 0.02 - - add rax, -9 - - - 0.98 0.02 - - - - movabs rcx, -6148914691236517205 + - - 0.07 0.04 - 0.89 - - movabs rax, 9223372036854775804 + - - 0.97 0.01 - 0.02 - - and rax, rsi + - - - 0.99 - 0.01 - - cmp rax, 9 + - - - - - 1.00 - - jb .LBB5_1 + - - 0.99 0.01 - - - - add rax, -9 + - - 0.02 0.06 - 0.92 - - movabs rcx, -6148914691236517205 - - 1.00 1.00 - - - - mul rcx - - - 0.99 - - 0.01 - - shr rdx - - - - 1.00 - - - - lea rax, [rdx + 2*rdx] - - - - 0.51 - 0.49 - - or rax, 3 - - - 0.01 0.49 - 0.50 - - add rax, 9 - - - - 0.02 - 0.98 - - cmp rsi, rax - - - - - - 1.00 - - jne .LBB5_4 - - - 0.51 0.49 - - 0.50 0.50 cmp word ptr [rdi], -16192 - - - - - - 1.00 - - je .LBB5_5 - - - - - - - - - xor edi, edi - - - 0.50 0.50 - - - - mov rdx, rsi - - - 0.50 0.48 - 0.02 - - mov rax, rdi + - - 0.07 0.91 - 0.02 - - test dil, 3 + - - - - - 1.00 - - jne .LBB5_1 + - - 0.97 - - 0.03 - - shr rdx + - - 0.03 0.97 - - - - lea rax, [rdx + 2*rdx] + - - 0.02 0.95 - 0.03 - - or rax, 3 + - - 0.03 0.96 - 0.01 - - add rax, 9 + - - 0.01 0.98 - 0.01 - - cmp rsi, rax + - - - - - 1.00 - - jne .LBB5_1 + - - - - - - 0.50 0.50 movzx ecx, word ptr [rdi] + - - - - - - - - xor eax, eax + - - 0.91 0.06 - 0.03 - - cmp ecx, 49344 + - - 0.97 0.04 - 0.99 - - cmove rsi, rdx + - - 0.99 0.99 - 0.02 - - cmove rax, rdi + - - 0.96 0.03 - 0.01 - - mov rdx, rsi + - - - - - 1.00 - - ret + - - - - - - - - xor eax, eax + - - 0.01 0.01 - 0.98 - - mov rdx, rsi - - - - - 1.00 - - ret diff --git a/benches/try_ref_from_bytes_dynamic_size.x86-64 b/benches/try_ref_from_bytes_dynamic_size.x86-64 index cf67afd31c..15d08c143c 100644 --- a/benches/try_ref_from_bytes_dynamic_size.x86-64 +++ b/benches/try_ref_from_bytes_dynamic_size.x86-64 @@ -1,22 +1,22 @@ bench_try_ref_from_bytes_dynamic_size: - mov rdx, rsi - mov rax, rdi cmp rsi, 4 - setb cl - or cl, al - test cl, 1 - jne .LBB5_4 - lea rcx, [rdx - 4] - mov rsi, rcx - and rsi, -2 - add rsi, 4 - cmp rdx, rsi - jne .LBB5_4 - cmp word ptr [rax], -16192 - jne .LBB5_4 + jb .LBB5_1 + test dil, 1 + jne .LBB5_1 + mov rdx, rsi + lea rcx, [rsi - 4] + mov rax, rcx + and rax, -2 + add rax, 4 + cmp rsi, rax + jne .LBB5_1 shr rcx - mov rdx, rcx + movzx esi, word ptr [rdi] + xor eax, eax + cmp esi, 49344 + cmove rdx, rcx + cmove rax, rdi ret -.LBB5_4: +.LBB5_1: xor eax, eax ret diff --git a/benches/try_ref_from_bytes_dynamic_size.x86-64.mca b/benches/try_ref_from_bytes_dynamic_size.x86-64.mca index ecd7a18f6d..99a6a1d9f3 100644 --- a/benches/try_ref_from_bytes_dynamic_size.x86-64.mca +++ b/benches/try_ref_from_bytes_dynamic_size.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 Instructions: 2000 -Total Cycles: 639 -Total uOps: 2100 +Total Cycles: 641 +Total uOps: 2200 Dispatch Width: 4 -uOps Per Cycle: 3.29 -IPC: 3.13 -Block RThroughput: 5.3 +uOps Per Cycle: 3.43 +IPC: 3.12 +Block RThroughput: 5.5 Instruction Info: @@ -18,23 +18,23 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 mov rdx, rsi - 1 1 0.33 mov rax, rdi 1 1 0.33 cmp rsi, 4 - 1 1 0.50 setb cl - 1 1 0.33 or cl, al - 1 1 0.33 test cl, 1 - 1 1 1.00 jne .LBB5_4 - 1 1 0.50 lea rcx, [rdx - 4] - 1 1 0.33 mov rsi, rcx - 1 1 0.33 and rsi, -2 - 1 1 0.33 add rsi, 4 - 1 1 0.33 cmp rdx, rsi - 1 1 1.00 jne .LBB5_4 - 2 6 0.50 * cmp word ptr [rax], -16192 - 1 1 1.00 jne .LBB5_4 + 1 1 1.00 jb .LBB5_1 + 1 1 0.33 test dil, 1 + 1 1 1.00 jne .LBB5_1 + 1 1 0.33 mov rdx, rsi + 1 1 0.50 lea rcx, [rsi - 4] + 1 1 0.33 mov rax, rcx + 1 1 0.33 and rax, -2 + 1 1 0.33 add rax, 4 + 1 1 0.33 cmp rsi, rax + 1 1 1.00 jne .LBB5_1 1 1 0.50 shr rcx - 1 1 0.33 mov rdx, rcx + 1 5 0.50 * movzx esi, word ptr [rdi] + 1 0 0.25 xor eax, eax + 1 1 0.33 cmp esi, 49344 + 2 2 0.67 cmove rdx, rcx + 2 2 0.67 cmove rax, rdi 1 1 1.00 U ret 1 0 0.25 xor eax, eax 1 1 1.00 U ret @@ -53,27 +53,27 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 6.32 6.32 - 6.36 0.50 0.50 + - - 6.31 6.32 - 6.37 0.50 0.50 Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.33 0.66 - 0.01 - - mov rdx, rsi - - - 0.66 0.34 - - - - mov rax, rdi - - - 0.34 0.66 - - - - cmp rsi, 4 - - - 0.99 - - 0.01 - - setb cl - - - 0.01 0.99 - - - - or cl, al - - - - 1.00 - - - - test cl, 1 - - - - - - 1.00 - - jne .LBB5_4 - - - 0.66 0.34 - - - - lea rcx, [rdx - 4] - - - 0.33 0.66 - 0.01 - - mov rsi, rcx - - - 1.00 - - - - - and rsi, -2 - - - 0.66 0.34 - - - - add rsi, 4 - - - - 1.00 - - - - cmp rdx, rsi - - - - - - 1.00 - - jne .LBB5_4 - - - - - - 1.00 0.50 0.50 cmp word ptr [rax], -16192 - - - - - - 1.00 - - jne .LBB5_4 - - - 0.67 - - 0.33 - - shr rcx - - - 0.67 0.33 - - - - mov rdx, rcx + - - 0.95 0.03 - 0.02 - - cmp rsi, 4 + - - - - - 1.00 - - jb .LBB5_1 + - - 0.37 0.63 - - - - test dil, 1 + - - - - - 1.00 - - jne .LBB5_1 + - - 0.66 0.33 - 0.01 - - mov rdx, rsi + - - 0.33 0.67 - - - - lea rcx, [rsi - 4] + - - 0.01 0.99 - - - - mov rax, rcx + - - - 1.00 - - - - and rax, -2 + - - - 0.99 - 0.01 - - add rax, 4 + - - 0.01 0.99 - - - - cmp rsi, rax + - - - - - 1.00 - - jne .LBB5_1 + - - 1.00 - - - - - shr rcx + - - - - - - 0.50 0.50 movzx esi, word ptr [rdi] + - - - - - - - - xor eax, eax + - - 0.99 0.01 - - - - cmp esi, 49344 + - - 0.99 0.02 - 0.99 - - cmove rdx, rcx + - - 1.00 0.66 - 0.34 - - cmove rax, rdi - - - - - 1.00 - - ret - - - - - - - - xor eax, eax - - - - - 1.00 - - ret diff --git a/benches/try_ref_from_bytes_with_elems_dynamic_padding.x86-64 b/benches/try_ref_from_bytes_with_elems_dynamic_padding.x86-64 index 3ef8d1448a..2ea5118fa3 100644 --- a/benches/try_ref_from_bytes_with_elems_dynamic_padding.x86-64 +++ b/benches/try_ref_from_bytes_with_elems_dynamic_padding.x86-64 @@ -1,21 +1,19 @@ bench_try_ref_from_bytes_with_elems_dynamic_padding: - movabs rax, 3074457345618258598 - cmp rdx, rax - seta cl + movabs rcx, 3074457345618258598 + cmp rdx, rcx + ja .LBB5_4 mov rax, rdi test al, 3 - setne dil - or dil, cl - jne .LBB5_3 + jne .LBB5_4 lea rcx, [rdx + 2*rdx] or rcx, 3 add rcx, 9 cmp rsi, rcx - jne .LBB5_3 + jne .LBB5_4 cmp word ptr [rax], -16192 - je .LBB5_4 -.LBB5_3: + je .LBB5_5 +.LBB5_4: xor eax, eax mov rdx, rsi -.LBB5_4: +.LBB5_5: ret diff --git a/benches/try_ref_from_bytes_with_elems_dynamic_padding.x86-64.mca b/benches/try_ref_from_bytes_with_elems_dynamic_padding.x86-64.mca index 8131f3bd54..c5d4a2b0d5 100644 --- a/benches/try_ref_from_bytes_with_elems_dynamic_padding.x86-64.mca +++ b/benches/try_ref_from_bytes_with_elems_dynamic_padding.x86-64.mca @@ -1,11 +1,11 @@ Iterations: 100 -Instructions: 1800 -Total Cycles: 607 -Total uOps: 2000 +Instructions: 1600 +Total Cycles: 507 +Total uOps: 1700 Dispatch Width: 4 -uOps Per Cycle: 3.29 -IPC: 2.97 +uOps Per Cycle: 3.35 +IPC: 3.16 Block RThroughput: 5.0 @@ -18,21 +18,19 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 movabs rax, 3074457345618258598 - 1 1 0.33 cmp rdx, rax - 2 2 1.00 seta cl + 1 1 0.33 movabs rcx, 3074457345618258598 + 1 1 0.33 cmp rdx, rcx + 1 1 1.00 ja .LBB5_4 1 1 0.33 mov rax, rdi 1 1 0.33 test al, 3 - 1 1 0.50 setne dil - 1 1 0.33 or dil, cl - 1 1 1.00 jne .LBB5_3 + 1 1 1.00 jne .LBB5_4 1 1 0.50 lea rcx, [rdx + 2*rdx] 1 1 0.33 or rcx, 3 1 1 0.33 add rcx, 9 1 1 0.33 cmp rsi, rcx - 1 1 1.00 jne .LBB5_3 + 1 1 1.00 jne .LBB5_4 2 6 0.50 * cmp word ptr [rax], -16192 - 1 1 1.00 je .LBB5_4 + 1 1 1.00 je .LBB5_5 1 0 0.25 xor eax, eax 1 1 0.33 mov rdx, rsi 1 1 1.00 U ret @@ -51,25 +49,23 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 5.99 5.99 - 6.02 0.50 0.50 + - - 4.98 4.99 - 5.03 0.50 0.50 Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - - 0.99 - 0.01 - - movabs rax, 3074457345618258598 - - - - 1.00 - - - - cmp rdx, rax - - - - - - 2.00 - - seta cl - - - 1.00 - - - - - mov rax, rdi - - - 0.99 0.01 - - - - test al, 3 - - - 1.00 - - - - - setne dil - - - - 0.99 - 0.01 - - or dil, cl - - - - - - 1.00 - - jne .LBB5_3 - - - 0.01 0.99 - - - - lea rcx, [rdx + 2*rdx] - - - - 1.00 - - - - or rcx, 3 - - - 0.99 0.01 - - - - add rcx, 9 - - - - 1.00 - - - - cmp rsi, rcx - - - - - - 1.00 - - jne .LBB5_3 + - - 0.98 0.01 - 0.01 - - movabs rcx, 3074457345618258598 + - - 0.01 0.99 - - - - cmp rdx, rcx + - - - - - 1.00 - - ja .LBB5_4 + - - 0.99 0.01 - - - - mov rax, rdi + - - 0.01 0.98 - 0.01 - - test al, 3 + - - - - - 1.00 - - jne .LBB5_4 + - - 0.98 0.02 - - - - lea rcx, [rdx + 2*rdx] + - - 0.01 0.99 - - - - or rcx, 3 + - - - 1.00 - - - - add rcx, 9 + - - - 0.99 - 0.01 - - cmp rsi, rcx + - - - - - 1.00 - - jne .LBB5_4 - - 1.00 - - - 0.50 0.50 cmp word ptr [rax], -16192 - - - - - - 1.00 - - je .LBB5_4 + - - - - - 1.00 - - je .LBB5_5 - - - - - - - - xor eax, eax - - 1.00 - - - - - mov rdx, rsi - - - - - 1.00 - - ret diff --git a/benches/try_ref_from_bytes_with_elems_dynamic_size.x86-64 b/benches/try_ref_from_bytes_with_elems_dynamic_size.x86-64 index ba34b1855b..9054d9c7a1 100644 --- a/benches/try_ref_from_bytes_with_elems_dynamic_size.x86-64 +++ b/benches/try_ref_from_bytes_with_elems_dynamic_size.x86-64 @@ -1,13 +1,13 @@ bench_try_ref_from_bytes_with_elems_dynamic_size: - movabs rax, 4611686018427387901 - cmp rdx, rax - seta cl + movabs rcx, 4611686018427387901 + cmp rdx, rcx + ja .LBB5_3 mov rax, rdi - or dil, cl - test dil, 1 - jne .LBB5_3 lea rcx, [2*rdx + 4] cmp rsi, rcx + setne cl + or cl, al + test cl, 1 jne .LBB5_3 cmp word ptr [rax], -16192 je .LBB5_4 diff --git a/benches/try_ref_from_bytes_with_elems_dynamic_size.x86-64.mca b/benches/try_ref_from_bytes_with_elems_dynamic_size.x86-64.mca index ae049c03df..66d1b87267 100644 --- a/benches/try_ref_from_bytes_with_elems_dynamic_size.x86-64.mca +++ b/benches/try_ref_from_bytes_with_elems_dynamic_size.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 Instructions: 1500 -Total Cycles: 507 -Total uOps: 1700 +Total Cycles: 474 +Total uOps: 1600 Dispatch Width: 4 -uOps Per Cycle: 3.35 -IPC: 2.96 -Block RThroughput: 4.3 +uOps Per Cycle: 3.38 +IPC: 3.16 +Block RThroughput: 4.0 Instruction Info: @@ -18,15 +18,15 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 movabs rax, 4611686018427387901 - 1 1 0.33 cmp rdx, rax - 2 2 1.00 seta cl + 1 1 0.33 movabs rcx, 4611686018427387901 + 1 1 0.33 cmp rdx, rcx + 1 1 1.00 ja .LBB5_3 1 1 0.33 mov rax, rdi - 1 1 0.33 or dil, cl - 1 1 0.33 test dil, 1 - 1 1 1.00 jne .LBB5_3 1 1 0.50 lea rcx, [2*rdx + 4] 1 1 0.33 cmp rsi, rcx + 1 1 0.50 setne cl + 1 1 0.33 or cl, al + 1 1 0.33 test cl, 1 1 1 1.00 jne .LBB5_3 2 6 0.50 * cmp word ptr [rax], -16192 1 1 1.00 je .LBB5_4 @@ -48,22 +48,22 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 4.98 4.99 - 5.03 0.50 0.50 + - - 4.66 4.66 - 4.68 0.50 0.50 Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - - 0.99 - 0.01 - - movabs rax, 4611686018427387901 - - - 0.50 0.50 - - - - cmp rdx, rax - - - 1.96 - - 0.04 - - seta cl - - - 0.01 0.99 - - - - mov rax, rdi - - - 1.00 - - - - - or dil, cl - - - 0.99 0.01 - - - - test dil, 1 - - - - - - 1.00 - - jne .LBB5_3 - - - 0.01 0.99 - - - - lea rcx, [2*rdx + 4] - - - 0.02 0.49 - 0.49 - - cmp rsi, rcx + - - 0.33 0.66 - 0.01 - - movabs rcx, 4611686018427387901 + - - 1.00 - - - - - cmp rdx, rcx + - - - - - 1.00 - - ja .LBB5_3 + - - 0.66 0.01 - 0.33 - - mov rax, rdi + - - 0.33 0.67 - - - - lea rcx, [2*rdx + 4] + - - 0.01 0.99 - - - - cmp rsi, rcx + - - 0.66 - - 0.34 - - setne cl + - - - 1.00 - - - - or cl, al + - - 0.01 0.99 - - - - test cl, 1 - - - - - 1.00 - - jne .LBB5_3 - - - - 0.51 - 0.49 0.50 0.50 cmp word ptr [rax], -16192 + - - 0.99 0.01 - - 0.50 0.50 cmp word ptr [rax], -16192 - - - - - 1.00 - - je .LBB5_4 - - - - - - - - xor eax, eax - - - 0.49 0.51 - - - - mov rdx, rsi + - - 0.67 0.33 - - - - mov rdx, rsi - - - - - 1.00 - - ret diff --git a/benches/try_ref_from_prefix_dynamic_padding.x86-64 b/benches/try_ref_from_prefix_dynamic_padding.x86-64 index d832cb7ecf..3cb4e6b574 100644 --- a/benches/try_ref_from_prefix_dynamic_padding.x86-64 +++ b/benches/try_ref_from_prefix_dynamic_padding.x86-64 @@ -1,29 +1,31 @@ bench_try_ref_from_prefix_dynamic_padding: - xor edx, edx - mov eax, 0 - test dil, 3 - je .LBB5_1 - ret -.LBB5_1: movabs rax, 9223372036854775804 - and rsi, rax - cmp rsi, 9 - jae .LBB5_3 + and rax, rsi + cmp rax, 9 + jae .LBB5_2 mov edx, 1 - xor eax, eax + xor ecx, ecx + mov rax, rcx + ret +.LBB5_2: + xor edx, edx + mov ecx, 0 + test dil, 3 + je .LBB5_3 + mov rax, rcx ret .LBB5_3: - add rsi, -9 + add rax, -9 movabs rcx, -6148914691236517205 - mov rax, rsi mul rcx mov rax, rdx shr rax - movzx ecx, word ptr [rdi] - cmp cx, -16192 + movzx esi, word ptr [rdi] + cmp si, -16192 mov edx, 2 cmove rdx, rax - xor eax, eax - cmp ecx, 49344 - cmove rax, rdi + xor ecx, ecx + cmp esi, 49344 + cmove rcx, rdi + mov rax, rcx ret diff --git a/benches/try_ref_from_prefix_dynamic_padding.x86-64.mca b/benches/try_ref_from_prefix_dynamic_padding.x86-64.mca index 482112a39b..ef17cbfa30 100644 --- a/benches/try_ref_from_prefix_dynamic_padding.x86-64.mca +++ b/benches/try_ref_from_prefix_dynamic_padding.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 2600 -Total Cycles: 843 -Total uOps: 2900 +Instructions: 2800 +Total Cycles: 910 +Total uOps: 3100 Dispatch Width: 4 -uOps Per Cycle: 3.44 +uOps Per Cycle: 3.41 IPC: 3.08 -Block RThroughput: 7.3 +Block RThroughput: 7.8 Instruction Info: @@ -18,31 +18,33 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 0 0.25 xor edx, edx - 1 1 0.33 mov eax, 0 - 1 1 0.33 test dil, 3 - 1 1 1.00 je .LBB5_1 - 1 1 1.00 U ret 1 1 0.33 movabs rax, 9223372036854775804 - 1 1 0.33 and rsi, rax - 1 1 0.33 cmp rsi, 9 - 1 1 1.00 jae .LBB5_3 + 1 1 0.33 and rax, rsi + 1 1 0.33 cmp rax, 9 + 1 1 1.00 jae .LBB5_2 1 1 0.33 mov edx, 1 - 1 0 0.25 xor eax, eax + 1 0 0.25 xor ecx, ecx + 1 1 0.33 mov rax, rcx + 1 1 1.00 U ret + 1 0 0.25 xor edx, edx + 1 1 0.33 mov ecx, 0 + 1 1 0.33 test dil, 3 + 1 1 1.00 je .LBB5_3 + 1 1 0.33 mov rax, rcx 1 1 1.00 U ret - 1 1 0.33 add rsi, -9 + 1 1 0.33 add rax, -9 1 1 0.33 movabs rcx, -6148914691236517205 - 1 1 0.33 mov rax, rsi 2 4 1.00 mul rcx 1 1 0.33 mov rax, rdx 1 1 0.50 shr rax - 1 5 0.50 * movzx ecx, word ptr [rdi] - 1 1 0.33 cmp cx, -16192 + 1 5 0.50 * movzx esi, word ptr [rdi] + 1 1 0.33 cmp si, -16192 1 1 0.33 mov edx, 2 2 2 0.67 cmove rdx, rax - 1 0 0.25 xor eax, eax - 1 1 0.33 cmp ecx, 49344 - 2 2 0.67 cmove rax, rdi + 1 0 0.25 xor ecx, ecx + 1 1 0.33 cmp esi, 49344 + 2 2 0.67 cmove rcx, rdi + 1 1 0.33 mov rax, rcx 1 1 1.00 U ret @@ -59,33 +61,35 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 8.33 8.33 - 8.34 0.50 0.50 + - - 9.00 9.00 - 9.00 0.50 0.50 Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - - - - - - - xor edx, edx - - - 0.32 0.34 - 0.34 - - mov eax, 0 - - - 0.34 0.33 - 0.33 - - test dil, 3 - - - - - - 1.00 - - je .LBB5_1 + - - 0.03 0.96 - 0.01 - - movabs rax, 9223372036854775804 + - - 0.96 0.01 - 0.03 - - and rax, rsi + - - 0.96 0.02 - 0.02 - - cmp rax, 9 + - - - - - 1.00 - - jae .LBB5_2 + - - 0.99 - - 0.01 - - mov edx, 1 + - - - - - - - - xor ecx, ecx + - - 0.01 - - 0.99 - - mov rax, rcx - - - - - 1.00 - - ret - - - 0.35 0.65 - - - - movabs rax, 9223372036854775804 - - - 0.96 0.03 - 0.01 - - and rsi, rax - - - 0.01 0.97 - 0.02 - - cmp rsi, 9 - - - - - - 1.00 - - jae .LBB5_3 - - - 0.67 0.01 - 0.32 - - mov edx, 1 - - - - - - - - - xor eax, eax + - - - - - - - - xor edx, edx + - - 0.01 0.03 - 0.96 - - mov ecx, 0 + - - 0.02 0.97 - 0.01 - - test dil, 3 + - - - - - 1.00 - - je .LBB5_3 + - - 0.02 0.98 - - - - mov rax, rcx - - - - - 1.00 - - ret - - - 0.02 0.34 - 0.64 - - add rsi, -9 - - - 0.33 0.66 - 0.01 - - movabs rcx, -6148914691236517205 - - - 0.66 0.34 - - - - mov rax, rsi + - - - 1.00 - - - - add rax, -9 + - - 1.00 - - - - - movabs rcx, -6148914691236517205 - - 1.00 1.00 - - - - mul rcx - - 0.01 0.99 - - - - mov rax, rdx - - 0.99 - - 0.01 - - shr rax - - - - - - - 0.50 0.50 movzx ecx, word ptr [rdi] - - - 0.33 0.03 - 0.64 - - cmp cx, -16192 - - - 0.01 0.31 - 0.68 - - mov edx, 2 + - - - - - - 0.50 0.50 movzx esi, word ptr [rdi] + - - 0.97 0.02 - 0.01 - - cmp si, -16192 + - - 0.01 0.02 - 0.97 - - mov edx, 2 - - 1.00 1.00 - - - - cmove rdx, rax - - - - - - - - - xor eax, eax - - - 0.33 0.33 - 0.34 - - cmp ecx, 49344 - - - 1.00 1.00 - - - - cmove rax, rdi + - - - - - - - - xor ecx, ecx + - - 0.01 0.01 - 0.98 - - cmp esi, 49344 + - - 1.00 1.00 - - - - cmove rcx, rdi + - - 0.01 0.99 - - - - mov rax, rcx - - - - - 1.00 - - ret diff --git a/benches/try_ref_from_prefix_dynamic_size.x86-64 b/benches/try_ref_from_prefix_dynamic_size.x86-64 index be7f34b9f8..bca29f5523 100644 --- a/benches/try_ref_from_prefix_dynamic_size.x86-64 +++ b/benches/try_ref_from_prefix_dynamic_size.x86-64 @@ -1,14 +1,14 @@ bench_try_ref_from_prefix_dynamic_size: - xor edx, edx - mov eax, 0 - test dil, 1 - jne .LBB5_4 cmp rsi, 4 - jae .LBB5_3 + jae .LBB5_2 mov edx, 1 xor eax, eax ret -.LBB5_3: +.LBB5_2: + xor edx, edx + mov eax, 0 + test dil, 1 + jne .LBB5_4 add rsi, -4 shr rsi movzx ecx, word ptr [rdi] diff --git a/benches/try_ref_from_prefix_dynamic_size.x86-64.mca b/benches/try_ref_from_prefix_dynamic_size.x86-64.mca index 11706defe1..bdc62c5367 100644 --- a/benches/try_ref_from_prefix_dynamic_size.x86-64.mca +++ b/benches/try_ref_from_prefix_dynamic_size.x86-64.mca @@ -18,15 +18,15 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 0 0.25 xor edx, edx - 1 1 0.33 mov eax, 0 - 1 1 0.33 test dil, 1 - 1 1 1.00 jne .LBB5_4 1 1 0.33 cmp rsi, 4 - 1 1 1.00 jae .LBB5_3 + 1 1 1.00 jae .LBB5_2 1 1 0.33 mov edx, 1 1 0 0.25 xor eax, eax 1 1 1.00 U ret + 1 0 0.25 xor edx, edx + 1 1 0.33 mov eax, 0 + 1 1 0.33 test dil, 1 + 1 1 1.00 jne .LBB5_4 1 1 0.33 add rsi, -4 1 1 0.50 shr rsi 1 5 0.50 * movzx ecx, word ptr [rdi] @@ -56,22 +56,22 @@ Resource pressure per iteration: Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - - - - - - - xor edx, edx - - - 0.30 0.37 - 0.33 - - mov eax, 0 - - - 0.35 0.32 - 0.33 - - test dil, 1 - - - - - - 1.00 - - jne .LBB5_4 - - - 0.32 0.33 - 0.35 - - cmp rsi, 4 - - - - - - 1.00 - - jae .LBB5_3 - - - 0.33 0.35 - 0.32 - - mov edx, 1 + - - - 0.35 - 0.65 - - cmp rsi, 4 + - - - - - 1.00 - - jae .LBB5_2 + - - 0.34 0.66 - - - - mov edx, 1 - - - - - - - - xor eax, eax - - - - - 1.00 - - ret - - - 0.34 0.64 - 0.02 - - add rsi, -4 + - - - - - - - - xor edx, edx + - - 0.64 0.34 - 0.02 - - mov eax, 0 + - - 0.33 0.64 - 0.03 - - test dil, 1 + - - - - - 1.00 - - jne .LBB5_4 + - - 0.64 0.34 - 0.02 - - add rsi, -4 - - 1.00 - - - - - shr rsi - - - - - - 0.50 0.50 movzx ecx, word ptr [rdi] - - - 0.60 0.40 - - - - cmp ecx, 49344 - - - 0.05 0.95 - - - - mov edx, 2 + - - 0.32 0.38 - 0.30 - - cmp ecx, 49344 + - - 0.03 0.95 - 0.02 - - mov edx, 2 - - 1.00 1.00 - - - - cmove rdx, rsi - - - - - - - - xor eax, eax - - - 0.37 0.31 - 0.32 - - cmp cx, -16192 + - - 0.36 0.01 - 0.63 - - cmp cx, -16192 - - 1.00 1.00 - - - - cmove rax, rdi - - - - - 1.00 - - ret diff --git a/benches/try_ref_from_prefix_static_size.x86-64 b/benches/try_ref_from_prefix_static_size.x86-64 index 83212f776e..804d65c8d5 100644 --- a/benches/try_ref_from_prefix_static_size.x86-64 +++ b/benches/try_ref_from_prefix_static_size.x86-64 @@ -1,8 +1,9 @@ bench_try_ref_from_prefix_static_size: cmp rsi, 6 setb al - or al, dil - test al, 1 + mov ecx, edi + or cl, al + test cl, 1 jne .LBB5_2 movzx eax, word ptr [rdi] cmp eax, 49344 diff --git a/benches/try_ref_from_prefix_static_size.x86-64.mca b/benches/try_ref_from_prefix_static_size.x86-64.mca index 5d02b863a7..27fa1930fe 100644 --- a/benches/try_ref_from_prefix_static_size.x86-64.mca +++ b/benches/try_ref_from_prefix_static_size.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 1200 -Total Cycles: 374 -Total uOps: 1300 +Instructions: 1300 +Total Cycles: 407 +Total uOps: 1400 Dispatch Width: 4 -uOps Per Cycle: 3.48 -IPC: 3.21 -Block RThroughput: 3.3 +uOps Per Cycle: 3.44 +IPC: 3.19 +Block RThroughput: 3.5 Instruction Info: @@ -20,8 +20,9 @@ Instruction Info: [1] [2] [3] [4] [5] [6] Instructions: 1 1 0.33 cmp rsi, 6 1 1 0.50 setb al - 1 1 0.33 or al, dil - 1 1 0.33 test al, 1 + 1 1 0.33 mov ecx, edi + 1 1 0.33 or cl, al + 1 1 0.33 test cl, 1 1 1 1.00 jne .LBB5_2 1 5 0.50 * movzx eax, word ptr [rdi] 1 1 0.33 cmp eax, 49344 @@ -45,18 +46,19 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 3.66 3.65 - 3.69 0.50 0.50 + - - 3.99 3.99 - 4.02 0.50 0.50 Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.35 0.64 - 0.01 - - cmp rsi, 6 + - - 0.05 0.94 - 0.01 - - cmp rsi, 6 - - 1.00 - - - - - setb al - - - 0.02 0.66 - 0.32 - - or al, dil - - - 0.03 0.65 - 0.32 - - test al, 1 + - - 0.93 0.07 - - - - mov ecx, edi + - - 0.03 0.96 - 0.01 - - or cl, al + - - 0.03 0.02 - 0.95 - - test cl, 1 - - - - - 1.00 - - jne .LBB5_2 - - - - - - 0.50 0.50 movzx eax, word ptr [rdi] - - - 0.92 0.07 - 0.01 - - cmp eax, 49344 - - - 0.37 0.63 - - - - mov eax, 2 + - - 0.02 0.97 - 0.01 - - cmp eax, 49344 + - - 0.96 0.03 - 0.01 - - mov eax, 2 - - 0.97 1.00 - 0.03 - - cmove rax, rdi - - - - - 1.00 - - je .LBB5_3 - - - - - - - - xor eax, eax diff --git a/benches/try_ref_from_prefix_with_elems_dynamic_padding.x86-64 b/benches/try_ref_from_prefix_with_elems_dynamic_padding.x86-64 index 80e66ba160..15273eeb08 100644 --- a/benches/try_ref_from_prefix_with_elems_dynamic_padding.x86-64 +++ b/benches/try_ref_from_prefix_with_elems_dynamic_padding.x86-64 @@ -1,30 +1,28 @@ bench_try_ref_from_prefix_with_elems_dynamic_padding: + mov rcx, rdx + mov edx, 1 movabs rax, 3074457345618258598 - cmp rdx, rax - ja .LBB5_1 - xor ecx, ecx - mov eax, 0 - test dil, 3 - je .LBB5_3 - mov rdx, rcx - ret -.LBB5_3: - lea rax, [rdx + 2*rdx] + cmp rcx, rax + ja .LBB5_4 + lea rax, [rcx + 2*rcx] or rax, 3 add rax, 9 cmp rax, rsi - jbe .LBB5_4 -.LBB5_1: + jbe .LBB5_2 +.LBB5_4: xor eax, eax - mov edx, 1 +.LBB5_5: ret -.LBB5_4: +.LBB5_2: + xor edx, edx + mov eax, 0 + test dil, 3 + jne .LBB5_5 movzx esi, word ptr [rdi] cmp si, -16192 - mov ecx, 2 - cmove rcx, rdx + mov edx, 2 + cmove rdx, rcx xor eax, eax cmp esi, 49344 cmove rax, rdi - mov rdx, rcx ret diff --git a/benches/try_ref_from_prefix_with_elems_dynamic_padding.x86-64.mca b/benches/try_ref_from_prefix_with_elems_dynamic_padding.x86-64.mca index 512e8ce643..4fc4306581 100644 --- a/benches/try_ref_from_prefix_with_elems_dynamic_padding.x86-64.mca +++ b/benches/try_ref_from_prefix_with_elems_dynamic_padding.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 2600 -Total Cycles: 806 -Total uOps: 2800 +Instructions: 2400 +Total Cycles: 741 +Total uOps: 2600 Dispatch Width: 4 -uOps Per Cycle: 3.47 -IPC: 3.23 -Block RThroughput: 7.0 +uOps Per Cycle: 3.51 +IPC: 3.24 +Block RThroughput: 6.5 Instruction Info: @@ -18,31 +18,29 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: + 1 1 0.33 mov rcx, rdx + 1 1 0.33 mov edx, 1 1 1 0.33 movabs rax, 3074457345618258598 - 1 1 0.33 cmp rdx, rax - 1 1 1.00 ja .LBB5_1 - 1 0 0.25 xor ecx, ecx - 1 1 0.33 mov eax, 0 - 1 1 0.33 test dil, 3 - 1 1 1.00 je .LBB5_3 - 1 1 0.33 mov rdx, rcx - 1 1 1.00 U ret - 1 1 0.50 lea rax, [rdx + 2*rdx] + 1 1 0.33 cmp rcx, rax + 1 1 1.00 ja .LBB5_4 + 1 1 0.50 lea rax, [rcx + 2*rcx] 1 1 0.33 or rax, 3 1 1 0.33 add rax, 9 1 1 0.33 cmp rax, rsi - 1 1 1.00 jbe .LBB5_4 + 1 1 1.00 jbe .LBB5_2 1 0 0.25 xor eax, eax - 1 1 0.33 mov edx, 1 1 1 1.00 U ret + 1 0 0.25 xor edx, edx + 1 1 0.33 mov eax, 0 + 1 1 0.33 test dil, 3 + 1 1 1.00 jne .LBB5_5 1 5 0.50 * movzx esi, word ptr [rdi] 1 1 0.33 cmp si, -16192 - 1 1 0.33 mov ecx, 2 - 2 2 0.67 cmove rcx, rdx + 1 1 0.33 mov edx, 2 + 2 2 0.67 cmove rdx, rcx 1 0 0.25 xor eax, eax 1 1 0.33 cmp esi, 49344 2 2 0.67 cmove rax, rdi - 1 1 0.33 mov rdx, rcx 1 1 1.00 U ret @@ -59,33 +57,31 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 7.98 7.99 - 8.03 0.50 0.50 + - - 7.32 7.33 - 7.35 0.50 0.50 Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.98 - - 0.02 - - movabs rax, 3074457345618258598 - - - - 1.00 - - - - cmp rdx, rax - - - - - - 1.00 - - ja .LBB5_1 - - - - - - - - - xor ecx, ecx - - - 0.99 0.01 - - - - mov eax, 0 - - - 0.01 0.96 - 0.03 - - test dil, 3 - - - - - - 1.00 - - je .LBB5_3 - - - 0.97 0.01 - 0.02 - - mov rdx, rcx - - - - - - 1.00 - - ret - - - 0.03 0.97 - - - - lea rax, [rdx + 2*rdx] - - - 0.03 0.97 - - - - or rax, 3 - - - 0.01 0.99 - - - - add rax, 9 + - - - 0.99 - 0.01 - - mov rcx, rdx + - - 0.66 0.02 - 0.32 - - mov edx, 1 + - - 0.35 0.32 - 0.33 - - movabs rax, 3074457345618258598 + - - - 0.99 - 0.01 - - cmp rcx, rax + - - - - - 1.00 - - ja .LBB5_4 + - - 0.99 0.01 - - - - lea rax, [rcx + 2*rcx] + - - 0.33 0.67 - - - - or rax, 3 + - - - 1.00 - - - - add rax, 9 - - - 1.00 - - - - cmp rax, rsi - - - - - - 1.00 - - jbe .LBB5_4 + - - - - - 1.00 - - jbe .LBB5_2 - - - - - - - - xor eax, eax - - - 0.98 0.01 - 0.01 - - mov edx, 1 - - - - - 1.00 - - ret + - - - - - - - - xor edx, edx + - - 0.34 - - 0.66 - - mov eax, 0 + - - 0.99 - - 0.01 - - test dil, 3 + - - - - - 1.00 - - jne .LBB5_5 - - - - - - 0.50 0.50 movzx esi, word ptr [rdi] - - - 0.97 0.03 - - - - cmp si, -16192 - - - 0.98 0.01 - 0.01 - - mov ecx, 2 - - - 1.00 0.03 - 0.97 - - cmove rcx, rdx + - - 0.66 0.01 - 0.33 - - cmp si, -16192 + - - 0.67 0.32 - 0.01 - - mov edx, 2 + - - 1.00 0.99 - 0.01 - - cmove rdx, rcx - - - - - - - - xor eax, eax - - - 0.03 0.97 - - - - cmp esi, 49344 - - - 1.00 1.00 - - - - cmove rax, rdi - - - - 0.03 - 0.97 - - mov rdx, rcx + - - 0.33 0.33 - 0.34 - - cmp esi, 49344 + - - 1.00 0.68 - 0.32 - - cmove rax, rdi - - - - - 1.00 - - ret diff --git a/benches/try_ref_from_prefix_with_elems_dynamic_size.x86-64 b/benches/try_ref_from_prefix_with_elems_dynamic_size.x86-64 index c12e87c137..c1b444fde9 100644 --- a/benches/try_ref_from_prefix_with_elems_dynamic_size.x86-64 +++ b/benches/try_ref_from_prefix_with_elems_dynamic_size.x86-64 @@ -1,20 +1,20 @@ bench_try_ref_from_prefix_with_elems_dynamic_size: - movabs rax, 4611686018427387901 - cmp rdx, rax - ja .LBB5_1 mov rcx, rdx - xor edx, edx - mov eax, 0 - test dil, 1 - jne .LBB5_5 + mov edx, 1 + movabs rax, 4611686018427387901 + cmp rcx, rax + ja .LBB5_3 lea rax, [2*rcx + 4] cmp rax, rsi jbe .LBB5_4 -.LBB5_1: +.LBB5_3: xor eax, eax - mov edx, 1 ret .LBB5_4: + xor edx, edx + mov eax, 0 + test dil, 1 + jne .LBB5_6 movzx esi, word ptr [rdi] cmp si, -16192 mov edx, 2 @@ -22,5 +22,5 @@ bench_try_ref_from_prefix_with_elems_dynamic_size: xor eax, eax cmp esi, 49344 cmove rax, rdi -.LBB5_5: +.LBB5_6: ret diff --git a/benches/try_ref_from_prefix_with_elems_dynamic_size.x86-64.mca b/benches/try_ref_from_prefix_with_elems_dynamic_size.x86-64.mca index 6c3f1a1ec9..c7bcc8ae1d 100644 --- a/benches/try_ref_from_prefix_with_elems_dynamic_size.x86-64.mca +++ b/benches/try_ref_from_prefix_with_elems_dynamic_size.x86-64.mca @@ -18,20 +18,20 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.33 movabs rax, 4611686018427387901 - 1 1 0.33 cmp rdx, rax - 1 1 1.00 ja .LBB5_1 1 1 0.33 mov rcx, rdx - 1 0 0.25 xor edx, edx - 1 1 0.33 mov eax, 0 - 1 1 0.33 test dil, 1 - 1 1 1.00 jne .LBB5_5 + 1 1 0.33 mov edx, 1 + 1 1 0.33 movabs rax, 4611686018427387901 + 1 1 0.33 cmp rcx, rax + 1 1 1.00 ja .LBB5_3 1 1 0.50 lea rax, [2*rcx + 4] 1 1 0.33 cmp rax, rsi 1 1 1.00 jbe .LBB5_4 1 0 0.25 xor eax, eax - 1 1 0.33 mov edx, 1 1 1 1.00 U ret + 1 0 0.25 xor edx, edx + 1 1 0.33 mov eax, 0 + 1 1 0.33 test dil, 1 + 1 1 1.00 jne .LBB5_6 1 5 0.50 * movzx esi, word ptr [rdi] 1 1 0.33 cmp si, -16192 1 1 0.33 mov edx, 2 @@ -55,29 +55,29 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 6.65 6.66 - 6.69 0.50 0.50 + - - 6.66 6.66 - 6.68 0.50 0.50 Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.66 0.33 - 0.01 - - movabs rax, 4611686018427387901 - - - 0.02 0.66 - 0.32 - - cmp rdx, rax - - - - - - 1.00 - - ja .LBB5_1 - - - 0.66 0.33 - 0.01 - - mov rcx, rdx - - - - - - - - - xor edx, edx - - - 0.33 0.01 - 0.66 - - mov eax, 0 - - - 0.34 0.65 - 0.01 - - test dil, 1 - - - - - - 1.00 - - jne .LBB5_5 - - - 0.65 0.35 - - - - lea rax, [2*rcx + 4] - - - - 1.00 - - - - cmp rax, rsi + - - 0.01 0.98 - 0.01 - - mov rcx, rdx + - - 0.67 0.01 - 0.32 - - mov edx, 1 + - - 0.33 0.33 - 0.34 - - movabs rax, 4611686018427387901 + - - - 0.99 - 0.01 - - cmp rcx, rax + - - - - - 1.00 - - ja .LBB5_3 + - - 0.99 0.01 - - - - lea rax, [2*rcx + 4] + - - 0.33 0.67 - - - - cmp rax, rsi - - - - - 1.00 - - jbe .LBB5_4 - - - - - - - - xor eax, eax - - - 0.34 0.01 - 0.65 - - mov edx, 1 - - - - - 1.00 - - ret + - - - - - - - - xor edx, edx + - - 0.34 0.02 - 0.64 - - mov eax, 0 + - - 0.33 0.66 - 0.01 - - test dil, 1 + - - - - - 1.00 - - jne .LBB5_6 - - - - - - 0.50 0.50 movzx esi, word ptr [rdi] - - - 0.65 0.34 - 0.01 - - cmp si, -16192 - - - 0.66 0.34 - - - - mov edx, 2 + - - 0.66 0.34 - - - - cmp si, -16192 + - - 0.33 0.67 - - - - mov edx, 2 - - 1.00 0.99 - 0.01 - - cmove rdx, rcx - - - - - - - - xor eax, eax - - - 0.34 0.66 - - - - cmp esi, 49344 - - - 1.00 0.99 - 0.01 - - cmove rax, rdi + - - 0.67 0.32 - 0.01 - - cmp esi, 49344 + - - 1.00 0.67 - 0.33 - - cmove rax, rdi - - - - - 1.00 - - ret diff --git a/benches/try_ref_from_suffix_dynamic_padding.x86-64 b/benches/try_ref_from_suffix_dynamic_padding.x86-64 index b3e9244428..b265188697 100644 --- a/benches/try_ref_from_suffix_dynamic_padding.x86-64 +++ b/benches/try_ref_from_suffix_dynamic_padding.x86-64 @@ -1,11 +1,11 @@ bench_try_ref_from_suffix_dynamic_padding: - lea eax, [rsi + rdi] - test al, 3 - jne .LBB5_1 movabs rax, 9223372036854775804 and rax, rsi cmp rax, 9 - jae .LBB5_3 + jb .LBB5_1 + lea ecx, [rsi + rdi] + test cl, 3 + je .LBB5_3 .LBB5_1: xor eax, eax ret diff --git a/benches/try_ref_from_suffix_dynamic_padding.x86-64.mca b/benches/try_ref_from_suffix_dynamic_padding.x86-64.mca index d56ae56d85..ad9399513b 100644 --- a/benches/try_ref_from_suffix_dynamic_padding.x86-64.mca +++ b/benches/try_ref_from_suffix_dynamic_padding.x86-64.mca @@ -1,11 +1,11 @@ Iterations: 100 Instructions: 2300 -Total Cycles: 791 +Total Cycles: 797 Total uOps: 2600 Dispatch Width: 4 -uOps Per Cycle: 3.29 -IPC: 2.91 +uOps Per Cycle: 3.26 +IPC: 2.89 Block RThroughput: 6.5 @@ -18,13 +18,13 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.50 lea eax, [rsi + rdi] - 1 1 0.33 test al, 3 - 1 1 1.00 jne .LBB5_1 1 1 0.33 movabs rax, 9223372036854775804 1 1 0.33 and rax, rsi 1 1 0.33 cmp rax, 9 - 1 1 1.00 jae .LBB5_3 + 1 1 1.00 jb .LBB5_1 + 1 1 0.50 lea ecx, [rsi + rdi] + 1 1 0.33 test cl, 3 + 1 1 1.00 je .LBB5_3 1 0 0.25 xor eax, eax 1 1 1.00 U ret 1 1 0.33 add rax, -9 @@ -56,30 +56,30 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 7.70 7.58 - 7.72 0.50 0.50 + - - 7.67 7.62 - 7.71 0.50 0.50 Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.26 0.74 - - - - lea eax, [rsi + rdi] - - - 0.19 0.28 - 0.53 - - test al, 3 - - - - - - 1.00 - - jne .LBB5_1 - - - 0.93 0.06 - 0.01 - - movabs rax, 9223372036854775804 - - - 0.81 0.14 - 0.05 - - and rax, rsi - - - 0.55 0.43 - 0.02 - - cmp rax, 9 - - - - - - 1.00 - - jae .LBB5_3 + - - 0.60 0.24 - 0.16 - - movabs rax, 9223372036854775804 + - - 0.58 0.17 - 0.25 - - and rax, rsi + - - 0.33 0.60 - 0.07 - - cmp rax, 9 + - - - - - 1.00 - - jb .LBB5_1 + - - 0.30 0.70 - - - - lea ecx, [rsi + rdi] + - - 0.13 0.57 - 0.30 - - test cl, 3 + - - - - - 1.00 - - je .LBB5_3 - - - - - - - - xor eax, eax - - - - - 1.00 - - ret - - - 0.42 0.56 - 0.02 - - add rax, -9 - - - 0.67 0.30 - 0.03 - - movabs rcx, -6148914691236517205 + - - 0.72 0.21 - 0.07 - - add rax, -9 + - - 0.69 0.23 - 0.08 - - movabs rcx, -6148914691236517205 - - 1.00 1.00 - - - - mul rcx - - - 0.71 - - 0.29 - - shr rdx - - - 0.32 0.68 - - - - lea rcx, [rdx + 2*rdx] - - - 0.57 0.04 - 0.39 - - sub rsi, rcx - - - 0.28 0.67 - 0.05 - - or rcx, -4 - - - 0.29 0.29 - 0.42 - - add rsi, rdi - - - 0.02 0.98 - - - - lea rdi, [rcx + rsi] - - - 0.02 0.41 - 0.57 - - add rdi, -8 + - - 0.60 - - 0.40 - - shr rdx + - - 0.50 0.50 - - - - lea rcx, [rdx + 2*rdx] + - - 0.44 0.25 - 0.31 - - sub rsi, rcx + - - 0.52 0.34 - 0.14 - - or rcx, -4 + - - 0.28 0.46 - 0.26 - - add rsi, rdi + - - 0.06 0.94 - - - - lea rdi, [rcx + rsi] + - - - 0.37 - 0.63 - - add rdi, -8 - - - - - - - - xor eax, eax - - - 0.57 0.01 - 0.42 0.50 0.50 cmp word ptr [rcx + rsi - 8], -16192 - - - 0.09 0.99 - 0.92 - - cmove rax, rdi + - - 0.58 0.06 - 0.36 0.50 0.50 cmp word ptr [rcx + rsi - 8], -16192 + - - 0.34 0.98 - 0.68 - - cmove rax, rdi - - - - - 1.00 - - ret diff --git a/benches/try_ref_from_suffix_dynamic_size.x86-64 b/benches/try_ref_from_suffix_dynamic_size.x86-64 index d51f7817e5..f175802bae 100644 --- a/benches/try_ref_from_suffix_dynamic_size.x86-64 +++ b/benches/try_ref_from_suffix_dynamic_size.x86-64 @@ -1,13 +1,9 @@ bench_try_ref_from_suffix_dynamic_size: - lea eax, [rsi + rdi] cmp rsi, 4 - setb cl - or cl, al - test cl, 1 - je .LBB5_2 - xor eax, eax - ret -.LBB5_2: + jb .LBB5_1 + lea eax, [rsi + rdi] + test al, 1 + jne .LBB5_1 lea rdx, [rsi - 4] shr rdx and esi, 1 @@ -16,3 +12,6 @@ bench_try_ref_from_suffix_dynamic_size: cmp word ptr [rdi + rsi], -16192 cmove rax, rcx ret +.LBB5_1: + xor eax, eax + ret diff --git a/benches/try_ref_from_suffix_dynamic_size.x86-64.mca b/benches/try_ref_from_suffix_dynamic_size.x86-64.mca index 6cf7f8e493..37b19a1fab 100644 --- a/benches/try_ref_from_suffix_dynamic_size.x86-64.mca +++ b/benches/try_ref_from_suffix_dynamic_size.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 1600 -Total Cycles: 510 -Total uOps: 1800 +Instructions: 1500 +Total Cycles: 476 +Total uOps: 1700 Dispatch Width: 4 -uOps Per Cycle: 3.53 -IPC: 3.14 -Block RThroughput: 4.5 +uOps Per Cycle: 3.57 +IPC: 3.15 +Block RThroughput: 4.3 Instruction Info: @@ -18,14 +18,11 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.50 lea eax, [rsi + rdi] 1 1 0.33 cmp rsi, 4 - 1 1 0.50 setb cl - 1 1 0.33 or cl, al - 1 1 0.33 test cl, 1 - 1 1 1.00 je .LBB5_2 - 1 0 0.25 xor eax, eax - 1 1 1.00 U ret + 1 1 1.00 jb .LBB5_1 + 1 1 0.50 lea eax, [rsi + rdi] + 1 1 0.33 test al, 1 + 1 1 1.00 jne .LBB5_1 1 1 0.50 lea rdx, [rsi - 4] 1 1 0.50 shr rdx 1 1 0.33 and esi, 1 @@ -34,6 +31,8 @@ Instruction Info: 2 6 0.50 * cmp word ptr [rdi + rsi], -16192 2 2 0.67 cmove rax, rcx 1 1 1.00 U ret + 1 0 0.25 xor eax, eax + 1 1 1.00 U ret Resources: @@ -49,23 +48,22 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 4.99 5.00 - 5.01 0.50 0.50 + - - 4.66 4.66 - 4.68 0.50 0.50 Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.98 0.02 - - - - lea eax, [rsi + rdi] - - - - 0.98 - 0.02 - - cmp rsi, 4 - - - 1.00 - - - - - setb cl - - - 0.01 0.99 - - - - or cl, al - - - 0.01 0.07 - 0.92 - - test cl, 1 - - - - - - 1.00 - - je .LBB5_2 + - - 0.58 0.41 - 0.01 - - cmp rsi, 4 + - - - - - 1.00 - - jb .LBB5_1 + - - 0.41 0.59 - - - - lea eax, [rsi + rdi] + - - 0.28 0.72 - - - - test al, 1 + - - - - - 1.00 - - jne .LBB5_1 + - - 0.70 0.30 - - - - lea rdx, [rsi - 4] + - - 0.68 - - 0.32 - - shr rdx + - - 0.61 0.32 - 0.07 - - and esi, 1 + - - 0.28 0.72 - - - - lea rcx, [rdi + rsi] - - - - - - - - xor eax, eax + - - 0.12 0.60 - 0.28 0.50 0.50 cmp word ptr [rdi + rsi], -16192 + - - 1.00 1.00 - - - - cmove rax, rcx - - - - - 1.00 - - ret - - - 0.93 0.07 - - - - lea rdx, [rsi - 4] - - - 0.93 - - 0.07 - - shr rdx - - - 0.06 0.93 - 0.01 - - and esi, 1 - - - 0.07 0.93 - - - - lea rcx, [rdi + rsi] - - - - - - - - xor eax, eax - - - - 0.01 - 0.99 0.50 0.50 cmp word ptr [rdi + rsi], -16192 - - - 1.00 1.00 - - - - cmove rax, rcx - - - - - 1.00 - - ret diff --git a/benches/try_ref_from_suffix_static_size.x86-64 b/benches/try_ref_from_suffix_static_size.x86-64 index cd39f70931..e917f89bbb 100644 --- a/benches/try_ref_from_suffix_static_size.x86-64 +++ b/benches/try_ref_from_suffix_static_size.x86-64 @@ -1,16 +1,15 @@ bench_try_ref_from_suffix_static_size: - lea eax, [rsi + rdi] cmp rsi, 6 - setb cl - or cl, al - test cl, 1 - je .LBB5_2 - xor eax, eax - ret -.LBB5_2: + jb .LBB5_1 + lea eax, [rsi + rdi] + test al, 1 + jne .LBB5_1 lea rcx, [rdi + rsi] add rcx, -6 xor eax, eax cmp word ptr [rdi + rsi - 6], -16192 cmove rax, rcx ret +.LBB5_1: + xor eax, eax + ret diff --git a/benches/try_ref_from_suffix_static_size.x86-64.mca b/benches/try_ref_from_suffix_static_size.x86-64.mca index 087d1e7ed9..1227e4103d 100644 --- a/benches/try_ref_from_suffix_static_size.x86-64.mca +++ b/benches/try_ref_from_suffix_static_size.x86-64.mca @@ -1,11 +1,11 @@ Iterations: 100 -Instructions: 1400 -Total Cycles: 443 -Total uOps: 1600 +Instructions: 1300 +Total Cycles: 410 +Total uOps: 1500 Dispatch Width: 4 -uOps Per Cycle: 3.61 -IPC: 3.16 +uOps Per Cycle: 3.66 +IPC: 3.17 Block RThroughput: 4.0 @@ -18,20 +18,19 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: - 1 1 0.50 lea eax, [rsi + rdi] 1 1 0.33 cmp rsi, 6 - 1 1 0.50 setb cl - 1 1 0.33 or cl, al - 1 1 0.33 test cl, 1 - 1 1 1.00 je .LBB5_2 - 1 0 0.25 xor eax, eax - 1 1 1.00 U ret + 1 1 1.00 jb .LBB5_1 + 1 1 0.50 lea eax, [rsi + rdi] + 1 1 0.33 test al, 1 + 1 1 1.00 jne .LBB5_1 1 1 0.50 lea rcx, [rdi + rsi] 1 1 0.33 add rcx, -6 1 0 0.25 xor eax, eax 2 6 0.50 * cmp word ptr [rdi + rsi - 6], -16192 2 2 0.67 cmove rax, rcx 1 1 1.00 U ret + 1 0 0.25 xor eax, eax + 1 1 1.00 U ret Resources: @@ -47,21 +46,20 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 4.33 4.33 - 4.34 0.50 0.50 + - - 3.98 3.98 - 4.04 0.50 0.50 Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.32 0.68 - - - - lea eax, [rsi + rdi] - - - 0.05 0.94 - 0.01 - - cmp rsi, 6 - - - 1.00 - - - - - setb cl - - - 0.95 0.05 - - - - or cl, al - - - 0.95 0.02 - 0.03 - - test cl, 1 - - - - - - 1.00 - - je .LBB5_2 + - - 0.03 0.96 - 0.01 - - cmp rsi, 6 + - - - - - 1.00 - - jb .LBB5_1 + - - 0.95 0.05 - - - - lea eax, [rsi + rdi] + - - 0.06 0.94 - - - - test al, 1 + - - - - - 1.00 - - jne .LBB5_1 + - - 0.94 0.06 - - - - lea rcx, [rdi + rsi] + - - 0.05 0.95 - - - - add rcx, -6 - - - - - - - - xor eax, eax + - - 0.95 0.04 - 0.01 0.50 0.50 cmp word ptr [rdi + rsi - 6], -16192 + - - 1.00 0.98 - 0.02 - - cmove rax, rcx - - - - - 1.00 - - ret - - - 0.04 0.96 - - - - lea rcx, [rdi + rsi] - - - 0.02 0.97 - 0.01 - - add rcx, -6 - - - - - - - - xor eax, eax - - - 0.03 0.66 - 0.31 0.50 0.50 cmp word ptr [rdi + rsi - 6], -16192 - - - 0.97 0.05 - 0.98 - - cmove rax, rcx - - - - - 1.00 - - ret diff --git a/benches/try_ref_from_suffix_with_elems_dynamic_padding.x86-64 b/benches/try_ref_from_suffix_with_elems_dynamic_padding.x86-64 index c7530d8b68..91dc7251d3 100644 --- a/benches/try_ref_from_suffix_with_elems_dynamic_padding.x86-64 +++ b/benches/try_ref_from_suffix_with_elems_dynamic_padding.x86-64 @@ -1,32 +1,31 @@ bench_try_ref_from_suffix_with_elems_dynamic_padding: + mov rcx, rdx + mov edx, 1 movabs rax, 3074457345618258598 - cmp rdx, rax - ja .LBB5_1 - lea r8d, [rsi + rdi] - xor ecx, ecx - mov eax, 0 - test r8b, 3 - je .LBB5_3 - mov rdx, rcx - ret -.LBB5_3: - lea rax, [rdx + 2*rdx] + cmp rcx, rax + ja .LBB5_4 + lea rax, [rcx + 2*rcx] or rax, 3 add rax, 9 - sub rsi, rax - jae .LBB5_4 -.LBB5_1: + mov r8, rsi + sub r8, rax + jae .LBB5_2 +.LBB5_4: xor eax, eax - mov edx, 1 +.LBB5_5: ret -.LBB5_4: - lea r8, [rdi + rsi] - movzx esi, word ptr [rdi + rsi] - cmp si, -16192 - mov ecx, 2 - cmove rcx, rdx +.LBB5_2: + add esi, edi + xor edx, edx + mov eax, 0 + test sil, 3 + jne .LBB5_5 + lea rsi, [rdi + r8] + movzx edi, word ptr [rdi + r8] + cmp di, -16192 + mov edx, 2 + cmove rdx, rcx xor eax, eax - cmp esi, 49344 - cmove rax, r8 - mov rdx, rcx + cmp edi, 49344 + cmove rax, rsi ret diff --git a/benches/try_ref_from_suffix_with_elems_dynamic_padding.x86-64.mca b/benches/try_ref_from_suffix_with_elems_dynamic_padding.x86-64.mca index be736c00c2..198346b5fb 100644 --- a/benches/try_ref_from_suffix_with_elems_dynamic_padding.x86-64.mca +++ b/benches/try_ref_from_suffix_with_elems_dynamic_padding.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 2800 -Total Cycles: 878 -Total uOps: 3000 +Instructions: 2700 +Total Cycles: 1304 +Total uOps: 2900 Dispatch Width: 4 -uOps Per Cycle: 3.42 -IPC: 3.19 -Block RThroughput: 7.5 +uOps Per Cycle: 2.22 +IPC: 2.07 +Block RThroughput: 7.3 Instruction Info: @@ -18,33 +18,32 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: + 1 1 0.33 mov rcx, rdx + 1 1 0.33 mov edx, 1 1 1 0.33 movabs rax, 3074457345618258598 - 1 1 0.33 cmp rdx, rax - 1 1 1.00 ja .LBB5_1 - 1 1 0.50 lea r8d, [rsi + rdi] - 1 0 0.25 xor ecx, ecx - 1 1 0.33 mov eax, 0 - 1 1 0.33 test r8b, 3 - 1 1 1.00 je .LBB5_3 - 1 1 0.33 mov rdx, rcx - 1 1 1.00 U ret - 1 1 0.50 lea rax, [rdx + 2*rdx] + 1 1 0.33 cmp rcx, rax + 1 1 1.00 ja .LBB5_4 + 1 1 0.50 lea rax, [rcx + 2*rcx] 1 1 0.33 or rax, 3 1 1 0.33 add rax, 9 - 1 1 0.33 sub rsi, rax - 1 1 1.00 jae .LBB5_4 + 1 1 0.33 mov r8, rsi + 1 1 0.33 sub r8, rax + 1 1 1.00 jae .LBB5_2 1 0 0.25 xor eax, eax - 1 1 0.33 mov edx, 1 1 1 1.00 U ret - 1 1 0.50 lea r8, [rdi + rsi] - 1 5 0.50 * movzx esi, word ptr [rdi + rsi] - 1 1 0.33 cmp si, -16192 - 1 1 0.33 mov ecx, 2 - 2 2 0.67 cmove rcx, rdx + 1 1 0.33 add esi, edi + 1 0 0.25 xor edx, edx + 1 1 0.33 mov eax, 0 + 1 1 0.33 test sil, 3 + 1 1 1.00 jne .LBB5_5 + 1 1 0.50 lea rsi, [rdi + r8] + 1 5 0.50 * movzx edi, word ptr [rdi + r8] + 1 1 0.33 cmp di, -16192 + 1 1 0.33 mov edx, 2 + 2 2 0.67 cmove rdx, rcx 1 0 0.25 xor eax, eax - 1 1 0.33 cmp esi, 49344 - 2 2 0.67 cmove rax, r8 - 1 1 0.33 mov rdx, rcx + 1 1 0.33 cmp edi, 49344 + 2 2 0.67 cmove rax, rsi 1 1 1.00 U ret @@ -61,35 +60,34 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 8.65 8.65 - 8.70 0.50 0.50 + - - 8.01 8.49 - 8.50 0.50 0.50 Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.67 0.30 - 0.03 - - movabs rax, 3074457345618258598 - - - 0.01 0.99 - - - - cmp rdx, rax - - - - - - 1.00 - - ja .LBB5_1 - - - 0.99 0.01 - - - - lea r8d, [rsi + rdi] - - - - - - - - - xor ecx, ecx - - - 0.35 0.62 - 0.03 - - mov eax, 0 - - - 0.99 0.01 - - - - test r8b, 3 - - - - - - 1.00 - - je .LBB5_3 - - - 0.68 0.30 - 0.02 - - mov rdx, rcx - - - - - - 1.00 - - ret - - - 0.07 0.93 - - - - lea rax, [rdx + 2*rdx] - - - 0.06 0.35 - 0.59 - - or rax, 3 - - - 0.02 0.07 - 0.91 - - add rax, 9 - - - 0.01 0.04 - 0.95 - - sub rsi, rax - - - - - - 1.00 - - jae .LBB5_4 + - - 0.48 0.50 - 0.02 - - mov rcx, rdx + - - 0.02 0.52 - 0.46 - - mov edx, 1 + - - 0.49 0.51 - - - - movabs rax, 3074457345618258598 + - - 0.51 0.48 - 0.01 - - cmp rcx, rax + - - - - - 1.00 - - ja .LBB5_4 + - - 0.48 0.52 - - - - lea rax, [rcx + 2*rcx] + - - 0.52 0.48 - - - - or rax, 3 + - - 0.52 0.47 - 0.01 - - add rax, 9 + - - 0.48 0.52 - - - - mov r8, rsi + - - 0.51 0.01 - 0.48 - - sub r8, rax + - - - - - 1.00 - - jae .LBB5_2 - - - - - - - - xor eax, eax - - - 0.92 0.01 - 0.07 - - mov edx, 1 - - - - - 1.00 - - ret - - - - 1.00 - - - - lea r8, [rdi + rsi] - - - - - - - 0.50 0.50 movzx esi, word ptr [rdi + rsi] - - - 0.01 0.99 - - - - cmp si, -16192 - - - 0.88 0.04 - 0.08 - - mov ecx, 2 - - - 1.00 0.99 - 0.01 - - cmove rcx, rdx + - - 0.01 0.50 - 0.49 - - add esi, edi + - - - - - - - - xor edx, edx + - - 0.04 0.95 - 0.01 - - mov eax, 0 + - - 0.01 0.50 - 0.49 - - test sil, 3 + - - - - - 1.00 - - jne .LBB5_5 + - - 0.50 0.50 - - - - lea rsi, [rdi + r8] + - - - - - - 0.50 0.50 movzx edi, word ptr [rdi + r8] + - - 0.97 0.02 - 0.01 - - cmp di, -16192 + - - 0.48 0.51 - 0.01 - - mov edx, 2 + - - 0.99 0.51 - 0.50 - - cmove rdx, rcx - - - - - - - - xor eax, eax - - - 0.99 0.01 - - - - cmp esi, 49344 - - - 1.00 1.00 - - - - cmove rax, r8 - - - - 0.99 - 0.01 - - mov rdx, rcx + - - 0.02 0.48 - 0.50 - - cmp edi, 49344 + - - 0.98 0.51 - 0.51 - - cmove rax, rsi - - - - - 1.00 - - ret diff --git a/benches/try_ref_from_suffix_with_elems_dynamic_size.x86-64 b/benches/try_ref_from_suffix_with_elems_dynamic_size.x86-64 index 952eb12de8..ee0c7db854 100644 --- a/benches/try_ref_from_suffix_with_elems_dynamic_size.x86-64 +++ b/benches/try_ref_from_suffix_with_elems_dynamic_size.x86-64 @@ -1,28 +1,29 @@ bench_try_ref_from_suffix_with_elems_dynamic_size: + mov rcx, rdx + mov edx, 1 movabs rax, 4611686018427387901 - cmp rdx, rax - ja .LBB5_1 - lea r8d, [rsi + rdi] - xor ecx, ecx - mov eax, 0 - test r8b, 1 - jne .LBB5_5 - lea rax, [2*rdx + 4] - sub rsi, rax + cmp rcx, rax + ja .LBB5_3 + lea rax, [2*rcx + 4] + mov r8, rsi + sub r8, rax jae .LBB5_4 -.LBB5_1: +.LBB5_3: xor eax, eax - mov edx, 1 ret .LBB5_4: - lea r8, [rdi + rsi] - movzx esi, word ptr [rdi + rsi] - cmp si, -16192 - mov ecx, 2 - cmove rcx, rdx + add esi, edi + xor edx, edx + mov eax, 0 + test sil, 1 + jne .LBB5_6 + lea rsi, [rdi + r8] + movzx edi, word ptr [rdi + r8] + cmp di, -16192 + mov edx, 2 + cmove rdx, rcx xor eax, eax - cmp esi, 49344 - cmove rax, r8 -.LBB5_5: - mov rdx, rcx + cmp edi, 49344 + cmove rax, rsi +.LBB5_6: ret diff --git a/benches/try_ref_from_suffix_with_elems_dynamic_size.x86-64.mca b/benches/try_ref_from_suffix_with_elems_dynamic_size.x86-64.mca index d4f78f67a2..7eb924c596 100644 --- a/benches/try_ref_from_suffix_with_elems_dynamic_size.x86-64.mca +++ b/benches/try_ref_from_suffix_with_elems_dynamic_size.x86-64.mca @@ -1,12 +1,12 @@ Iterations: 100 -Instructions: 2400 -Total Cycles: 1107 -Total uOps: 2600 +Instructions: 2500 +Total Cycles: 1105 +Total uOps: 2700 Dispatch Width: 4 -uOps Per Cycle: 2.35 -IPC: 2.17 -Block RThroughput: 6.5 +uOps Per Cycle: 2.44 +IPC: 2.26 +Block RThroughput: 6.8 Instruction Info: @@ -18,29 +18,30 @@ Instruction Info: [6]: HasSideEffects (U) [1] [2] [3] [4] [5] [6] Instructions: + 1 1 0.33 mov rcx, rdx + 1 1 0.33 mov edx, 1 1 1 0.33 movabs rax, 4611686018427387901 - 1 1 0.33 cmp rdx, rax - 1 1 1.00 ja .LBB5_1 - 1 1 0.50 lea r8d, [rsi + rdi] - 1 0 0.25 xor ecx, ecx - 1 1 0.33 mov eax, 0 - 1 1 0.33 test r8b, 1 - 1 1 1.00 jne .LBB5_5 - 1 1 0.50 lea rax, [2*rdx + 4] - 1 1 0.33 sub rsi, rax + 1 1 0.33 cmp rcx, rax + 1 1 1.00 ja .LBB5_3 + 1 1 0.50 lea rax, [2*rcx + 4] + 1 1 0.33 mov r8, rsi + 1 1 0.33 sub r8, rax 1 1 1.00 jae .LBB5_4 1 0 0.25 xor eax, eax - 1 1 0.33 mov edx, 1 1 1 1.00 U ret - 1 1 0.50 lea r8, [rdi + rsi] - 1 5 0.50 * movzx esi, word ptr [rdi + rsi] - 1 1 0.33 cmp si, -16192 - 1 1 0.33 mov ecx, 2 - 2 2 0.67 cmove rcx, rdx + 1 1 0.33 add esi, edi + 1 0 0.25 xor edx, edx + 1 1 0.33 mov eax, 0 + 1 1 0.33 test sil, 1 + 1 1 1.00 jne .LBB5_6 + 1 1 0.50 lea rsi, [rdi + r8] + 1 5 0.50 * movzx edi, word ptr [rdi + r8] + 1 1 0.33 cmp di, -16192 + 1 1 0.33 mov edx, 2 + 2 2 0.67 cmove rdx, rcx 1 0 0.25 xor eax, eax - 1 1 0.33 cmp esi, 49344 - 2 2 0.67 cmove rax, r8 - 1 1 0.33 mov rdx, rcx + 1 1 0.33 cmp edi, 49344 + 2 2 0.67 cmove rax, rsi 1 1 1.00 U ret @@ -57,31 +58,32 @@ Resources: Resource pressure per iteration: [0] [1] [2] [3] [4] [5] [6.0] [6.1] - - - 6.99 7.00 - 8.01 0.50 0.50 + - - 7.50 7.52 - 7.98 0.50 0.50 Resource pressure by instruction: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions: - - - 0.02 0.95 - 0.03 - - movabs rax, 4611686018427387901 - - - 0.93 0.04 - 0.03 - - cmp rdx, rax - - - - - - 1.00 - - ja .LBB5_1 - - - 0.96 0.04 - - - - lea r8d, [rsi + rdi] - - - - - - - - - xor ecx, ecx - - - 0.95 0.02 - 0.03 - - mov eax, 0 - - - 0.95 0.05 - - - - test r8b, 1 - - - - - - 1.00 - - jne .LBB5_5 - - - 0.06 0.94 - - - - lea rax, [2*rdx + 4] - - - 0.93 0.07 - - - - sub rsi, rax + - - 0.47 0.52 - 0.01 - - mov rcx, rdx + - - 0.50 0.49 - 0.01 - - mov edx, 1 + - - 0.49 0.49 - 0.02 - - movabs rax, 4611686018427387901 + - - 0.48 0.51 - 0.01 - - cmp rcx, rax + - - - - - 1.00 - - ja .LBB5_3 + - - 0.51 0.49 - - - - lea rax, [2*rcx + 4] + - - 0.49 0.51 - - - - mov r8, rsi + - - 0.48 0.52 - - - - sub r8, rax - - - - - 1.00 - - jae .LBB5_4 - - - - - - - - xor eax, eax - - - 0.03 0.95 - 0.02 - - mov edx, 1 - - - - - 1.00 - - ret - - - 0.97 0.03 - - - - lea r8, [rdi + rsi] - - - - - - - 0.50 0.50 movzx esi, word ptr [rdi + rsi] - - - 0.03 0.97 - - - - cmp si, -16192 - - - 0.05 0.94 - 0.01 - - mov ecx, 2 - - - 0.06 0.98 - 0.96 - - cmove rcx, rdx + - - 0.47 0.47 - 0.06 - - add esi, edi + - - - - - - - - xor edx, edx + - - 0.51 0.49 - - - - mov eax, 0 + - - 0.47 0.47 - 0.06 - - test sil, 1 + - - - - - 1.00 - - jne .LBB5_6 + - - 0.52 0.48 - - - - lea rsi, [rdi + r8] + - - - - - - 0.50 0.50 movzx edi, word ptr [rdi + r8] + - - 0.50 0.04 - 0.46 - - cmp di, -16192 + - - 0.49 0.50 - 0.01 - - mov edx, 2 + - - 0.54 0.52 - 0.94 - - cmove rdx, rcx - - - - - - - - xor eax, eax - - - 0.97 0.03 - - - - cmp esi, 49344 - - - 0.06 0.96 - 0.98 - - cmove rax, r8 - - - 0.02 0.03 - 0.95 - - mov rdx, rcx + - - 0.04 0.49 - 0.47 - - cmp edi, 49344 + - - 0.54 0.53 - 0.93 - - cmove rax, rsi - - - - - 1.00 - - ret diff --git a/src/layout.rs b/src/layout.rs index 19ad5ca85f..e2b322e8a6 100644 --- a/src/layout.rs +++ b/src/layout.rs @@ -638,37 +638,7 @@ impl DstLayout { addr.checked_add(bytes_len).is_some(), "`addr` + `bytes_len` > usize::MAX" ); - - // Alignment checks go in their own block to avoid introducing variables - // into the top-level scope. - { - // We check alignment for `addr` (for prefix casts) or `addr + - // bytes_len` (for suffix casts). For a prefix cast, the correctness - // of this check is trivial - `addr` is the address the object will - // live at. - // - // For a suffix cast, we know that all valid sizes for the type are - // a multiple of the alignment (and by safety precondition, we know - // `DstLayout` may only describe valid Rust types). Thus, a - // validly-sized instance which lives at a validly-aligned address - // must also end at a validly-aligned address. Thus, if the end - // address for a suffix cast (`addr + bytes_len`) is not aligned, - // then no valid start address will be aligned either. - let offset = match cast_type { - CastType::Prefix => 0, - CastType::Suffix => bytes_len, - }; - - // Addition is guaranteed not to overflow because `offset <= - // bytes_len`, and `addr + bytes_len <= usize::MAX` is a - // precondition of this method. Modulus is guaranteed not to divide - // by 0 because `align` is non-zero. - #[allow(clippy::arithmetic_side_effects)] - if (addr + offset) % self.align.get() != 0 { - return Err(MetadataCastError::Alignment); - } - } - + let (elems, self_bytes) = match size_info { SizeInfo::Sized { size } => { if size > bytes_len { @@ -682,7 +652,7 @@ impl DstLayout { // multiple of the alignment, or will be larger than // `bytes_len`. let max_total_bytes = - util::round_down_to_next_multiple_of_alignment(bytes_len, self.align); + util::round_down_to_next_multiple_of_alignment(bytes_len, self.align); // Calculate the maximum number of bytes that could be consumed // by the trailing slice. // @@ -693,7 +663,7 @@ impl DstLayout { // `bytes_len` too small even for 0 trailing slice elements. None => return Err(MetadataCastError::Size), }; - + // Calculate the number of elements that fit in // `max_slice_and_padding_bytes`; any remaining bytes will be // considered padding. @@ -728,10 +698,40 @@ impl DstLayout { // `self_bytes` up to `max_total_bytes`. #[allow(clippy::arithmetic_side_effects)] let self_bytes = - without_padding + util::padding_needed_for(without_padding, self.align); + without_padding + util::padding_needed_for(without_padding, self.align); (elems, self_bytes) } }; + + // Alignment checks go in their own block to avoid introducing variables + // into the top-level scope. + { + // We check alignment for `addr` (for prefix casts) or `addr + + // bytes_len` (for suffix casts). For a prefix cast, the correctness + // of this check is trivial - `addr` is the address the object will + // live at. + // + // For a suffix cast, we know that all valid sizes for the type are + // a multiple of the alignment (and by safety precondition, we know + // `DstLayout` may only describe valid Rust types). Thus, a + // validly-sized instance which lives at a validly-aligned address + // must also end at a validly-aligned address. Thus, if the end + // address for a suffix cast (`addr + bytes_len`) is not aligned, + // then no valid start address will be aligned either. + let offset = match cast_type { + CastType::Prefix => 0, + CastType::Suffix => bytes_len, + }; + + // Addition is guaranteed not to overflow because `offset <= + // bytes_len`, and `addr + bytes_len <= usize::MAX` is a + // precondition of this method. Modulus is guaranteed not to divide + // by 0 because `align` is non-zero. + #[allow(clippy::arithmetic_side_effects)] + if (addr + offset) % self.align.get() != 0 { + return Err(MetadataCastError::Alignment); + } + } __const_debug_assert!(self_bytes <= bytes_len);