Check elision of bounds checking for simple loops
When checking unsafe usage for https://github.com/image-rs/imageproc/issues/371 I found a few cases where benchmarks showed a speedup from using unsafe getters despite the accesses having the following form:
for y in 0..image.height() {
for x in 0..image.width() {
...
let p = image.unsafe_get_pixel(x, y);
...
}
}
Check the generated assembly from using the current accessor functions, starting from a trivial loop, e.g.
/// .
pub fn trivial_loop(image: &mut GrayImage, d: u8) {
for y in 0..image.height() {
for x in 0..image.width() {
let p = image.get_pixel(x, y)[0];
image.put_pixel(x, y, Luma([p + d]));
}
}
}
/// .
pub fn trivial_loop_unsafe(image: &mut GrayImage, d: u8) {
unsafe {
for y in 0..image.height() {
for x in 0..image.width() {
let p = image.unsafe_get_pixel(x, y)[0];
image.unsafe_put_pixel(x, y, Luma([p + d]));
}
}
}
}
As one can see from this playground, the compiler does not eilde all bounds accesses and is prepared to panic for the non-unsafe loop. Why, we don't know. Comment out the safe version to see that the unsafe variant will never panic.
Oh, cool. I didn't realise you could use image from the playground now.
Two more variants, they don't get SIMD optimized unlike the unsafe version.
pub fn enumerate_loop(image: &mut GrayImage, d: u8) {
for (_, _, p) in image.enumerate_pixels_mut() {
*p = Luma([p[0] + d]);
}
}
pub fn samples_loop(image: &mut GrayImage, d: u8) {
for i in 0..(image.width()*image.height()) as usize {
let p = (**image)[i];
(**image)[i] = p + d;
}
}
Using cargo asm...
Safe loop:
imageproc::gradients::trivial_loop (src/gradients.rs:10):
push rbp
mov rbp, rsp
sub rsp, 96
mov r8d, dword, ptr, [rdi, +, 28]
test r8, r8
je LBB162_12
xor r9d, r9d
mov edx, dword, ptr, [rdi, +, 24]
test rdx, rdx
jne LBB162_4
LBB162_2:
inc r9
cmp r9, r8
je LBB162_12
mov edx, dword, ptr, [rdi, +, 24]
test rdx, rdx
je LBB162_2
LBB162_4:
mov eax, dword, ptr, [rdi, +, 28]
cmp r9, rax
jae LBB162_5
lea r10, [rdx, -, 1]
xor ecx, ecx
LBB162_8:
imul rdx, r9
lea r11, [rcx, +, rdx]
mov rax, qword, ptr, [rdi, +, 16]
cmp rax, r11
jbe LBB162_13
add rdx, qword, ptr, [rdi]
add byte, ptr, [rcx, +, rdx], sil
cmp r10, rcx
je LBB162_2
inc rcx
mov edx, dword, ptr, [rdi, +, 24]
mov eax, dword, ptr, [rdi, +, 28]
cmp rcx, rdx
jae LBB162_6
cmp r9, rax
jb LBB162_8
jmp LBB162_6
LBB162_12:
add rsp, 96
pop rbp
ret
LBB162_13:
lea rdi, [rdx, +, rcx, +, 1]
mov rsi, rax
call core::slice::slice_index_len_fail
LBB162_5:
xor ecx, ecx
LBB162_6:
mov dword, ptr, [rbp, -, 8], ecx
mov dword, ptr, [rbp, -, 4], r9d
mov dword, ptr, [rbp, -, 16], edx
mov dword, ptr, [rbp, -, 12], eax
lea rax, [rbp, -, 8]
mov qword, ptr, [rbp, -, 48], rax
lea rax, [rip, +, __ZN59_$LT$$LP$T10$C$$u20$T11$RP$$u20$as$u20$core..fmt..Debug$GT$3fmt17h80913d96e83c0fd7E]
mov qword, ptr, [rbp, -, 40], rax
lea rcx, [rbp, -, 16]
mov qword, ptr, [rbp, -, 32], rcx
mov qword, ptr, [rbp, -, 24], rax
lea rax, [rip, +, l_anon.51f5f939b630edf9f4882189126992de.28]
mov qword, ptr, [rbp, -, 96], rax
mov qword, ptr, [rbp, -, 88], 2
mov qword, ptr, [rbp, -, 80], 0
lea rax, [rbp, -, 48]
mov qword, ptr, [rbp, -, 64], rax
mov qword, ptr, [rbp, -, 56], 2
lea rsi, [rip, +, l_anon.51f5f939b630edf9f4882189126992de.31]
lea rdi, [rbp, -, 96]
call std::panicking::begin_panic_fmt
Unsafe loop:
imageproc::gradients::trivial_loop_unsafe (src/gradients.rs:20):
push rbp
mov rbp, rsp
push r15
push r14
push r12
push rbx
mov r10d, dword, ptr, [rdi, +, 28]
test r10, r10
je LBB163_14
mov ecx, dword, ptr, [rdi, +, 24]
test rcx, rcx
je LBB163_14
mov r14, qword, ptr, [rdi]
mov r15d, ecx
and r15d, -32
lea r8, [r15, -, 32]
mov rdx, r8
shr rdx, 5
inc rdx
movzx eax, sil
movd xmm0, eax
pxor xmm1, xmm1
pshufb xmm0, xmm1
mov r9d, edx
and r9d, 1
lea rax, [r14, +, 48]
mov r11, r9
sub r11, rdx
xor r12d, r12d
mov rdx, r14
cmp ecx, 32
jae LBB163_5
jmp LBB163_4
LBB163_13:
inc r12
add rax, rcx
add rdx, rcx
cmp r12, r10
je LBB163_14
cmp ecx, 32
jae LBB163_5
LBB163_4:
xor edi, edi
jmp LBB163_12
LBB163_5:
test r8, r8
je LBB163_6
mov rbx, r11
xor edi, edi
LBB163_8:
movdqu xmm1, xmmword, ptr, [rax, +, rdi, -, 48]
movdqu xmm2, xmmword, ptr, [rax, +, rdi, -, 32]
movdqu xmm3, xmmword, ptr, [rax, +, rdi, -, 16]
movdqu xmm4, xmmword, ptr, [rax, +, rdi]
paddb xmm1, xmm0
paddb xmm2, xmm0
movdqu xmmword, ptr, [rax, +, rdi, -, 48], xmm1
movdqu xmmword, ptr, [rax, +, rdi, -, 32], xmm2
paddb xmm3, xmm0
paddb xmm4, xmm0
movdqu xmmword, ptr, [rax, +, rdi, -, 16], xmm3
movdqu xmmword, ptr, [rax, +, rdi], xmm4
add rdi, 64
add rbx, 2
jne LBB163_8
test r9, r9
je LBB163_11
LBB163_10:
mov rbx, r12
imul rbx, rcx
add rbx, rdi
movdqu xmm1, xmmword, ptr, [r14, +, rbx]
movdqu xmm2, xmmword, ptr, [r14, +, rbx, +, 16]
paddb xmm1, xmm0
paddb xmm2, xmm0
movdqu xmmword, ptr, [r14, +, rbx], xmm1
movdqu xmmword, ptr, [r14, +, rbx, +, 16], xmm2
LBB163_11:
mov rdi, r15
cmp r15, rcx
je LBB163_13
LBB163_12:
add byte, ptr, [rdx, +, rdi], sil
inc rdi
cmp rcx, rdi
jne LBB163_12
jmp LBB163_13
LBB163_6:
xor edi, edi
test r9, r9
jne LBB163_10
jmp LBB163_11
LBB163_14:
pop rbx
pop r12
pop r14
pop r15
pop rbp
ret
Luma::from_slice is not marked as inline, but changing this makes no difference.
Found a very manual but safe version that is optimized as nicely as the unsafe one in the main loop, not a very big surprise though.
pub fn samples_loop(image: &mut GrayImage, d: u8) {
let samples = image.width() as usize*image.height() as usize;
let image = &mut **image;
for p in &mut image[..samples] {
*p = *p + d;
}
}