imageproc Check elision of bounds checking for simple loops

When checking unsafe usage for https://github.com/image-rs/imageproc/issues/371 I found a few cases where benchmarks showed a speedup from using unsafe getters despite the accesses having the following form:

    for y in 0..image.height() {
        for x in 0..image.width() {
            ...
            let p = image.unsafe_get_pixel(x, y);
            ...
        }
    }

Check the generated assembly from using the current accessor functions, starting from a trivial loop, e.g.


/// .
pub fn trivial_loop(image: &mut GrayImage, d: u8) {
    for y in 0..image.height() {
        for x in 0..image.width() {
            let p = image.get_pixel(x, y)[0];
            image.put_pixel(x, y, Luma([p + d]));
        }
    }
}

/// .
pub fn trivial_loop_unsafe(image: &mut GrayImage, d: u8) {
    unsafe {
        for y in 0..image.height() {
            for x in 0..image.width() {
                let p = image.unsafe_get_pixel(x, y)[0];
                image.unsafe_put_pixel(x, y, Luma([p + d]));
            }
        }
    }
}

Sep 01 '19 10:09 theotherphil

As one can see from this playground, the compiler does not eilde all bounds accesses and is prepared to panic for the non-unsafe loop. Why, we don't know. Comment out the safe version to see that the unsafe variant will never panic.

Sep 01 '19 12:09 197g

Oh, cool. I didn't realise you could use image from the playground now.

Sep 01 '19 12:09 theotherphil

Two more variants, they don't get SIMD optimized unlike the unsafe version.


pub fn enumerate_loop(image: &mut GrayImage, d: u8) {
    for (_, _, p) in image.enumerate_pixels_mut() {
        *p = Luma([p[0] + d]);
    }
}

pub fn samples_loop(image: &mut GrayImage, d: u8) {
    for i in 0..(image.width()*image.height()) as usize {
        let p = (**image)[i];
        (**image)[i] = p + d;
    }
}

Sep 01 '19 12:09 197g

Using cargo asm...

Safe loop:

imageproc::gradients::trivial_loop (src/gradients.rs:10):
 push    rbp
 mov     rbp, rsp
 sub     rsp, 96
 mov     r8d, dword, ptr, [rdi, +, 28]
 test    r8, r8
 je      LBB162_12
 xor     r9d, r9d
 mov     edx, dword, ptr, [rdi, +, 24]
 test    rdx, rdx
 jne     LBB162_4
LBB162_2:
 inc     r9
 cmp     r9, r8
 je      LBB162_12
 mov     edx, dword, ptr, [rdi, +, 24]
 test    rdx, rdx
 je      LBB162_2
LBB162_4:
 mov     eax, dword, ptr, [rdi, +, 28]
 cmp     r9, rax
 jae     LBB162_5
 lea     r10, [rdx, -, 1]
 xor     ecx, ecx
LBB162_8:
 imul    rdx, r9
 lea     r11, [rcx, +, rdx]
 mov     rax, qword, ptr, [rdi, +, 16]
 cmp     rax, r11
 jbe     LBB162_13
 add     rdx, qword, ptr, [rdi]
 add     byte, ptr, [rcx, +, rdx], sil
 cmp     r10, rcx
 je      LBB162_2
 inc     rcx
 mov     edx, dword, ptr, [rdi, +, 24]
 mov     eax, dword, ptr, [rdi, +, 28]
 cmp     rcx, rdx
 jae     LBB162_6
 cmp     r9, rax
 jb      LBB162_8
 jmp     LBB162_6
LBB162_12:
 add     rsp, 96
 pop     rbp
 ret
LBB162_13:
 lea     rdi, [rdx, +, rcx, +, 1]
 mov     rsi, rax
 call    core::slice::slice_index_len_fail
LBB162_5:
 xor     ecx, ecx
LBB162_6:
 mov     dword, ptr, [rbp, -, 8], ecx
 mov     dword, ptr, [rbp, -, 4], r9d
 mov     dword, ptr, [rbp, -, 16], edx
 mov     dword, ptr, [rbp, -, 12], eax
 lea     rax, [rbp, -, 8]
 mov     qword, ptr, [rbp, -, 48], rax
 lea     rax, [rip, +, __ZN59_$LT$$LP$T10$C$$u20$T11$RP$$u20$as$u20$core..fmt..Debug$GT$3fmt17h80913d96e83c0fd7E]
 mov     qword, ptr, [rbp, -, 40], rax
 lea     rcx, [rbp, -, 16]
 mov     qword, ptr, [rbp, -, 32], rcx
 mov     qword, ptr, [rbp, -, 24], rax
 lea     rax, [rip, +, l_anon.51f5f939b630edf9f4882189126992de.28]
 mov     qword, ptr, [rbp, -, 96], rax
 mov     qword, ptr, [rbp, -, 88], 2
 mov     qword, ptr, [rbp, -, 80], 0
 lea     rax, [rbp, -, 48]
 mov     qword, ptr, [rbp, -, 64], rax
 mov     qword, ptr, [rbp, -, 56], 2
 lea     rsi, [rip, +, l_anon.51f5f939b630edf9f4882189126992de.31]
 lea     rdi, [rbp, -, 96]
 call    std::panicking::begin_panic_fmt

Unsafe loop:

imageproc::gradients::trivial_loop_unsafe (src/gradients.rs:20):
 push    rbp
 mov     rbp, rsp
 push    r15
 push    r14
 push    r12
 push    rbx
 mov     r10d, dword, ptr, [rdi, +, 28]
 test    r10, r10
 je      LBB163_14
 mov     ecx, dword, ptr, [rdi, +, 24]
 test    rcx, rcx
 je      LBB163_14
 mov     r14, qword, ptr, [rdi]
 mov     r15d, ecx
 and     r15d, -32
 lea     r8, [r15, -, 32]
 mov     rdx, r8
 shr     rdx, 5
 inc     rdx
 movzx   eax, sil
 movd    xmm0, eax
 pxor    xmm1, xmm1
 pshufb  xmm0, xmm1
 mov     r9d, edx
 and     r9d, 1
 lea     rax, [r14, +, 48]
 mov     r11, r9
 sub     r11, rdx
 xor     r12d, r12d
 mov     rdx, r14
 cmp     ecx, 32
 jae     LBB163_5
 jmp     LBB163_4
LBB163_13:
 inc     r12
 add     rax, rcx
 add     rdx, rcx
 cmp     r12, r10
 je      LBB163_14
 cmp     ecx, 32
 jae     LBB163_5
LBB163_4:
 xor     edi, edi
 jmp     LBB163_12
LBB163_5:
 test    r8, r8
 je      LBB163_6
 mov     rbx, r11
 xor     edi, edi
LBB163_8:
 movdqu  xmm1, xmmword, ptr, [rax, +, rdi, -, 48]
 movdqu  xmm2, xmmword, ptr, [rax, +, rdi, -, 32]
 movdqu  xmm3, xmmword, ptr, [rax, +, rdi, -, 16]
 movdqu  xmm4, xmmword, ptr, [rax, +, rdi]
 paddb   xmm1, xmm0
 paddb   xmm2, xmm0
 movdqu  xmmword, ptr, [rax, +, rdi, -, 48], xmm1
 movdqu  xmmword, ptr, [rax, +, rdi, -, 32], xmm2
 paddb   xmm3, xmm0
 paddb   xmm4, xmm0
 movdqu  xmmword, ptr, [rax, +, rdi, -, 16], xmm3
 movdqu  xmmword, ptr, [rax, +, rdi], xmm4
 add     rdi, 64
 add     rbx, 2
 jne     LBB163_8
 test    r9, r9
 je      LBB163_11
LBB163_10:
 mov     rbx, r12
 imul    rbx, rcx
 add     rbx, rdi
 movdqu  xmm1, xmmword, ptr, [r14, +, rbx]
 movdqu  xmm2, xmmword, ptr, [r14, +, rbx, +, 16]
 paddb   xmm1, xmm0
 paddb   xmm2, xmm0
 movdqu  xmmword, ptr, [r14, +, rbx], xmm1
 movdqu  xmmword, ptr, [r14, +, rbx, +, 16], xmm2
LBB163_11:
 mov     rdi, r15
 cmp     r15, rcx
 je      LBB163_13
LBB163_12:
 add     byte, ptr, [rdx, +, rdi], sil
 inc     rdi
 cmp     rcx, rdi
 jne     LBB163_12
 jmp     LBB163_13
LBB163_6:
 xor     edi, edi
 test    r9, r9
 jne     LBB163_10
 jmp     LBB163_11
LBB163_14:
 pop     rbx
 pop     r12
 pop     r14
 pop     r15
 pop     rbp
 ret

Sep 01 '19 13:09 theotherphil

Luma::from_slice is not marked as inline, but changing this makes no difference.

Sep 01 '19 13:09 theotherphil

Found a very manual but safe version that is optimized as nicely as the unsafe one in the main loop, not a very big surprise though.

pub fn samples_loop(image: &mut GrayImage, d: u8) {
    let samples = image.width() as usize*image.height() as usize;
    let image = &mut **image;
    for p in &mut image[..samples] {
        *p = *p + d;
    }
}

Sep 01 '19 14:09 197g