Add overloaded function for count equals 1 in stack.h
Add overloaded function for count equals 1 in stack.h to eliminating the redundant multiplication related to 1.
Why?
Why?
In some practical applications I have tested, the value of count is frequently 1. When count is 1, multiplication related to count is redundant. Meanwhile, it can also avoid loading 1 from memory.
Why?
In some practical applications I have tested, the value of count is frequently 1. When count is 1, multiplication related to count is redundant. Meanwhile, it can also avoid loading 1 from memory.
These functions are all inline so the compiler will remove the multiplication after optimization.
Something like this https://godbolt.org/z/9YvPq1Y3h
By reviewing the gdb disassembly results, it appears that the compiler has not successfully optimized the situation with the value of count equal 1.
My compiler is gcc9.4.0.
Does it with optimization?
Does it with optimization?
Yes, I use release build with -O3 enabled for testing.
I tried to make minimal reproduce code as follows, but cannot find such multiplications on MacOS (clang).
#include "rapidjson/allocators.h"
#include "rapidjson/internal/stack.h"
char* test_push(rapidjson::internal::Stack<rapidjson::CrtAllocator>& s) {
return s.Push<char>();
}
char* test_push_unsafe(rapidjson::internal::Stack<rapidjson::CrtAllocator>& s) {
return s.PushUnsafe<char>();
}
void test_reserve(rapidjson::internal::Stack<rapidjson::CrtAllocator>& s) {
s.Reserve<char>();
}
g++ -I include -D NDEBUG -S -o- -O3 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -fverbose-asm
test.cpp
.section __TEXT,__text,regular,pure_instructions
.build_version macos, 13, 0 sdk_version 13, 3
.globl __Z3fooRN9rapidjson8internal5StackINS_12CrtAllocatorEEE ; -- Begin function _Z3fooRN9rapidjson8internal5StackINS_12CrtAllocatorEEE
.p2align 2
__Z3fooRN9rapidjson8internal5StackINS_12CrtAllocatorEEE: ; @_Z3fooRN9rapidjson8internal5StackINS_12CrtAllocatorEEE
; %bb.0:
ldr x8, [x0, #24]
add x9, x8, #1
str x9, [x0, #24]
mov w9, #97
strb w9, [x8]
ret
; -- End function
.globl __Z3barRN9rapidjson8internal5StackINS_12CrtAllocatorEEE ; -- Begin function _Z3barRN9rapidjson8internal5StackINS_12CrtAllocatorEEE
.p2align 2
__Z3barRN9rapidjson8internal5StackINS_12CrtAllocatorEEE: ; @_Z3barRN9rapidjson8internal5StackINS_12CrtAllocatorEEE
; %bb.0:
stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
stp x29, x30, [sp, #16] ; 16-byte Folded Spill
add x29, sp, #16
mov x19, x0
ldp x8, x9, [x0, #24]
sub x9, x9, x8
cmp x9, #0
b.le LBB1_2
LBB1_1:
add x9, x8, #1
str x9, [x19, #24]
mov w9, #97
strb w9, [x8]
ldr x8, [x19, #24]
ldursb w0, [x8, #-1]
ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
ret
LBB1_2:
mov x0, x19
mov w1, #1
bl __ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm
ldr x8, [x19, #24]
b LBB1_1
; -- End function
.globl __ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm ; -- Begin function _ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm
.weak_def_can_be_hidden __ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm
.p2align 2
__ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm: ; @_ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm
; %bb.0:
stp x22, x21, [sp, #-48]! ; 16-byte Folded Spill
stp x20, x19, [sp, #16] ; 16-byte Folded Spill
stp x29, x30, [sp, #32] ; 16-byte Folded Spill
add x29, sp, #32
mov x21, x1
mov x19, x0
ldr x20, [x0, #16]
cbz x20, LBB2_2
; %bb.1:
ldr x8, [x19, #32]
sub x8, x8, x20
add x9, x8, #1
add x8, x8, x9, lsr #1
mov x9, x20
b LBB2_5
LBB2_2:
ldr x8, [x19]
cbnz x8, LBB2_4
; %bb.3:
mov w0, #1
bl __Znwm
stp x0, x0, [x19]
LBB2_4:
mov x9, #0
ldr x8, [x19, #40]
LBB2_5:
ldr x10, [x19, #24]
sub x22, x10, x9
add x9, x22, x21
cmp x8, x9
mov x0, x20
csel x21, x8, x9, hi
cbz x21, LBB2_7
; %bb.6:
mov x1, x21
bl _realloc
b LBB2_8
LBB2_7:
bl _free
mov x0, #0
LBB2_8:
add x8, x0, x22
stp x0, x8, [x19, #16]
add x8, x0, x21
str x8, [x19, #32]
ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
ldp x20, x19, [sp, #16] ; 16-byte Folded Reload
ldp x22, x21, [sp], #48 ; 16-byte Folded Reload
ret
; -- End function
.subsections_via_symbols
➜ rapidjson git:(master) ✗ g++ -I include -D NDEBUG -S -o- -O3 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -fverbose-asm test.cpp
.section __TEXT,__text,regular,pure_instructions
.build_version macos, 13, 0 sdk_version 13, 3
.globl __Z9test_pushRN9rapidjson8internal5StackINS_12CrtAllocatorEEE ; -- Begin function _Z9test_pushRN9rapidjson8internal5StackINS_12CrtAllocatorEEE
.p2align 2
__Z9test_pushRN9rapidjson8internal5StackINS_12CrtAllocatorEEE: ; @_Z9test_pushRN9rapidjson8internal5StackINS_12CrtAllocatorEEE
; %bb.0:
stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
stp x29, x30, [sp, #16] ; 16-byte Folded Spill
add x29, sp, #16
mov x19, x0
ldp x0, x8, [x0, #24]
sub x8, x8, x0
cmp x8, #0
b.le LBB0_2
; %bb.1:
add x8, x0, #1
str x8, [x19, #24]
ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
ret
LBB0_2:
mov x0, x19
mov w1, #1
bl __ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm
ldr x0, [x19, #24]
add x8, x0, #1
str x8, [x19, #24]
ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
ret
; -- End function
.globl __Z16test_push_unsafeRN9rapidjson8internal5StackINS_12CrtAllocatorEEE ; -- Begin function _Z16test_push_unsafeRN9rapidjson8internal5StackINS_12CrtAllocatorEEE
.p2align 2
__Z16test_push_unsafeRN9rapidjson8internal5StackINS_12CrtAllocatorEEE: ; @_Z16test_push_unsafeRN9rapidjson8internal5StackINS_12CrtAllocatorEEE
; %bb.0:
ldr x8, [x0, #24]
add x9, x8, #1
str x9, [x0, #24]
mov x0, x8
ret
; -- End function
.globl __Z12test_reserveRN9rapidjson8internal5StackINS_12CrtAllocatorEEE ; -- Begin function _Z12test_reserveRN9rapidjson8internal5StackINS_12CrtAllocatorEEE
.p2align 2
__Z12test_reserveRN9rapidjson8internal5StackINS_12CrtAllocatorEEE: ; @_Z12test_reserveRN9rapidjson8internal5StackINS_12CrtAllocatorEEE
; %bb.0:
ldp x9, x8, [x0, #24]
sub x8, x8, x9
cmp x8, #0
b.le LBB2_2
; %bb.1:
ret
LBB2_2:
mov w1, #1
b __ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm
; -- End function
.globl __ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm ; -- Begin function _ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm
.weak_def_can_be_hidden __ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm
.p2align 2
__ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm: ; @_ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm
; %bb.0:
stp x22, x21, [sp, #-48]! ; 16-byte Folded Spill
stp x20, x19, [sp, #16] ; 16-byte Folded Spill
stp x29, x30, [sp, #32] ; 16-byte Folded Spill
add x29, sp, #32
mov x21, x1
mov x19, x0
ldr x20, [x0, #16]
cbz x20, LBB3_2
; %bb.1:
ldr x8, [x19, #32]
sub x8, x8, x20
add x9, x8, #1
add x8, x8, x9, lsr #1
mov x9, x20
b LBB3_5
LBB3_2:
ldr x8, [x19]
cbnz x8, LBB3_4
; %bb.3:
mov w0, #1
bl __Znwm
stp x0, x0, [x19]
LBB3_4:
mov x9, #0
ldr x8, [x19, #40]
LBB3_5:
ldr x10, [x19, #24]
sub x22, x10, x9
add x9, x22, x21
cmp x8, x9
mov x0, x20
csel x21, x8, x9, hi
cbz x21, LBB3_7
; %bb.6:
mov x1, x21
bl _realloc
b LBB3_8
LBB3_7:
bl _free
mov x0, #0
LBB3_8:
add x8, x0, x22
stp x0, x8, [x19, #16]
add x8, x0, x21
str x8, [x19, #32]
ldp x29, x30, [sp, #32] ; 16-byte Folded Reload
ldp x20, x19, [sp, #16] ; 16-byte Folded Reload
ldp x22, x21, [sp], #48 ; 16-byte Folded Reload
ret
; -- End function
.subsections_via_symbols