rapidjson icon indicating copy to clipboard operation
rapidjson copied to clipboard

Add overloaded function for count equals 1 in stack.h

Open violet73 opened this issue 2 years ago • 9 comments

Add overloaded function for count equals 1 in stack.h to eliminating the redundant multiplication related to 1.

violet73 avatar Aug 18 '23 06:08 violet73

CLA assistant check
All committers have signed the CLA.

tencent-adm avatar Aug 18 '23 06:08 tencent-adm

Why?

miloyip avatar Aug 18 '23 06:08 miloyip

Why?

In some practical applications I have tested, the value of count is frequently 1. When count is 1, multiplication related to count is redundant. Meanwhile, it can also avoid loading 1 from memory.

violet73 avatar Aug 18 '23 06:08 violet73

Why?

In some practical applications I have tested, the value of count is frequently 1. When count is 1, multiplication related to count is redundant. Meanwhile, it can also avoid loading 1 from memory.

These functions are all inline so the compiler will remove the multiplication after optimization.

miloyip avatar Aug 18 '23 06:08 miloyip

Something like this https://godbolt.org/z/9YvPq1Y3h

miloyip avatar Aug 18 '23 07:08 miloyip

By reviewing the gdb disassembly results, it appears that the compiler has not successfully optimized the situation with the value of count equal 1.

ecedc45dfd0d5e512323adf6b9ea617 aa3dd21d91dd4f3787f96f9a9d8917f

My compiler is gcc9.4.0.

violet73 avatar Aug 18 '23 07:08 violet73

Does it with optimization?

miloyip avatar Aug 18 '23 08:08 miloyip

Does it with optimization?

Yes, I use release build with -O3 enabled for testing.

violet73 avatar Aug 18 '23 08:08 violet73

I tried to make minimal reproduce code as follows, but cannot find such multiplications on MacOS (clang).

#include "rapidjson/allocators.h"
#include "rapidjson/internal/stack.h"

char* test_push(rapidjson::internal::Stack<rapidjson::CrtAllocator>& s) {
    return s.Push<char>();
}

char* test_push_unsafe(rapidjson::internal::Stack<rapidjson::CrtAllocator>& s) {
    return s.PushUnsafe<char>();
}

void test_reserve(rapidjson::internal::Stack<rapidjson::CrtAllocator>& s) {
    s.Reserve<char>();
}
g++ -I include -D NDEBUG -S -o- -O3 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -fverbose-asm 
test.cpp
	.section	__TEXT,__text,regular,pure_instructions
	.build_version macos, 13, 0	sdk_version 13, 3
	.globl	__Z3fooRN9rapidjson8internal5StackINS_12CrtAllocatorEEE ; -- Begin function _Z3fooRN9rapidjson8internal5StackINS_12CrtAllocatorEEE
	.p2align	2
__Z3fooRN9rapidjson8internal5StackINS_12CrtAllocatorEEE: ; @_Z3fooRN9rapidjson8internal5StackINS_12CrtAllocatorEEE
; %bb.0:
	ldr	x8, [x0, #24]
	add	x9, x8, #1
	str	x9, [x0, #24]
	mov	w9, #97
	strb	w9, [x8]
	ret
                                        ; -- End function
	.globl	__Z3barRN9rapidjson8internal5StackINS_12CrtAllocatorEEE ; -- Begin function _Z3barRN9rapidjson8internal5StackINS_12CrtAllocatorEEE
	.p2align	2
__Z3barRN9rapidjson8internal5StackINS_12CrtAllocatorEEE: ; @_Z3barRN9rapidjson8internal5StackINS_12CrtAllocatorEEE
; %bb.0:
	stp	x20, x19, [sp, #-32]!           ; 16-byte Folded Spill
	stp	x29, x30, [sp, #16]             ; 16-byte Folded Spill
	add	x29, sp, #16
	mov	x19, x0
	ldp	x8, x9, [x0, #24]
	sub	x9, x9, x8
	cmp	x9, #0
	b.le	LBB1_2
LBB1_1:
	add	x9, x8, #1
	str	x9, [x19, #24]
	mov	w9, #97
	strb	w9, [x8]
	ldr	x8, [x19, #24]
	ldursb	w0, [x8, #-1]
	ldp	x29, x30, [sp, #16]             ; 16-byte Folded Reload
	ldp	x20, x19, [sp], #32             ; 16-byte Folded Reload
	ret
LBB1_2:
	mov	x0, x19
	mov	w1, #1
	bl	__ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm
	ldr	x8, [x19, #24]
	b	LBB1_1
                                        ; -- End function
	.globl	__ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm ; -- Begin function _ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm
	.weak_def_can_be_hidden	__ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm
	.p2align	2
__ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm: ; @_ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm
; %bb.0:
	stp	x22, x21, [sp, #-48]!           ; 16-byte Folded Spill
	stp	x20, x19, [sp, #16]             ; 16-byte Folded Spill
	stp	x29, x30, [sp, #32]             ; 16-byte Folded Spill
	add	x29, sp, #32
	mov	x21, x1
	mov	x19, x0
	ldr	x20, [x0, #16]
	cbz	x20, LBB2_2
; %bb.1:
	ldr	x8, [x19, #32]
	sub	x8, x8, x20
	add	x9, x8, #1
	add	x8, x8, x9, lsr #1
	mov	x9, x20
	b	LBB2_5
LBB2_2:
	ldr	x8, [x19]
	cbnz	x8, LBB2_4
; %bb.3:
	mov	w0, #1
	bl	__Znwm
	stp	x0, x0, [x19]
LBB2_4:
	mov	x9, #0
	ldr	x8, [x19, #40]
LBB2_5:
	ldr	x10, [x19, #24]
	sub	x22, x10, x9
	add	x9, x22, x21
	cmp	x8, x9
	mov	x0, x20
	csel	x21, x8, x9, hi
	cbz	x21, LBB2_7
; %bb.6:
	mov	x1, x21
	bl	_realloc
	b	LBB2_8
LBB2_7:
	bl	_free
	mov	x0, #0
LBB2_8:
	add	x8, x0, x22
	stp	x0, x8, [x19, #16]
	add	x8, x0, x21
	str	x8, [x19, #32]
	ldp	x29, x30, [sp, #32]             ; 16-byte Folded Reload
	ldp	x20, x19, [sp, #16]             ; 16-byte Folded Reload
	ldp	x22, x21, [sp], #48             ; 16-byte Folded Reload
	ret
                                        ; -- End function
.subsections_via_symbols
➜  rapidjson git:(master) ✗ g++ -I include -D NDEBUG -S -o- -O3 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -fverbose-asm test.cpp
	.section	__TEXT,__text,regular,pure_instructions
	.build_version macos, 13, 0	sdk_version 13, 3
	.globl	__Z9test_pushRN9rapidjson8internal5StackINS_12CrtAllocatorEEE ; -- Begin function _Z9test_pushRN9rapidjson8internal5StackINS_12CrtAllocatorEEE
	.p2align	2
__Z9test_pushRN9rapidjson8internal5StackINS_12CrtAllocatorEEE: ; @_Z9test_pushRN9rapidjson8internal5StackINS_12CrtAllocatorEEE
; %bb.0:
	stp	x20, x19, [sp, #-32]!           ; 16-byte Folded Spill
	stp	x29, x30, [sp, #16]             ; 16-byte Folded Spill
	add	x29, sp, #16
	mov	x19, x0
	ldp	x0, x8, [x0, #24]
	sub	x8, x8, x0
	cmp	x8, #0
	b.le	LBB0_2
; %bb.1:
	add	x8, x0, #1
	str	x8, [x19, #24]
	ldp	x29, x30, [sp, #16]             ; 16-byte Folded Reload
	ldp	x20, x19, [sp], #32             ; 16-byte Folded Reload
	ret
LBB0_2:
	mov	x0, x19
	mov	w1, #1
	bl	__ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm
	ldr	x0, [x19, #24]
	add	x8, x0, #1
	str	x8, [x19, #24]
	ldp	x29, x30, [sp, #16]             ; 16-byte Folded Reload
	ldp	x20, x19, [sp], #32             ; 16-byte Folded Reload
	ret
                                        ; -- End function
	.globl	__Z16test_push_unsafeRN9rapidjson8internal5StackINS_12CrtAllocatorEEE ; -- Begin function _Z16test_push_unsafeRN9rapidjson8internal5StackINS_12CrtAllocatorEEE
	.p2align	2
__Z16test_push_unsafeRN9rapidjson8internal5StackINS_12CrtAllocatorEEE: ; @_Z16test_push_unsafeRN9rapidjson8internal5StackINS_12CrtAllocatorEEE
; %bb.0:
	ldr	x8, [x0, #24]
	add	x9, x8, #1
	str	x9, [x0, #24]
	mov	x0, x8
	ret
                                        ; -- End function
	.globl	__Z12test_reserveRN9rapidjson8internal5StackINS_12CrtAllocatorEEE ; -- Begin function _Z12test_reserveRN9rapidjson8internal5StackINS_12CrtAllocatorEEE
	.p2align	2
__Z12test_reserveRN9rapidjson8internal5StackINS_12CrtAllocatorEEE: ; @_Z12test_reserveRN9rapidjson8internal5StackINS_12CrtAllocatorEEE
; %bb.0:
	ldp	x9, x8, [x0, #24]
	sub	x8, x8, x9
	cmp	x8, #0
	b.le	LBB2_2
; %bb.1:
	ret
LBB2_2:
	mov	w1, #1
	b	__ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm
                                        ; -- End function
	.globl	__ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm ; -- Begin function _ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm
	.weak_def_can_be_hidden	__ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm
	.p2align	2
__ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm: ; @_ZN9rapidjson8internal5StackINS_12CrtAllocatorEE6ExpandIcEEvm
; %bb.0:
	stp	x22, x21, [sp, #-48]!           ; 16-byte Folded Spill
	stp	x20, x19, [sp, #16]             ; 16-byte Folded Spill
	stp	x29, x30, [sp, #32]             ; 16-byte Folded Spill
	add	x29, sp, #32
	mov	x21, x1
	mov	x19, x0
	ldr	x20, [x0, #16]
	cbz	x20, LBB3_2
; %bb.1:
	ldr	x8, [x19, #32]
	sub	x8, x8, x20
	add	x9, x8, #1
	add	x8, x8, x9, lsr #1
	mov	x9, x20
	b	LBB3_5
LBB3_2:
	ldr	x8, [x19]
	cbnz	x8, LBB3_4
; %bb.3:
	mov	w0, #1
	bl	__Znwm
	stp	x0, x0, [x19]
LBB3_4:
	mov	x9, #0
	ldr	x8, [x19, #40]
LBB3_5:
	ldr	x10, [x19, #24]
	sub	x22, x10, x9
	add	x9, x22, x21
	cmp	x8, x9
	mov	x0, x20
	csel	x21, x8, x9, hi
	cbz	x21, LBB3_7
; %bb.6:
	mov	x1, x21
	bl	_realloc
	b	LBB3_8
LBB3_7:
	bl	_free
	mov	x0, #0
LBB3_8:
	add	x8, x0, x22
	stp	x0, x8, [x19, #16]
	add	x8, x0, x21
	str	x8, [x19, #32]
	ldp	x29, x30, [sp, #32]             ; 16-byte Folded Reload
	ldp	x20, x19, [sp, #16]             ; 16-byte Folded Reload
	ldp	x22, x21, [sp], #48             ; 16-byte Folded Reload
	ret
                                        ; -- End function
.subsections_via_symbols

miloyip avatar Aug 18 '23 10:08 miloyip