snmalloc icon indicating copy to clipboard operation
snmalloc copied to clipboard

WIP: Hack to optimise based on known sizes

Open mjp41 opened this issue 4 years ago • 1 comments

If the caller knows the sizes, and knows the allocation is thread local, then we can make some significant optimisations. This is a brief hack to show where would need changing.

mjp41 avatar Feb 05 '21 12:02 mjp41

On x86 the code would look something like:

00000000000055a0 <free_local_small>:
    55a0:	48 89 fa             	mov    %rdi,%rdx
    55a3:	48 89 fe             	mov    %rdi,%rsi
    55a6:	48 81 e6 00 00 f0 ff 	and    $0xfffffffffff00000,%rsi
    55ad:	89 d0                	mov    %edx,%eax
    55af:	c1 e8 0e             	shr    $0xe,%eax
    55b2:	83 e0 3f             	and    $0x3f,%eax
    55b5:	48 c1 e0 04          	shl    $0x4,%rax
    55b9:	66 83 44 06 60 ff    	addw   $0xffff,0x60(%rsi,%rax,1)
    55bf:	74 0e                	je     55cf <free_local_small+0x2f>
    55c1:	48 8b 4c 06 58       	mov    0x58(%rsi,%rax,1),%rcx
    55c6:	48 89 54 06 58       	mov    %rdx,0x58(%rsi,%rax,1)
    55cb:	48 89 0a             	mov    %rcx,(%rdx)
    55ce:	c3                   	retq   
;; SLOW PATH
    55cf:	48 89 d1             	mov    %rdx,%rcx
    55d2:	48 81 e1 00 c0 ff ff 	and    $0xffffffffffffc000,%rcx
    55d9:	48 8b 3d e8 c9 20 00 	mov    0x20c9e8(%rip),%rdi        # 211fc8 <.got+0x10>
    55e0:	64 48 8b 3f          	mov    %fs:(%rdi),%rdi
    55e4:	48 81 e1 ff 3f f0 ff 	and    $0xfffffffffff03fff,%rcx
    55eb:	0f b6 4c 01 66       	movzbl 0x66(%rcx,%rax,1),%ecx
    55f0:	e9 4b e6 ff ff       	jmpq   3c40 <_ZN8snmalloc9AllocatorIXadL_ZNS_20needs_initialisationEPvEEXadL_ZNS_21init_thread_allocatorENS_12function_refIFS1_S1_EEEEENS_24MemoryProviderStateMixinINS_8PALLinuxEEENS_15DefaultChunkMapINS_21GlobalPagemapTemplateINS_11FlatPagemapILm20EhEEEEEELb1EE27small_dealloc_offseted_slowEPNS_9SuperslabES1_m>
    55f5:	66 2e 0f 1f 84 00 00 	nopw   %cs:0x0(%rax,%rax,1)
    55fc:	00 00 00 
    55ff:	90                   	nop

and

0000000000005650 <malloc_small_64>:
    5650:	48 8b 05 71 c9 20 00 	mov    0x20c971(%rip),%rax        # 211fc8 <.got+0x10>
    5657:	64 48 8b 38          	mov    %fs:(%rax),%rdi
    565b:	48 8b 47 18          	mov    0x18(%rdi),%rax
    565f:	48 85 c0             	test   %rax,%rax
    5662:	74 08                	je     566c <malloc_small_64+0x1c>
    5664:	48 8b 08             	mov    (%rax),%rcx
    5667:	48 89 4f 18          	mov    %rcx,0x18(%rdi)
    566b:	c3                   	retq   
;; SLOW PATH
    566c:	48 8b 87 80 0c 00 00 	mov    0xc80(%rdi),%rax
    5673:	48 39 87 88 0c 00 00 	cmp    %rax,0xc88(%rdi)
    567a:	75 0f                	jne    568b <malloc_small_64+0x3b>
    567c:	be 03 00 00 00       	mov    $0x3,%esi
    5681:	ba 40 00 00 00       	mov    $0x40,%edx
    5686:	e9 25 ea ff ff       	jmpq   40b0 <_ZN8snmalloc9AllocatorIXadL_ZNS_20needs_initialisationEPvEEXadL_ZNS_21init_thread_allocatorENS_12function_refIFS1_S1_EEEEENS_24MemoryProviderStateMixinINS_8PALLinuxEEENS_15DefaultChunkMapINS_21GlobalPagemapTemplateINS_11FlatPagemapILm20EhEEEEEELb1EE26small_alloc_next_free_listILNS_7ZeroMemE0ELNS_12AllowReserveE1EEES1_mm>
    568b:	be 03 00 00 00       	mov    $0x3,%esi
    5690:	ba 40 00 00 00       	mov    $0x40,%edx
    5695:	e9 b6 ea ff ff       	jmpq   4150 <_ZN8snmalloc9AllocatorIXadL_ZNS_20needs_initialisationEPvEEXadL_ZNS_21init_thread_allocatorENS_12function_refIFS1_S1_EEEEENS_24MemoryProviderStateMixinINS_8PALLinuxEEENS_15DefaultChunkMapINS_21GlobalPagemapTemplateINS_11FlatPagemapILm20EhEEEEEELb1EE19small_alloc_mq_slowILNS_7ZeroMemE0ELNS_12AllowReserveE1EEES1_mm>
    569a:	66 0f 1f 44 00 00    	nopw   0x0(%rax,%rax,1)

This is not intended for merging, but I have pushed so we can adapt the idea later.

mjp41 avatar Feb 05 '21 12:02 mjp41