snmalloc
snmalloc copied to clipboard
WIP: Hack to optimise based on known sizes
If the caller knows the sizes, and knows the allocation is thread local, then we can make some significant optimisations. This is a brief hack to show where would need changing.
On x86 the code would look something like:
00000000000055a0 <free_local_small>:
55a0: 48 89 fa mov %rdi,%rdx
55a3: 48 89 fe mov %rdi,%rsi
55a6: 48 81 e6 00 00 f0 ff and $0xfffffffffff00000,%rsi
55ad: 89 d0 mov %edx,%eax
55af: c1 e8 0e shr $0xe,%eax
55b2: 83 e0 3f and $0x3f,%eax
55b5: 48 c1 e0 04 shl $0x4,%rax
55b9: 66 83 44 06 60 ff addw $0xffff,0x60(%rsi,%rax,1)
55bf: 74 0e je 55cf <free_local_small+0x2f>
55c1: 48 8b 4c 06 58 mov 0x58(%rsi,%rax,1),%rcx
55c6: 48 89 54 06 58 mov %rdx,0x58(%rsi,%rax,1)
55cb: 48 89 0a mov %rcx,(%rdx)
55ce: c3 retq
;; SLOW PATH
55cf: 48 89 d1 mov %rdx,%rcx
55d2: 48 81 e1 00 c0 ff ff and $0xffffffffffffc000,%rcx
55d9: 48 8b 3d e8 c9 20 00 mov 0x20c9e8(%rip),%rdi # 211fc8 <.got+0x10>
55e0: 64 48 8b 3f mov %fs:(%rdi),%rdi
55e4: 48 81 e1 ff 3f f0 ff and $0xfffffffffff03fff,%rcx
55eb: 0f b6 4c 01 66 movzbl 0x66(%rcx,%rax,1),%ecx
55f0: e9 4b e6 ff ff jmpq 3c40 <_ZN8snmalloc9AllocatorIXadL_ZNS_20needs_initialisationEPvEEXadL_ZNS_21init_thread_allocatorENS_12function_refIFS1_S1_EEEEENS_24MemoryProviderStateMixinINS_8PALLinuxEEENS_15DefaultChunkMapINS_21GlobalPagemapTemplateINS_11FlatPagemapILm20EhEEEEEELb1EE27small_dealloc_offseted_slowEPNS_9SuperslabES1_m>
55f5: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
55fc: 00 00 00
55ff: 90 nop
and
0000000000005650 <malloc_small_64>:
5650: 48 8b 05 71 c9 20 00 mov 0x20c971(%rip),%rax # 211fc8 <.got+0x10>
5657: 64 48 8b 38 mov %fs:(%rax),%rdi
565b: 48 8b 47 18 mov 0x18(%rdi),%rax
565f: 48 85 c0 test %rax,%rax
5662: 74 08 je 566c <malloc_small_64+0x1c>
5664: 48 8b 08 mov (%rax),%rcx
5667: 48 89 4f 18 mov %rcx,0x18(%rdi)
566b: c3 retq
;; SLOW PATH
566c: 48 8b 87 80 0c 00 00 mov 0xc80(%rdi),%rax
5673: 48 39 87 88 0c 00 00 cmp %rax,0xc88(%rdi)
567a: 75 0f jne 568b <malloc_small_64+0x3b>
567c: be 03 00 00 00 mov $0x3,%esi
5681: ba 40 00 00 00 mov $0x40,%edx
5686: e9 25 ea ff ff jmpq 40b0 <_ZN8snmalloc9AllocatorIXadL_ZNS_20needs_initialisationEPvEEXadL_ZNS_21init_thread_allocatorENS_12function_refIFS1_S1_EEEEENS_24MemoryProviderStateMixinINS_8PALLinuxEEENS_15DefaultChunkMapINS_21GlobalPagemapTemplateINS_11FlatPagemapILm20EhEEEEEELb1EE26small_alloc_next_free_listILNS_7ZeroMemE0ELNS_12AllowReserveE1EEES1_mm>
568b: be 03 00 00 00 mov $0x3,%esi
5690: ba 40 00 00 00 mov $0x40,%edx
5695: e9 b6 ea ff ff jmpq 4150 <_ZN8snmalloc9AllocatorIXadL_ZNS_20needs_initialisationEPvEEXadL_ZNS_21init_thread_allocatorENS_12function_refIFS1_S1_EEEEENS_24MemoryProviderStateMixinINS_8PALLinuxEEENS_15DefaultChunkMapINS_21GlobalPagemapTemplateINS_11FlatPagemapILm20EhEEEEEELb1EE19small_alloc_mq_slowILNS_7ZeroMemE0ELNS_12AllowReserveE1EEES1_mm>
569a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
This is not intended for merging, but I have pushed so we can adapt the idea later.