Safety guarantees regarding code motion?
Hi team,
I'm working with the rust-lang team on a concern that would overlap with cpu_features, and am specifically here because this project has been around for a long time, is well regarded, and looks to be very mature and well-designed!
My question is fairly simple - have you run into any issues due to a combination of inlined functions and code motion? Are there any steps the project itself takes or any guarantees that make you feel sufficiently confident that compiler optimizations can't cause instructions to be reordered out of the confines of the protective if statement?
For example, if you had the following contrived code sample:
#include "cpuinfo_x86.h"
static const X86Features features = GetX86Info().features;
void Compute(int m, int n) {
int x, y;
if (features.bmi2) {
x = _bzhi_u32(m, n);
y = x + 1;
} else {
y = m + n;
}
printf("%d\n", y);
}
Is there anything to prevent a (smart but not too smart) optimizing compiler from noticing that 1) _bzhi_u32() is a known, inlinable function with zero observable side effects, 2) x may be calculated without affecting the else branch, 3) features.bmi2 is always tested and changing the code to
void Compute(int m, int n) {
...
bzhi x, m, n;
incr x;
add y, m, n;
test features.bmi2;
cmov y, x;
...
}
(or just keeping the jmp but calculating bzhi beforehand)
The project README doesn't mention that the burden is on a developer to make sure any code using the detected features in the if (features.foo) { ... } branch cannot be inlined to protect against code motion, so I'm wondering if there are any sections of the C or C++ specification or the GCC/LLVM internal APIs that document, e.g., code motion cannot happen across a non-inlined condition or anything else that would prevent a (compliant) compiler from generating code that possibly uses an unsupported instruction.
Hi @mqudsi, thanks for the issue! I think we should consider this case and add an appropriate approach to the documentation. I have finished investigating this issue. First of all, it depends on compilers and versions of the compiler, so I don't think that it is possible to take into account all compiler optimizations, probably you can turn off optimization on specific code: https://docs.microsoft.com/en-us/cpp/preprocessor/optimize?view=msvc-170 https://stackoverflow.com/questions/34902857/clang-do-not-optimize-a-specific-function
Theoretical solutions
Detection features via another process
I found this approach in the commit of mimalloc that we can define macros at build time from another process's computations and define appropriate code on preprocessor directives https://github.com/jserv/mimalloc/blob/detect-cache-line-size/CMakeLists.txt#L206-L228
the advantage is that the whole logic will be defined in compilation time.
the disadvantage of this approach is that you need to write an additional program to get OUTPUT_VARIABLE
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(i386|amd64)")
execute_process(COMMAND some_program_to_parse_cpu_features -n "features.bmi2"
OUTPUT_VARIABLE CPU_FEATURES_BMI2
OUTPUT_STRIP_TRAILING_WHITESPACE)
endif()
void Compute(int m, int n) {
int x, y;
#if CPU_FEATURES_BMI2
x = _bzhi_u32(m, n);
y = x + 1;
#else
y = m + n;
#endif
printf("%d\n", y);
}
Allocate funtions at runtime
The next solution I found in the blog of IKVM.NET How to Hack Your Own JIT Intrinsic and https://github.com/damageboy/ReJit
Need to write asm files with/without support of bmi2 by template:
compute_with_bmi2.asm, compute_default.asm
BITS 64
[MAP all]
trampoline_param_size EQU 4
trampoline_size EQU 5
jit:
; Push the stuff we need to preserve by ABI
push rbx
; Store the return address in RBX
mov rbx,qword [rsp+08]
push rsi
push rdi
mov rdx,rcx
; Block copy all the LEA insn
lea rsi, [rel start]
lea rdi, [rbx-trampoline_size]
mov rcx, end - start
rep movsb
mov rcx,rbx
sub rcx,rdi
mov al,0x90
rep stosb
; Pop in reverse
pop rdi
pop rsi
sub rbx,trampoline_size
mov qword [rsp+08h],rbx
pop rbx
mov rcx,rdx
ret
start:
; Compute logic
end:
HackJit.cs
public static class HackJit {
private static X86Features features = X86Info.GetX86Info().Features;
private static byte[] GetOpcodes(string i)
{
var assembly = Assembly.GetExecutingAssembly();
var jitStream = assembly.GetManifestResourceStream("ProjectName.HackJit" + i + ".o");
var bytes = new byte[jitStream.Length];
jitStream.Read(bytes, 0, bytes.Length);
return bytes;
}
[MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.NoOptimization)]
public static void Compute(int x, int y)
{
// Reserve some space to overwrite
Compute(x, y);
Compute(x, y);
Compute(x, y);
Compute(x, y);
Compute(x, y);
Compute(x, y);
Compute(x, y);
}
public static void Init()
{
var candidates = typeof(HackJit).GetMethods(BindingFlags.Static | BindingFlags.Public);
foreach (var method in candidates) {
var mh = method.MethodHandle;
RuntimeHelpers.PrepareMethod(mh);
var p = mh.GetFunctionPointer();
var asmFile = features.bmi2 ? "compute_with_bmi2" : "compute_default"
var code = GetOpcodes(asmFile);
if (IntPtr.Size == 8)
Marshal.Copy(code, 0, p, code.Length);
else
throw new NotImplementedException();
}
}
}
Also, you can do it for C/C++ https://stackoverflow.com/questions/10456245/using-c-with-assembly-to-allocate-and-create-new-functions-at-runtime
int ImAcDeclFunc(int a, int b)
{
return a + b;
}
int main(void)
{
return 0;
}
00000000004004c4 <ImAcDeclFunc>:
ImAcDeclFunc():
4004c4: 55 push %rbp
4004c5: 48 89 e5 mov %rsp,%rbp
4004c8: 89 7d fc mov %edi,-0x4(%rbp)
4004cb: 89 75 f8 mov %esi,-0x8(%rbp)
4004ce: 8b 45 f8 mov -0x8(%rbp),%eax
4004d1: 8b 55 fc mov -0x4(%rbp),%edx
4004d4: 8d 04 02 lea (%rdx,%rax,1),%eax
4004d7: c9 leaveq
4004d8: c3 retq
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <malloc.h>
#include <sys/mman.h>
#include <unistd.h>
int main(void)
{
char func[] = {0x55, 0x48, 0x89, 0xe5, 0x89, 0x7d, 0xfc,
0x89, 0x75, 0xf8, 0x8b, 0x45, 0xf8,
0x8b, 0x55, 0xfc, 0x8d, 0x04, 0x02,
0xc9, 0xc3};
int (* func_copy)(int,int) = mmap(NULL, sizeof(func),
PROT_WRITE | PROT_READ | PROT_EXEC,
MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
memcpy(func_copy, func, sizeof(func));
printf("1 + 2 = %d\n", func_copy(1,2));
munmap(func_copy, sizeof(func));
return EXIT_SUCCESS;
}
I guess there is some library for getting opcodes in C/C++. The disadvantage of this approach is that you need to write code for a specific architecture
GNU Lighting
Also, you can create generated code via GNU lighting, see: https://www.gnu.org/software/lightning https://www.gnu.org/software/lightning/manual/html_node/Fibonacci.html#Fibonacci
AsmJit
https://asmjit.com/ https://github.com/asmjit/asmjit
cc: @gchatelet, @Mizux
@toor1245 that's some extensive research! Thanks for taking the time to compile it all into one post.
I think it's important to distinguish between workarounds that cause the application to effectively hardcode its support for a CPU instruction vs those that allow runtime differentiation (which, as I understand it, is the raison d'être for the cpu_features library in the first place). To that end, I think the first approach is out (and it's already possible if you just use the compiler-provided defines like __BMI2__ or __RDRND__ that are set when use -march=... or -mbmi2, although those aren't necessary if you are writing your own assembly).
I might be wrong, but there is probably a generic alternative to prevent inlining and code motion here available by simply changing the shape of the feature detection (and making sure the cpu_features library's own code is in a separate compilation unit and isn't inlined and optimized against with LTO or similar).
e.g. instead of using if (features.bmi2) { .. } the library could expose (C++ only, though?) an alternative api that would be more resistant to code motion, taking a callback/closure that is conditionally evaluated if the feature is present:
int x, ones;
if features.run_if(X86Features.popcnt, [&] () { ones = _popcnt(x); })
else { ones = // calculate it manually }
With an interface along those lines, the opportunities for the compiler to reorder code across the conditional are greatly restricted (again, I'm not sure if they're eliminated altogether depending on how smart the compiler is and whether or not the cpu_features library is optimized with the rest of the code as one unit).
@mqudsi, I started comparing the assembler code generation of your Compute function example across supported cpu_features compilers (clang, msvc, gcc - latest versions) but can't reproduce this case with code motion
x86-64 gcc 12.1 flags: -march="skylake" -Ofast
x86-64 clang 14.0.0 flags: -march="skylake" -Ofast
.LC0:
.string "%d\n"
Compute:
lea eax, [rdi+rsi]
test BYTE PTR features[rip], 2
je .L8
bzhi edi, edi, esi
lea eax, [rdi+1]
.L8:
mov esi, eax
mov edi, OFFSET FLAT:.LC0
xor eax, eax
jmp printf
ref: https://gcc.godbolt.org/z/6WrY7GMM6 (gcc) ref: https://gcc.godbolt.org/z/xoP3qnbPh (clang)
x86-64 msvc v19.latest flags: /O2
Compute PROC ; COMDAT
test BYTE PTR features, 2
je SHORT $LN2@Compute
bzhi edx, ecx, edx
inc edx
lea rcx, OFFSET FLAT:??_C@_03PMGGPEJJ@?$CFd?6@
jmp printf
$LN2@Compute:
add edx, ecx
lea rcx, OFFSET FLAT:??_C@_03PMGGPEJJ@?$CFd?6@
jmp printf
Compute ENDP
ref: https://gcc.godbolt.org/z/oEG5xh4cj (msvc)
I have tested your proposed example with callback:
void ComputeRunIf(void *params, void *output) {
int *x_ptr = (int *) params;
*((int *) output) = __popcntq(*x_ptr);
}
int __attribute__((noinline)) RunIf(int is_supported, void (*func)(void *, void *), void *params, void *output) {
if (is_supported) {
func(params, output);
return 1;
}
output = NULL;
return 0;
}
static X86Features features;
int main() {
features = GetX86Info().features;
int x = 10;
int ones;
int status = RunIf(features.bmi2, ComputeRunIf, &x, &ones);
if(!status) {
ones = // calculate it manually
}
return 0;
}
ComputeRunIf:
movsx rax, DWORD PTR [rdi]
popcnt rax, rax
mov DWORD PTR [rsi], eax
ret
GetCpuidLeaf:
push rbx
mov eax, edi
mov ecx, esi
cpuid
mov rsi, rdx
sal rbx, 32
mov eax, eax
sal rsi, 32
mov edx, ecx
or rax, rbx
or rdx, rsi
pop rbx
ret
GetX86Info:
push rbx
mov eax, 7
xor ecx, ecx
cpuid
mov eax, DWORD PTR kEmptyX86Info[rip]
shr ebx, 7
and ebx, 2
and eax, -3
or eax, ebx
pop rbx
ret
RunIf:
test edi, edi
jne .L17
mov eax, edi
ret
.L17:
sub rsp, 8
mov r8, rsi
mov rdi, rdx
mov rsi, rcx
call r8
mov eax, 1
add rsp, 8
ret
main:
push rbx
mov eax, 7
xor ecx, ecx
sub rsp, 16
cpuid
shr ebx, 8
mov edi, ebx
sal edi, 7
sar dil, 7
lea rcx, [rsp+12]
lea rdx, [rsp+8]
movsx edi, dil
mov esi, OFFSET FLAT:ComputeRunIf
mov DWORD PTR [rsp+8], 10
call RunIf
add rsp, 16
xor eax, eax
pop rbx
ret
kEmptyX86Info:
.zero 4
ref: https://gcc.godbolt.org/z/YWTsWME6a (gcc) ref: https://gcc.godbolt.org/z/W3PrMYn1c (msvc)
So, probably we can introduce something like that, but need to make sure that in the real case everything will work fine, could you provide the compiler version and flags that you tested Compute function or an example of code that produces code motion for further research, please?