cpu_features Safety guarantees regarding code motion?

Hi team,

I'm working with the rust-lang team on a concern that would overlap with cpu_features, and am specifically here because this project has been around for a long time, is well regarded, and looks to be very mature and well-designed!

My question is fairly simple - have you run into any issues due to a combination of inlined functions and code motion? Are there any steps the project itself takes or any guarantees that make you feel sufficiently confident that compiler optimizations can't cause instructions to be reordered out of the confines of the protective if statement?

For example, if you had the following contrived code sample:

#include "cpuinfo_x86.h"

static const X86Features features = GetX86Info().features;

void Compute(int m, int n) {
  int x, y;
  if (features.bmi2) {
    x = _bzhi_u32(m, n);
    y = x + 1;
  } else {
    y = m + n;
  }

  printf("%d\n", y);
}

Is there anything to prevent a (smart but not too smart) optimizing compiler from noticing that 1) _bzhi_u32() is a known, inlinable function with zero observable side effects, 2) x may be calculated without affecting the else branch, 3) features.bmi2 is always tested and changing the code to

void Compute(int m, int n) {
  ...
  bzhi x, m, n;
  incr x;
  add y, m, n;
  test features.bmi2;
  cmov y, x;
  ...
}

(or just keeping the jmp but calculating bzhi beforehand)

The project README doesn't mention that the burden is on a developer to make sure any code using the detected features in the if (features.foo) { ... } branch cannot be inlined to protect against code motion, so I'm wondering if there are any sections of the C or C++ specification or the GCC/LLVM internal APIs that document, e.g., code motion cannot happen across a non-inlined condition or anything else that would prevent a (compliant) compiler from generating code that possibly uses an unsupported instruction.

Jul 20 '22 17:07 mqudsi

Hi @mqudsi, thanks for the issue! I think we should consider this case and add an appropriate approach to the documentation. I have finished investigating this issue. First of all, it depends on compilers and versions of the compiler, so I don't think that it is possible to take into account all compiler optimizations, probably you can turn off optimization on specific code: https://docs.microsoft.com/en-us/cpp/preprocessor/optimize?view=msvc-170 https://stackoverflow.com/questions/34902857/clang-do-not-optimize-a-specific-function

Theoretical solutions

Detection features via another process

I found this approach in the commit of mimalloc that we can define macros at build time from another process's computations and define appropriate code on preprocessor directives https://github.com/jserv/mimalloc/blob/detect-cache-line-size/CMakeLists.txt#L206-L228

the advantage is that the whole logic will be defined in compilation time. the disadvantage of this approach is that you need to write an additional program to get OUTPUT_VARIABLE

if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(i386|amd64)")
    execute_process(COMMAND some_program_to_parse_cpu_features -n "features.bmi2"
                    OUTPUT_VARIABLE CPU_FEATURES_BMI2
                    OUTPUT_STRIP_TRAILING_WHITESPACE)
endif()

void Compute(int m, int n) {
  int x, y;
#if CPU_FEATURES_BMI2
  x = _bzhi_u32(m, n);
  y = x + 1;
#else
 y = m + n;
#endif
 printf("%d\n", y);
}

Allocate funtions at runtime

The next solution I found in the blog of IKVM.NET How to Hack Your Own JIT Intrinsic and https://github.com/damageboy/ReJit

Need to write asm files with/without support of bmi2 by template: compute_with_bmi2.asm, compute_default.asm

BITS 64
[MAP all]
trampoline_param_size EQU 4
trampoline_size EQU 5
jit:
; Push the stuff we need to preserve by ABI
push        rbx
; Store the return address in RBX
mov         rbx,qword  [rsp+08]
push        rsi
push        rdi
mov         rdx,rcx

; Block copy all the LEA insn
lea rsi, [rel start]
lea rdi, [rbx-trampoline_size]
mov rcx, end - start
rep movsb
mov rcx,rbx
sub rcx,rdi
mov al,0x90
rep stosb
; Pop in reverse
pop         rdi
pop         rsi
sub         rbx,trampoline_size
mov         qword [rsp+08h],rbx
pop         rbx
mov         rcx,rdx
ret
start:
; Compute logic 
end:

HackJit.cs

public static class HackJit {
  private static X86Features features = X86Info.GetX86Info().Features;

  private static byte[] GetOpcodes(string i)
  {
    var assembly = Assembly.GetExecutingAssembly();
    var jitStream = assembly.GetManifestResourceStream("ProjectName.HackJit" + i + ".o");
    var bytes = new byte[jitStream.Length];
    jitStream.Read(bytes, 0, bytes.Length);
    return bytes;
  }

    [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.NoOptimization)]
    public static void Compute(int x, int y)
    {
         // Reserve some space to overwrite
         Compute(x, y);
         Compute(x, y);
         Compute(x, y);
         Compute(x, y);
         Compute(x, y);
         Compute(x, y);
         Compute(x, y);
    }

  public static void Init()
  {
      var candidates = typeof(HackJit).GetMethods(BindingFlags.Static | BindingFlags.Public);

      foreach (var method in candidates) {
        var mh = method.MethodHandle;
        RuntimeHelpers.PrepareMethod(mh);
        var p = mh.GetFunctionPointer();
        var asmFile = features.bmi2 ? "compute_with_bmi2" : "compute_default"
        var code = GetOpcodes(asmFile);
        if (IntPtr.Size == 8)
          Marshal.Copy(code, 0, p, code.Length);
        else
          throw new NotImplementedException();
      }
    }
}

Also, you can do it for C/C++ https://stackoverflow.com/questions/10456245/using-c-with-assembly-to-allocate-and-create-new-functions-at-runtime

int ImAcDeclFunc(int a, int b)
{
    return a + b;
}

int main(void)
{
    return 0;
}

00000000004004c4 <ImAcDeclFunc>:
ImAcDeclFunc():
  4004c4:   55                      push   %rbp
  4004c5:   48 89 e5                mov    %rsp,%rbp
  4004c8:   89 7d fc                mov    %edi,-0x4(%rbp)
  4004cb:   89 75 f8                mov    %esi,-0x8(%rbp)
  4004ce:   8b 45 f8                mov    -0x8(%rbp),%eax
  4004d1:   8b 55 fc                mov    -0x4(%rbp),%edx
  4004d4:   8d 04 02                lea    (%rdx,%rax,1),%eax
  4004d7:   c9                      leaveq 
  4004d8:   c3                      retq

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <malloc.h>
#include <sys/mman.h>
#include <unistd.h>

int main(void)
{
    char func[] = {0x55, 0x48, 0x89, 0xe5, 0x89, 0x7d, 0xfc,
    0x89, 0x75, 0xf8, 0x8b, 0x45, 0xf8,
    0x8b, 0x55, 0xfc, 0x8d, 0x04, 0x02,
    0xc9, 0xc3};

    int (* func_copy)(int,int) = mmap(NULL, sizeof(func),
        PROT_WRITE | PROT_READ | PROT_EXEC,
        MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);

    memcpy(func_copy, func, sizeof(func));
    printf("1 + 2 = %d\n", func_copy(1,2));

    munmap(func_copy, sizeof(func));
    return EXIT_SUCCESS;
}

I guess there is some library for getting opcodes in C/C++. The disadvantage of this approach is that you need to write code for a specific architecture

GNU Lighting

Also, you can create generated code via GNU lighting, see: https://www.gnu.org/software/lightning https://www.gnu.org/software/lightning/manual/html_node/Fibonacci.html#Fibonacci

AsmJit

https://asmjit.com/ https://github.com/asmjit/asmjit

cc: @gchatelet, @Mizux

Aug 09 '22 15:08 toor1245

@toor1245 that's some extensive research! Thanks for taking the time to compile it all into one post.

I think it's important to distinguish between workarounds that cause the application to effectively hardcode its support for a CPU instruction vs those that allow runtime differentiation (which, as I understand it, is the raison d'être for the cpu_features library in the first place). To that end, I think the first approach is out (and it's already possible if you just use the compiler-provided defines like __BMI2__ or __RDRND__ that are set when use -march=... or -mbmi2, although those aren't necessary if you are writing your own assembly).

I might be wrong, but there is probably a generic alternative to prevent inlining and code motion here available by simply changing the shape of the feature detection (and making sure the cpu_features library's own code is in a separate compilation unit and isn't inlined and optimized against with LTO or similar).

e.g. instead of using if (features.bmi2) { .. } the library could expose (C++ only, though?) an alternative api that would be more resistant to code motion, taking a callback/closure that is conditionally evaluated if the feature is present:

int x, ones;
if features.run_if(X86Features.popcnt, [&] () { ones = _popcnt(x); })
else { ones = // calculate it manually }

With an interface along those lines, the opportunities for the compiler to reorder code across the conditional are greatly restricted (again, I'm not sure if they're eliminated altogether depending on how smart the compiler is and whether or not the cpu_features library is optimized with the rest of the code as one unit).

Aug 12 '22 20:08 mqudsi

@mqudsi, I started comparing the assembler code generation of your Compute function example across supported cpu_features compilers (clang, msvc, gcc - latest versions) but can't reproduce this case with code motion

x86-64 gcc 12.1 flags: -march="skylake" -Ofast x86-64 clang 14.0.0 flags: -march="skylake" -Ofast

.LC0:
      .string "%d\n"
Compute:
      lea     eax, [rdi+rsi]
      test    BYTE PTR features[rip], 2
      je      .L8
      bzhi    edi, edi, esi
      lea     eax, [rdi+1]
.L8:
      mov     esi, eax
      mov     edi, OFFSET FLAT:.LC0
      xor     eax, eax
      jmp     printf

ref: https://gcc.godbolt.org/z/6WrY7GMM6 (gcc) ref: https://gcc.godbolt.org/z/xoP3qnbPh (clang)

x86-64 msvc v19.latest flags: /O2

Compute PROC                                          ; COMDAT
        test    BYTE PTR features, 2
        je      SHORT $LN2@Compute
        bzhi    edx, ecx, edx
        inc     edx
        lea     rcx, OFFSET FLAT:??_C@_03PMGGPEJJ@?$CFd?6@
        jmp     printf
$LN2@Compute:
        add     edx, ecx
        lea     rcx, OFFSET FLAT:??_C@_03PMGGPEJJ@?$CFd?6@
        jmp     printf
Compute ENDP

ref: https://gcc.godbolt.org/z/oEG5xh4cj (msvc)

I have tested your proposed example with callback:

void ComputeRunIf(void *params, void *output) {
    int *x_ptr = (int *) params;
    *((int *) output) = __popcntq(*x_ptr);
}

int __attribute__((noinline)) RunIf(int is_supported, void (*func)(void *, void *), void *params, void *output) {
    if (is_supported) {
        func(params, output);
        return 1;
    }
    output = NULL;
    return 0;
} 
static X86Features features;

int main() {
    features = GetX86Info().features;
    int x = 10;
    int ones;
    int status = RunIf(features.bmi2, ComputeRunIf, &x, &ones);
    if(!status)  {
        ones = // calculate it manually
    }
    return 0;
}

ComputeRunIf:
        movsx   rax, DWORD PTR [rdi]
        popcnt  rax, rax
        mov     DWORD PTR [rsi], eax
        ret
GetCpuidLeaf:
        push    rbx
        mov     eax, edi
        mov     ecx, esi
        cpuid

        mov     rsi, rdx
        sal     rbx, 32
        mov     eax, eax
        sal     rsi, 32
        mov     edx, ecx
        or      rax, rbx
        or      rdx, rsi
        pop     rbx
        ret
GetX86Info:
        push    rbx
        mov     eax, 7
        xor     ecx, ecx
        cpuid

        mov     eax, DWORD PTR kEmptyX86Info[rip]
        shr     ebx, 7
        and     ebx, 2
        and     eax, -3
        or      eax, ebx
        pop     rbx
        ret
RunIf:
        test    edi, edi
        jne     .L17
        mov     eax, edi
        ret
.L17:
        sub     rsp, 8
        mov     r8, rsi
        mov     rdi, rdx
        mov     rsi, rcx
        call    r8
        mov     eax, 1
        add     rsp, 8
        ret
main:
        push    rbx
        mov     eax, 7
        xor     ecx, ecx
        sub     rsp, 16
        cpuid

        shr     ebx, 8
        mov     edi, ebx
        sal     edi, 7
        sar     dil, 7
        lea     rcx, [rsp+12]
        lea     rdx, [rsp+8]
        movsx   edi, dil
        mov     esi, OFFSET FLAT:ComputeRunIf
        mov     DWORD PTR [rsp+8], 10
        call    RunIf
        add     rsp, 16
        xor     eax, eax
        pop     rbx
        ret
kEmptyX86Info:
        .zero   4

ref: https://gcc.godbolt.org/z/YWTsWME6a (gcc) ref: https://gcc.godbolt.org/z/W3PrMYn1c (msvc)

So, probably we can introduce something like that, but need to make sure that in the real case everything will work fine, could you provide the compiler version and flags that you tested Compute function or an example of code that produces code motion for further research, please?

Aug 14 '22 21:08 toor1245