Really bad soft_float memcpy codegen (ReleaseFast)

Open davidgmbb opened this issue 2 years ago • 0 comments

Zig Version

0.11.0-dev.3395+1e7dcaa3a

Steps to Reproduce and Observed Behavior

I could be doing something wrong, but here are the steps to reproduce:

Your standard build.zig (I only changed target features):

const std = @import("std");

// Although this function looks imperative, note that its job is to
// declaratively construct a build graph that will be executed by an external
// runner.
pub fn build(b: *std.Build) void {
    // Standard target options allows the person running `zig build` to choose
    // what target to build for. Here we do not override the defaults, which
    // means any target is allowed, and the default is native. Other options
    // for restricting supported target set are available.
    var target = std.zig.CrossTarget{
        .cpu_arch = .x86_64,
    };
    target.cpu_features_add.addFeature(@enumToInt(std.Target.x86.Feature.soft_float));
    target.cpu_features_sub.addFeature(@enumToInt(std.Target.x86.Feature.x87));
    target.cpu_features_sub.addFeature(@enumToInt(std.Target.x86.Feature.sse));
    target.cpu_features_sub.addFeature(@enumToInt(std.Target.x86.Feature.sse2));
    target.cpu_features_sub.addFeature(@enumToInt(std.Target.x86.Feature.avx));
    target.cpu_features_sub.addFeature(@enumToInt(std.Target.x86.Feature.avx2));

    // Standard optimization options allow the person running `zig build` to select
    // between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall. Here we do not
    // set a preferred release mode, allowing the user to decide how to optimize.
    const optimize = b.standardOptimizeOption(.{});

    const exe = b.addExecutable(.{
        .name = "memcpy",
        // In this case the main source file is merely a path, however, in more
        // complicated build scripts, this could be a generated file.
        .root_source_file = .{ .path = "src/main.zig" },
        .target = target,
        .optimize = optimize,
    });

    // This declares intent for the executable to be installed into the
    // standard location when the user invokes the "install" step (the default
    // step when running `zig build`).
    b.installArtifact(exe);

    // This *creates* a Run step in the build graph, to be executed when another
    // step is evaluated that depends on it. The next line below will establish
    // such a dependency.
    const run_cmd = b.addRunArtifact(exe);

    // By making the run step depend on the install step, it will be run from the
    // installation directory rather than directly from within the cache directory.
    // This is not necessary, however, if the application depends on other installed
    // files, this ensures they will be present and in the expected location.
    run_cmd.step.dependOn(b.getInstallStep());

    // This allows the user to pass arguments to the application in the build
    // command itself, like this: `zig build run -- arg1 arg2 etc`
    if (b.args) |args| {
        run_cmd.addArgs(args);
    }

    // This creates a build step. It will be visible in the `zig build --help` menu,
    // and can be selected like this: `zig build run`
    // This will evaluate the `run` step rather than the default, which is "install".
    const run_step = b.step("run", "Run the app");
    run_step.dependOn(&run_cmd.step);

    // Creates a step for unit testing. This only builds the test executable
    // but does not run it.
    const unit_tests = b.addTest(.{
        .root_source_file = .{ .path = "src/main.zig" },
        .target = target,
        .optimize = optimize,
    });

    const run_unit_tests = b.addRunArtifact(unit_tests);

    // Similar to creating the run step earlier, this exposes a `test` step to
    // the `zig build --help` menu, providing a way for the user to request
    // running the unit tests.
    const test_step = b.step("test", "Run unit tests");
    test_step.dependOn(&run_unit_tests.step);
}

Your src/main.zig

const std = @import("std");

pub fn main() !void {
    const arguments = try std.process.argsAlloc(std.heap.page_allocator);
    const src = arguments[2];
    const dst = arguments[1][0..src.len];
    @memcpy(dst, src);
}

test "simple test" {
    var list = std.ArrayList(i32).init(std.testing.allocator);
    defer list.deinit(); // try commenting this out and see if zig detects the memory leak!
    try list.append(42);
    try std.testing.expectEqual(@as(i32, 42), list.pop());
}

Then:

zig build -Doptimize=ReleaseFast
objdump -dxS -Mintel zig-out/bin/memcpy

Observing you can see the binary is quite bloated (although the calls that I made might be pulling a lot of dependencies), but memcpy has really poor codegen.

0000000000213ec0 <memcpy>:
  213ec0:	48 89 f8             	mov    rax,rdi
  213ec3:	48 85 d2             	test   rdx,rdx
  213ec6:	0f 84 c5 00 00 00    	je     213f91 <memcpy+0xd1>
  213ecc:	0f b6 0e             	movzx  ecx,BYTE PTR [rsi]
  213ecf:	88 08                	mov    BYTE PTR [rax],cl
  213ed1:	48 89 d1             	mov    rcx,rdx
  213ed4:	48 83 c1 ff          	add    rcx,0xffffffffffffffff
  213ed8:	0f 84 b3 00 00 00    	je     213f91 <memcpy+0xd1>
  213ede:	48 83 c2 fe          	add    rdx,0xfffffffffffffffe
  213ee2:	48 89 cf             	mov    rdi,rcx
  213ee5:	48 83 e7 07          	and    rdi,0x7
  213ee9:	74 2b                	je     213f16 <memcpy+0x56>
  213eeb:	45 31 c0             	xor    r8d,r8d
  213eee:	66 90                	xchg   ax,ax
  213ef0:	46 0f b6 4c 06 01    	movzx  r9d,BYTE PTR [rsi+r8*1+0x1]
  213ef6:	46 88 4c 00 01       	mov    BYTE PTR [rax+r8*1+0x1],r9b
  213efb:	49 83 c0 01          	add    r8,0x1
  213eff:	4c 39 c7             	cmp    rdi,r8
  213f02:	75 ec                	jne    213ef0 <memcpy+0x30>
  213f04:	4c 29 c1             	sub    rcx,r8
  213f07:	4a 8d 3c 00          	lea    rdi,[rax+r8*1]
  213f0b:	4c 01 c6             	add    rsi,r8
  213f0e:	48 83 fa 07          	cmp    rdx,0x7
  213f12:	73 0b                	jae    213f1f <memcpy+0x5f>
  213f14:	eb 7b                	jmp    213f91 <memcpy+0xd1>
  213f16:	48 89 c7             	mov    rdi,rax
  213f19:	48 83 fa 07          	cmp    rdx,0x7
  213f1d:	72 72                	jb     213f91 <memcpy+0xd1>
  213f1f:	31 d2                	xor    edx,edx
  213f21:	66 2e 0f 1f 84 00 00 	cs nop WORD PTR [rax+rax*1+0x0]
  213f28:	00 00 00 
  213f2b:	0f 1f 44 00 00       	nop    DWORD PTR [rax+rax*1+0x0]
  213f30:	44 0f b6 44 16 01    	movzx  r8d,BYTE PTR [rsi+rdx*1+0x1]
  213f36:	44 88 44 17 01       	mov    BYTE PTR [rdi+rdx*1+0x1],r8b
  213f3b:	44 0f b6 44 16 02    	movzx  r8d,BYTE PTR [rsi+rdx*1+0x2]
  213f41:	44 88 44 17 02       	mov    BYTE PTR [rdi+rdx*1+0x2],r8b
  213f46:	44 0f b6 44 16 03    	movzx  r8d,BYTE PTR [rsi+rdx*1+0x3]
  213f4c:	44 88 44 17 03       	mov    BYTE PTR [rdi+rdx*1+0x3],r8b
  213f51:	44 0f b6 44 16 04    	movzx  r8d,BYTE PTR [rsi+rdx*1+0x4]
  213f57:	44 88 44 17 04       	mov    BYTE PTR [rdi+rdx*1+0x4],r8b
  213f5c:	44 0f b6 44 16 05    	movzx  r8d,BYTE PTR [rsi+rdx*1+0x5]
  213f62:	44 88 44 17 05       	mov    BYTE PTR [rdi+rdx*1+0x5],r8b
  213f67:	44 0f b6 44 16 06    	movzx  r8d,BYTE PTR [rsi+rdx*1+0x6]
  213f6d:	44 88 44 17 06       	mov    BYTE PTR [rdi+rdx*1+0x6],r8b
  213f72:	44 0f b6 44 16 07    	movzx  r8d,BYTE PTR [rsi+rdx*1+0x7]
  213f78:	44 88 44 17 07       	mov    BYTE PTR [rdi+rdx*1+0x7],r8b
  213f7d:	44 0f b6 44 16 08    	movzx  r8d,BYTE PTR [rsi+rdx*1+0x8]
  213f83:	44 88 44 17 08       	mov    BYTE PTR [rdi+rdx*1+0x8],r8b
  213f88:	48 83 c2 08          	add    rdx,0x8
  213f8c:	48 39 d1             	cmp    rcx,rdx
  213f8f:	75 9f                	jne    213f30 <memcpy+0x70>
  213f91:	c3                   	ret
  213f92:	cc                   	int3
  213f93:	cc                   	int3
  213f94:	cc                   	int3
  213f95:	cc                   	int3
  213f96:	cc                   	int3
  213f97:	cc                   	int3
  213f98:	cc                   	int3
  213f99:	cc                   	int3
  213f9a:	cc                   	int3
  213f9b:	cc                   	int3
  213f9c:	cc                   	int3
  213f9d:	cc                   	int3
  213f9e:	cc                   	int3
  213f9f:	cc                   	int3

Expected Behavior

Using rep movsb and similar would be great. I wonder if this sort of functions which are required to be fast could be hand-coded using inline assembly.

Jun 15 '23 17:06 davidgmbb