zig
zig copied to clipboard
Really bad soft_float memcpy codegen (ReleaseFast)
Zig Version
0.11.0-dev.3395+1e7dcaa3a
Steps to Reproduce and Observed Behavior
I could be doing something wrong, but here are the steps to reproduce:
Your standard build.zig (I only changed target features):
const std = @import("std");
// Although this function looks imperative, note that its job is to
// declaratively construct a build graph that will be executed by an external
// runner.
pub fn build(b: *std.Build) void {
// Standard target options allows the person running `zig build` to choose
// what target to build for. Here we do not override the defaults, which
// means any target is allowed, and the default is native. Other options
// for restricting supported target set are available.
var target = std.zig.CrossTarget{
.cpu_arch = .x86_64,
};
target.cpu_features_add.addFeature(@enumToInt(std.Target.x86.Feature.soft_float));
target.cpu_features_sub.addFeature(@enumToInt(std.Target.x86.Feature.x87));
target.cpu_features_sub.addFeature(@enumToInt(std.Target.x86.Feature.sse));
target.cpu_features_sub.addFeature(@enumToInt(std.Target.x86.Feature.sse2));
target.cpu_features_sub.addFeature(@enumToInt(std.Target.x86.Feature.avx));
target.cpu_features_sub.addFeature(@enumToInt(std.Target.x86.Feature.avx2));
// Standard optimization options allow the person running `zig build` to select
// between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall. Here we do not
// set a preferred release mode, allowing the user to decide how to optimize.
const optimize = b.standardOptimizeOption(.{});
const exe = b.addExecutable(.{
.name = "memcpy",
// In this case the main source file is merely a path, however, in more
// complicated build scripts, this could be a generated file.
.root_source_file = .{ .path = "src/main.zig" },
.target = target,
.optimize = optimize,
});
// This declares intent for the executable to be installed into the
// standard location when the user invokes the "install" step (the default
// step when running `zig build`).
b.installArtifact(exe);
// This *creates* a Run step in the build graph, to be executed when another
// step is evaluated that depends on it. The next line below will establish
// such a dependency.
const run_cmd = b.addRunArtifact(exe);
// By making the run step depend on the install step, it will be run from the
// installation directory rather than directly from within the cache directory.
// This is not necessary, however, if the application depends on other installed
// files, this ensures they will be present and in the expected location.
run_cmd.step.dependOn(b.getInstallStep());
// This allows the user to pass arguments to the application in the build
// command itself, like this: `zig build run -- arg1 arg2 etc`
if (b.args) |args| {
run_cmd.addArgs(args);
}
// This creates a build step. It will be visible in the `zig build --help` menu,
// and can be selected like this: `zig build run`
// This will evaluate the `run` step rather than the default, which is "install".
const run_step = b.step("run", "Run the app");
run_step.dependOn(&run_cmd.step);
// Creates a step for unit testing. This only builds the test executable
// but does not run it.
const unit_tests = b.addTest(.{
.root_source_file = .{ .path = "src/main.zig" },
.target = target,
.optimize = optimize,
});
const run_unit_tests = b.addRunArtifact(unit_tests);
// Similar to creating the run step earlier, this exposes a `test` step to
// the `zig build --help` menu, providing a way for the user to request
// running the unit tests.
const test_step = b.step("test", "Run unit tests");
test_step.dependOn(&run_unit_tests.step);
}
Your src/main.zig
const std = @import("std");
pub fn main() !void {
const arguments = try std.process.argsAlloc(std.heap.page_allocator);
const src = arguments[2];
const dst = arguments[1][0..src.len];
@memcpy(dst, src);
}
test "simple test" {
var list = std.ArrayList(i32).init(std.testing.allocator);
defer list.deinit(); // try commenting this out and see if zig detects the memory leak!
try list.append(42);
try std.testing.expectEqual(@as(i32, 42), list.pop());
}
Then:
zig build -Doptimize=ReleaseFast
objdump -dxS -Mintel zig-out/bin/memcpy
Observing you can see the binary is quite bloated (although the calls that I made might be pulling a lot of dependencies), but memcpy has really poor codegen.
0000000000213ec0 <memcpy>:
213ec0: 48 89 f8 mov rax,rdi
213ec3: 48 85 d2 test rdx,rdx
213ec6: 0f 84 c5 00 00 00 je 213f91 <memcpy+0xd1>
213ecc: 0f b6 0e movzx ecx,BYTE PTR [rsi]
213ecf: 88 08 mov BYTE PTR [rax],cl
213ed1: 48 89 d1 mov rcx,rdx
213ed4: 48 83 c1 ff add rcx,0xffffffffffffffff
213ed8: 0f 84 b3 00 00 00 je 213f91 <memcpy+0xd1>
213ede: 48 83 c2 fe add rdx,0xfffffffffffffffe
213ee2: 48 89 cf mov rdi,rcx
213ee5: 48 83 e7 07 and rdi,0x7
213ee9: 74 2b je 213f16 <memcpy+0x56>
213eeb: 45 31 c0 xor r8d,r8d
213eee: 66 90 xchg ax,ax
213ef0: 46 0f b6 4c 06 01 movzx r9d,BYTE PTR [rsi+r8*1+0x1]
213ef6: 46 88 4c 00 01 mov BYTE PTR [rax+r8*1+0x1],r9b
213efb: 49 83 c0 01 add r8,0x1
213eff: 4c 39 c7 cmp rdi,r8
213f02: 75 ec jne 213ef0 <memcpy+0x30>
213f04: 4c 29 c1 sub rcx,r8
213f07: 4a 8d 3c 00 lea rdi,[rax+r8*1]
213f0b: 4c 01 c6 add rsi,r8
213f0e: 48 83 fa 07 cmp rdx,0x7
213f12: 73 0b jae 213f1f <memcpy+0x5f>
213f14: eb 7b jmp 213f91 <memcpy+0xd1>
213f16: 48 89 c7 mov rdi,rax
213f19: 48 83 fa 07 cmp rdx,0x7
213f1d: 72 72 jb 213f91 <memcpy+0xd1>
213f1f: 31 d2 xor edx,edx
213f21: 66 2e 0f 1f 84 00 00 cs nop WORD PTR [rax+rax*1+0x0]
213f28: 00 00 00
213f2b: 0f 1f 44 00 00 nop DWORD PTR [rax+rax*1+0x0]
213f30: 44 0f b6 44 16 01 movzx r8d,BYTE PTR [rsi+rdx*1+0x1]
213f36: 44 88 44 17 01 mov BYTE PTR [rdi+rdx*1+0x1],r8b
213f3b: 44 0f b6 44 16 02 movzx r8d,BYTE PTR [rsi+rdx*1+0x2]
213f41: 44 88 44 17 02 mov BYTE PTR [rdi+rdx*1+0x2],r8b
213f46: 44 0f b6 44 16 03 movzx r8d,BYTE PTR [rsi+rdx*1+0x3]
213f4c: 44 88 44 17 03 mov BYTE PTR [rdi+rdx*1+0x3],r8b
213f51: 44 0f b6 44 16 04 movzx r8d,BYTE PTR [rsi+rdx*1+0x4]
213f57: 44 88 44 17 04 mov BYTE PTR [rdi+rdx*1+0x4],r8b
213f5c: 44 0f b6 44 16 05 movzx r8d,BYTE PTR [rsi+rdx*1+0x5]
213f62: 44 88 44 17 05 mov BYTE PTR [rdi+rdx*1+0x5],r8b
213f67: 44 0f b6 44 16 06 movzx r8d,BYTE PTR [rsi+rdx*1+0x6]
213f6d: 44 88 44 17 06 mov BYTE PTR [rdi+rdx*1+0x6],r8b
213f72: 44 0f b6 44 16 07 movzx r8d,BYTE PTR [rsi+rdx*1+0x7]
213f78: 44 88 44 17 07 mov BYTE PTR [rdi+rdx*1+0x7],r8b
213f7d: 44 0f b6 44 16 08 movzx r8d,BYTE PTR [rsi+rdx*1+0x8]
213f83: 44 88 44 17 08 mov BYTE PTR [rdi+rdx*1+0x8],r8b
213f88: 48 83 c2 08 add rdx,0x8
213f8c: 48 39 d1 cmp rcx,rdx
213f8f: 75 9f jne 213f30 <memcpy+0x70>
213f91: c3 ret
213f92: cc int3
213f93: cc int3
213f94: cc int3
213f95: cc int3
213f96: cc int3
213f97: cc int3
213f98: cc int3
213f99: cc int3
213f9a: cc int3
213f9b: cc int3
213f9c: cc int3
213f9d: cc int3
213f9e: cc int3
213f9f: cc int3
Expected Behavior
Using rep movsb and similar would be great. I wonder if this sort of functions which are required to be fast could be hand-coded using inline assembly.