aboutsummaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorAndrew Kelley <andrew@ziglang.org>2025-01-21 13:05:14 -0500
committerGitHub <noreply@github.com>2025-01-21 13:05:14 -0500
commit18fcb3b5e82d84e50b951373203f48325b2feada (patch)
treecc9aa163cfdbeb3b03d7d4266b492e7880e5b1fc /lib
parentd652dd065858c754f6d744beec3e7e5bc4ec058b (diff)
parentb7a887f0fb7d166ad93eae62ecd93d1ed173297e (diff)
downloadzig-18fcb3b5e82d84e50b951373203f48325b2feada.tar.gz
zig-18fcb3b5e82d84e50b951373203f48325b2feada.zip
Merge pull request #18912 from dweiller/memcpy-opt
optimized memcpy
Diffstat (limited to 'lib')
-rw-r--r--lib/compiler_rt.zig1
-rw-r--r--lib/compiler_rt/memcpy.zig288
-rw-r--r--lib/compiler_rt/memmove.zig170
3 files changed, 347 insertions, 112 deletions
diff --git a/lib/compiler_rt.zig b/lib/compiler_rt.zig
index 1369e0a7e5..82aeb7f88e 100644
--- a/lib/compiler_rt.zig
+++ b/lib/compiler_rt.zig
@@ -233,6 +233,7 @@ comptime {
_ = @import("compiler_rt/memcpy.zig");
_ = @import("compiler_rt/memset.zig");
+ _ = @import("compiler_rt/memmove.zig");
_ = @import("compiler_rt/memcmp.zig");
_ = @import("compiler_rt/bcmp.zig");
_ = @import("compiler_rt/ssp.zig");
diff --git a/lib/compiler_rt/memcpy.zig b/lib/compiler_rt/memcpy.zig
index 614bcc7182..d67a6e766a 100644
--- a/lib/compiler_rt/memcpy.zig
+++ b/lib/compiler_rt/memcpy.zig
@@ -1,149 +1,213 @@
const std = @import("std");
+const assert = std.debug.assert;
const common = @import("./common.zig");
const builtin = @import("builtin");
comptime {
if (builtin.object_format != .c) {
- @export(&memcpy, .{ .name = "memcpy", .linkage = common.linkage, .visibility = common.visibility });
- @export(&memmove, .{ .name = "memmove", .linkage = common.linkage, .visibility = common.visibility });
+ const export_options: std.builtin.ExportOptions = .{
+ .name = "memcpy",
+ .linkage = common.linkage,
+ .visibility = common.visibility,
+ };
+
+ if (builtin.mode == .ReleaseSmall)
+ @export(&memcpySmall, export_options)
+ else
+ @export(&memcpyFast, export_options);
}
}
-fn memcpy(noalias opt_dest: ?[*]u8, noalias opt_src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 {
- return memmove(opt_dest, opt_src, len);
+const Element = if (std.simd.suggestVectorLength(u8)) |vec_size|
+ @Type(.{ .vector = .{
+ .child = u8,
+ .len = vec_size,
+ } })
+else
+ usize;
+
+comptime {
+ assert(@sizeOf(Element) >= @alignOf(Element));
+ assert(std.math.isPowerOfTwo(@sizeOf(Element)));
}
-fn memmove(opt_dest: ?[*]u8, opt_src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 {
- // a port of https://github.com/facebook/folly/blob/1c8bc50e88804e2a7361a57cd9b551dd10f6c5fd/folly/memcpy.S
- if (len == 0) {
- @branchHint(.unlikely);
- return opt_dest;
- }
+fn memcpySmall(noalias dest: ?[*]u8, noalias src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 {
+ @setRuntimeSafety(builtin.is_test);
- const dest = opt_dest.?;
- const src = opt_src.?;
-
- if (len < 8) {
- @branchHint(.unlikely);
- if (len == 1) {
- @branchHint(.unlikely);
- dest[0] = src[0];
- } else if (len >= 4) {
- @branchHint(.unlikely);
- blockCopy(dest, src, 4, len);
- } else {
- blockCopy(dest, src, 2, len);
- }
- return dest;
+ for (0..len) |i| {
+ dest.?[i] = src.?[i];
}
- if (len > 32) {
- @branchHint(.unlikely);
- if (len > 256) {
- @branchHint(.unlikely);
- copyMove(dest, src, len);
- return dest;
- }
- copyLong(dest, src, len);
- return dest;
- }
+ return dest;
+}
- if (len > 16) {
- @branchHint(.unlikely);
- blockCopy(dest, src, 16, len);
- return dest;
- }
+fn memcpyFast(noalias dest: ?[*]u8, noalias src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 {
+ @setRuntimeSafety(builtin.is_test);
+
+ const small_limit = 2 * @sizeOf(Element);
- blockCopy(dest, src, 8, len);
+ if (copySmallLength(small_limit, dest.?, src.?, len)) return dest;
+
+ copyForwards(dest.?, src.?, len);
return dest;
}
-inline fn blockCopy(dest: [*]u8, src: [*]const u8, block_size: comptime_int, len: usize) void {
- const first = @as(*align(1) const @Vector(block_size, u8), src[0..block_size]).*;
- const second = @as(*align(1) const @Vector(block_size, u8), src[len - block_size ..][0..block_size]).*;
- dest[0..block_size].* = first;
- dest[len - block_size ..][0..block_size].* = second;
-}
+inline fn copySmallLength(
+ comptime small_limit: comptime_int,
+ dest: [*]u8,
+ src: [*]const u8,
+ len: usize,
+) bool {
+ if (len < 16) {
+ copyLessThan16(dest, src, len);
+ return true;
+ }
-inline fn copyLong(dest: [*]u8, src: [*]const u8, len: usize) void {
- var array: [8]@Vector(32, u8) = undefined;
+ if (comptime 2 < (std.math.log2(small_limit) + 1) / 2) {
+ if (copy16ToSmallLimit(small_limit, dest, src, len)) return true;
+ }
- inline for (.{ 64, 128, 192, 256 }, 0..) |N, i| {
- array[i * 2] = src[(N / 2) - 32 ..][0..32].*;
- array[(i * 2) + 1] = src[len - N / 2 ..][0..32].*;
+ return false;
+}
- if (len <= N) {
- @branchHint(.unlikely);
- for (0..i + 1) |j| {
- dest[j * 32 ..][0..32].* = array[j * 2];
- dest[len - ((j * 32) + 32) ..][0..32].* = array[(j * 2) + 1];
- }
- return;
- }
+inline fn copyLessThan16(
+ dest: [*]u8,
+ src: [*]const u8,
+ len: usize,
+) void {
+ @setRuntimeSafety(builtin.is_test);
+ if (len < 4) {
+ if (len == 0) return;
+ dest[0] = src[0];
+ dest[len / 2] = src[len / 2];
+ dest[len - 1] = src[len - 1];
+ return;
}
+ copyRange4(4, dest, src, len);
}
-inline fn copyMove(dest: [*]u8, src: [*]const u8, len: usize) void {
- if (@intFromPtr(src) >= @intFromPtr(dest)) {
- @branchHint(.unlikely);
- copyForward(dest, src, len);
- } else if (@intFromPtr(src) + len > @intFromPtr(dest)) {
- @branchHint(.unlikely);
- overlapBwd(dest, src, len);
- } else {
- copyForward(dest, src, len);
+inline fn copy16ToSmallLimit(
+ comptime small_limit: comptime_int,
+ dest: [*]u8,
+ src: [*]const u8,
+ len: usize,
+) bool {
+ @setRuntimeSafety(builtin.is_test);
+ inline for (2..(std.math.log2(small_limit) + 1) / 2 + 1) |p| {
+ const limit = 1 << (2 * p);
+ if (len < limit) {
+ copyRange4(limit / 4, dest, src, len);
+ return true;
+ }
}
+ return false;
}
-inline fn copyForward(dest: [*]u8, src: [*]const u8, len: usize) void {
- const tail: @Vector(32, u8) = src[len - 32 ..][0..32].*;
+inline fn copyForwards(
+ noalias dest: [*]u8,
+ noalias src: [*]const u8,
+ len: usize,
+) void {
+ @setRuntimeSafety(builtin.is_test);
+ assert(len >= 2 * @sizeOf(Element));
+
+ dest[0..@sizeOf(Element)].* = src[0..@sizeOf(Element)].*;
+ const alignment_offset = @alignOf(Element) - @intFromPtr(src) % @alignOf(Element);
+ const n = len - alignment_offset;
+ const d = dest + alignment_offset;
+ const s = src + alignment_offset;
+
+ copyBlocksAlignedSource(@ptrCast(d), @alignCast(@ptrCast(s)), n);
+
+ // copy last `@sizeOf(Element)` bytes unconditionally, since block copy
+ // methods only copy a multiple of `@sizeOf(Element)` bytes.
+ const offset = len - @sizeOf(Element);
+ dest[offset..][0..@sizeOf(Element)].* = src[offset..][0..@sizeOf(Element)].*;
+}
- const N: usize = len & ~@as(usize, 127);
- var i: usize = 0;
+inline fn copyBlocksAlignedSource(
+ noalias dest: [*]align(1) Element,
+ noalias src: [*]const Element,
+ max_bytes: usize,
+) void {
+ copyBlocks(dest, src, max_bytes);
+}
- while (i < N) : (i += 128) {
- dest[i..][0..32].* = src[i..][0..32].*;
- dest[i + 32 ..][0..32].* = src[i + 32 ..][0..32].*;
- dest[i + 64 ..][0..32].* = src[i + 64 ..][0..32].*;
- dest[i + 96 ..][0..32].* = src[i + 96 ..][0..32].*;
- }
+/// Copies the largest multiple of `@sizeOf(T)` bytes from `src` to `dest`,
+/// that is less than `max_bytes` where `T` is the child type of `src` and
+/// `dest`.
+inline fn copyBlocks(
+ noalias dest: anytype,
+ noalias src: anytype,
+ max_bytes: usize,
+) void {
+ @setRuntimeSafety(builtin.is_test);
+
+ const T = @typeInfo(@TypeOf(dest)).pointer.child;
+ comptime assert(T == @typeInfo(@TypeOf(src)).pointer.child);
- if (len - i <= 32) {
- @branchHint(.unlikely);
- dest[len - 32 ..][0..32].* = tail;
- } else {
- copyLong(dest[i..], src[i..], len - i);
+ const loop_count = max_bytes / @sizeOf(T);
+
+ for (dest[0..loop_count], src[0..loop_count]) |*d, s| {
+ d.* = s;
}
}
-inline fn overlapBwd(dest: [*]u8, src: [*]const u8, len: usize) void {
- var array: [5]@Vector(32, u8) = undefined;
- array[0] = src[len - 32 ..][0..32].*;
- inline for (1..5) |i| array[i] = src[(i - 1) << 5 ..][0..32].*;
-
- const end: usize = (@intFromPtr(dest) + len - 32) & 31;
- const range = len - end;
- var s = src + range;
- var d = dest + range;
-
- while (@intFromPtr(s) > @intFromPtr(src + 128)) {
- // zig fmt: off
- const first = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 32)).*;
- const second = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 64)).*;
- const third = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 96)).*;
- const fourth = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 128)).*;
-
- @as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 32))).* = first;
- @as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 64))).* = second;
- @as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 96))).* = third;
- @as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 128))).* = fourth;
- // zig fmt: on
-
- s -= 128;
- d -= 128;
- }
+/// copy `len` bytes from `src` to `dest`; `len` must be in the range
+/// `[copy_len, 4 * copy_len)`.
+inline fn copyRange4(
+ comptime copy_len: comptime_int,
+ noalias dest: [*]u8,
+ noalias src: [*]const u8,
+ len: usize,
+) void {
+ @setRuntimeSafety(builtin.is_test);
+ comptime assert(std.math.isPowerOfTwo(copy_len));
+ assert(len >= copy_len);
+ assert(len < 4 * copy_len);
+
+ const a = len & (copy_len * 2);
+ const b = a / 2;
+
+ const last = len - copy_len;
+ const pen = last - b;
+
+ dest[0..copy_len].* = src[0..copy_len].*;
+ dest[b..][0..copy_len].* = src[b..][0..copy_len].*;
+ dest[pen..][0..copy_len].* = src[pen..][0..copy_len].*;
+ dest[last..][0..copy_len].* = src[last..][0..copy_len].*;
+}
+
+test {
+ const S = struct {
+ fn testFunc(comptime copy_func: anytype) !void {
+ const max_len = 1024;
+ var buffer: [max_len + @alignOf(Element) - 1]u8 align(@alignOf(Element)) = undefined;
+ for (&buffer, 0..) |*b, i| {
+ b.* = @intCast(i % 97);
+ }
+ var dest: [max_len + @alignOf(Element) - 1]u8 align(@alignOf(Element)) = undefined;
+
+ for (0..max_len) |copy_len| {
+ for (0..@alignOf(Element)) |s_offset| {
+ for (0..@alignOf(Element)) |d_offset| {
+ @memset(&dest, 0xff);
+ const s = buffer[s_offset..][0..copy_len];
+ const d = dest[d_offset..][0..copy_len];
+ _ = copy_func(@ptrCast(d.ptr), @ptrCast(s.ptr), s.len);
+ std.testing.expectEqualSlices(u8, s, d) catch |e| {
+ std.debug.print("error encountered for length={d}, s_offset={d}, d_offset={d}\n", .{
+ copy_len, s_offset, d_offset,
+ });
+ return e;
+ };
+ }
+ }
+ }
+ }
+ };
- inline for (array[1..], 0..) |vec, i| dest[i * 32 ..][0..32].* = vec;
- dest[len - 32 ..][0..32].* = array[0];
+ try S.testFunc(memcpySmall);
+ try S.testFunc(memcpyFast);
}
diff --git a/lib/compiler_rt/memmove.zig b/lib/compiler_rt/memmove.zig
new file mode 100644
index 0000000000..408151e2a3
--- /dev/null
+++ b/lib/compiler_rt/memmove.zig
@@ -0,0 +1,170 @@
+const std = @import("std");
+const common = @import("./common.zig");
+const builtin = @import("builtin");
+
+comptime {
+ if (builtin.object_format != .c) {
+ const export_options: std.builtin.ExportOptions = .{
+ .name = "memmove",
+ .linkage = common.linkage,
+ .visibility = common.visibility,
+ };
+
+ if (builtin.mode == .ReleaseSmall)
+ @export(&memmoveSmall, export_options)
+ else
+ @export(&memmoveFast, export_options);
+ }
+}
+
+fn memmoveSmall(opt_dest: ?[*]u8, opt_src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 {
+ const dest = opt_dest.?;
+ const src = opt_src.?;
+
+ if (@intFromPtr(dest) < @intFromPtr(src)) {
+ for (0..len) |i| {
+ dest[i] = src[i];
+ }
+ } else {
+ for (0..len) |i| {
+ dest[len - 1 - i] = src[len - 1 - i];
+ }
+ }
+
+ return dest;
+}
+
+pub fn memmoveFast(opt_dest: ?[*]u8, opt_src: ?[*]const u8, len: usize) callconv(.C) ?[*]u8 {
+ // a port of https://github.com/facebook/folly/blob/1c8bc50e88804e2a7361a57cd9b551dd10f6c5fd/folly/memcpy.S
+ if (len == 0) {
+ @branchHint(.unlikely);
+ return opt_dest;
+ }
+
+ const dest = opt_dest.?;
+ const src = opt_src.?;
+
+ if (len < 8) {
+ @branchHint(.unlikely);
+ if (len == 1) {
+ @branchHint(.unlikely);
+ dest[0] = src[0];
+ } else if (len >= 4) {
+ @branchHint(.unlikely);
+ blockCopy(dest, src, 4, len);
+ } else {
+ blockCopy(dest, src, 2, len);
+ }
+ return dest;
+ }
+
+ if (len > 32) {
+ @branchHint(.unlikely);
+ if (len > 256) {
+ @branchHint(.unlikely);
+ copyMove(dest, src, len);
+ return dest;
+ }
+ copyLong(dest, src, len);
+ return dest;
+ }
+
+ if (len > 16) {
+ @branchHint(.unlikely);
+ blockCopy(dest, src, 16, len);
+ return dest;
+ }
+
+ blockCopy(dest, src, 8, len);
+
+ return dest;
+}
+
+inline fn blockCopy(dest: [*]u8, src: [*]const u8, block_size: comptime_int, len: usize) void {
+ const first = @as(*align(1) const @Vector(block_size, u8), src[0..block_size]).*;
+ const second = @as(*align(1) const @Vector(block_size, u8), src[len - block_size ..][0..block_size]).*;
+ dest[0..block_size].* = first;
+ dest[len - block_size ..][0..block_size].* = second;
+}
+
+inline fn copyLong(dest: [*]u8, src: [*]const u8, len: usize) void {
+ var array: [8]@Vector(32, u8) = undefined;
+
+ inline for (.{ 64, 128, 192, 256 }, 0..) |N, i| {
+ array[i * 2] = src[(N / 2) - 32 ..][0..32].*;
+ array[(i * 2) + 1] = src[len - N / 2 ..][0..32].*;
+
+ if (len <= N) {
+ @branchHint(.unlikely);
+ for (0..i + 1) |j| {
+ dest[j * 32 ..][0..32].* = array[j * 2];
+ dest[len - ((j * 32) + 32) ..][0..32].* = array[(j * 2) + 1];
+ }
+ return;
+ }
+ }
+}
+
+inline fn copyMove(dest: [*]u8, src: [*]const u8, len: usize) void {
+ if (@intFromPtr(src) >= @intFromPtr(dest)) {
+ @branchHint(.unlikely);
+ copyForward(dest, src, len);
+ } else if (@intFromPtr(src) + len > @intFromPtr(dest)) {
+ @branchHint(.unlikely);
+ overlapBwd(dest, src, len);
+ } else {
+ copyForward(dest, src, len);
+ }
+}
+
+inline fn copyForward(dest: [*]u8, src: [*]const u8, len: usize) void {
+ const tail: @Vector(32, u8) = src[len - 32 ..][0..32].*;
+
+ const N: usize = len & ~@as(usize, 127);
+ var i: usize = 0;
+
+ while (i < N) : (i += 128) {
+ dest[i..][0..32].* = src[i..][0..32].*;
+ dest[i + 32 ..][0..32].* = src[i + 32 ..][0..32].*;
+ dest[i + 64 ..][0..32].* = src[i + 64 ..][0..32].*;
+ dest[i + 96 ..][0..32].* = src[i + 96 ..][0..32].*;
+ }
+
+ if (len - i <= 32) {
+ @branchHint(.unlikely);
+ dest[len - 32 ..][0..32].* = tail;
+ } else {
+ copyLong(dest[i..], src[i..], len - i);
+ }
+}
+
+inline fn overlapBwd(dest: [*]u8, src: [*]const u8, len: usize) void {
+ var array: [5]@Vector(32, u8) = undefined;
+ array[0] = src[len - 32 ..][0..32].*;
+ inline for (1..5) |i| array[i] = src[(i - 1) << 5 ..][0..32].*;
+
+ const end: usize = (@intFromPtr(dest) + len - 32) & 31;
+ const range = len - end;
+ var s = src + range;
+ var d = dest + range;
+
+ while (@intFromPtr(s) > @intFromPtr(src + 128)) {
+ // zig fmt: off
+ const first = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 32)).*;
+ const second = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 64)).*;
+ const third = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 96)).*;
+ const fourth = @as(*align(1) const @Vector(32, u8), @ptrCast(s - 128)).*;
+
+ @as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 32))).* = first;
+ @as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 64))).* = second;
+ @as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 96))).* = third;
+ @as(*align(32) @Vector(32, u8), @alignCast(@ptrCast(d - 128))).* = fourth;
+ // zig fmt: on
+
+ s -= 128;
+ d -= 128;
+ }
+
+ inline for (array[1..], 0..) |vec, i| dest[i * 32 ..][0..32].* = vec;
+ dest[len - 32 ..][0..32].* = array[0];
+}