aboutsummaryrefslogtreecommitdiff
path: root/lib/std/os.zig
diff options
context:
space:
mode:
authorAndrew Kelley <andrew@ziglang.org>2020-03-03 10:15:37 -0500
committerGitHub <noreply@github.com>2020-03-03 10:15:37 -0500
commit226b801830857a9075fdd8180739105bb73eed0e (patch)
tree48c684f434c6a44d2e0684adc69b31862dcb90ab /lib/std/os.zig
parent387418277a4964714ddaec3336a602ec87dde0f9 (diff)
parent9d6cc75ce3be9ab291614fc0d4361877e9200126 (diff)
downloadzig-226b801830857a9075fdd8180739105bb73eed0e.tar.gz
zig-226b801830857a9075fdd8180739105bb73eed0e.zip
Merge pull request #4612 from ziglang/os-read-write-sendfile
std.os read/write functions + sendfile
Diffstat (limited to 'lib/std/os.zig')
-rw-r--r--lib/std/os.zig646
1 files changed, 469 insertions, 177 deletions
diff --git a/lib/std/os.zig b/lib/std/os.zig
index fbfef4ac4c..969e6407a6 100644
--- a/lib/std/os.zig
+++ b/lib/std/os.zig
@@ -298,6 +298,11 @@ pub const ReadError = error{
/// buf.len. If 0 bytes were read, that means EOF.
/// If the application has a global event loop enabled, EAGAIN is handled
/// via the event loop. Otherwise EAGAIN results in error.WouldBlock.
+///
+/// Linux has a limit on how many bytes may be transferred in one `read` call, which is `0x7ffff000`
+/// on both 64-bit and 32-bit systems. This is due to using a signed C int as the return value, as
+/// well as stuffing the errno codes into the last `4096` values. This is noted on the `read` man page.
+/// For POSIX the limit is `math.maxInt(isize)`.
pub fn read(fd: fd_t, buf: []u8) ReadError!usize {
if (builtin.os.tag == .windows) {
return windows.ReadFile(fd, buf, null);
@@ -316,8 +321,15 @@ pub fn read(fd: fd_t, buf: []u8) ReadError!usize {
}
}
+ // Prevents EINVAL.
+ const max_count = switch (std.Target.current.os.tag) {
+ .linux => 0x7ffff000,
+ else => math.maxInt(isize),
+ };
+ const adjusted_len = math.min(max_count, buf.len);
+
while (true) {
- const rc = system.read(fd, buf.ptr, buf.len);
+ const rc = system.read(fd, buf.ptr, adjusted_len);
switch (errno(rc)) {
0 => return @intCast(usize, rc),
EINTR => continue,
@@ -352,32 +364,18 @@ pub fn read(fd: fd_t, buf: []u8) ReadError!usize {
/// * Windows
/// On these systems, the read races with concurrent writes to the same file descriptor.
pub fn readv(fd: fd_t, iov: []const iovec) ReadError!usize {
- if (builtin.os.tag == .windows) {
- // TODO batch these into parallel requests
- var off: usize = 0;
- var iov_i: usize = 0;
- var inner_off: usize = 0;
- while (true) {
- const v = iov[iov_i];
- const amt_read = try read(fd, v.iov_base[inner_off .. v.iov_len - inner_off]);
- off += amt_read;
- inner_off += amt_read;
- if (inner_off == v.len) {
- iov_i += 1;
- inner_off = 0;
- if (iov_i == iov.len) {
- return off;
- }
- }
- if (amt_read == 0) return off; // EOF
- } else unreachable; // TODO https://github.com/ziglang/zig/issues/707
+ if (std.Target.current.os.tag == .windows) {
+ // TODO does Windows have a way to read an io vector?
+ if (iov.len == 0) return @as(usize, 0);
+ const first = iov[0];
+ return read(fd, first.iov_base[0..first.iov_len]);
}
while (true) {
// TODO handle the case when iov_len is too large and get rid of this @intCast
- const rc = system.readv(fd, iov.ptr, @intCast(u32, iov.len));
+ const rc = system.readv(fd, iov.ptr, iov_count);
switch (errno(rc)) {
- 0 => return @bitCast(usize, rc),
+ 0 => return @intCast(usize, rc),
EINTR => continue,
EINVAL => unreachable,
EFAULT => unreachable,
@@ -397,6 +395,8 @@ pub fn readv(fd: fd_t, iov: []const iovec) ReadError!usize {
}
}
+pub const PReadError = ReadError || error{Unseekable};
+
/// Number of bytes read is returned. Upon reading end-of-file, zero is returned.
///
/// Retries when interrupted by a signal.
@@ -405,7 +405,7 @@ pub fn readv(fd: fd_t, iov: []const iovec) ReadError!usize {
/// via the event loop. Otherwise EAGAIN results in `error.WouldBlock`.
/// On Windows, if the application has a global event loop enabled, I/O Completion Ports are
/// used to perform the I/O. `error.WouldBlock` is not possible on Windows.
-pub fn pread(fd: fd_t, buf: []u8, offset: u64) ReadError!usize {
+pub fn pread(fd: fd_t, buf: []u8, offset: u64) PReadError!usize {
if (builtin.os.tag == .windows) {
return windows.ReadFile(fd, buf, offset);
}
@@ -429,6 +429,9 @@ pub fn pread(fd: fd_t, buf: []u8, offset: u64) ReadError!usize {
ENOBUFS => return error.SystemResources,
ENOMEM => return error.SystemResources,
ECONNRESET => return error.ConnectionResetByPeer,
+ ENXIO => return error.Unseekable,
+ ESPIPE => return error.Unseekable,
+ EOVERFLOW => return error.Unseekable,
else => |err| return unexpectedErrno(err),
}
}
@@ -448,75 +451,23 @@ pub fn pread(fd: fd_t, buf: []u8, offset: u64) ReadError!usize {
/// * Darwin
/// * Windows
/// On these systems, the read races with concurrent writes to the same file descriptor.
-pub fn preadv(fd: fd_t, iov: []const iovec, offset: u64) ReadError!usize {
- if (comptime std.Target.current.isDarwin()) {
- // Darwin does not have preadv but it does have pread.
- var off: usize = 0;
- var iov_i: usize = 0;
- var inner_off: usize = 0;
- while (true) {
- const v = iov[iov_i];
- const rc = darwin.pread(fd, v.iov_base + inner_off, v.iov_len - inner_off, offset + off);
- const err = darwin.getErrno(rc);
- switch (err) {
- 0 => {
- const amt_read = @bitCast(usize, rc);
- off += amt_read;
- inner_off += amt_read;
- if (inner_off == v.iov_len) {
- iov_i += 1;
- inner_off = 0;
- if (iov_i == iov.len) {
- return off;
- }
- }
- if (rc == 0) return off; // EOF
- continue;
- },
- EINTR => continue,
- EINVAL => unreachable,
- EFAULT => unreachable,
- ESPIPE => unreachable, // fd is not seekable
- EAGAIN => if (std.event.Loop.instance) |loop| {
- loop.waitUntilFdReadable(fd);
- continue;
- } else {
- return error.WouldBlock;
- },
- EBADF => unreachable, // always a race condition
- EIO => return error.InputOutput,
- EISDIR => return error.IsDir,
- ENOBUFS => return error.SystemResources,
- ENOMEM => return error.SystemResources,
- else => return unexpectedErrno(err),
- }
- }
+pub fn preadv(fd: fd_t, iov: []const iovec, offset: u64) PReadError!usize {
+ const have_pread_but_not_preadv = switch (std.Target.current.os.tag) {
+ .windows, .macosx, .ios, .watchos, .tvos => true,
+ else => false,
+ };
+ if (have_pread_but_not_preadv) {
+ // We could loop here; but proper usage of `preadv` must handle partial reads anyway.
+ // So we simply read into the first vector only.
+ if (iov.len == 0) return @as(usize, 0);
+ const first = iov[0];
+ return pread(fd, first.iov_base[0..first.iov_len], offset);
}
- if (builtin.os.tag == .windows) {
- // TODO batch these into parallel requests
- var off: usize = 0;
- var iov_i: usize = 0;
- var inner_off: usize = 0;
- while (true) {
- const v = iov[iov_i];
- const amt_read = try pread(fd, v.iov_base[inner_off .. v.iov_len - inner_off], offset + off);
- off += amt_read;
- inner_off += amt_read;
- if (inner_off == v.len) {
- iov_i += 1;
- inner_off = 0;
- if (iov_i == iov.len) {
- return off;
- }
- }
- if (amt_read == 0) return off; // EOF
- } else unreachable; // TODO https://github.com/ziglang/zig/issues/707
- }
+ const iov_count = math.cast(u31, iov.len) catch math.maxInt(u31);
while (true) {
- // TODO handle the case when iov_len is too large and get rid of this @intCast
- const rc = system.preadv(fd, iov.ptr, @intCast(u32, iov.len), offset);
+ const rc = system.preadv(fd, iov.ptr, iov_count, offset);
switch (errno(rc)) {
0 => return @bitCast(usize, rc),
EINTR => continue,
@@ -533,6 +484,9 @@ pub fn preadv(fd: fd_t, iov: []const iovec, offset: u64) ReadError!usize {
EISDIR => return error.IsDir,
ENOBUFS => return error.SystemResources,
ENOMEM => return error.SystemResources,
+ ENXIO => return error.Unseekable,
+ ESPIPE => return error.Unseekable,
+ EOVERFLOW => return error.Unseekable,
else => |err| return unexpectedErrno(err),
}
}
@@ -553,10 +507,28 @@ pub const WriteError = error{
WouldBlock,
} || UnexpectedError;
-/// Write to a file descriptor. Keeps trying if it gets interrupted.
-/// If the application has a global event loop enabled, EAGAIN is handled
-/// via the event loop. Otherwise EAGAIN results in error.WouldBlock.
-pub fn write(fd: fd_t, bytes: []const u8) WriteError!void {
+/// Write to a file descriptor.
+/// Retries when interrupted by a signal.
+/// Returns the number of bytes written. If nonzero bytes were supplied, this will be nonzero.
+///
+/// Note that a successful write() may transfer fewer than count bytes. Such partial writes can
+/// occur for various reasons; for example, because there was insufficient space on the disk
+/// device to write all of the requested bytes, or because a blocked write() to a socket, pipe, or
+/// similar was interrupted by a signal handler after it had transferred some, but before it had
+/// transferred all of the requested bytes. In the event of a partial write, the caller can make
+/// another write() call to transfer the remaining bytes. The subsequent call will either
+/// transfer further bytes or may result in an error (e.g., if the disk is now full).
+///
+/// For POSIX systems, if the application has a global event loop enabled, EAGAIN is handled
+/// via the event loop. Otherwise EAGAIN results in `error.WouldBlock`.
+/// On Windows, if the application has a global event loop enabled, I/O Completion Ports are
+/// used to perform the I/O. `error.WouldBlock` is not possible on Windows.
+///
+/// Linux has a limit on how many bytes may be transferred in one `write` call, which is `0x7ffff000`
+/// on both 64-bit and 32-bit systems. This is due to using a signed C int as the return value, as
+/// well as stuffing the errno codes into the last `4096` values. This is noted on the `write` man page.
+/// The corresponding POSIX limit is `math.maxInt(isize)`.
+pub fn write(fd: fd_t, bytes: []const u8) WriteError!usize {
if (builtin.os.tag == .windows) {
return windows.WriteFile(fd, bytes, null);
}
@@ -568,26 +540,21 @@ pub fn write(fd: fd_t, bytes: []const u8) WriteError!void {
}};
var nwritten: usize = undefined;
switch (wasi.fd_write(fd, &ciovs, ciovs.len, &nwritten)) {
- 0 => return,
+ 0 => return nwritten,
else => |err| return unexpectedErrno(err),
}
}
- // Linux can return EINVAL when write amount is > 0x7ffff000
- // See https://github.com/ziglang/zig/pull/743#issuecomment-363165856
- // TODO audit this. Shawn Landden says that this is not actually true.
- // if this logic should stay, move it to std.os.linux
- const max_bytes_len = 0x7ffff000;
+ const max_count = switch (std.Target.current.os.tag) {
+ .linux => 0x7ffff000,
+ else => math.maxInt(isize),
+ };
+ const adjusted_len = math.min(max_count, bytes.len);
- var index: usize = 0;
- while (index < bytes.len) {
- const amt_to_write = math.min(bytes.len - index, @as(usize, max_bytes_len));
- const rc = system.write(fd, bytes.ptr + index, amt_to_write);
+ while (true) {
+ const rc = system.write(fd, bytes.ptr, adjusted_len);
switch (errno(rc)) {
- 0 => {
- index += @intCast(usize, rc);
- continue;
- },
+ 0 => return @intCast(usize, rc),
EINTR => continue,
EINVAL => unreachable,
EFAULT => unreachable,
@@ -611,14 +578,36 @@ pub fn write(fd: fd_t, bytes: []const u8) WriteError!void {
}
/// Write multiple buffers to a file descriptor.
-/// If the application has a global event loop enabled, EAGAIN is handled
-/// via the event loop. Otherwise EAGAIN results in error.WouldBlock.
-pub fn writev(fd: fd_t, iov: []const iovec_const) WriteError!void {
+/// Retries when interrupted by a signal.
+/// Returns the number of bytes written. If nonzero bytes were supplied, this will be nonzero.
+///
+/// Note that a successful write() may transfer fewer bytes than supplied. Such partial writes can
+/// occur for various reasons; for example, because there was insufficient space on the disk
+/// device to write all of the requested bytes, or because a blocked write() to a socket, pipe, or
+/// similar was interrupted by a signal handler after it had transferred some, but before it had
+/// transferred all of the requested bytes. In the event of a partial write, the caller can make
+/// another write() call to transfer the remaining bytes. The subsequent call will either
+/// transfer further bytes or may result in an error (e.g., if the disk is now full).
+///
+/// For POSIX systems, if the application has a global event loop enabled, EAGAIN is handled
+/// via the event loop. Otherwise EAGAIN results in `error.WouldBlock`.
+/// On Windows, if the application has a global event loop enabled, I/O Completion Ports are
+/// used to perform the I/O. `error.WouldBlock` is not possible on Windows.
+///
+/// If `iov.len` is larger than will fit in a `u31`, a partial write will occur.
+pub fn writev(fd: fd_t, iov: []const iovec_const) WriteError!usize {
+ if (std.Target.current.os.tag == .windows) {
+ // TODO does Windows have a way to write an io vector?
+ if (iov.len == 0) return @as(usize, 0);
+ const first = iov[0];
+ return write(fd, first.iov_base[0..first.iov_len]);
+ }
+
+ const iov_count = math.cast(u31, iov.len) catch math.maxInt(u31);
while (true) {
- // TODO handle the case when iov_len is too large and get rid of this @intCast
- const rc = system.writev(fd, iov.ptr, @intCast(u32, iov.len));
+ const rc = system.writev(fd, iov.ptr, iov_count);
switch (errno(rc)) {
- 0 => return,
+ 0 => return @intCast(usize, rc),
EINTR => continue,
EINVAL => unreachable,
EFAULT => unreachable,
@@ -641,23 +630,45 @@ pub fn writev(fd: fd_t, iov: []const iovec_const) WriteError!void {
}
}
+pub const PWriteError = WriteError || error{Unseekable};
+
/// Write to a file descriptor, with a position offset.
-///
/// Retries when interrupted by a signal.
+/// Returns the number of bytes written. If nonzero bytes were supplied, this will be nonzero.
+///
+/// Note that a successful write() may transfer fewer bytes than supplied. Such partial writes can
+/// occur for various reasons; for example, because there was insufficient space on the disk
+/// device to write all of the requested bytes, or because a blocked write() to a socket, pipe, or
+/// similar was interrupted by a signal handler after it had transferred some, but before it had
+/// transferred all of the requested bytes. In the event of a partial write, the caller can make
+/// another write() call to transfer the remaining bytes. The subsequent call will either
+/// transfer further bytes or may result in an error (e.g., if the disk is now full).
///
/// For POSIX systems, if the application has a global event loop enabled, EAGAIN is handled
/// via the event loop. Otherwise EAGAIN results in `error.WouldBlock`.
/// On Windows, if the application has a global event loop enabled, I/O Completion Ports are
/// used to perform the I/O. `error.WouldBlock` is not possible on Windows.
-pub fn pwrite(fd: fd_t, bytes: []const u8, offset: u64) WriteError!void {
+///
+/// Linux has a limit on how many bytes may be transferred in one `pwrite` call, which is `0x7ffff000`
+/// on both 64-bit and 32-bit systems. This is due to using a signed C int as the return value, as
+/// well as stuffing the errno codes into the last `4096` values. This is noted on the `write` man page.
+/// The corresponding POSIX limit is `math.maxInt(isize)`.
+pub fn pwrite(fd: fd_t, bytes: []const u8, offset: u64) PWriteError!usize {
if (std.Target.current.os.tag == .windows) {
return windows.WriteFile(fd, bytes, offset);
}
+ // Prevent EINVAL.
+ const max_count = switch (std.Target.current.os.tag) {
+ .linux => 0x7ffff000,
+ else => math.maxInt(isize),
+ };
+ const adjusted_len = math.min(max_count, bytes.len);
+
while (true) {
- const rc = system.pwrite(fd, bytes.ptr, bytes.len, offset);
+ const rc = system.pwrite(fd, bytes.ptr, adjusted_len, offset);
switch (errno(rc)) {
- 0 => return,
+ 0 => return @intCast(usize, rc),
EINTR => continue,
EINVAL => unreachable,
EFAULT => unreachable,
@@ -675,84 +686,54 @@ pub fn pwrite(fd: fd_t, bytes: []const u8, offset: u64) WriteError!void {
ENOSPC => return error.NoSpaceLeft,
EPERM => return error.AccessDenied,
EPIPE => return error.BrokenPipe,
+ ENXIO => return error.Unseekable,
+ ESPIPE => return error.Unseekable,
+ EOVERFLOW => return error.Unseekable,
else => |err| return unexpectedErrno(err),
}
}
}
/// Write multiple buffers to a file descriptor, with a position offset.
-///
/// Retries when interrupted by a signal.
+/// Returns the number of bytes written. If nonzero bytes were supplied, this will be nonzero.
+///
+/// Note that a successful write() may transfer fewer than count bytes. Such partial writes can
+/// occur for various reasons; for example, because there was insufficient space on the disk
+/// device to write all of the requested bytes, or because a blocked write() to a socket, pipe, or
+/// similar was interrupted by a signal handler after it had transferred some, but before it had
+/// transferred all of the requested bytes. In the event of a partial write, the caller can make
+/// another write() call to transfer the remaining bytes. The subsequent call will either
+/// transfer further bytes or may result in an error (e.g., if the disk is now full).
///
/// If the application has a global event loop enabled, EAGAIN is handled
/// via the event loop. Otherwise EAGAIN results in `error.WouldBlock`.
///
-/// This operation is non-atomic on the following systems:
+/// The following systems do not have this syscall, and will return partial writes if more than one
+/// vector is provided:
/// * Darwin
/// * Windows
-/// On these systems, the write races with concurrent writes to the same file descriptor, and
-/// the file can be in a partially written state when an error occurs.
-pub fn pwritev(fd: fd_t, iov: []const iovec_const, offset: u64) WriteError!void {
- if (comptime std.Target.current.isDarwin()) {
- // Darwin does not have pwritev but it does have pwrite.
- var off: usize = 0;
- var iov_i: usize = 0;
- var inner_off: usize = 0;
- while (true) {
- const v = iov[iov_i];
- const rc = darwin.pwrite(fd, v.iov_base + inner_off, v.iov_len - inner_off, offset + off);
- const err = darwin.getErrno(rc);
- switch (err) {
- 0 => {
- const amt_written = @bitCast(usize, rc);
- off += amt_written;
- inner_off += amt_written;
- if (inner_off == v.iov_len) {
- iov_i += 1;
- inner_off = 0;
- if (iov_i == iov.len) {
- return;
- }
- }
- continue;
- },
- EINTR => continue,
- ESPIPE => unreachable, // `fd` is not seekable.
- EINVAL => unreachable,
- EFAULT => unreachable,
- EAGAIN => if (std.event.Loop.instance) |loop| {
- loop.waitUntilFdWritable(fd);
- continue;
- } else {
- return error.WouldBlock;
- },
- EBADF => unreachable, // Always a race condition.
- EDESTADDRREQ => unreachable, // `connect` was never called.
- EDQUOT => return error.DiskQuota,
- EFBIG => return error.FileTooBig,
- EIO => return error.InputOutput,
- ENOSPC => return error.NoSpaceLeft,
- EPERM => return error.AccessDenied,
- EPIPE => return error.BrokenPipe,
- else => return unexpectedErrno(err),
- }
- }
- }
+///
+/// If `iov.len` is larger than will fit in a `u31`, a partial write will occur.
+pub fn pwritev(fd: fd_t, iov: []const iovec_const, offset: u64) PWriteError!usize {
+ const have_pwrite_but_not_pwritev = switch (std.Target.current.os.tag) {
+ .windows, .macosx, .ios, .watchos, .tvos => true,
+ else => false,
+ };
- if (std.Target.current.os.tag == .windows) {
- var off = offset;
- for (iov) |item| {
- try pwrite(fd, item.iov_base[0..item.iov_len], off);
- off += buf.len;
- }
- return;
+ if (have_pwrite_but_not_pwritev) {
+ // We could loop here; but proper usage of `pwritev` must handle partial writes anyway.
+ // So we simply write the first vector only.
+ if (iov.len == 0) return @as(usize, 0);
+ const first = iov[0];
+ return pwrite(fd, first.iov_base[0..first.iov_len], offset);
}
+ const iov_count = math.cast(u31, iov.len) catch math.maxInt(u31);
while (true) {
- // TODO handle the case when iov_len is too large and get rid of this @intCast
- const rc = system.pwritev(fd, iov.ptr, @intCast(u32, iov.len), offset);
+ const rc = system.pwritev(fd, iov.ptr, iov_count, offset);
switch (errno(rc)) {
- 0 => return,
+ 0 => return @intCast(usize, rc),
EINTR => continue,
EINVAL => unreachable,
EFAULT => unreachable,
@@ -770,6 +751,9 @@ pub fn pwritev(fd: fd_t, iov: []const iovec_const, offset: u64) WriteError!void
ENOSPC => return error.NoSpaceLeft,
EPERM => return error.AccessDenied,
EPIPE => return error.BrokenPipe,
+ ENXIO => return error.Unseekable,
+ ESPIPE => return error.Unseekable,
+ EOVERFLOW => return error.Unseekable,
else => |err| return unexpectedErrno(err),
}
}
@@ -3389,7 +3373,6 @@ pub const SendError = error{
/// The socket type requires that message be sent atomically, and the size of the message
/// to be sent made this impossible. The message is not transmitted.
- ///
MessageTooBig,
/// The output queue for a network interface was full. This generally indicates that the
@@ -3498,6 +3481,315 @@ pub fn send(
return sendto(sockfd, buf, flags, null, 0);
}
+pub const SendFileError = PReadError || WriteError || SendError;
+
+fn count_iovec_bytes(iovs: []const iovec_const) usize {
+ var count: usize = 0;
+ for (iovs) |iov| {
+ count += iov.iov_len;
+ }
+ return count;
+}
+
+/// Transfer data between file descriptors, with optional headers and trailers.
+/// Returns the number of bytes written. This will be zero if `in_offset` falls beyond the end of the file.
+///
+/// The `sendfile` call copies `count` bytes from one file descriptor to another. When possible,
+/// this is done within the operating system kernel, which can provide better performance
+/// characteristics than transferring data from kernel to user space and back, such as with
+/// `read` and `write` calls. When `count` is `0`, it means to copy until the end of the input file has been
+/// reached. Note, however, that partial writes are still possible in this case.
+///
+/// `in_fd` must be a file descriptor opened for reading, and `out_fd` must be a file descriptor
+/// opened for writing. They may be any kind of file descriptor; however, if `in_fd` is not a regular
+/// file system file, it may cause this function to fall back to calling `read` and `write`, in which case
+/// atomicity guarantees no longer apply.
+///
+/// Copying begins reading at `in_offset`. The input file descriptor seek position is ignored and not updated.
+/// If the output file descriptor has a seek position, it is updated as bytes are written.
+///
+/// `flags` has different meanings per operating system; refer to the respective man pages.
+///
+/// These systems support atomically sending everything, including headers and trailers:
+/// * macOS
+/// * FreeBSD
+///
+/// These systems support in-kernel data copying, but headers and trailers are not sent atomically:
+/// * Linux
+///
+/// Other systems fall back to calling `read` / `write`.
+///
+/// Linux has a limit on how many bytes may be transferred in one `sendfile` call, which is `0x7ffff000`
+/// on both 64-bit and 32-bit systems. This is due to using a signed C int as the return value, as
+/// well as stuffing the errno codes into the last `4096` values. This is cited on the `sendfile` man page.
+/// The corresponding POSIX limit on this is `math.maxInt(isize)`.
+pub fn sendfile(
+ out_fd: fd_t,
+ in_fd: fd_t,
+ in_offset: u64,
+ count: usize,
+ headers: []const iovec_const,
+ trailers: []const iovec_const,
+ flags: u32,
+) SendFileError!usize {
+ var header_done = false;
+ var total_written: usize = 0;
+
+ // Prevents EOVERFLOW.
+ const max_count = switch (std.Target.current.os.tag) {
+ .linux => 0x7ffff000,
+ else => math.maxInt(isize),
+ };
+
+ switch (std.Target.current.os.tag) {
+ .linux => sf: {
+ // sendfile() first appeared in Linux 2.2, glibc 2.1.
+ const call_sf = comptime if (builtin.link_libc)
+ std.c.versionCheck(.{ .major = 2, .minor = 1 }).ok
+ else
+ std.Target.current.os.version_range.linux.range.max.order(.{ .major = 2, .minor = 2 }) != .lt;
+ if (!call_sf) break :sf;
+
+ if (headers.len != 0) {
+ const amt = try writev(out_fd, headers);
+ total_written += amt;
+ if (amt < count_iovec_bytes(headers)) return total_written;
+ header_done = true;
+ }
+
+ // Here we match BSD behavior, making a zero count value send as many bytes as possible.
+ const adjusted_count = if (count == 0) max_count else math.min(count, max_count);
+
+ while (true) {
+ var offset: off_t = @bitCast(off_t, in_offset);
+ const rc = system.sendfile(out_fd, in_fd, &offset, adjusted_count);
+ switch (errno(rc)) {
+ 0 => {
+ const amt = @bitCast(usize, rc);
+ total_written += amt;
+ if (count == 0 and amt == 0) {
+ // We have detected EOF from `in_fd`.
+ break;
+ } else if (amt < count) {
+ return total_written;
+ } else {
+ break;
+ }
+ },
+
+ EBADF => unreachable, // Always a race condition.
+ EFAULT => unreachable, // Segmentation fault.
+ EOVERFLOW => unreachable, // We avoid passing too large of a `count`.
+ ENOTCONN => unreachable, // `out_fd` is an unconnected socket.
+
+ EINVAL, ENOSYS => {
+ // EINVAL could be any of the following situations:
+ // * Descriptor is not valid or locked
+ // * an mmap(2)-like operation is not available for in_fd
+ // * count is negative
+ // * out_fd has the O_APPEND flag set
+ // Because of the "mmap(2)-like operation" possibility, we fall back to doing read/write
+ // manually, the same as ENOSYS.
+ break :sf;
+ },
+ EAGAIN => if (std.event.Loop.instance) |loop| {
+ loop.waitUntilFdWritable(out_fd);
+ continue;
+ } else {
+ return error.WouldBlock;
+ },
+ EIO => return error.InputOutput,
+ EPIPE => return error.BrokenPipe,
+ ENOMEM => return error.SystemResources,
+ ENXIO => return error.Unseekable,
+ ESPIPE => return error.Unseekable,
+ else => |err| {
+ const discard = unexpectedErrno(err);
+ break :sf;
+ },
+ }
+ }
+
+ if (trailers.len != 0) {
+ total_written += try writev(out_fd, trailers);
+ }
+
+ return total_written;
+ },
+ .freebsd => sf: {
+ var hdtr_data: std.c.sf_hdtr = undefined;
+ var hdtr: ?*std.c.sf_hdtr = null;
+ if (headers.len != 0 or trailers.len != 0) {
+ // Here we carefully avoid `@intCast` by returning partial writes when
+ // too many io vectors are provided.
+ const hdr_cnt = math.cast(u31, headers.len) catch math.maxInt(u31);
+ if (headers.len > hdr_cnt) return writev(out_fd, headers);
+
+ const trl_cnt = math.cast(u31, trailers.len) catch math.maxInt(u31);
+
+ hdtr_data = std.c.sf_hdtr{
+ .headers = headers.ptr,
+ .hdr_cnt = hdr_cnt,
+ .trailers = trailers.ptr,
+ .trl_cnt = trl_cnt,
+ };
+ hdtr = &hdtr_data;
+ }
+
+ const adjusted_count = math.min(count, max_count);
+
+ while (true) {
+ var sbytes: off_t = undefined;
+ const err = errno(system.sendfile(out_fd, in_fd, in_offset, adjusted_count, hdtr, &sbytes, flags));
+ const amt = @bitCast(usize, sbytes);
+ switch (err) {
+ 0 => return amt,
+
+ EBADF => unreachable, // Always a race condition.
+ EFAULT => unreachable, // Segmentation fault.
+ ENOTCONN => unreachable, // `out_fd` is an unconnected socket.
+
+ EINVAL, EOPNOTSUPP, ENOTSOCK, ENOSYS => {
+ // EINVAL could be any of the following situations:
+ // * The fd argument is not a regular file.
+ // * The s argument is not a SOCK_STREAM type socket.
+ // * The offset argument is negative.
+ // Because of some of these possibilities, we fall back to doing read/write
+ // manually, the same as ENOSYS.
+ break :sf;
+ },
+
+ EINTR => if (amt != 0) return amt else continue,
+
+ EAGAIN => if (amt != 0) {
+ return amt;
+ } else if (std.event.Loop.instance) |loop| {
+ loop.waitUntilFdWritable(out_fd);
+ continue;
+ } else {
+ return error.WouldBlock;
+ },
+
+ EBUSY => if (amt != 0) {
+ return amt;
+ } else if (std.event.Loop.instance) |loop| {
+ loop.waitUntilFdReadable(in_fd);
+ continue;
+ } else {
+ return error.WouldBlock;
+ },
+
+ EIO => return error.InputOutput,
+ ENOBUFS => return error.SystemResources,
+ EPIPE => return error.BrokenPipe,
+
+ else => {
+ const discard = unexpectedErrno(err);
+ if (amt != 0) {
+ return amt;
+ } else {
+ break :sf;
+ }
+ },
+ }
+ }
+ },
+ .macosx, .ios, .tvos, .watchos => sf: {
+ var hdtr_data: std.c.sf_hdtr = undefined;
+ var hdtr: ?*std.c.sf_hdtr = null;
+ if (headers.len != 0 or trailers.len != 0) {
+ // Here we carefully avoid `@intCast` by returning partial writes when
+ // too many io vectors are provided.
+ const hdr_cnt = math.cast(u31, headers.len) catch math.maxInt(u31);
+ if (headers.len > hdr_cnt) return writev(out_fd, headers);
+
+ const trl_cnt = math.cast(u31, trailers.len) catch math.maxInt(u31);
+
+ hdtr_data = std.c.sf_hdtr{
+ .headers = headers.ptr,
+ .hdr_cnt = hdr_cnt,
+ .trailers = trailers.ptr,
+ .trl_cnt = trl_cnt,
+ };
+ hdtr = &hdtr_data;
+ }
+
+ const adjusted_count = math.min(count, @as(u63, max_count));
+
+ while (true) {
+ var sbytes: off_t = adjusted_count;
+ const signed_offset = @bitCast(i64, in_offset);
+ const err = errno(system.sendfile(out_fd, in_fd, signed_offset, &sbytes, hdtr, flags));
+ const amt = @bitCast(usize, sbytes);
+ switch (err) {
+ 0 => return amt,
+
+ EBADF => unreachable, // Always a race condition.
+ EFAULT => unreachable, // Segmentation fault.
+ EINVAL => unreachable,
+ ENOTCONN => unreachable, // `out_fd` is an unconnected socket.
+
+ ENOTSUP, ENOTSOCK, ENOSYS => break :sf,
+
+ EINTR => if (amt != 0) return amt else continue,
+
+ EAGAIN => if (amt != 0) {
+ return amt;
+ } else if (std.event.Loop.instance) |loop| {
+ loop.waitUntilFdWritable(out_fd);
+ continue;
+ } else {
+ return error.WouldBlock;
+ },
+
+ EIO => return error.InputOutput,
+ EPIPE => return error.BrokenPipe,
+
+ else => {
+ const discard = unexpectedErrno(err);
+ if (amt != 0) {
+ return amt;
+ } else {
+ break :sf;
+ }
+ },
+ }
+ }
+ },
+ else => {}, // fall back to read/write
+ }
+
+ if (headers.len != 0 and !header_done) {
+ const amt = try writev(out_fd, headers);
+ total_written += amt;
+ if (amt < count_iovec_bytes(headers)) return total_written;
+ }
+
+ rw: {
+ var buf: [8 * 4096]u8 = undefined;
+ // Here we match BSD behavior, making a zero count value send as many bytes as possible.
+ const adjusted_count = if (count == 0) buf.len else math.min(buf.len, count);
+ const amt_read = try pread(in_fd, buf[0..adjusted_count], in_offset);
+ if (amt_read == 0) {
+ if (count == 0) {
+ // We have detected EOF from `in_fd`.
+ break :rw;
+ } else {
+ return total_written;
+ }
+ }
+ const amt_written = try write(out_fd, buf[0..amt_read]);
+ total_written += amt_written;
+ if (amt_written < count or count == 0) return total_written;
+ }
+
+ if (trailers.len != 0) {
+ total_written += try writev(out_fd, trailers);
+ }
+
+ return total_written;
+}
+
pub const PollError = error{
/// The kernel had no space to allocate file descriptor tables.
SystemResources,