diff options
| author | Andrew Kelley <andrew@ziglang.org> | 2020-03-03 02:03:22 -0500 |
|---|---|---|
| committer | Andrew Kelley <andrew@ziglang.org> | 2020-03-03 02:25:26 -0500 |
| commit | c81345c8aec56a108f6f98001666a1552d65ce85 (patch) | |
| tree | 6a18842256a157896d175d4e254811b9bf478090 /lib/std/os.zig | |
| parent | bd287dd1942f0a72e6bd9dc8475bd4e7d34fa5f8 (diff) | |
| download | zig-c81345c8aec56a108f6f98001666a1552d65ce85.tar.gz zig-c81345c8aec56a108f6f98001666a1552d65ce85.zip | |
breaking: std.os read/write functions + sendfile
* rework os.sendfile and add macosx support, and a fallback
implementation for any OS.
* fix sendto compile error
* std.os write functions support partial writes. closes #3443.
* std.os pread / pwrite functions can now return `error.Unseekable`.
* std.fs.File read/write functions now have readAll/writeAll variants
which loop to complete operations even when partial reads/writes
happen.
* Audit std.os read/write functions with respect to Linux returning
EINVAL for lengths greater than 0x7fff0000.
* std.os read/write shim functions do not unnecessarily loop. Since
partial reads/writes are part of the API, the caller will be forced
to loop anyway, and so that would just be code bloat.
* Improve doc comments
* Add a non-trivial test for std.os.sendfile
* Fix std.os.pread on 32 bit Linux
* Add missing SYS_sendfile bit on aarch64
Diffstat (limited to 'lib/std/os.zig')
| -rw-r--r-- | lib/std/os.zig | 722 |
1 files changed, 449 insertions, 273 deletions
diff --git a/lib/std/os.zig b/lib/std/os.zig index 2015b52d2f..8913a1599f 100644 --- a/lib/std/os.zig +++ b/lib/std/os.zig @@ -298,6 +298,11 @@ pub const ReadError = error{ /// buf.len. If 0 bytes were read, that means EOF. /// If the application has a global event loop enabled, EAGAIN is handled /// via the event loop. Otherwise EAGAIN results in error.WouldBlock. +/// +/// Linux has a limit on how many bytes may be transferred in one `read` call, which is `0x7ffff000` +/// on both 64-bit and 32-bit systems. This is due to using a signed C int as the return value, as +/// well as stuffing the errno codes into the last `4096` values. This is noted on the `read` man page. +/// For POSIX the limit is `math.maxInt(isize)`. pub fn read(fd: fd_t, buf: []u8) ReadError!usize { if (builtin.os.tag == .windows) { return windows.ReadFile(fd, buf, null); @@ -316,8 +321,15 @@ pub fn read(fd: fd_t, buf: []u8) ReadError!usize { } } + // Prevents EINVAL. + const max_count = switch (std.Target.current.os.tag) { + .linux => 0x7ffff000, + else => math.maxInt(isize), + }; + const adjusted_len = math.min(max_count, buf.len); + while (true) { - const rc = system.read(fd, buf.ptr, buf.len); + const rc = system.read(fd, buf.ptr, adjusted_len); switch (errno(rc)) { 0 => return @intCast(usize, rc), EINTR => continue, @@ -352,32 +364,18 @@ pub fn read(fd: fd_t, buf: []u8) ReadError!usize { /// * Windows /// On these systems, the read races with concurrent writes to the same file descriptor. pub fn readv(fd: fd_t, iov: []const iovec) ReadError!usize { - if (builtin.os.tag == .windows) { - // TODO batch these into parallel requests - var off: usize = 0; - var iov_i: usize = 0; - var inner_off: usize = 0; - while (true) { - const v = iov[iov_i]; - const amt_read = try read(fd, v.iov_base[inner_off .. v.iov_len - inner_off]); - off += amt_read; - inner_off += amt_read; - if (inner_off == v.len) { - iov_i += 1; - inner_off = 0; - if (iov_i == iov.len) { - return off; - } - } - if (amt_read == 0) return off; // EOF - } else unreachable; // TODO https://github.com/ziglang/zig/issues/707 + if (std.Target.current.os.tag == .windows) { + // TODO does Windows have a way to read an io vector? + if (iov.len == 0) return @as(usize, 0); + const first = iov[0]; + return read(fd, first.iov_base[0..first.iov_len]); } while (true) { // TODO handle the case when iov_len is too large and get rid of this @intCast - const rc = system.readv(fd, iov.ptr, @intCast(u32, iov.len)); + const rc = system.readv(fd, iov.ptr, iov_count); switch (errno(rc)) { - 0 => return @bitCast(usize, rc), + 0 => return @intCast(usize, rc), EINTR => continue, EINVAL => unreachable, EFAULT => unreachable, @@ -397,6 +395,8 @@ pub fn readv(fd: fd_t, iov: []const iovec) ReadError!usize { } } +pub const PReadError = ReadError || error{Unseekable}; + /// Number of bytes read is returned. Upon reading end-of-file, zero is returned. /// /// Retries when interrupted by a signal. @@ -405,7 +405,7 @@ pub fn readv(fd: fd_t, iov: []const iovec) ReadError!usize { /// via the event loop. Otherwise EAGAIN results in `error.WouldBlock`. /// On Windows, if the application has a global event loop enabled, I/O Completion Ports are /// used to perform the I/O. `error.WouldBlock` is not possible on Windows. -pub fn pread(fd: fd_t, buf: []u8, offset: u64) ReadError!usize { +pub fn pread(fd: fd_t, buf: []u8, offset: u64) PReadError!usize { if (builtin.os.tag == .windows) { return windows.ReadFile(fd, buf, offset); } @@ -429,6 +429,9 @@ pub fn pread(fd: fd_t, buf: []u8, offset: u64) ReadError!usize { ENOBUFS => return error.SystemResources, ENOMEM => return error.SystemResources, ECONNRESET => return error.ConnectionResetByPeer, + ENXIO => return error.Unseekable, + ESPIPE => return error.Unseekable, + EOVERFLOW => return error.Unseekable, else => |err| return unexpectedErrno(err), } } @@ -448,75 +451,23 @@ pub fn pread(fd: fd_t, buf: []u8, offset: u64) ReadError!usize { /// * Darwin /// * Windows /// On these systems, the read races with concurrent writes to the same file descriptor. -pub fn preadv(fd: fd_t, iov: []const iovec, offset: u64) ReadError!usize { - if (comptime std.Target.current.isDarwin()) { - // Darwin does not have preadv but it does have pread. - var off: usize = 0; - var iov_i: usize = 0; - var inner_off: usize = 0; - while (true) { - const v = iov[iov_i]; - const rc = darwin.pread(fd, v.iov_base + inner_off, v.iov_len - inner_off, offset + off); - const err = darwin.getErrno(rc); - switch (err) { - 0 => { - const amt_read = @bitCast(usize, rc); - off += amt_read; - inner_off += amt_read; - if (inner_off == v.iov_len) { - iov_i += 1; - inner_off = 0; - if (iov_i == iov.len) { - return off; - } - } - if (rc == 0) return off; // EOF - continue; - }, - EINTR => continue, - EINVAL => unreachable, - EFAULT => unreachable, - ESPIPE => unreachable, // fd is not seekable - EAGAIN => if (std.event.Loop.instance) |loop| { - loop.waitUntilFdReadable(fd); - continue; - } else { - return error.WouldBlock; - }, - EBADF => unreachable, // always a race condition - EIO => return error.InputOutput, - EISDIR => return error.IsDir, - ENOBUFS => return error.SystemResources, - ENOMEM => return error.SystemResources, - else => return unexpectedErrno(err), - } - } +pub fn preadv(fd: fd_t, iov: []const iovec, offset: u64) PReadError!usize { + const have_pread_but_not_preadv = switch (std.Target.current.os.tag) { + .windows, .macosx, .ios, .watchos, .tvos => true, + else => false, + }; + if (have_pread_but_not_preadv) { + // We could loop here; but proper usage of `preadv` must handle partial reads anyway. + // So we simply read into the first vector only. + if (iov.len == 0) return @as(usize, 0); + const first = iov[0]; + return pread(fd, first.iov_base[0..first.iov_len], offset); } - if (builtin.os.tag == .windows) { - // TODO batch these into parallel requests - var off: usize = 0; - var iov_i: usize = 0; - var inner_off: usize = 0; - while (true) { - const v = iov[iov_i]; - const amt_read = try pread(fd, v.iov_base[inner_off .. v.iov_len - inner_off], offset + off); - off += amt_read; - inner_off += amt_read; - if (inner_off == v.len) { - iov_i += 1; - inner_off = 0; - if (iov_i == iov.len) { - return off; - } - } - if (amt_read == 0) return off; // EOF - } else unreachable; // TODO https://github.com/ziglang/zig/issues/707 - } + const iov_count = math.cast(u31, iov.len) catch math.maxInt(u31); while (true) { - // TODO handle the case when iov_len is too large and get rid of this @intCast - const rc = system.preadv(fd, iov.ptr, @intCast(u32, iov.len), offset); + const rc = system.preadv(fd, iov.ptr, iov_count, offset); switch (errno(rc)) { 0 => return @bitCast(usize, rc), EINTR => continue, @@ -533,6 +484,9 @@ pub fn preadv(fd: fd_t, iov: []const iovec, offset: u64) ReadError!usize { EISDIR => return error.IsDir, ENOBUFS => return error.SystemResources, ENOMEM => return error.SystemResources, + ENXIO => return error.Unseekable, + ESPIPE => return error.Unseekable, + EOVERFLOW => return error.Unseekable, else => |err| return unexpectedErrno(err), } } @@ -553,10 +507,28 @@ pub const WriteError = error{ WouldBlock, } || UnexpectedError; -/// Write to a file descriptor. Keeps trying if it gets interrupted. -/// If the application has a global event loop enabled, EAGAIN is handled -/// via the event loop. Otherwise EAGAIN results in error.WouldBlock. -pub fn write(fd: fd_t, bytes: []const u8) WriteError!void { +/// Write to a file descriptor. +/// Retries when interrupted by a signal. +/// Returns the number of bytes written. If nonzero bytes were supplied, this will be nonzero. +/// +/// Note that a successful write() may transfer fewer than count bytes. Such partial writes can +/// occur for various reasons; for example, because there was insufficient space on the disk +/// device to write all of the requested bytes, or because a blocked write() to a socket, pipe, or +/// similar was interrupted by a signal handler after it had transferred some, but before it had +/// transferred all of the requested bytes. In the event of a partial write, the caller can make +/// another write() call to transfer the remaining bytes. The subsequent call will either +/// transfer further bytes or may result in an error (e.g., if the disk is now full). +/// +/// For POSIX systems, if the application has a global event loop enabled, EAGAIN is handled +/// via the event loop. Otherwise EAGAIN results in `error.WouldBlock`. +/// On Windows, if the application has a global event loop enabled, I/O Completion Ports are +/// used to perform the I/O. `error.WouldBlock` is not possible on Windows. +/// +/// Linux has a limit on how many bytes may be transferred in one `write` call, which is `0x7ffff000` +/// on both 64-bit and 32-bit systems. This is due to using a signed C int as the return value, as +/// well as stuffing the errno codes into the last `4096` values. This is noted on the `write` man page. +/// The corresponding POSIX limit is `math.maxInt(isize)`. +pub fn write(fd: fd_t, bytes: []const u8) WriteError!usize { if (builtin.os.tag == .windows) { return windows.WriteFile(fd, bytes, null); } @@ -568,26 +540,21 @@ pub fn write(fd: fd_t, bytes: []const u8) WriteError!void { }}; var nwritten: usize = undefined; switch (wasi.fd_write(fd, &ciovs, ciovs.len, &nwritten)) { - 0 => return, + 0 => return nwritten, else => |err| return unexpectedErrno(err), } } - // Linux can return EINVAL when write amount is > 0x7ffff000 - // See https://github.com/ziglang/zig/pull/743#issuecomment-363165856 - // TODO audit this. Shawn Landden says that this is not actually true. - // if this logic should stay, move it to std.os.linux - const max_bytes_len = 0x7ffff000; + const max_count = switch (std.Target.current.os.tag) { + .linux => 0x7ffff000, + else => math.maxInt(isize), + }; + const adjusted_len = math.min(max_count, bytes.len); - var index: usize = 0; - while (index < bytes.len) { - const amt_to_write = math.min(bytes.len - index, @as(usize, max_bytes_len)); - const rc = system.write(fd, bytes.ptr + index, amt_to_write); + while (true) { + const rc = system.write(fd, bytes.ptr, adjusted_len); switch (errno(rc)) { - 0 => { - index += @intCast(usize, rc); - continue; - }, + 0 => return @intCast(usize, rc), EINTR => continue, EINVAL => unreachable, EFAULT => unreachable, @@ -611,14 +578,36 @@ pub fn write(fd: fd_t, bytes: []const u8) WriteError!void { } /// Write multiple buffers to a file descriptor. -/// If the application has a global event loop enabled, EAGAIN is handled -/// via the event loop. Otherwise EAGAIN results in error.WouldBlock. -pub fn writev(fd: fd_t, iov: []const iovec_const) WriteError!void { +/// Retries when interrupted by a signal. +/// Returns the number of bytes written. If nonzero bytes were supplied, this will be nonzero. +/// +/// Note that a successful write() may transfer fewer bytes than supplied. Such partial writes can +/// occur for various reasons; for example, because there was insufficient space on the disk +/// device to write all of the requested bytes, or because a blocked write() to a socket, pipe, or +/// similar was interrupted by a signal handler after it had transferred some, but before it had +/// transferred all of the requested bytes. In the event of a partial write, the caller can make +/// another write() call to transfer the remaining bytes. The subsequent call will either +/// transfer further bytes or may result in an error (e.g., if the disk is now full). +/// +/// For POSIX systems, if the application has a global event loop enabled, EAGAIN is handled +/// via the event loop. Otherwise EAGAIN results in `error.WouldBlock`. +/// On Windows, if the application has a global event loop enabled, I/O Completion Ports are +/// used to perform the I/O. `error.WouldBlock` is not possible on Windows. +/// +/// If `iov.len` is larger than will fit in a `u31`, a partial write will occur. +pub fn writev(fd: fd_t, iov: []const iovec_const) WriteError!usize { + if (std.Target.current.os.tag == .windows) { + // TODO does Windows have a way to write an io vector? + if (iov.len == 0) return @as(usize, 0); + const first = iov[0]; + return write(fd, first.iov_base[0..first.iov_len]); + } + + const iov_count = math.cast(u31, iov.len) catch math.maxInt(u31); while (true) { - // TODO handle the case when iov_len is too large and get rid of this @intCast - const rc = system.writev(fd, iov.ptr, @intCast(u32, iov.len)); + const rc = system.writev(fd, iov.ptr, iov_count); switch (errno(rc)) { - 0 => return, + 0 => return @intCast(usize, rc), EINTR => continue, EINVAL => unreachable, EFAULT => unreachable, @@ -641,23 +630,45 @@ pub fn writev(fd: fd_t, iov: []const iovec_const) WriteError!void { } } +pub const PWriteError = WriteError || error{Unseekable}; + /// Write to a file descriptor, with a position offset. -/// /// Retries when interrupted by a signal. +/// Returns the number of bytes written. If nonzero bytes were supplied, this will be nonzero. +/// +/// Note that a successful write() may transfer fewer bytes than supplied. Such partial writes can +/// occur for various reasons; for example, because there was insufficient space on the disk +/// device to write all of the requested bytes, or because a blocked write() to a socket, pipe, or +/// similar was interrupted by a signal handler after it had transferred some, but before it had +/// transferred all of the requested bytes. In the event of a partial write, the caller can make +/// another write() call to transfer the remaining bytes. The subsequent call will either +/// transfer further bytes or may result in an error (e.g., if the disk is now full). /// /// For POSIX systems, if the application has a global event loop enabled, EAGAIN is handled /// via the event loop. Otherwise EAGAIN results in `error.WouldBlock`. /// On Windows, if the application has a global event loop enabled, I/O Completion Ports are /// used to perform the I/O. `error.WouldBlock` is not possible on Windows. -pub fn pwrite(fd: fd_t, bytes: []const u8, offset: u64) WriteError!void { +/// +/// Linux has a limit on how many bytes may be transferred in one `pwrite` call, which is `0x7ffff000` +/// on both 64-bit and 32-bit systems. This is due to using a signed C int as the return value, as +/// well as stuffing the errno codes into the last `4096` values. This is noted on the `write` man page. +/// The corresponding POSIX limit is `math.maxInt(isize)`. +pub fn pwrite(fd: fd_t, bytes: []const u8, offset: u64) PWriteError!usize { if (std.Target.current.os.tag == .windows) { return windows.WriteFile(fd, bytes, offset); } + // Prevent EINVAL. + const max_count = switch (std.Target.current.os.tag) { + .linux => 0x7ffff000, + else => math.maxInt(isize), + }; + const adjusted_len = math.min(max_count, bytes.len); + while (true) { - const rc = system.pwrite(fd, bytes.ptr, bytes.len, offset); + const rc = system.pwrite(fd, bytes.ptr, adjusted_len, offset); switch (errno(rc)) { - 0 => return, + 0 => return @intCast(usize, rc), EINTR => continue, EINVAL => unreachable, EFAULT => unreachable, @@ -675,84 +686,54 @@ pub fn pwrite(fd: fd_t, bytes: []const u8, offset: u64) WriteError!void { ENOSPC => return error.NoSpaceLeft, EPERM => return error.AccessDenied, EPIPE => return error.BrokenPipe, + ENXIO => return error.Unseekable, + ESPIPE => return error.Unseekable, + EOVERFLOW => return error.Unseekable, else => |err| return unexpectedErrno(err), } } } /// Write multiple buffers to a file descriptor, with a position offset. -/// /// Retries when interrupted by a signal. +/// Returns the number of bytes written. If nonzero bytes were supplied, this will be nonzero. +/// +/// Note that a successful write() may transfer fewer than count bytes. Such partial writes can +/// occur for various reasons; for example, because there was insufficient space on the disk +/// device to write all of the requested bytes, or because a blocked write() to a socket, pipe, or +/// similar was interrupted by a signal handler after it had transferred some, but before it had +/// transferred all of the requested bytes. In the event of a partial write, the caller can make +/// another write() call to transfer the remaining bytes. The subsequent call will either +/// transfer further bytes or may result in an error (e.g., if the disk is now full). /// /// If the application has a global event loop enabled, EAGAIN is handled /// via the event loop. Otherwise EAGAIN results in `error.WouldBlock`. /// -/// This operation is non-atomic on the following systems: +/// The following systems do not have this syscall, and will return partial writes if more than one +/// vector is provided: /// * Darwin /// * Windows -/// On these systems, the write races with concurrent writes to the same file descriptor, and -/// the file can be in a partially written state when an error occurs. -pub fn pwritev(fd: fd_t, iov: []const iovec_const, offset: u64) WriteError!void { - if (comptime std.Target.current.isDarwin()) { - // Darwin does not have pwritev but it does have pwrite. - var off: usize = 0; - var iov_i: usize = 0; - var inner_off: usize = 0; - while (true) { - const v = iov[iov_i]; - const rc = darwin.pwrite(fd, v.iov_base + inner_off, v.iov_len - inner_off, offset + off); - const err = darwin.getErrno(rc); - switch (err) { - 0 => { - const amt_written = @bitCast(usize, rc); - off += amt_written; - inner_off += amt_written; - if (inner_off == v.iov_len) { - iov_i += 1; - inner_off = 0; - if (iov_i == iov.len) { - return; - } - } - continue; - }, - EINTR => continue, - ESPIPE => unreachable, // `fd` is not seekable. - EINVAL => unreachable, - EFAULT => unreachable, - EAGAIN => if (std.event.Loop.instance) |loop| { - loop.waitUntilFdWritable(fd); - continue; - } else { - return error.WouldBlock; - }, - EBADF => unreachable, // Always a race condition. - EDESTADDRREQ => unreachable, // `connect` was never called. - EDQUOT => return error.DiskQuota, - EFBIG => return error.FileTooBig, - EIO => return error.InputOutput, - ENOSPC => return error.NoSpaceLeft, - EPERM => return error.AccessDenied, - EPIPE => return error.BrokenPipe, - else => return unexpectedErrno(err), - } - } - } +/// +/// If `iov.len` is larger than will fit in a `u31`, a partial write will occur. +pub fn pwritev(fd: fd_t, iov: []const iovec_const, offset: u64) PWriteError!usize { + const have_pwrite_but_not_pwritev = switch (std.Target.current.os.tag) { + .windows, .macosx, .ios, .watchos, .tvos => true, + else => false, + }; - if (std.Target.current.os.tag == .windows) { - var off = offset; - for (iov) |item| { - try pwrite(fd, item.iov_base[0..item.iov_len], off); - off += buf.len; - } - return; + if (have_pwrite_but_not_pwritev) { + // We could loop here; but proper usage of `pwritev` must handle partial writes anyway. + // So we simply write the first vector only. + if (iov.len == 0) return @as(usize, 0); + const first = iov[0]; + return pwrite(fd, first.iov_base[0..first.iov_len], offset); } + const iov_count = math.cast(u31, iov.len) catch math.maxInt(u31); while (true) { - // TODO handle the case when iov_len is too large and get rid of this @intCast - const rc = system.pwritev(fd, iov.ptr, @intCast(u32, iov.len), offset); + const rc = system.pwritev(fd, iov.ptr, iov_count, offset); switch (errno(rc)) { - 0 => return, + 0 => return @intCast(usize, rc), EINTR => continue, EINVAL => unreachable, EFAULT => unreachable, @@ -770,6 +751,9 @@ pub fn pwritev(fd: fd_t, iov: []const iovec_const, offset: u64) WriteError!void ENOSPC => return error.NoSpaceLeft, EPERM => return error.AccessDenied, EPIPE => return error.BrokenPipe, + ENXIO => return error.Unseekable, + ESPIPE => return error.Unseekable, + EOVERFLOW => return error.Unseekable, else => |err| return unexpectedErrno(err), } } @@ -3389,7 +3373,6 @@ pub const SendError = error{ /// The socket type requires that message be sent atomically, and the size of the message /// to be sent made this impossible. The message is not transmitted. - /// MessageTooBig, /// The output queue for a network interface was full. This generally indicates that the @@ -3498,119 +3481,312 @@ pub fn send( return sendto(sockfd, buf, flags, null, 0); } -pub const SendFileError = error{ - /// There was an unspecified error while reading from infd. - InputOutput, - - /// There was insufficient resources for processing. - SystemResources, - - /// The value provided for count overflows the maximum size of either - /// infd or outfd. - Overflow, - - /// Offset was provided, but infd is not seekable. - Unseekable, +pub const SendFileError = PReadError || WriteError || SendError; - /// The outfd is marked nonblocking and the requested operation would block, and - /// there is no global event loop configured. - WouldBlock, -} || WriteError || UnexpectedError; - -pub const sf_hdtr = struct { - headers: []iovec_const, - trailers: []iovec_const, -}; +fn count_iovec_bytes(iovs: []const iovec_const) usize { + var count: usize = 0; + for (iovs) |iov| { + count += iov.iov_len; + } + return count; +} -/// Transfer data between file descriptors. +/// Transfer data between file descriptors, with optional headers and trailers. +/// Returns the number of bytes written. This will be zero if `in_offset` falls beyond the end of the file. +/// +/// The `sendfile` call copies `count` bytes from one file descriptor to another. When possible, +/// this is done within the operating system kernel, which can provide better performance +/// characteristics than transferring data from kernel to user space and back, such as with +/// `read` and `write` calls. When `count` is `0`, it means to copy until the end of the input file has been +/// reached. Note, however, that partial writes are still possible in this case. +/// +/// `in_fd` must be a file descriptor opened for reading, and `out_fd` must be a file descriptor +/// opened for writing. They may be any kind of file descriptor; however, if `in_fd` is not a regular +/// file system file, it may cause this function to fall back to calling `read` and `write`, in which case +/// atomicity guarantees no longer apply. +/// +/// Copying begins reading at `in_offset`. The input file descriptor seek position is ignored and not updated. +/// If the output file descriptor has a seek position, it is updated as bytes are written. /// -/// The `sendfile` call copies `count` bytes from one file descriptor to another within the kernel. This can -/// be more performant than transferring data from the kernel to user space and back, such as with -/// `read` and `write` calls. +/// `flags` has different meanings per operating system; refer to the respective man pages. /// -/// The `infd` should be a file descriptor opened for reading, and `outfd` should be a file descriptor -/// opened for writing. Copying will begin at `offset`, if not null, which will be updated to reflect -/// the number of bytes read. If `offset` is null, the copying will begin at the current seek position, -/// and the file position will be updated. -pub fn sendfile(infd: fd_t, outfd: fd_t, offset: u64, count: usize, optional_hdtr: ?*const sf_hdtr, flags: u32) SendFileError!usize { - // XXX: check if offset is > length of file, return 0 bytes written - // XXX: document systems where headers are sent atomically. - // XXX: compute new offset on EINTR/EAGAIN - var rc: usize = undefined; - var err: usize = undefined; - if (builtin.os == .linux) { - while (true) { - try lseek_SET(infd, offset); - - if (optional_hdtr) |hdtr| { - try writev(outfd, hdtr.headers); +/// These systems support atomically sending everything, including headers and trailers: +/// * macOS +/// * FreeBSD +/// +/// These systems support in-kernel data copying, but headers and trailers are not sent atomically: +/// * Linux +/// +/// Other systems fall back to calling `read` / `write`. +/// +/// Linux has a limit on how many bytes may be transferred in one `sendfile` call, which is `0x7ffff000` +/// on both 64-bit and 32-bit systems. This is due to using a signed C int as the return value, as +/// well as stuffing the errno codes into the last `4096` values. This is cited on the `sendfile` man page. +/// The corresponding POSIX limit on this is `math.maxInt(isize)`. +pub fn sendfile( + out_fd: fd_t, + in_fd: fd_t, + in_offset: u64, + count: usize, + headers: []const iovec_const, + trailers: []const iovec_const, + flags: u32, +) SendFileError!usize { + var header_done = false; + var total_written: usize = 0; + + // Prevents EOVERFLOW. + const max_count = switch (std.Target.current.os.tag) { + .linux => 0x7ffff000, + else => math.maxInt(isize), + }; + + switch (std.Target.current.os.tag) { + .linux => sf: { + // sendfile() first appeared in Linux 2.2, glibc 2.1. + const call_sf = comptime if (builtin.link_libc) + std.c.versionCheck(.{ .major = 2, .minor = 1 }).ok + else + std.Target.current.os.version_range.linux.range.max.order(.{ .major = 2, .minor = 2 }) != .lt; + if (!call_sf) break :sf; + + if (headers.len != 0) { + const amt = try writev(out_fd, headers); + total_written += amt; + if (amt < count_iovec_bytes(headers)) return total_written; + header_done = true; } - rc = system.sendfile(outfd, infd, null, count); - err = errno(rc); + // Here we match BSD behavior, making a zero count value send as many bytes as possible. + const adjusted_count = if (count == 0) max_count else math.min(count, max_count); + + while (true) { + var offset: off_t = @bitCast(off_t, in_offset); + const rc = system.sendfile(out_fd, in_fd, &offset, adjusted_count); + switch (errno(rc)) { + 0 => { + const amt = @bitCast(usize, rc); + total_written += amt; + if (count == 0 and amt == 0) { + // We have detected EOF from `in_fd`. + break; + } else if (amt < count) { + return total_written; + } else { + break; + } + }, + + EBADF => unreachable, // Always a race condition. + EFAULT => unreachable, // Segmentation fault. + EOVERFLOW => unreachable, // We avoid passing too large of a `count`. + ENOTCONN => unreachable, // `out_fd` is an unconnected socket. + + EINVAL, ENOSYS => { + // EINVAL could be any of the following situations: + // * Descriptor is not valid or locked + // * an mmap(2)-like operation is not available for in_fd + // * count is negative + // * out_fd has the O_APPEND flag set + // Because of the "mmap(2)-like operation" possibility, we fall back to doing read/write + // manually, the same as ENOSYS. + break :sf; + }, + EAGAIN => if (std.event.Loop.instance) |loop| { + loop.waitUntilFdWritable(out_fd); + continue; + } else { + return error.WouldBlock; + }, + EIO => return error.InputOutput, + EPIPE => return error.BrokenPipe, + ENOMEM => return error.SystemResources, + ENXIO => return error.Unseekable, + ESPIPE => return error.Unseekable, + else => |err| { + const discard = unexpectedErrno(err); + break :sf; + }, + } + } - if (optional_hdtr) |hdtr| { - try writev(outfd, hdtr.trailers); + if (trailers.len != 0) { + total_written += try writev(out_fd, trailers); } - switch (err) { - 0 => return @intCast(usize, rc), - else => return unexpectedErrno(err), + return total_written; + }, + .freebsd => sf: { + var hdtr_data: std.c.sf_hdtr = undefined; + var hdtr: ?*std.c.sf_hdtr = null; + if (headers.len != 0 or trailers.len != 0) { + // Here we carefully avoid `@intCast` by returning partial writes when + // too many io vectors are provided. + const hdr_cnt = math.cast(u31, headers.len) catch math.maxInt(u31); + if (headers.len > hdr_cnt) return writev(out_fd, headers); + + const trl_cnt = math.cast(u31, trailers.len) catch math.maxInt(u31); + + hdtr_data = std.c.sf_hdtr{ + .headers = headers.ptr, + .hdr_cnt = hdr_cnt, + .trailers = trailers.ptr, + .trl_cnt = trl_cnt, + }; + hdtr = &hdtr_data; + } - EBADF => unreachable, - EINVAL => unreachable, - EFAULT => unreachable, - EAGAIN => if (std.event.Loop.instance) |loop| { - loop.waitUntilFdWritable(outfd); - continue; - } else { - return error.WouldBlock; - }, - EIO => return error.InputOutput, - ENOMEM => return error.SystemResources, - EOVERFLOW => return error.Overflow, - ESPIPE => return error.Unseekable, + const adjusted_count = math.min(count, max_count); + + while (true) { + var sbytes: off_t = undefined; + const err = errno(system.sendfile(out_fd, in_fd, in_offset, adjusted_count, hdtr, &sbytes, flags)); + const amt = @bitCast(usize, sbytes); + switch (err) { + 0 => return amt, + + EBADF => unreachable, // Always a race condition. + EFAULT => unreachable, // Segmentation fault. + ENOTCONN => unreachable, // `out_fd` is an unconnected socket. + + EINVAL, EOPNOTSUPP, ENOTSOCK, ENOSYS => { + // EINVAL could be any of the following situations: + // * The fd argument is not a regular file. + // * The s argument is not a SOCK_STREAM type socket. + // * The offset argument is negative. + // Because of some of these possibilities, we fall back to doing read/write + // manually, the same as ENOSYS. + break :sf; + }, + + EINTR => if (amt != 0) return amt else continue, + + EAGAIN => if (amt != 0) { + return amt; + } else if (std.event.Loop.instance) |loop| { + loop.waitUntilFdWritable(out_fd); + continue; + } else { + return error.WouldBlock; + }, + + EBUSY => if (amt != 0) { + return amt; + } else if (std.event.Loop.instance) |loop| { + loop.waitUntilFdReadable(in_fd); + continue; + } else { + return error.WouldBlock; + }, + + EIO => return error.InputOutput, + ENOBUFS => return error.SystemResources, + EPIPE => return error.BrokenPipe, + + else => { + const discard = unexpectedErrno(err); + if (amt != 0) { + return amt; + } else { + break :sf; + } + }, + } } - } - } else if (builtin.os == .freebsd) { - while (true) { - var rcount: u64 = 0; - var hdtr: std.c.sf_hdtr = undefined; - if (optional_hdtr) |h| { - hdtr = std.c.sf_hdtr{ - .headers = h.headers.ptr, - .hdr_cnt = @intCast(c_int, h.headers.len), - .trailers = h.trailers.ptr, - .trl_cnt = @intCast(c_int, h.trailers.len), + }, + .macosx, .ios, .tvos, .watchos => sf: { + var hdtr_data: std.c.sf_hdtr = undefined; + var hdtr: ?*std.c.sf_hdtr = null; + if (headers.len != 0 or trailers.len != 0) { + // Here we carefully avoid `@intCast` by returning partial writes when + // too many io vectors are provided. + const hdr_cnt = math.cast(u31, headers.len) catch math.maxInt(u31); + if (headers.len > hdr_cnt) return writev(out_fd, headers); + + const trl_cnt = math.cast(u31, trailers.len) catch math.maxInt(u31); + + hdtr_data = std.c.sf_hdtr{ + .headers = headers.ptr, + .hdr_cnt = hdr_cnt, + .trailers = trailers.ptr, + .trl_cnt = trl_cnt, }; + hdtr = &hdtr_data; } - err = errno(system.sendfile(infd, outfd, offset, count, &hdtr, &rcount, @intCast(c_int, flags))); - switch (err) { - 0 => return @intCast(usize, rcount), - else => return unexpectedErrno(err), - EBADF => unreachable, - EFAULT => unreachable, - EINVAL => unreachable, - ENOTCAPABLE => unreachable, - ENOTCONN => unreachable, - ENOTSOCK => unreachable, - EAGAIN => if (std.event.Loop.instance) |loop| { - loop.waitUntilFdWritable(outfd); - continue; - } else { - return error.WouldBlock; - }, - EBUSY => return error.DeviceBusy, - EINTR => continue, - EIO => return error.InputOutput, - ENOBUFS => return error.SystemResources, - EPIPE => return error.BrokenPipe, + const adjusted_count = math.min(count, max_count); + + while (true) { + var sbytes: off_t = adjusted_count; + const err = errno(system.sendfile(out_fd, in_fd, in_offset, &sbytes, hdtr, flags)); + const amt = @bitCast(usize, sbytes); + switch (err) { + 0 => return amt, + + EBADF => unreachable, // Always a race condition. + EFAULT => unreachable, // Segmentation fault. + EINVAL => unreachable, + ENOTCONN => unreachable, // `out_fd` is an unconnected socket. + + ENOTSUP, ENOTSOCK, ENOSYS => break :sf, + + EINTR => if (amt != 0) return amt else continue, + + EAGAIN => if (amt != 0) { + return amt; + } else if (std.event.Loop.instance) |loop| { + loop.waitUntilFdWritable(out_fd); + continue; + } else { + return error.WouldBlock; + }, + + EIO => return error.InputOutput, + EPIPE => return error.BrokenPipe, + + else => { + _ = unexpectedErrno(err); + if (amt != 0) { + return amt; + } else { + break :sf; + } + }, + } + } + }, + else => {}, // fall back to read/write + } + + if (headers.len != 0 and !header_done) { + const amt = try writev(out_fd, headers); + total_written += amt; + if (amt < count_iovec_bytes(headers)) return total_written; + } + + rw: { + var buf: [8 * 4096]u8 = undefined; + // Here we match BSD behavior, making a zero count value send as many bytes as possible. + const adjusted_count = if (count == 0) buf.len else math.min(buf.len, count); + const amt_read = try pread(in_fd, buf[0..adjusted_count], in_offset); + if (amt_read == 0) { + if (count == 0) { + // We have detected EOF from `in_fd`. + break :rw; + } else { + return total_written; } } - } else { - @compileError("sendfile unimplemented for this target"); + const amt_written = try write(out_fd, buf[0..amt_read]); + total_written += amt_written; + if (amt_written < count or count == 0) return total_written; + } + + if (trailers.len != 0) { + total_written += try writev(out_fd, trailers); } + + return total_written; } pub const PollError = error{ |
