diff options
| author | Andrew Kelley <andrew@ziglang.org> | 2020-09-22 11:41:21 -0700 |
|---|---|---|
| committer | Andrew Kelley <andrew@ziglang.org> | 2020-09-22 11:41:21 -0700 |
| commit | e2d1f9874df2a9221aaa9ec55bd2974b70601f64 (patch) | |
| tree | eff7919b0717e193aa53b70fcee862d6f33deddb /lib/std/os/linux/bpf.zig | |
| parent | 52b8239a22aa37fe3914427cd4e2905231769e59 (diff) | |
| parent | 58ee5f4e61cd9b7a9ba65798e2214efa3753a733 (diff) | |
| download | zig-e2d1f9874df2a9221aaa9ec55bd2974b70601f64.tar.gz zig-e2d1f9874df2a9221aaa9ec55bd2974b70601f64.zip | |
Merge remote-tracking branch 'origin/master' into llvm11
Diffstat (limited to 'lib/std/os/linux/bpf.zig')
| -rw-r--r-- | lib/std/os/linux/bpf.zig | 829 |
1 files changed, 761 insertions, 68 deletions
diff --git a/lib/std/os/linux/bpf.zig b/lib/std/os/linux/bpf.zig index 928c157c42..44c938feb8 100644 --- a/lib/std/os/linux/bpf.zig +++ b/lib/std/os/linux/bpf.zig @@ -3,9 +3,16 @@ // This file is part of [zig](https://ziglang.org/), which is MIT licensed. // The MIT license requires this copyright notice to be included in all copies // and substantial portions of the software. -usingnamespace std.os; +usingnamespace std.os.linux; const std = @import("../../std.zig"); +const errno = getErrno; +const unexpectedErrno = std.os.unexpectedErrno; const expectEqual = std.testing.expectEqual; +const expectError = std.testing.expectError; +const expect = std.testing.expect; + +pub const btf = @import("bpf/btf.zig"); +pub const kern = @import("bpf/kern.zig"); // instruction classes pub const LD = 0x00; @@ -62,6 +69,7 @@ pub const MAXINSNS = 4096; // instruction classes /// jmp mode in word width pub const JMP32 = 0x06; + /// alu mode in double word width pub const ALU64 = 0x07; @@ -72,14 +80,17 @@ pub const XADD = 0xc0; // alu/jmp fields /// mov reg to reg pub const MOV = 0xb0; + /// sign extending arithmetic shift right */ pub const ARSH = 0xc0; // change endianness of a register /// flags for endianness conversion: pub const END = 0xd0; + /// convert to little-endian */ pub const TO_LE = 0x00; + /// convert to big-endian pub const TO_BE = 0x08; pub const FROM_LE = TO_LE; @@ -88,29 +99,39 @@ pub const FROM_BE = TO_BE; // jmp encodings /// jump != * pub const JNE = 0x50; + /// LT is unsigned, '<' pub const JLT = 0xa0; + /// LE is unsigned, '<=' * pub const JLE = 0xb0; + /// SGT is signed '>', GT in x86 pub const JSGT = 0x60; + /// SGE is signed '>=', GE in x86 pub const JSGE = 0x70; + /// SLT is signed, '<' pub const JSLT = 0xc0; + /// SLE is signed, '<=' pub const JSLE = 0xd0; + /// function call pub const CALL = 0x80; + /// function return pub const EXIT = 0x90; /// Flag for prog_attach command. If a sub-cgroup installs some bpf program, the /// program in this cgroup yields to sub-cgroup program. pub const F_ALLOW_OVERRIDE = 0x1; + /// Flag for prog_attach command. If a sub-cgroup installs some bpf program, /// that cgroup program gets run in addition to the program in this cgroup. pub const F_ALLOW_MULTI = 0x2; + /// Flag for prog_attach command. pub const F_REPLACE = 0x4; @@ -164,47 +185,61 @@ pub const PSEUDO_CALL = 1; /// flag for BPF_MAP_UPDATE_ELEM command. create new element or update existing pub const ANY = 0; + /// flag for BPF_MAP_UPDATE_ELEM command. create new element if it didn't exist pub const NOEXIST = 1; + /// flag for BPF_MAP_UPDATE_ELEM command. update existing element pub const EXIST = 2; + /// flag for BPF_MAP_UPDATE_ELEM command. spin_lock-ed map_lookup/map_update pub const F_LOCK = 4; /// flag for BPF_MAP_CREATE command */ pub const BPF_F_NO_PREALLOC = 0x1; + /// flag for BPF_MAP_CREATE command. Instead of having one common LRU list in /// the BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list which can /// scale and perform better. Note, the LRU nodes (including free nodes) cannot /// be moved across different LRU lists. pub const BPF_F_NO_COMMON_LRU = 0x2; + /// flag for BPF_MAP_CREATE command. Specify numa node during map creation pub const BPF_F_NUMA_NODE = 0x4; + /// flag for BPF_MAP_CREATE command. Flags for BPF object read access from /// syscall side pub const BPF_F_RDONLY = 0x8; + /// flag for BPF_MAP_CREATE command. Flags for BPF object write access from /// syscall side pub const BPF_F_WRONLY = 0x10; + /// flag for BPF_MAP_CREATE command. Flag for stack_map, store build_id+offset /// instead of pointer pub const BPF_F_STACK_BUILD_ID = 0x20; + /// flag for BPF_MAP_CREATE command. Zero-initialize hash function seed. This /// should only be used for testing. pub const BPF_F_ZERO_SEED = 0x40; + /// flag for BPF_MAP_CREATE command Flags for accessing BPF object from program /// side. pub const BPF_F_RDONLY_PROG = 0x80; + /// flag for BPF_MAP_CREATE command. Flags for accessing BPF object from program /// side. pub const BPF_F_WRONLY_PROG = 0x100; + /// flag for BPF_MAP_CREATE command. Clone map from listener for newly accepted /// socket pub const BPF_F_CLONE = 0x200; + /// flag for BPF_MAP_CREATE command. Enable memory-mapping BPF map pub const BPF_F_MMAPABLE = 0x400; -/// These values correspond to "syscalls" within the BPF program's environment +/// These values correspond to "syscalls" within the BPF program's environment, +/// each one is documented in std.os.linux.BPF.kern pub const Helper = enum(i32) { unspec, map_lookup_elem, @@ -325,9 +360,34 @@ pub const Helper = enum(i32) { tcp_send_ack, send_signal_thread, jiffies64, + read_branch_records, + get_ns_current_pid_tgid, + xdp_output, + get_netns_cookie, + get_current_ancestor_cgroup_id, + sk_assign, + ktime_get_boot_ns, + seq_printf, + seq_write, + sk_cgroup_id, + sk_ancestor_cgroup_id, + ringbuf_output, + ringbuf_reserve, + ringbuf_submit, + ringbuf_discard, + ringbuf_query, + csum_level, + skc_to_tcp6_sock, + skc_to_tcp_sock, + skc_to_tcp_timewait_sock, + skc_to_tcp_request_sock, + skc_to_udp6_sock, + get_task_stack, _, }; +// TODO: determine that this is the expected bit layout for both little and big +// endian systems /// a single BPF instruction pub const Insn = packed struct { code: u8, @@ -340,19 +400,30 @@ pub const Insn = packed struct { /// frame pub const Reg = packed enum(u4) { r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10 }; const Source = packed enum(u1) { reg, imm }; + + const Mode = packed enum(u8) { + imm = IMM, + abs = ABS, + ind = IND, + mem = MEM, + len = LEN, + msh = MSH, + }; + const AluOp = packed enum(u8) { add = ADD, sub = SUB, mul = MUL, div = DIV, - op_or = OR, - op_and = AND, + alu_or = OR, + alu_and = AND, lsh = LSH, rsh = RSH, neg = NEG, mod = MOD, xor = XOR, mov = MOV, + arsh = ARSH, }; pub const Size = packed enum(u8) { @@ -368,6 +439,13 @@ pub const Insn = packed struct { jgt = JGT, jge = JGE, jset = JSET, + jlt = JLT, + jle = JLE, + jne = JNE, + jsgt = JSGT, + jsge = JSGE, + jslt = JSLT, + jsle = JSLE, }; const ImmOrReg = union(Source) { @@ -419,22 +497,100 @@ pub const Insn = packed struct { return alu(64, .add, dst, src); } + pub fn sub(dst: Reg, src: anytype) Insn { + return alu(64, .sub, dst, src); + } + + pub fn mul(dst: Reg, src: anytype) Insn { + return alu(64, .mul, dst, src); + } + + pub fn div(dst: Reg, src: anytype) Insn { + return alu(64, .div, dst, src); + } + + pub fn alu_or(dst: Reg, src: anytype) Insn { + return alu(64, .alu_or, dst, src); + } + + pub fn alu_and(dst: Reg, src: anytype) Insn { + return alu(64, .alu_and, dst, src); + } + + pub fn lsh(dst: Reg, src: anytype) Insn { + return alu(64, .lsh, dst, src); + } + + pub fn rsh(dst: Reg, src: anytype) Insn { + return alu(64, .rsh, dst, src); + } + + pub fn neg(dst: Reg) Insn { + return alu(64, .neg, dst, 0); + } + + pub fn mod(dst: Reg, src: anytype) Insn { + return alu(64, .mod, dst, src); + } + + pub fn xor(dst: Reg, src: anytype) Insn { + return alu(64, .xor, dst, src); + } + + pub fn arsh(dst: Reg, src: anytype) Insn { + return alu(64, .arsh, dst, src); + } + fn jmp(op: JmpOp, dst: Reg, src: anytype, off: i16) Insn { return imm_reg(JMP | @enumToInt(op), dst, src, off); } + pub fn ja(off: i16) Insn { + return jmp(.ja, .r0, 0, off); + } + pub fn jeq(dst: Reg, src: anytype, off: i16) Insn { return jmp(.jeq, dst, src, off); } - pub fn stx_mem(size: Size, dst: Reg, src: Reg, off: i16) Insn { - return Insn{ - .code = STX | @enumToInt(size) | MEM, - .dst = @enumToInt(dst), - .src = @enumToInt(src), - .off = off, - .imm = 0, - }; + pub fn jgt(dst: Reg, src: anytype, off: i16) Insn { + return jmp(.jgt, dst, src, off); + } + + pub fn jge(dst: Reg, src: anytype, off: i16) Insn { + return jmp(.jge, dst, src, off); + } + + pub fn jlt(dst: Reg, src: anytype, off: i16) Insn { + return jmp(.jlt, dst, src, off); + } + + pub fn jle(dst: Reg, src: anytype, off: i16) Insn { + return jmp(.jle, dst, src, off); + } + + pub fn jset(dst: Reg, src: anytype, off: i16) Insn { + return jmp(.jset, dst, src, off); + } + + pub fn jne(dst: Reg, src: anytype, off: i16) Insn { + return jmp(.jne, dst, src, off); + } + + pub fn jsgt(dst: Reg, src: anytype, off: i16) Insn { + return jmp(.jsgt, dst, src, off); + } + + pub fn jsge(dst: Reg, src: anytype, off: i16) Insn { + return jmp(.jsge, dst, src, off); + } + + pub fn jslt(dst: Reg, src: anytype, off: i16) Insn { + return jmp(.jslt, dst, src, off); + } + + pub fn jsle(dst: Reg, src: anytype, off: i16) Insn { + return jmp(.jsle, dst, src, off); } pub fn xadd(dst: Reg, src: Reg) Insn { @@ -447,17 +603,34 @@ pub const Insn = packed struct { }; } - /// direct packet access, R0 = *(uint *)(skb->data + imm32) - pub fn ld_abs(size: Size, imm: i32) Insn { + fn ld(mode: Mode, size: Size, dst: Reg, src: Reg, imm: i32) Insn { return Insn{ - .code = LD | @enumToInt(size) | ABS, - .dst = 0, - .src = 0, + .code = @enumToInt(mode) | @enumToInt(size) | LD, + .dst = @enumToInt(dst), + .src = @enumToInt(src), .off = 0, .imm = imm, }; } + pub fn ld_abs(size: Size, dst: Reg, src: Reg, imm: i32) Insn { + return ld(.abs, size, dst, src, imm); + } + + pub fn ld_ind(size: Size, dst: Reg, src: Reg, imm: i32) Insn { + return ld(.ind, size, dst, src, imm); + } + + pub fn ldx(size: Size, dst: Reg, src: Reg, off: i16) Insn { + return Insn{ + .code = MEM | @enumToInt(size) | LDX, + .dst = @enumToInt(dst), + .src = @enumToInt(src), + .off = off, + .imm = 0, + }; + } + fn ld_imm_impl1(dst: Reg, src: Reg, imm: u64) Insn { return Insn{ .code = LD | DW | IMM, @@ -478,6 +651,14 @@ pub const Insn = packed struct { }; } + pub fn ld_dw1(dst: Reg, imm: u64) Insn { + return ld_imm_impl1(dst, .r0, imm); + } + + pub fn ld_dw2(imm: u64) Insn { + return ld_imm_impl2(imm); + } + pub fn ld_map_fd1(dst: Reg, map_fd: fd_t) Insn { return ld_imm_impl1(dst, @intToEnum(Reg, PSEUDO_MAP_FD), @intCast(u64, map_fd)); } @@ -486,6 +667,53 @@ pub const Insn = packed struct { return ld_imm_impl2(@intCast(u64, map_fd)); } + pub fn st(comptime size: Size, dst: Reg, off: i16, imm: i32) Insn { + if (size == .double_word) @compileError("TODO: need to determine how to correctly handle double words"); + return Insn{ + .code = MEM | @enumToInt(size) | ST, + .dst = @enumToInt(dst), + .src = 0, + .off = off, + .imm = imm, + }; + } + + pub fn stx(size: Size, dst: Reg, off: i16, src: Reg) Insn { + return Insn{ + .code = MEM | @enumToInt(size) | STX, + .dst = @enumToInt(dst), + .src = @enumToInt(src), + .off = off, + .imm = 0, + }; + } + + fn endian_swap(endian: std.builtin.Endian, comptime size: Size, dst: Reg) Insn { + return Insn{ + .code = switch (endian) { + .Big => 0xdc, + .Little => 0xd4, + }, + .dst = @enumToInt(dst), + .src = 0, + .off = 0, + .imm = switch (size) { + .byte => @compileError("can't swap a single byte"), + .half_word => 16, + .word => 32, + .double_word => 64, + }, + }; + } + + pub fn le(comptime size: Size, dst: Reg) Insn { + return endian_swap(.Little, size, dst); + } + + pub fn be(comptime size: Size, dst: Reg) Insn { + return endian_swap(.Big, size, dst); + } + pub fn call(helper: Helper) Insn { return Insn{ .code = JMP | CALL, @@ -508,95 +736,242 @@ pub const Insn = packed struct { } }; -fn expect_insn(insn: Insn, val: u64) void { - expectEqual(@bitCast(u64, insn), val); -} - test "insn bitsize" { expectEqual(@bitSizeOf(Insn), 64); } -// mov instructions -test "mov imm" { - expect_insn(Insn.mov(.r1, 1), 0x00000001000001b7); -} - -test "mov reg" { - expect_insn(Insn.mov(.r6, .r1), 0x00000000000016bf); -} - -// alu instructions -test "add imm" { - expect_insn(Insn.add(.r2, -4), 0xfffffffc00000207); +fn expect_opcode(code: u8, insn: Insn) void { + expectEqual(code, insn.code); } -// ld instructions -test "ld_abs" { - expect_insn(Insn.ld_abs(.byte, 42), 0x0000002a00000030); -} - -test "ld_map_fd" { - expect_insn(Insn.ld_map_fd1(.r1, 42), 0x0000002a00001118); - expect_insn(Insn.ld_map_fd2(42), 0x0000000000000000); -} - -// st instructions -test "stx_mem" { - expect_insn(Insn.stx_mem(.word, .r10, .r0, -4), 0x00000000fffc0a63); -} - -test "xadd" { - expect_insn(Insn.xadd(.r0, .r1), 0x00000000000010db); -} - -// jmp instructions -test "jeq imm" { - expect_insn(Insn.jeq(.r0, 0, 2), 0x0000000000020015); -} - -// other instructions -test "call" { - expect_insn(Insn.call(.map_lookup_elem), 0x0000000100000085); -} - -test "exit" { - expect_insn(Insn.exit(), 0x0000000000000095); +// The opcodes were grabbed from https://github.com/iovisor/bpf-docs/blob/master/eBPF.md +test "opcodes" { + // instructions that have a name that end with 1 or 2 are consecutive for + // loading 64-bit immediates (imm is only 32 bits wide) + + // alu instructions + expect_opcode(0x07, Insn.add(.r1, 0)); + expect_opcode(0x0f, Insn.add(.r1, .r2)); + expect_opcode(0x17, Insn.sub(.r1, 0)); + expect_opcode(0x1f, Insn.sub(.r1, .r2)); + expect_opcode(0x27, Insn.mul(.r1, 0)); + expect_opcode(0x2f, Insn.mul(.r1, .r2)); + expect_opcode(0x37, Insn.div(.r1, 0)); + expect_opcode(0x3f, Insn.div(.r1, .r2)); + expect_opcode(0x47, Insn.alu_or(.r1, 0)); + expect_opcode(0x4f, Insn.alu_or(.r1, .r2)); + expect_opcode(0x57, Insn.alu_and(.r1, 0)); + expect_opcode(0x5f, Insn.alu_and(.r1, .r2)); + expect_opcode(0x67, Insn.lsh(.r1, 0)); + expect_opcode(0x6f, Insn.lsh(.r1, .r2)); + expect_opcode(0x77, Insn.rsh(.r1, 0)); + expect_opcode(0x7f, Insn.rsh(.r1, .r2)); + expect_opcode(0x87, Insn.neg(.r1)); + expect_opcode(0x97, Insn.mod(.r1, 0)); + expect_opcode(0x9f, Insn.mod(.r1, .r2)); + expect_opcode(0xa7, Insn.xor(.r1, 0)); + expect_opcode(0xaf, Insn.xor(.r1, .r2)); + expect_opcode(0xb7, Insn.mov(.r1, 0)); + expect_opcode(0xbf, Insn.mov(.r1, .r2)); + expect_opcode(0xc7, Insn.arsh(.r1, 0)); + expect_opcode(0xcf, Insn.arsh(.r1, .r2)); + + // atomic instructions: might be more of these not documented in the wild + expect_opcode(0xdb, Insn.xadd(.r1, .r2)); + + // TODO: byteswap instructions + expect_opcode(0xd4, Insn.le(.half_word, .r1)); + expectEqual(@intCast(i32, 16), Insn.le(.half_word, .r1).imm); + expect_opcode(0xd4, Insn.le(.word, .r1)); + expectEqual(@intCast(i32, 32), Insn.le(.word, .r1).imm); + expect_opcode(0xd4, Insn.le(.double_word, .r1)); + expectEqual(@intCast(i32, 64), Insn.le(.double_word, .r1).imm); + expect_opcode(0xdc, Insn.be(.half_word, .r1)); + expectEqual(@intCast(i32, 16), Insn.be(.half_word, .r1).imm); + expect_opcode(0xdc, Insn.be(.word, .r1)); + expectEqual(@intCast(i32, 32), Insn.be(.word, .r1).imm); + expect_opcode(0xdc, Insn.be(.double_word, .r1)); + expectEqual(@intCast(i32, 64), Insn.be(.double_word, .r1).imm); + + // memory instructions + expect_opcode(0x18, Insn.ld_dw1(.r1, 0)); + expect_opcode(0x00, Insn.ld_dw2(0)); + + // loading a map fd + expect_opcode(0x18, Insn.ld_map_fd1(.r1, 0)); + expectEqual(@intCast(u4, PSEUDO_MAP_FD), Insn.ld_map_fd1(.r1, 0).src); + expect_opcode(0x00, Insn.ld_map_fd2(0)); + + expect_opcode(0x38, Insn.ld_abs(.double_word, .r1, .r2, 0)); + expect_opcode(0x20, Insn.ld_abs(.word, .r1, .r2, 0)); + expect_opcode(0x28, Insn.ld_abs(.half_word, .r1, .r2, 0)); + expect_opcode(0x30, Insn.ld_abs(.byte, .r1, .r2, 0)); + + expect_opcode(0x58, Insn.ld_ind(.double_word, .r1, .r2, 0)); + expect_opcode(0x40, Insn.ld_ind(.word, .r1, .r2, 0)); + expect_opcode(0x48, Insn.ld_ind(.half_word, .r1, .r2, 0)); + expect_opcode(0x50, Insn.ld_ind(.byte, .r1, .r2, 0)); + + expect_opcode(0x79, Insn.ldx(.double_word, .r1, .r2, 0)); + expect_opcode(0x61, Insn.ldx(.word, .r1, .r2, 0)); + expect_opcode(0x69, Insn.ldx(.half_word, .r1, .r2, 0)); + expect_opcode(0x71, Insn.ldx(.byte, .r1, .r2, 0)); + + expect_opcode(0x62, Insn.st(.word, .r1, 0, 0)); + expect_opcode(0x6a, Insn.st(.half_word, .r1, 0, 0)); + expect_opcode(0x72, Insn.st(.byte, .r1, 0, 0)); + + expect_opcode(0x63, Insn.stx(.word, .r1, 0, .r2)); + expect_opcode(0x6b, Insn.stx(.half_word, .r1, 0, .r2)); + expect_opcode(0x73, Insn.stx(.byte, .r1, 0, .r2)); + expect_opcode(0x7b, Insn.stx(.double_word, .r1, 0, .r2)); + + // branch instructions + expect_opcode(0x05, Insn.ja(0)); + expect_opcode(0x15, Insn.jeq(.r1, 0, 0)); + expect_opcode(0x1d, Insn.jeq(.r1, .r2, 0)); + expect_opcode(0x25, Insn.jgt(.r1, 0, 0)); + expect_opcode(0x2d, Insn.jgt(.r1, .r2, 0)); + expect_opcode(0x35, Insn.jge(.r1, 0, 0)); + expect_opcode(0x3d, Insn.jge(.r1, .r2, 0)); + expect_opcode(0xa5, Insn.jlt(.r1, 0, 0)); + expect_opcode(0xad, Insn.jlt(.r1, .r2, 0)); + expect_opcode(0xb5, Insn.jle(.r1, 0, 0)); + expect_opcode(0xbd, Insn.jle(.r1, .r2, 0)); + expect_opcode(0x45, Insn.jset(.r1, 0, 0)); + expect_opcode(0x4d, Insn.jset(.r1, .r2, 0)); + expect_opcode(0x55, Insn.jne(.r1, 0, 0)); + expect_opcode(0x5d, Insn.jne(.r1, .r2, 0)); + expect_opcode(0x65, Insn.jsgt(.r1, 0, 0)); + expect_opcode(0x6d, Insn.jsgt(.r1, .r2, 0)); + expect_opcode(0x75, Insn.jsge(.r1, 0, 0)); + expect_opcode(0x7d, Insn.jsge(.r1, .r2, 0)); + expect_opcode(0xc5, Insn.jslt(.r1, 0, 0)); + expect_opcode(0xcd, Insn.jslt(.r1, .r2, 0)); + expect_opcode(0xd5, Insn.jsle(.r1, 0, 0)); + expect_opcode(0xdd, Insn.jsle(.r1, .r2, 0)); + expect_opcode(0x85, Insn.call(.unspec)); + expect_opcode(0x95, Insn.exit()); } pub const Cmd = extern enum(usize) { + /// Create a map and return a file descriptor that refers to the map. The + /// close-on-exec file descriptor flag is automatically enabled for the new + /// file descriptor. + /// + /// uses MapCreateAttr map_create, + + /// Look up an element by key in a specified map and return its value. + /// + /// uses MapElemAttr map_lookup_elem, + + /// Create or update an element (key/value pair) in a specified map. + /// + /// uses MapElemAttr map_update_elem, + + /// Look up and delete an element by key in a specified map. + /// + /// uses MapElemAttr map_delete_elem, + + /// Look up an element by key in a specified map and return the key of the + /// next element. map_get_next_key, + + /// Verify and load an eBPF program, returning a new file descriptor + /// associated with the program. The close-on-exec file descriptor flag + /// is automatically enabled for the new file descriptor. + /// + /// uses ProgLoadAttr prog_load, + + /// Pin a map or eBPF program to a path within the minimal BPF filesystem + /// + /// uses ObjAttr obj_pin, + + /// Get the file descriptor of a BPF object pinned to a certain path + /// + /// uses ObjAttr obj_get, + + /// uses ProgAttachAttr prog_attach, + + /// uses ProgAttachAttr prog_detach, + + /// uses TestRunAttr prog_test_run, + + /// uses GetIdAttr prog_get_next_id, + + /// uses GetIdAttr map_get_next_id, + + /// uses GetIdAttr prog_get_fd_by_id, + + /// uses GetIdAttr map_get_fd_by_id, + + /// uses InfoAttr obj_get_info_by_fd, + + /// uses QueryAttr prog_query, + + /// uses RawTracepointAttr raw_tracepoint_open, + + /// uses BtfLoadAttr btf_load, + + /// uses GetIdAttr btf_get_fd_by_id, + + /// uses TaskFdQueryAttr task_fd_query, + + /// uses MapElemAttr map_lookup_and_delete_elem, map_freeze, + + /// uses GetIdAttr btf_get_next_id, + + /// uses MapBatchAttr map_lookup_batch, + + /// uses MapBatchAttr map_lookup_and_delete_batch, + + /// uses MapBatchAttr map_update_batch, + + /// uses MapBatchAttr map_delete_batch, + + /// uses LinkCreateAttr link_create, + + /// uses LinkUpdateAttr link_update, + + /// uses GetIdAttr link_get_fd_by_id, + + /// uses GetIdAttr link_get_next_id, + + /// uses EnableStatsAttr enable_stats, + + /// uses IterCreateAttr iter_create, link_detach, _, @@ -630,42 +1005,138 @@ pub const MapType = extern enum(u32) { sk_storage, devmap_hash, struct_ops, + + /// An ordered and shared CPU version of perf_event_array. They have + /// similar semantics: + /// - variable length records + /// - no blocking: when full, reservation fails + /// - memory mappable for ease and speed + /// - epoll notifications for new data, but can busy poll + /// + /// Ringbufs give BPF programs two sets of APIs: + /// - ringbuf_output() allows copy data from one place to a ring + /// buffer, similar to bpf_perf_event_output() + /// - ringbuf_reserve()/ringbuf_commit()/ringbuf_discard() split the + /// process into two steps. First a fixed amount of space is reserved, + /// if that is successful then the program gets a pointer to a chunk of + /// memory and can be submitted with commit() or discarded with + /// discard() + /// + /// ringbuf_output() will incurr an extra memory copy, but allows to submit + /// records of the length that's not known beforehand, and is an easy + /// replacement for perf_event_outptu(). + /// + /// ringbuf_reserve() avoids the extra memory copy but requires a known size + /// of memory beforehand. + /// + /// ringbuf_query() allows to query properties of the map, 4 are currently + /// supported: + /// - BPF_RB_AVAIL_DATA: amount of unconsumed data in ringbuf + /// - BPF_RB_RING_SIZE: returns size of ringbuf + /// - BPF_RB_CONS_POS/BPF_RB_PROD_POS returns current logical position + /// of consumer and producer respectively + /// + /// key size: 0 + /// value size: 0 + /// max entries: size of ringbuf, must be power of 2 ringbuf, + _, }; pub const ProgType = extern enum(u32) { unspec, + + /// context type: __sk_buff socket_filter, + + /// context type: bpf_user_pt_regs_t kprobe, + + /// context type: __sk_buff sched_cls, + + /// context type: __sk_buff sched_act, + + /// context type: u64 tracepoint, + + /// context type: xdp_md xdp, + + /// context type: bpf_perf_event_data perf_event, + + /// context type: __sk_buff cgroup_skb, + + /// context type: bpf_sock cgroup_sock, + + /// context type: __sk_buff lwt_in, + + /// context type: __sk_buff lwt_out, + + /// context type: __sk_buff lwt_xmit, + + /// context type: bpf_sock_ops sock_ops, + + /// context type: __sk_buff sk_skb, + + /// context type: bpf_cgroup_dev_ctx cgroup_device, + + /// context type: sk_msg_md sk_msg, + + /// context type: bpf_raw_tracepoint_args raw_tracepoint, + + /// context type: bpf_sock_addr cgroup_sock_addr, + + /// context type: __sk_buff lwt_seg6local, + + /// context type: u32 lirc_mode2, + + /// context type: sk_reuseport_md sk_reuseport, + + /// context type: __sk_buff flow_dissector, + + /// context type: bpf_sysctl cgroup_sysctl, + + /// context type: bpf_raw_tracepoint_args raw_tracepoint_writable, + + /// context type: bpf_sockopt cgroup_sockopt, + + /// context type: void * tracing, + + /// context type: void * struct_ops, + + /// context type: void * ext, + + /// context type: void * lsm, + + /// context type: bpf_sk_lookup sk_lookup, + _, }; pub const AttachType = extern enum(u32) { @@ -715,27 +1186,38 @@ const obj_name_len = 16; pub const MapCreateAttr = extern struct { /// one of MapType map_type: u32, + /// size of key in bytes key_size: u32, + /// size of value in bytes value_size: u32, + /// max number of entries in a map max_entries: u32, + /// .map_create related flags map_flags: u32, + /// fd pointing to the inner map inner_map_fd: fd_t, + /// numa node (effective only if MapCreateFlags.numa_node is set) numa_node: u32, map_name: [obj_name_len]u8, + /// ifindex of netdev to create on map_ifindex: u32, + /// fd pointing to a BTF type data btf_fd: fd_t, + /// BTF type_id of the key btf_key_type_id: u32, + /// BTF type_id of the value bpf_value_type_id: u32, + /// BTF type_id of a kernel struct stored as the map value btf_vmlinux_value_type_id: u32, }; @@ -755,10 +1237,12 @@ pub const MapElemAttr = extern struct { pub const MapBatchAttr = extern struct { /// start batch, NULL to start from beginning in_batch: u64, + /// output: next start batch out_batch: u64, keys: u64, values: u64, + /// input/output: /// input: # of key/value elements /// output: # of filled elements @@ -775,35 +1259,49 @@ pub const ProgLoadAttr = extern struct { insn_cnt: u32, insns: u64, license: u64, + /// verbosity level of verifier log_level: u32, + /// size of user buffer log_size: u32, + /// user supplied buffer log_buf: u64, + /// not used kern_version: u32, prog_flags: u32, prog_name: [obj_name_len]u8, - /// ifindex of netdev to prep for. For some prog types expected attach - /// type must be known at load time to verify attach type specific parts - /// of prog (context accesses, allowed helpers, etc). + + /// ifindex of netdev to prep for. prog_ifindex: u32, + + /// For some prog types expected attach type must be known at load time to + /// verify attach type specific parts of prog (context accesses, allowed + /// helpers, etc). expected_attach_type: u32, + /// fd pointing to BTF type data prog_btf_fd: fd_t, + /// userspace bpf_func_info size func_info_rec_size: u32, func_info: u64, + /// number of bpf_func_info records func_info_cnt: u32, + /// userspace bpf_line_info size line_info_rec_size: u32, line_info: u64, + /// number of bpf_line_info records line_info_cnt: u32, + /// in-kernel BTF type id to attach to attact_btf_id: u32, + /// 0 to attach to vmlinux attach_prog_id: u32, }; @@ -819,29 +1317,36 @@ pub const ObjAttr = extern struct { pub const ProgAttachAttr = extern struct { /// container object to attach to target_fd: fd_t, + /// eBPF program to attach attach_bpf_fd: fd_t, + attach_type: u32, attach_flags: u32, + // TODO: BPF_F_REPLACE flags /// previously attached eBPF program to replace if .replace is used replace_bpf_fd: fd_t, }; /// struct used by Cmd.prog_test_run command -pub const TestAttr = extern struct { +pub const TestRunAttr = extern struct { prog_fd: fd_t, retval: u32, + /// input: len of data_in data_size_in: u32, + /// input/output: len of data_out. returns ENOSPC if data_out is too small. data_size_out: u32, data_in: u64, data_out: u64, repeat: u32, duration: u32, + /// input: len of ctx_in ctx_size_in: u32, + /// input/output: len of ctx_out. returns ENOSPC if ctx_out is too small. ctx_size_out: u32, ctx_in: u64, @@ -894,26 +1399,35 @@ pub const BtfLoadAttr = extern struct { btf_log_level: u32, }; +/// struct used by Cmd.task_fd_query pub const TaskFdQueryAttr = extern struct { /// input: pid pid: pid_t, + /// input: fd fd: fd_t, + /// input: flags flags: u32, + /// input/output: buf len buf_len: u32, + /// input/output: /// tp_name for tracepoint /// symbol for kprobe /// filename for uprobe buf: u64, + /// output: prod_id prog_id: u32, + /// output: BPF_FD_TYPE fd_type: u32, + /// output: probe_offset probe_offset: u64, + /// output: probe_addr probe_addr: u64, }; @@ -922,9 +1436,11 @@ pub const TaskFdQueryAttr = extern struct { pub const LinkCreateAttr = extern struct { /// eBPF program to attach prog_fd: fd_t, + /// object to attach to target_fd: fd_t, attach_type: u32, + /// extra flags flags: u32, }; @@ -932,10 +1448,13 @@ pub const LinkCreateAttr = extern struct { /// struct used by Cmd.link_update command pub const LinkUpdateAttr = extern struct { link_fd: fd_t, + /// new program to update link with new_prog_fd: fd_t, + /// extra flags flags: u32, + /// expected link's program fd, it is specified only if BPF_F_REPLACE is /// set in flags old_prog_fd: fd_t, @@ -952,6 +1471,7 @@ pub const IterCreateAttr = extern struct { flags: u32, }; +/// Mega struct that is passed to the bpf() syscall pub const Attr = extern union { map_create: MapCreateAttr, map_elem: MapElemAttr, @@ -971,3 +1491,176 @@ pub const Attr = extern union { enable_stats: EnableStatsAttr, iter_create: IterCreateAttr, }; + +pub const Log = struct { + level: u32, + buf: []u8, +}; + +pub fn map_create(map_type: MapType, key_size: u32, value_size: u32, max_entries: u32) !fd_t { + var attr = Attr{ + .map_create = std.mem.zeroes(MapCreateAttr), + }; + + attr.map_create.map_type = @enumToInt(map_type); + attr.map_create.key_size = key_size; + attr.map_create.value_size = value_size; + attr.map_create.max_entries = max_entries; + + const rc = bpf(.map_create, &attr, @sizeOf(MapCreateAttr)); + return switch (errno(rc)) { + 0 => @intCast(fd_t, rc), + EINVAL => error.MapTypeOrAttrInvalid, + ENOMEM => error.SystemResources, + EPERM => error.AccessDenied, + else => |err| unexpectedErrno(rc), + }; +} + +test "map_create" { + const map = try map_create(.hash, 4, 4, 32); + defer std.os.close(map); +} + +pub fn map_lookup_elem(fd: fd_t, key: []const u8, value: []u8) !void { + var attr = Attr{ + .map_elem = std.mem.zeroes(MapElemAttr), + }; + + attr.map_elem.map_fd = fd; + attr.map_elem.key = @ptrToInt(key.ptr); + attr.map_elem.result.value = @ptrToInt(value.ptr); + + const rc = bpf(.map_lookup_elem, &attr, @sizeOf(MapElemAttr)); + switch (errno(rc)) { + 0 => return, + EBADF => return error.BadFd, + EFAULT => unreachable, + EINVAL => return error.FieldInAttrNeedsZeroing, + ENOENT => return error.NotFound, + EPERM => return error.AccessDenied, + else => |err| return unexpectedErrno(rc), + } +} + +pub fn map_update_elem(fd: fd_t, key: []const u8, value: []const u8, flags: u64) !void { + var attr = Attr{ + .map_elem = std.mem.zeroes(MapElemAttr), + }; + + attr.map_elem.map_fd = fd; + attr.map_elem.key = @ptrToInt(key.ptr); + attr.map_elem.result = .{ .value = @ptrToInt(value.ptr) }; + attr.map_elem.flags = flags; + + const rc = bpf(.map_update_elem, &attr, @sizeOf(MapElemAttr)); + switch (errno(rc)) { + 0 => return, + E2BIG => return error.ReachedMaxEntries, + EBADF => return error.BadFd, + EFAULT => unreachable, + EINVAL => return error.FieldInAttrNeedsZeroing, + ENOMEM => return error.SystemResources, + EPERM => return error.AccessDenied, + else => |err| return unexpectedErrno(err), + } +} + +pub fn map_delete_elem(fd: fd_t, key: []const u8) !void { + var attr = Attr{ + .map_elem = std.mem.zeroes(MapElemAttr), + }; + + attr.map_elem.map_fd = fd; + attr.map_elem.key = @ptrToInt(key.ptr); + + const rc = bpf(.map_delete_elem, &attr, @sizeOf(MapElemAttr)); + switch (errno(rc)) { + 0 => return, + EBADF => return error.BadFd, + EFAULT => unreachable, + EINVAL => return error.FieldInAttrNeedsZeroing, + ENOENT => return error.NotFound, + EPERM => return error.AccessDenied, + else => |err| return unexpectedErrno(err), + } +} + +test "map lookup, update, and delete" { + const key_size = 4; + const value_size = 4; + const map = try map_create(.hash, key_size, value_size, 1); + defer std.os.close(map); + + const key = std.mem.zeroes([key_size]u8); + var value = std.mem.zeroes([value_size]u8); + + // fails looking up value that doesn't exist + expectError(error.NotFound, map_lookup_elem(map, &key, &value)); + + // succeed at updating and looking up element + try map_update_elem(map, &key, &value, 0); + try map_lookup_elem(map, &key, &value); + + // fails inserting more than max entries + const second_key = [key_size]u8{ 0, 0, 0, 1 }; + expectError(error.ReachedMaxEntries, map_update_elem(map, &second_key, &value, 0)); + + // succeed at deleting an existing elem + try map_delete_elem(map, &key); + expectError(error.NotFound, map_lookup_elem(map, &key, &value)); + + // fail at deleting a non-existing elem + expectError(error.NotFound, map_delete_elem(map, &key)); +} + +pub fn prog_load( + prog_type: ProgType, + insns: []const Insn, + log: ?*Log, + license: []const u8, + kern_version: u32, +) !fd_t { + var attr = Attr{ + .prog_load = std.mem.zeroes(ProgLoadAttr), + }; + + attr.prog_load.prog_type = @enumToInt(prog_type); + attr.prog_load.insns = @ptrToInt(insns.ptr); + attr.prog_load.insn_cnt = @intCast(u32, insns.len); + attr.prog_load.license = @ptrToInt(license.ptr); + attr.prog_load.kern_version = kern_version; + + if (log) |l| { + attr.prog_load.log_buf = @ptrToInt(l.buf.ptr); + attr.prog_load.log_size = @intCast(u32, l.buf.len); + attr.prog_load.log_level = l.level; + } + + const rc = bpf(.prog_load, &attr, @sizeOf(ProgLoadAttr)); + return switch (errno(rc)) { + 0 => @intCast(fd_t, rc), + EACCES => error.UnsafeProgram, + EFAULT => unreachable, + EINVAL => error.InvalidProgram, + EPERM => error.AccessDenied, + else => |err| unexpectedErrno(err), + }; +} + +test "prog_load" { + // this should fail because it does not set r0 before exiting + const bad_prog = [_]Insn{ + Insn.exit(), + }; + + const good_prog = [_]Insn{ + Insn.mov(.r0, 0), + Insn.exit(), + }; + + const prog = try prog_load(.socket_filter, &good_prog, null, "MIT", 0); + defer std.os.close(prog); + + expectError(error.UnsafeProgram, prog_load(.socket_filter, &bad_prog, null, "MIT", 0)); +} |
