diff options
| author | Matthew Lugg <mlugg@mlugg.co.uk> | 2025-11-20 17:48:35 +0000 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-11-20 17:48:35 +0000 |
| commit | 8a73fc8d8ee0065e4d07e473ce6ab095cebfaece (patch) | |
| tree | 36bed2955666bdfce7b96e2f33c9409bd26f03c1 /lib/std/debug/MachOFile.zig | |
| parent | a9568ed2963298864e5c9a92a3eafe81771128ff (diff) | |
| parent | a87b5332319b20347916f77e57253e2df4b2a3af (diff) | |
| download | zig-8a73fc8d8ee0065e4d07e473ce6ab095cebfaece.tar.gz zig-8a73fc8d8ee0065e4d07e473ce6ab095cebfaece.zip | |
Merge pull request #25981 from mlugg/macos-fuzz-2
make the fuzzer vaguely work on macOS
Diffstat (limited to 'lib/std/debug/MachOFile.zig')
| -rw-r--r-- | lib/std/debug/MachOFile.zig | 548 |
1 files changed, 548 insertions, 0 deletions
diff --git a/lib/std/debug/MachOFile.zig b/lib/std/debug/MachOFile.zig new file mode 100644 index 0000000000..3be1b1daff --- /dev/null +++ b/lib/std/debug/MachOFile.zig @@ -0,0 +1,548 @@ +mapped_memory: []align(std.heap.page_size_min) const u8, +symbols: []const Symbol, +strings: []const u8, +text_vmaddr: u64, + +/// Key is index into `strings` of the file path. +ofiles: std.AutoArrayHashMapUnmanaged(u32, Error!OFile), + +pub const Error = error{ + InvalidMachO, + InvalidDwarf, + MissingDebugInfo, + UnsupportedDebugInfo, + ReadFailed, + OutOfMemory, +}; + +pub fn deinit(mf: *MachOFile, gpa: Allocator) void { + for (mf.ofiles.values()) |*maybe_of| { + const of = &(maybe_of.* catch continue); + posix.munmap(of.mapped_memory); + of.dwarf.deinit(gpa); + of.symbols_by_name.deinit(gpa); + } + mf.ofiles.deinit(gpa); + gpa.free(mf.symbols); + posix.munmap(mf.mapped_memory); +} + +pub fn load(gpa: Allocator, path: []const u8, arch: std.Target.Cpu.Arch) Error!MachOFile { + switch (arch) { + .x86_64, .aarch64 => {}, + else => unreachable, + } + + const all_mapped_memory = try mapDebugInfoFile(path); + errdefer posix.munmap(all_mapped_memory); + + // In most cases, the file we just mapped is a Mach-O binary. However, it could be a "universal + // binary": a simple file format which contains Mach-O binaries for multiple targets. For + // instance, `/usr/lib/dyld` is currently distributed as a universal binary containing images + // for both ARM64 macOS and x86_64 macOS. + if (all_mapped_memory.len < 4) return error.InvalidMachO; + const magic = std.mem.readInt(u32, all_mapped_memory.ptr[0..4], .little); + + // The contents of a Mach-O file, which may or may not be the whole of `all_mapped_memory`. + const mapped_macho = switch (magic) { + macho.MH_MAGIC_64 => all_mapped_memory, + + macho.FAT_CIGAM => mapped_macho: { + // This is the universal binary format (aka a "fat binary"). + var fat_r: Io.Reader = .fixed(all_mapped_memory); + const hdr = fat_r.takeStruct(macho.fat_header, .big) catch |err| switch (err) { + error.ReadFailed => unreachable, + error.EndOfStream => return error.InvalidMachO, + }; + const want_cpu_type = switch (arch) { + .x86_64 => macho.CPU_TYPE_X86_64, + .aarch64 => macho.CPU_TYPE_ARM64, + else => unreachable, + }; + for (0..hdr.nfat_arch) |_| { + const fat_arch = fat_r.takeStruct(macho.fat_arch, .big) catch |err| switch (err) { + error.ReadFailed => unreachable, + error.EndOfStream => return error.InvalidMachO, + }; + if (fat_arch.cputype != want_cpu_type) continue; + if (fat_arch.offset + fat_arch.size > all_mapped_memory.len) return error.InvalidMachO; + break :mapped_macho all_mapped_memory[fat_arch.offset..][0..fat_arch.size]; + } + // `arch` was not present in the fat binary. + return error.MissingDebugInfo; + }, + + // Even on modern 64-bit targets, this format doesn't seem to be too extensively used. It + // will be fairly easy to add support here if necessary; it's very similar to above. + macho.FAT_CIGAM_64 => return error.UnsupportedDebugInfo, + + else => return error.InvalidMachO, + }; + + var r: Io.Reader = .fixed(mapped_macho); + const hdr = r.takeStruct(macho.mach_header_64, .little) catch |err| switch (err) { + error.ReadFailed => unreachable, + error.EndOfStream => return error.InvalidMachO, + }; + + if (hdr.magic != macho.MH_MAGIC_64) + return error.InvalidMachO; + + const symtab: macho.symtab_command, const text_vmaddr: u64 = lcs: { + var it: macho.LoadCommandIterator = try .init(&hdr, mapped_macho[@sizeOf(macho.mach_header_64)..]); + var symtab: ?macho.symtab_command = null; + var text_vmaddr: ?u64 = null; + while (try it.next()) |cmd| switch (cmd.hdr.cmd) { + .SYMTAB => symtab = cmd.cast(macho.symtab_command) orelse return error.InvalidMachO, + .SEGMENT_64 => if (cmd.cast(macho.segment_command_64)) |seg_cmd| { + if (!mem.eql(u8, seg_cmd.segName(), "__TEXT")) continue; + text_vmaddr = seg_cmd.vmaddr; + }, + else => {}, + }; + break :lcs .{ + symtab orelse return error.MissingDebugInfo, + text_vmaddr orelse return error.MissingDebugInfo, + }; + }; + + const strings = mapped_macho[symtab.stroff..][0 .. symtab.strsize - 1]; + + var symbols: std.ArrayList(Symbol) = try .initCapacity(gpa, symtab.nsyms); + defer symbols.deinit(gpa); + + // This map is temporary; it is used only to detect duplicates here. This is + // necessary because we prefer to use STAB ("symbolic debugging table") symbols, + // but they might not be present, so we track normal symbols too. + // Indices match 1-1 with those of `symbols`. + var symbol_names: std.StringArrayHashMapUnmanaged(void) = .empty; + defer symbol_names.deinit(gpa); + try symbol_names.ensureUnusedCapacity(gpa, symtab.nsyms); + + var ofile: u32 = undefined; + var last_sym: Symbol = undefined; + var state: enum { + init, + oso_open, + oso_close, + bnsym, + fun_strx, + fun_size, + ensym, + } = .init; + + var sym_r: Io.Reader = .fixed(mapped_macho[symtab.symoff..]); + for (0..symtab.nsyms) |_| { + const sym = sym_r.takeStruct(macho.nlist_64, .little) catch |err| switch (err) { + error.ReadFailed => unreachable, + error.EndOfStream => return error.InvalidMachO, + }; + if (sym.n_type.bits.is_stab == 0) { + if (sym.n_strx == 0) continue; + switch (sym.n_type.bits.type) { + .undf, .pbud, .indr, .abs, _ => continue, + .sect => { + const name = std.mem.sliceTo(strings[sym.n_strx..], 0); + const gop = symbol_names.getOrPutAssumeCapacity(name); + if (!gop.found_existing) { + assert(gop.index == symbols.items.len); + symbols.appendAssumeCapacity(.{ + .strx = sym.n_strx, + .addr = sym.n_value, + .ofile = Symbol.unknown_ofile, + }); + } + }, + } + continue; + } + + // TODO handle globals N_GSYM, and statics N_STSYM + switch (sym.n_type.stab) { + .oso => switch (state) { + .init, .oso_close => { + state = .oso_open; + ofile = sym.n_strx; + }, + else => return error.InvalidMachO, + }, + .bnsym => switch (state) { + .oso_open, .ensym => { + state = .bnsym; + last_sym = .{ + .strx = 0, + .addr = sym.n_value, + .ofile = ofile, + }; + }, + else => return error.InvalidMachO, + }, + .fun => switch (state) { + .bnsym => { + state = .fun_strx; + last_sym.strx = sym.n_strx; + }, + .fun_strx => { + state = .fun_size; + }, + else => return error.InvalidMachO, + }, + .ensym => switch (state) { + .fun_size => { + state = .ensym; + if (last_sym.strx != 0) { + const name = std.mem.sliceTo(strings[last_sym.strx..], 0); + const gop = symbol_names.getOrPutAssumeCapacity(name); + if (!gop.found_existing) { + assert(gop.index == symbols.items.len); + symbols.appendAssumeCapacity(last_sym); + } else { + symbols.items[gop.index] = last_sym; + } + } + }, + else => return error.InvalidMachO, + }, + .so => switch (state) { + .init, .oso_close => {}, + .oso_open, .ensym => { + state = .oso_close; + }, + else => return error.InvalidMachO, + }, + else => {}, + } + } + + switch (state) { + .init => { + // Missing STAB symtab entries is still okay, unless there were also no normal symbols. + if (symbols.items.len == 0) return error.MissingDebugInfo; + }, + .oso_close => {}, + else => return error.InvalidMachO, // corrupted STAB entries in symtab + } + + const symbols_slice = try symbols.toOwnedSlice(gpa); + errdefer gpa.free(symbols_slice); + + // Even though lld emits symbols in ascending order, this debug code + // should work for programs linked in any valid way. + // This sort is so that we can binary search later. + mem.sort(Symbol, symbols_slice, {}, Symbol.addressLessThan); + + return .{ + .mapped_memory = all_mapped_memory, + .symbols = symbols_slice, + .strings = strings, + .ofiles = .empty, + .text_vmaddr = text_vmaddr, + }; +} +pub fn getDwarfForAddress(mf: *MachOFile, gpa: Allocator, vaddr: u64) !struct { *Dwarf, u64 } { + const symbol = Symbol.find(mf.symbols, vaddr) orelse return error.MissingDebugInfo; + + if (symbol.ofile == Symbol.unknown_ofile) return error.MissingDebugInfo; + + // offset of `address` from start of `symbol` + const address_symbol_offset = vaddr - symbol.addr; + + // Take the symbol name from the N_FUN STAB entry, we're going to + // use it if we fail to find the DWARF infos + const stab_symbol = mem.sliceTo(mf.strings[symbol.strx..], 0); + + const gop = try mf.ofiles.getOrPut(gpa, symbol.ofile); + if (!gop.found_existing) { + const name = mem.sliceTo(mf.strings[symbol.ofile..], 0); + gop.value_ptr.* = loadOFile(gpa, name); + } + const of = &(gop.value_ptr.* catch |err| return err); + + const symbol_index = of.symbols_by_name.getKeyAdapted( + @as([]const u8, stab_symbol), + @as(OFile.SymbolAdapter, .{ .strtab = of.strtab, .symtab_raw = of.symtab_raw }), + ) orelse return error.MissingDebugInfo; + + const symbol_ofile_vaddr = vaddr: { + var sym = of.symtab_raw[symbol_index]; + if (builtin.cpu.arch.endian() != .little) std.mem.byteSwapAllFields(macho.nlist_64, &sym); + break :vaddr sym.n_value; + }; + + return .{ &of.dwarf, symbol_ofile_vaddr + address_symbol_offset }; +} +pub fn lookupSymbolName(mf: *MachOFile, vaddr: u64) error{MissingDebugInfo}![]const u8 { + const symbol = Symbol.find(mf.symbols, vaddr) orelse return error.MissingDebugInfo; + return mem.sliceTo(mf.strings[symbol.strx..], 0); +} + +const OFile = struct { + mapped_memory: []align(std.heap.page_size_min) const u8, + dwarf: Dwarf, + strtab: []const u8, + symtab_raw: []align(1) const macho.nlist_64, + /// All named symbols in `symtab_raw`. Stored `u32` key is the index into `symtab_raw`. Accessed + /// through `SymbolAdapter`, so that the symbol name is used as the logical key. + symbols_by_name: std.ArrayHashMapUnmanaged(u32, void, void, true), + + const SymbolAdapter = struct { + strtab: []const u8, + symtab_raw: []align(1) const macho.nlist_64, + pub fn hash(ctx: SymbolAdapter, sym_name: []const u8) u32 { + _ = ctx; + return @truncate(std.hash.Wyhash.hash(0, sym_name)); + } + pub fn eql(ctx: SymbolAdapter, a_sym_name: []const u8, b_sym_index: u32, b_index: usize) bool { + _ = b_index; + var b_sym = ctx.symtab_raw[b_sym_index]; + if (builtin.cpu.arch.endian() != .little) std.mem.byteSwapAllFields(macho.nlist_64, &b_sym); + const b_sym_name = std.mem.sliceTo(ctx.strtab[b_sym.n_strx..], 0); + return mem.eql(u8, a_sym_name, b_sym_name); + } + }; +}; + +const Symbol = struct { + strx: u32, + addr: u64, + /// Value may be `unknown_ofile`. + ofile: u32, + const unknown_ofile = std.math.maxInt(u32); + fn addressLessThan(context: void, lhs: Symbol, rhs: Symbol) bool { + _ = context; + return lhs.addr < rhs.addr; + } + /// Assumes that `symbols` is sorted in order of ascending `addr`. + fn find(symbols: []const Symbol, address: usize) ?*const Symbol { + if (symbols.len == 0) return null; // no potential match + if (address < symbols[0].addr) return null; // address is before the lowest-address symbol + var left: usize = 0; + var len: usize = symbols.len; + while (len > 1) { + const mid = left + len / 2; + if (address < symbols[mid].addr) { + len /= 2; + } else { + left = mid; + len -= len / 2; + } + } + return &symbols[left]; + } + + test find { + const symbols: []const Symbol = &.{ + .{ .addr = 100, .strx = undefined, .ofile = undefined }, + .{ .addr = 200, .strx = undefined, .ofile = undefined }, + .{ .addr = 300, .strx = undefined, .ofile = undefined }, + }; + + try testing.expectEqual(null, find(symbols, 0)); + try testing.expectEqual(null, find(symbols, 99)); + try testing.expectEqual(&symbols[0], find(symbols, 100).?); + try testing.expectEqual(&symbols[0], find(symbols, 150).?); + try testing.expectEqual(&symbols[0], find(symbols, 199).?); + + try testing.expectEqual(&symbols[1], find(symbols, 200).?); + try testing.expectEqual(&symbols[1], find(symbols, 250).?); + try testing.expectEqual(&symbols[1], find(symbols, 299).?); + + try testing.expectEqual(&symbols[2], find(symbols, 300).?); + try testing.expectEqual(&symbols[2], find(symbols, 301).?); + try testing.expectEqual(&symbols[2], find(symbols, 5000).?); + } +}; +test { + _ = Symbol; +} + +fn loadOFile(gpa: Allocator, o_file_name: []const u8) !OFile { + const all_mapped_memory, const mapped_ofile = map: { + const open_paren = paren: { + if (std.mem.endsWith(u8, o_file_name, ")")) { + if (std.mem.findScalarLast(u8, o_file_name, '(')) |i| { + break :paren i; + } + } + // Not an archive, just a normal path to a .o file + const m = try mapDebugInfoFile(o_file_name); + break :map .{ m, m }; + }; + + // We have the form 'path/to/archive.a(entry.o)'. Map the archive and find the object file in question. + + const archive_path = o_file_name[0..open_paren]; + const target_name_in_archive = o_file_name[open_paren + 1 .. o_file_name.len - 1]; + const mapped_archive = try mapDebugInfoFile(archive_path); + errdefer posix.munmap(mapped_archive); + + var ar_reader: Io.Reader = .fixed(mapped_archive); + const ar_magic = ar_reader.take(8) catch return error.InvalidMachO; + if (!std.mem.eql(u8, ar_magic, "!<arch>\n")) return error.InvalidMachO; + while (true) { + if (ar_reader.seek == ar_reader.buffer.len) return error.MissingDebugInfo; + + const raw_name = ar_reader.takeArray(16) catch return error.InvalidMachO; + ar_reader.discardAll(12 + 6 + 6 + 8) catch return error.InvalidMachO; + const raw_size = ar_reader.takeArray(10) catch return error.InvalidMachO; + const file_magic = ar_reader.takeArray(2) catch return error.InvalidMachO; + if (!std.mem.eql(u8, file_magic, "`\n")) return error.InvalidMachO; + + const size = std.fmt.parseInt(u32, mem.sliceTo(raw_size, ' '), 10) catch return error.InvalidMachO; + const raw_data = ar_reader.take(size) catch return error.InvalidMachO; + + const entry_name: []const u8, const entry_contents: []const u8 = entry: { + if (!std.mem.startsWith(u8, raw_name, "#1/")) { + break :entry .{ mem.sliceTo(raw_name, '/'), raw_data }; + } + const len = std.fmt.parseInt(u32, mem.sliceTo(raw_name[3..], ' '), 10) catch return error.InvalidMachO; + if (len > size) return error.InvalidMachO; + break :entry .{ mem.sliceTo(raw_data[0..len], 0), raw_data[len..] }; + }; + + if (std.mem.eql(u8, entry_name, target_name_in_archive)) { + break :map .{ mapped_archive, entry_contents }; + } + } + }; + errdefer posix.munmap(all_mapped_memory); + + var r: Io.Reader = .fixed(mapped_ofile); + const hdr = r.takeStruct(macho.mach_header_64, .little) catch |err| switch (err) { + error.ReadFailed => unreachable, + error.EndOfStream => return error.InvalidMachO, + }; + if (hdr.magic != std.macho.MH_MAGIC_64) return error.InvalidMachO; + + const seg_cmd: macho.LoadCommandIterator.LoadCommand, const symtab_cmd: macho.symtab_command = cmds: { + var seg_cmd: ?macho.LoadCommandIterator.LoadCommand = null; + var symtab_cmd: ?macho.symtab_command = null; + var it: macho.LoadCommandIterator = try .init(&hdr, mapped_ofile[@sizeOf(macho.mach_header_64)..]); + while (try it.next()) |lc| switch (lc.hdr.cmd) { + .SEGMENT_64 => seg_cmd = lc, + .SYMTAB => symtab_cmd = lc.cast(macho.symtab_command) orelse return error.InvalidMachO, + else => {}, + }; + break :cmds .{ + seg_cmd orelse return error.MissingDebugInfo, + symtab_cmd orelse return error.MissingDebugInfo, + }; + }; + + if (mapped_ofile.len < symtab_cmd.stroff + symtab_cmd.strsize) return error.InvalidMachO; + if (mapped_ofile[symtab_cmd.stroff + symtab_cmd.strsize - 1] != 0) return error.InvalidMachO; + const strtab = mapped_ofile[symtab_cmd.stroff..][0 .. symtab_cmd.strsize - 1]; + + const n_sym_bytes = symtab_cmd.nsyms * @sizeOf(macho.nlist_64); + if (mapped_ofile.len < symtab_cmd.symoff + n_sym_bytes) return error.InvalidMachO; + const symtab_raw: []align(1) const macho.nlist_64 = @ptrCast(mapped_ofile[symtab_cmd.symoff..][0..n_sym_bytes]); + + // TODO handle tentative (common) symbols + var symbols_by_name: std.ArrayHashMapUnmanaged(u32, void, void, true) = .empty; + defer symbols_by_name.deinit(gpa); + try symbols_by_name.ensureUnusedCapacity(gpa, @intCast(symtab_raw.len)); + for (symtab_raw, 0..) |sym_raw, sym_index| { + var sym = sym_raw; + if (builtin.cpu.arch.endian() != .little) std.mem.byteSwapAllFields(macho.nlist_64, &sym); + if (sym.n_strx == 0) continue; + switch (sym.n_type.bits.type) { + .undf => continue, // includes tentative symbols + .abs => continue, + else => {}, + } + const sym_name = mem.sliceTo(strtab[sym.n_strx..], 0); + const gop = symbols_by_name.getOrPutAssumeCapacityAdapted( + @as([]const u8, sym_name), + @as(OFile.SymbolAdapter, .{ .strtab = strtab, .symtab_raw = symtab_raw }), + ); + if (gop.found_existing) return error.InvalidMachO; + gop.key_ptr.* = @intCast(sym_index); + } + + var sections: Dwarf.SectionArray = @splat(null); + for (seg_cmd.getSections()) |sect_raw| { + var sect = sect_raw; + if (builtin.cpu.arch.endian() != .little) std.mem.byteSwapAllFields(macho.section_64, §); + + if (!std.mem.eql(u8, "__DWARF", sect.segName())) continue; + + const section_index: usize = inline for (@typeInfo(Dwarf.Section.Id).@"enum".fields, 0..) |section, i| { + if (mem.eql(u8, "__" ++ section.name, sect.sectName())) break i; + } else continue; + + if (mapped_ofile.len < sect.offset + sect.size) return error.InvalidMachO; + const section_bytes = mapped_ofile[sect.offset..][0..sect.size]; + sections[section_index] = .{ + .data = section_bytes, + .owned = false, + }; + } + + if (sections[@intFromEnum(Dwarf.Section.Id.debug_info)] == null or + sections[@intFromEnum(Dwarf.Section.Id.debug_abbrev)] == null or + sections[@intFromEnum(Dwarf.Section.Id.debug_str)] == null or + sections[@intFromEnum(Dwarf.Section.Id.debug_line)] == null) + { + return error.MissingDebugInfo; + } + + var dwarf: Dwarf = .{ .sections = sections }; + errdefer dwarf.deinit(gpa); + dwarf.open(gpa, .little) catch |err| switch (err) { + error.InvalidDebugInfo, + error.EndOfStream, + error.Overflow, + error.StreamTooLong, + => return error.InvalidDwarf, + + error.MissingDebugInfo, + error.ReadFailed, + error.OutOfMemory, + => |e| return e, + }; + + return .{ + .mapped_memory = all_mapped_memory, + .dwarf = dwarf, + .strtab = strtab, + .symtab_raw = symtab_raw, + .symbols_by_name = symbols_by_name.move(), + }; +} + +/// Uses `mmap` to map the file at `path` into memory. +fn mapDebugInfoFile(path: []const u8) ![]align(std.heap.page_size_min) const u8 { + const file = std.fs.cwd().openFile(path, .{}) catch |err| switch (err) { + error.FileNotFound => return error.MissingDebugInfo, + else => return error.ReadFailed, + }; + defer file.close(); + + const file_len = std.math.cast( + usize, + file.getEndPos() catch return error.ReadFailed, + ) orelse return error.ReadFailed; + + return posix.mmap( + null, + file_len, + posix.PROT.READ, + .{ .TYPE = .SHARED }, + file.handle, + 0, + ) catch return error.ReadFailed; +} + +const std = @import("std"); +const Allocator = std.mem.Allocator; +const Dwarf = std.debug.Dwarf; +const Io = std.Io; +const assert = std.debug.assert; +const posix = std.posix; +const macho = std.macho; +const mem = std.mem; +const testing = std.testing; + +const builtin = @import("builtin"); + +const MachOFile = @This(); |
