From cb28fc2e63dea2902fda21b7738aa93eaf4a2ea0 Mon Sep 17 00:00:00 2001
From: Luuk de Gram <luuk@degram.dev>
Date: Wed, 25 May 2022 22:35:11 +0200
Subject: wasm-linker: Resolve symbols from archives

Lazily load object files by default, and only load the object file
when an unresolved symbol has been found within an archive.
---
 src/link/Wasm.zig         | 117 +++++++++++++++++++++++++++++++++++++++++-----
 src/link/Wasm/Archive.zig |  33 +++++++++++--
 2 files changed, 135 insertions(+), 15 deletions(-)

(limited to 'src')

diff --git a/src/link/Wasm.zig b/src/link/Wasm.zig
index 71f5c6f784..f734b228ae 100644
--- a/src/link/Wasm.zig
+++ b/src/link/Wasm.zig
@@ -29,6 +29,7 @@ const Air = @import("../Air.zig");
 const Liveness = @import("../Liveness.zig");
 const Symbol = @import("Wasm/Symbol.zig");
 const Object = @import("Wasm/Object.zig");
+const Archive = @import("Wasm/Archive.zig");
 const types = @import("Wasm/types.zig");
 
 pub const base_tag = link.File.Tag.wasm;
@@ -125,6 +126,10 @@ function_table: std.AutoHashMapUnmanaged(SymbolLoc, u32) = .{},
 
 /// All object files and their data which are linked into the final binary
 objects: std.ArrayListUnmanaged(Object) = .{},
+/// All archive files that are lazy loaded.
+/// e.g. when an undefined symbol references a symbol from the archive.
+archives: std.ArrayListUnmanaged(Archive) = .{},
+
 /// A map of global names (read: offset into string table) to their symbol location
 globals: std.AutoHashMapUnmanaged(u32, SymbolLoc) = .{},
 /// Maps discarded symbols and their positions to the location of the symbol
@@ -133,6 +138,8 @@ discarded: std.AutoHashMapUnmanaged(SymbolLoc, SymbolLoc) = .{},
 /// List of all symbol locations which have been resolved by the linker and will be emit
 /// into the final binary.
 resolved_symbols: std.AutoArrayHashMapUnmanaged(SymbolLoc, void) = .{},
+/// Symbols that remain undefined after symbol resolution.
+undefs: std.StringArrayHashMapUnmanaged(SymbolLoc) = .{},
 /// Maps a symbol's location to an atom. This can be used to find meta
 /// data of a symbol, such as its size, or its offset to perform a relocation.
 /// Undefined (and synthetic) symbols do not have an Atom and therefore cannot be mapped.
@@ -359,6 +366,7 @@ pub fn createEmpty(gpa: Allocator, options: link.Options) !*Wasm {
 fn parseInputFiles(self: *Wasm, files: []const []const u8) !void {
     for (files) |path| {
         if (try self.parseObjectFile(path)) continue;
+        if (try self.parseArchive(path, false)) continue; // load archives lazily
         log.warn("Unexpected file format at path: '{s}'", .{path});
     }
 }
@@ -371,10 +379,7 @@ fn parseObjectFile(self: *Wasm, path: []const u8) !bool {
     errdefer file.close();
 
     var object = Object.create(self.base.allocator, file, path) catch |err| switch (err) {
-        error.InvalidMagicByte, error.NotObjectFile => {
-            log.warn("Self hosted linker does not support non-object file parsing: {s}", .{@errorName(err)});
-            return false;
-        },
+        error.InvalidMagicByte, error.NotObjectFile => return false,
         else => |e| return e,
     };
     errdefer object.deinit(self.base.allocator);
@@ -382,6 +387,56 @@ fn parseObjectFile(self: *Wasm, path: []const u8) !bool {
     return true;
 }
 
+/// Parses an archive file and will then parse each object file
+/// that was found in the archive file.
+/// Returns false when the file is not an archive file.
+/// May return an error instead when parsing failed.
+///
+/// When `force_load` is `true`, it will for link all object files in the archive.
+/// When false, it will only link with object files that contain symbols that
+/// are referenced by other object files or Zig code.
+fn parseArchive(self: *Wasm, path: []const u8, force_load: bool) !bool {
+    const file = try fs.cwd().openFile(path, .{});
+    errdefer file.close();
+
+    var archive: Archive = .{
+        .file = file,
+        .name = path,
+    };
+    archive.parse(self.base.allocator) catch |err| switch (err) {
+        error.EndOfStream, error.NotArchive => {
+            archive.deinit(self.base.allocator);
+            return false;
+        },
+        else => |e| return e,
+    };
+
+    if (!force_load) {
+        errdefer archive.deinit(self.base.allocator);
+        try self.archives.append(self.base.allocator, archive);
+        return true;
+    }
+    defer archive.deinit(self.base.allocator);
+
+    // In this case we must force link all embedded object files within the archive
+    // We loop over all symbols, and then group them by offset as the offset
+    // notates where the object file starts.
+    var offsets = std.AutoArrayHashMap(u32, void).init(self.base.allocator);
+    defer offsets.deinit();
+    for (archive.toc.values()) |symbol_offsets| {
+        for (symbol_offsets.items) |sym_offset| {
+            try offsets.put(sym_offset, {});
+        }
+    }
+
+    for (offsets.keys()) |file_offset| {
+        const object = try self.objects.addOne(self.base.allocator);
+        object.* = try archive.parseObject(self.base.allocator, file_offset);
+    }
+
+    return true;
+}
+
 fn resolveSymbolsInObject(self: *Wasm, object_index: u16) !void {
     const object: Object = self.objects.items[object_index];
     log.debug("Resolving symbols in object: '{s}'", .{object.name});
@@ -414,6 +469,10 @@ fn resolveSymbolsInObject(self: *Wasm, object_index: u16) !void {
         if (!maybe_existing.found_existing) {
             maybe_existing.value_ptr.* = location;
             try self.resolved_symbols.putNoClobber(self.base.allocator, location, {});
+
+            if (symbol.isUndefined()) {
+                try self.undefs.putNoClobber(self.base.allocator, sym_name, location);
+            }
             continue;
         }
 
@@ -456,6 +515,42 @@ fn resolveSymbolsInObject(self: *Wasm, object_index: u16) !void {
         try self.globals.put(self.base.allocator, sym_name_index, location);
         try self.resolved_symbols.put(self.base.allocator, location, {});
         assert(self.resolved_symbols.swapRemove(existing_loc));
+        if (existing_sym.isUndefined()) {
+            // ensure order remains intact in case we later
+            // resolve symbols again in a loop
+            assert(self.undefs.orderedRemove(sym_name));
+        }
+    }
+}
+
+fn resolveSymbolsInArchives(self: *Wasm) !void {
+    if (self.archives.items.len == 0) return;
+
+    log.debug("Resolving symbols in archives", .{});
+    var index: u32 = 0;
+    undef_loop: while (index < self.undefs.count()) {
+        const undef_sym_loc = self.undefs.values()[index];
+        const sym_name = undef_sym_loc.getName(self);
+
+        for (self.archives.items) |archive| {
+            const offset = archive.toc.get(sym_name) orelse {
+                // symbol does not exist in this archive
+                continue;
+            };
+
+            // Symbol is found in unparsed object file within current archive.
+            // Parse object and and resolve symbols again before we check remaining
+            // undefined symbols.
+            const object_file_index = @intCast(u16, self.objects.items.len);
+            const object = try self.objects.addOne(self.base.allocator);
+            object.* = try archive.parseObject(self.base.allocator, offset.items[0]);
+            try self.resolveSymbolsInObject(object_file_index);
+
+            // continue loop for any remaining undefined symbols that still exist
+            // after resolving last object file
+            continue :undef_loop;
+        }
+        index += 1;
     }
 }
 
@@ -789,6 +884,7 @@ pub fn getGlobalSymbol(self: *Wasm, name: []const u8) !u32 {
     self.symbols.items[sym_index] = symbol;
     gop.value_ptr.* = .{ .index = sym_index, .file = null };
     try self.resolved_symbols.put(self.base.allocator, gop.value_ptr.*, {});
+    try self.undefs.putNoClobber(self.base.allocator, name, gop.value_ptr.*);
     return sym_index;
 }
 
@@ -1017,6 +1113,7 @@ pub fn addOrUpdateImport(
         const loc: SymbolLoc = .{ .file = null, .index = symbol_index };
         global_gop.value_ptr.* = loc;
         try self.resolved_symbols.put(self.base.allocator, loc, {});
+        try self.undefs.putNoClobber(self.base.allocator, name, loc);
     }
 
     if (type_index) |ty_index| {
@@ -1298,7 +1395,7 @@ fn mergeTypes(self: *Wasm) !void {
     // type inserted. If we do this for the same function multiple times,
     // it will be overwritten with the incorrect type.
     var dirty = std.AutoHashMap(u32, void).init(self.base.allocator);
-    try dirty.ensureUnusedCapacity(@intCast(u32, self.functions.count()) + self.imported_functions_count);
+    try dirty.ensureUnusedCapacity(@intCast(u32, self.functions.count()));
     defer dirty.deinit();
 
     for (self.resolved_symbols.keys()) |sym_loc| {
@@ -1313,22 +1410,17 @@ fn mergeTypes(self: *Wasm) !void {
             continue;
         }
 
-        if (dirty.contains(symbol.index)) {
-            continue; // We already added the type of this symbol
-        }
-
         if (symbol.isUndefined()) {
             log.debug("Adding type from extern function '{s}'", .{sym_loc.getName(self)});
             const import: *types.Import = self.imports.getPtr(sym_loc).?;
             const original_type = object.func_types[import.kind.function];
             import.kind.function = try self.putOrGetFuncType(original_type);
-        } else {
+        } else if (!dirty.contains(symbol.index)) {
             log.debug("Adding type from function '{s}'", .{sym_loc.getName(self)});
             const func = &self.functions.values()[symbol.index - self.imported_functions_count];
             func.type_index = try self.putOrGetFuncType(object.func_types[func.type_index]);
+            dirty.putAssumeCapacityNoClobber(symbol.index, {});
         }
-
-        dirty.putAssumeCapacityNoClobber(symbol.index, {});
     }
     log.debug("Completed merging and deduplicating types. Total count: ({d})", .{self.func_types.items.len});
 }
@@ -1747,6 +1839,7 @@ pub fn flushModule(self: *Wasm, comp: *Compilation, prog_node: *std.Progress.Nod
     while (object_index < self.objects.items.len) : (object_index += 1) {
         try self.resolveSymbolsInObject(object_index);
     }
+    try self.resolveSymbolsInArchives();
 
     // When we finish/error we reset the state of the linker
     // So we can rebuild the binary file on each incremental update
diff --git a/src/link/Wasm/Archive.zig b/src/link/Wasm/Archive.zig
index ca69795537..816a8cd0d0 100644
--- a/src/link/Wasm/Archive.zig
+++ b/src/link/Wasm/Archive.zig
@@ -113,8 +113,6 @@ pub fn parse(self: *Archive, allocator: Allocator) !void {
         return error.NotArchive;
     }
 
-    log.debug("parsing archive '{s}' at '{s}'", .{ std.mem.sliceTo(&self.header.ar_name, 0), self.name });
-
     try self.parseTableOfContents(allocator, reader);
 }
 
@@ -175,6 +173,35 @@ fn parseTableOfContents(self: *Archive, allocator: Allocator, reader: anytype) !
             gop.value_ptr.* = .{};
         }
         try gop.value_ptr.append(allocator, symbol_positions[gop.index]);
-        log.debug("  parsed symbol '{s}' for position {d}", .{ string, symbol_positions[gop.index] });
     }
 }
+
+/// From a given file offset, starts reading for a file header.
+/// When found, parses the object file into an `Object` and returns it.
+pub fn parseObject(self: Archive, allocator: Allocator, file_offset: u32) !Object {
+    try self.file.seekTo(file_offset);
+    const reader = self.file.reader();
+    const header = try reader.readStruct(ar_hdr);
+
+    if (!mem.eql(u8, &header.ar_fmag, ARFMAG)) {
+        log.err("invalid header delimiter: expected '{s}', found '{s}'", .{ ARFMAG, header.ar_fmag });
+        return error.MalformedArchive;
+    }
+
+    const object_name = try parseName(allocator, header, reader);
+    defer allocator.free(object_name);
+
+    const name = name: {
+        if (object_name.len == 0) {
+            break :name try std.fmt.allocPrint(allocator, "{s}.o", .{self.name});
+        }
+        const base_path = std.fs.path.dirname(self.name);
+        break :name try std.fmt.allocPrint(allocator, "{s}/{s}.o", .{ base_path, object_name });
+    };
+
+    log.debug(" parsing object file '{s}' from archive\n", .{name});
+    const object_file = try std.fs.cwd().openFile(name, .{});
+    errdefer object_file.close();
+
+    return Object.create(allocator, object_file, name);
+}
-- 
cgit v1.2.3