link: prevent deadlock when prelink tasks fail

If an error occured which prevented a prelink task from being queued, then `pending_prelink_tasks` would never be decremented, which could cause deadlocks in some cases. So, instead of calculating ahead of time the number of prelink tasks to expect, we use a simpler strategy which is much like a wait group: we add 1 to a value when we spawn a worker, and in the worker function, `defer` decrementing the value. The initial value is 1, and there's a decrement after all of the workers are spawned, so once it hits 0, prelink is done (be it with a failure or a success).
author: mlugg <mlugg@mlugg.co.uk> 2025-08-05 22:48:50 +0100
committer: Matthew Lugg <mlugg@mlugg.co.uk> 2025-08-06 21:50:16 +0100
commit: e17a050bc695f7d117b89adb1d258813593ca111 (patch)
tree: f281876485848c783fb9e2a10616b98cda0b2e09 /src/link/Queue.zig
parent: 3de8bbd3d4e262df11a582fb52401b8077b5f352 (diff)
download: zig-e17a050bc695f7d117b89adb1d258813593ca111.tar.gz
zig-e17a050bc695f7d117b89adb1d258813593ca111.zip
1 files changed, 45 insertions, 13 deletions
diff --git a/src/link/Queue.zig b/src/link/Queue.zig
index d1595636ac..9f4535e1fe 100644
--- a/src/link/Queue.zig
+++ b/src/link/Queue.zig
@@ -16,9 +16,9 @@ mutex: std.Thread.Mutex,
 /// Validates that only one `flushTaskQueue` thread is running at a time.
 flush_safety: std.debug.SafetyLock,
 
-/// This is the number of prelink tasks which are expected but have not yet been enqueued.
-/// Guarded by `mutex`.
-pending_prelink_tasks: u32,
+/// This value is positive while there are still prelink tasks yet to be queued. Once they are
+/// all queued, this value becomes 0, and ZCU tasks can be run. Guarded by `mutex`.
+prelink_wait_count: u32,
 
 /// Prelink tasks which have been enqueued and are not yet owned by the worker thread.
 /// Allocated into `gpa`, guarded by `mutex`.
@@ -59,7 +59,7 @@ state: union(enum) {
     /// The link thread is currently running or queued to run.
     running,
     /// The link thread is not running or queued, because it has exhausted all immediately available
-    /// tasks. It should be spawned when more tasks are enqueued. If `pending_prelink_tasks` is not
+    /// tasks. It should be spawned when more tasks are enqueued. If `prelink_wait_count` is not
     /// zero, we are specifically waiting for prelink tasks.
     finished,
     /// The link thread is not running or queued, because it is waiting for this MIR to be populated.
@@ -73,11 +73,11 @@ state: union(enum) {
 const max_air_bytes_in_flight = 10 * 1024 * 1024;
 
 /// The initial `Queue` state, containing no tasks, expecting no prelink tasks, and with no running worker thread.
-/// The `pending_prelink_tasks` and `queued_prelink` fields may be modified as needed before calling `start`.
+/// The `queued_prelink` field may be appended to before calling `start`.
 pub const empty: Queue = .{
     .mutex = .{},
     .flush_safety = .{},
-    .pending_prelink_tasks = 0,
+    .prelink_wait_count = undefined, // set in `start`
     .queued_prelink = .empty,
     .wip_prelink = .empty,
     .queued_zcu = .empty,
@@ -100,17 +100,49 @@ pub fn deinit(q: *Queue, comp: *Compilation) void {
 }
 
 /// This is expected to be called exactly once, after which the caller must not directly access
-/// `queued_prelink` or `pending_prelink_tasks` any longer. This will spawn the link thread if
-/// necessary.
+/// `queued_prelink` any longer. This will spawn the link thread if necessary.
 pub fn start(q: *Queue, comp: *Compilation) void {
     assert(q.state == .finished);
     assert(q.queued_zcu.items.len == 0);
+    // Reset this to 1. We can't init it to 1 in `empty`, because it would fall to 0 on successive
+    // incremental updates, but we still need the initial 1.
+    q.prelink_wait_count = 1;
     if (q.queued_prelink.items.len != 0) {
         q.state = .running;
         comp.thread_pool.spawnWgId(&comp.link_task_wait_group, flushTaskQueue, .{ q, comp });
     }
 }
 
+/// Every call to this must be paired with a call to `finishPrelinkItem`.
+pub fn startPrelinkItem(q: *Queue) void {
+    q.mutex.lock();
+    defer q.mutex.unlock();
+    assert(q.prelink_wait_count > 0); // must not have finished everything already
+    q.prelink_wait_count += 1;
+}
+/// This function must be called exactly one more time than `startPrelinkItem` is. The final call
+/// indicates that we have finished calling `startPrelinkItem`, so once all pending items finish,
+/// we are ready to move on to ZCU tasks.
+pub fn finishPrelinkItem(q: *Queue, comp: *Compilation) void {
+    {
+        q.mutex.lock();
+        defer q.mutex.unlock();
+        q.prelink_wait_count -= 1;
+        if (q.prelink_wait_count != 0) return;
+        // The prelink task count dropped to 0; restart the linker thread if necessary.
+        switch (q.state) {
+            .wait_for_mir => unreachable, // we've not started zcu tasks yet
+            .running => return,
+            .finished => {},
+        }
+        assert(q.queued_prelink.items.len == 0);
+        // Even if there are no ZCU tasks, we must restart the linker thread to make sure
+        // that `link.File.prelink()` is called.
+        q.state = .running;
+    }
+    comp.thread_pool.spawnWgId(&comp.link_task_wait_group, flushTaskQueue, .{ q, comp });
+}
+
 /// Called by codegen workers after they have populated a `ZcuTask.LinkFunc.SharedMir`. If the link
 /// thread was waiting for this MIR, it can resume.
 pub fn mirReady(q: *Queue, comp: *Compilation, func_index: InternPool.Index, mir: *ZcuTask.LinkFunc.SharedMir) void {
@@ -130,14 +162,14 @@ pub fn mirReady(q: *Queue, comp: *Compilation, func_index: InternPool.Index, mir
     comp.thread_pool.spawnWgId(&comp.link_task_wait_group, flushTaskQueue, .{ q, comp });
 }
 
-/// Enqueues all prelink tasks in `tasks`. Asserts that they were expected, i.e. that `tasks.len` is
-/// less than or equal to `q.pending_prelink_tasks`. Also asserts that `tasks.len` is not 0.
+/// Enqueues all prelink tasks in `tasks`. Asserts that they were expected, i.e. that
+/// `prelink_wait_count` is not yet 0. Also asserts that `tasks.len` is not 0.
 pub fn enqueuePrelink(q: *Queue, comp: *Compilation, tasks: []const PrelinkTask) Allocator.Error!void {
     {
         q.mutex.lock();
         defer q.mutex.unlock();
+        assert(q.prelink_wait_count > 0);
         try q.queued_prelink.appendSlice(comp.gpa, tasks);
-        q.pending_prelink_tasks -= @intCast(tasks.len);
         switch (q.state) {
             .wait_for_mir => unreachable, // we've not started zcu tasks yet
             .running => return,
@@ -167,7 +199,7 @@ pub fn enqueueZcu(q: *Queue, comp: *Compilation, task: ZcuTask) Allocator.Error!
         try q.queued_zcu.append(comp.gpa, task);
         switch (q.state) {
             .running, .wait_for_mir => return,
-            .finished => if (q.pending_prelink_tasks != 0) return,
+            .finished => if (q.prelink_wait_count > 0) return,
         }
         // Restart the linker thread, unless it would immediately be blocked
         if (task == .link_func and task.link_func.mir.status.load(.acquire) == .pending) {
@@ -194,7 +226,7 @@ fn flushTaskQueue(tid: usize, q: *Queue, comp: *Compilation) void {
             defer q.mutex.unlock();
             std.mem.swap(std.ArrayListUnmanaged(PrelinkTask), &q.queued_prelink, &q.wip_prelink);
             if (q.wip_prelink.items.len == 0) {
-                if (q.pending_prelink_tasks == 0) {
+                if (q.prelink_wait_count == 0) {
                     break :prelink; // prelink is done
                 } else {
                     // We're expecting more prelink tasks so can't move on to ZCU tasks.
author	mlugg <mlugg@mlugg.co.uk>	2025-08-05 22:48:50 +0100
committer	Matthew Lugg <mlugg@mlugg.co.uk>	2025-08-06 21:50:16 +0100
commit	e17a050bc695f7d117b89adb1d258813593ca111 (patch)
tree	f281876485848c783fb9e2a10616b98cda0b2e09 /src/link/Queue.zig
parent	3de8bbd3d4e262df11a582fb52401b8077b5f352 (diff)
download	zig-e17a050bc695f7d117b89adb1d258813593ca111.tar.gz zig-e17a050bc695f7d117b89adb1d258813593ca111.zip