From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: Joshua Ashton Date: Tue, 16 Jan 2024 00:49:27 +0000 Subject: [PATCH] amdgpu: Forward -ENODATA Signed-off-by: Jan200101 --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 57f8f8b3cd8a..5763b51d8019 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -258,9 +258,8 @@ struct dma_fence *fence = NULL; int r; - /* Ignore soft recovered fences here */ r = drm_sched_entity_error(s_entity); - if (r && r != -ENODATA) + if (r) goto error; if (!fence && job->gang_submit) From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: Joshua Ashton Date: Tue, 16 Jan 2024 00:51:25 +0000 Subject: [PATCH] drm/amdgpu: Determine soft recovery deadline next to usage Otherwise we are determining this timeout based on a time before we go into some spinlock, which is bad. Signed-off-by: Joshua Ashton Signed-off-by: Jan200101 --- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index 5d1a6e95b02e..e79ba4b171ae 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -434,8 +434,7 @@ bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid, struct dma_fence *fence) { unsigned long flags; - - ktime_t deadline = ktime_add_us(ktime_get(), 10000); + ktime_t deadline; if (amdgpu_sriov_vf(ring->adev) || !ring->funcs->soft_recovery || !fence) return false; @@ -446,6 +445,8 @@ bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid, spin_unlock_irqrestore(fence->lock, flags); atomic_inc(&ring->adev->gpu_reset_counter); + + deadline = ktime_add_us(ktime_get(), 10000); while (!dma_fence_is_signaled(fence) && ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0) ring->funcs->soft_recovery(ring, vmid); From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: Joshua Ashton Date: Thu, 18 Jan 2024 21:45:39 +0000 Subject: [PATCH] drm/amdgpu: Bump soft recovery timeout to 1s Seems more reliable. Signed-off-by: Joshua Ashton Signed-off-by: Jan200101 --- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index e79ba4b171ae..1631c0c7c21b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -446,7 +446,7 @@ bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid, atomic_inc(&ring->adev->gpu_reset_counter); - deadline = ktime_add_us(ktime_get(), 10000); + deadline = ktime_add_ms(ktime_get(), 1000); while (!dma_fence_is_signaled(fence) && ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0) ring->funcs->soft_recovery(ring, vmid); From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: Friedrich Vock Date: Thu, 18 Jan 2024 19:54:01 +0100 Subject: [PATCH] drm/amdgpu: Reset IH OVERFLOW_CLEAR bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allows us to detect subsequent IH ring buffer overflows as well. Cc: Joshua Ashton Cc: Alex Deucher Cc: Christian König Cc: stable@vger.kernel.org Signed-off-by: Friedrich Vock Signed-off-by: Jan200101 --- drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h | 2 ++ drivers/gpu/drm/amd/amdgpu/cik_ih.c | 7 +++++++ drivers/gpu/drm/amd/amdgpu/cz_ih.c | 6 ++++++ drivers/gpu/drm/amd/amdgpu/iceland_ih.c | 6 ++++++ drivers/gpu/drm/amd/amdgpu/ih_v6_0.c | 7 +++++++ drivers/gpu/drm/amd/amdgpu/navi10_ih.c | 7 +++++++ drivers/gpu/drm/amd/amdgpu/si_ih.c | 7 +++++++ drivers/gpu/drm/amd/amdgpu/tonga_ih.c | 7 +++++++ drivers/gpu/drm/amd/amdgpu/vega10_ih.c | 7 +++++++ drivers/gpu/drm/amd/amdgpu/vega20_ih.c | 7 +++++++ 10 files changed, 63 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h index dd1c2eded6b9..ba52c4c92a54 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h @@ -66,6 +66,8 @@ struct amdgpu_ih_ring { unsigned rptr; struct amdgpu_ih_regs ih_regs; + bool overflow; + /* For waiting on IH processing at checkpoint. */ wait_queue_head_t wait_process; uint64_t processed_timestamp; diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c b/drivers/gpu/drm/amd/amdgpu/cik_ih.c index df385ffc9768..2cf8be061441 100644 --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c @@ -204,6 +204,13 @@ static u32 cik_ih_get_wptr(struct amdgpu_device *adev, tmp = RREG32(mmIH_RB_CNTL); tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK; WREG32(mmIH_RB_CNTL, tmp); + + /* Unset the CLEAR_OVERFLOW bit immediately so new overflows + * can be detected. + */ + tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK; + WREG32(mmIH_RB_CNTL, tmp); + ih->overflow = true; } return (wptr & ih->ptr_mask); } diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c b/drivers/gpu/drm/amd/amdgpu/cz_ih.c index b8c47e0cf37a..e5c4ed44bad9 100644 --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c @@ -216,6 +216,12 @@ static u32 cz_ih_get_wptr(struct amdgpu_device *adev, tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1); WREG32(mmIH_RB_CNTL, tmp); + /* Unset the CLEAR_OVERFLOW bit immediately so new overflows + * can be detected. + */ + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0); + WREG32(mmIH_RB_CNTL, tmp); + ih->overflow = true; out: return (wptr & ih->ptr_mask); diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c index aecad530b10a..075e5c1a5549 100644 --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c @@ -215,6 +215,12 @@ static u32 iceland_ih_get_wptr(struct amdgpu_device *adev, tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1); WREG32(mmIH_RB_CNTL, tmp); + /* Unset the CLEAR_OVERFLOW bit immediately so new overflows + * can be detected. + */ + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0); + WREG32(mmIH_RB_CNTL, tmp); + ih->overflow = true; out: return (wptr & ih->ptr_mask); diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c index b02e1cef78a7..d855c20a0b66 100644 --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c @@ -418,6 +418,13 @@ static u32 ih_v6_0_get_wptr(struct amdgpu_device *adev, tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl); tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1); WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp); + + /* Unset the CLEAR_OVERFLOW bit immediately so new overflows + * can be detected. + */ + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0); + WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp); + ih->overflow = true; out: return (wptr & ih->ptr_mask); } diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c index eec13cb5bf75..d211596ee66b 100644 --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c @@ -442,6 +442,13 @@ static u32 navi10_ih_get_wptr(struct amdgpu_device *adev, tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl); tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1); WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp); + + /* Unset the CLEAR_OVERFLOW bit immediately so new overflows + * can be detected. + */ + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0); + WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp); + ih->overflow = true; out: return (wptr & ih->ptr_mask); } diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c b/drivers/gpu/drm/amd/amdgpu/si_ih.c index 9a24f17a5750..398fbc296cac 100644 --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c @@ -119,6 +119,13 @@ static u32 si_ih_get_wptr(struct amdgpu_device *adev, tmp = RREG32(IH_RB_CNTL); tmp |= IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK; WREG32(IH_RB_CNTL, tmp); + + /* Unset the CLEAR_OVERFLOW bit immediately so new overflows + * can be detected. + */ + tmp &= ~IH_RB_CNTL__WPTR_OVERFLOW_CLEAR_MASK; + WREG32(IH_RB_CNTL, tmp); + ih->overflow = true; } return (wptr & ih->ptr_mask); } diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c index b08905d1c00f..0d9100425a4e 100644 --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c @@ -219,6 +219,13 @@ static u32 tonga_ih_get_wptr(struct amdgpu_device *adev, tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1); WREG32(mmIH_RB_CNTL, tmp); + /* Unset the CLEAR_OVERFLOW bit immediately so new overflows + * can be detected. + */ + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0); + WREG32(mmIH_RB_CNTL, tmp); + ih->overflow = true; + out: return (wptr & ih->ptr_mask); } diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c index 1e83db0c5438..ee307d48186a 100644 --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c @@ -373,6 +373,13 @@ static u32 vega10_ih_get_wptr(struct amdgpu_device *adev, tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1); WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp); + /* Unset the CLEAR_OVERFLOW bit immediately so new overflows + * can be detected. + */ + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0); + WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp); + ih->overflow = true; + out: return (wptr & ih->ptr_mask); } diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c index 4d719df376a7..106fc3e7eb77 100644 --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c @@ -421,6 +421,13 @@ static u32 vega20_ih_get_wptr(struct amdgpu_device *adev, tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1); WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp); + /* Unset the CLEAR_OVERFLOW bit immediately so new overflows + * can be detected. + */ + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0); + WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp); + ih->overflow = true; + out: return (wptr & ih->ptr_mask); } From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: Friedrich Vock Date: Thu, 18 Jan 2024 19:54:02 +0100 Subject: [PATCH] drm/amdgpu: Process fences on IH overflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the IH ring buffer overflows, it's possible that fence signal events were lost. Check each ring for progress to prevent job timeouts/GPU hangs due to the fences staying unsignaled despite the work being done. Cc: Joshua Ashton Cc: Alex Deucher Cc: Christian König Cc: stable@vger.kernel.org Signed-off-by: Friedrich Vock Signed-off-by: Jan200101 --- drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c index fceb3b384955..122f5d0000ea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c @@ -205,6 +205,7 @@ int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) { unsigned int count; u32 wptr; + int i; if (!ih->enabled || adev->shutdown) return IRQ_NONE; @@ -223,6 +224,21 @@ int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih) ih->rptr &= ih->ptr_mask; } + /* If the ring buffer overflowed, we might have lost some fence + * signal interrupts. Check if there was any activity so the signal + * doesn't get lost. + */ + if (ih->overflow) { + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + + if (!ring || !ring->fence_drv.initialized) + continue; + amdgpu_fence_process(ring); + } + ih->overflow = false; + } + amdgpu_ih_set_rptr(adev, ih); wake_up_all(&ih->wait_process);