aboutsummaryrefslogtreecommitdiff
path: root/SOURCES/CVE-2024-44961.patch
blob: 19e2808b74a8761f1413707c1c7d4e742663db53 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
From git@z Thu Jan  1 00:00:00 1970
Subject: [PATCH 1/2] drm/amdgpu: Pass amdgpu_job directly to
 amdgpu_ring_soft_recovery
From: Joshua Ashton <joshua@froggi.es>
Date: Sat, 13 Jan 2024 14:02:03 +0000
Message-Id: <20240113140206.2383133-1-joshua@froggi.es>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: 8bit

We will need this to change the karma in the future.

Signed-off-by: Joshua Ashton <joshua@froggi.es>

Cc: Friedrich Vock <friedrich.vock@gmx.de>
Cc: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Cc: Christian König <christian.koenig@amd.com>
Cc: André Almeida <andrealmeid@igalia.com>
Cc: stable@vger.kernel.org
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 9 ++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 3 +--
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 78476bc75b4e..c1af7ca25912 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -52,7 +52,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 	adev->job_hang = true;
 
 	if (amdgpu_gpu_recovery &&
-	    amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
+	    amdgpu_ring_soft_recovery(ring, job)) {
 		DRM_ERROR("ring %s timeout, but soft recovered\n",
 			  s_job->sched->name);
 		goto exit;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 45424ebf9681..25209ce54552 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -425,14 +425,13 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
  * amdgpu_ring_soft_recovery - try to soft recover a ring lockup
  *
  * @ring: ring to try the recovery on
- * @vmid: VMID we try to get going again
- * @fence: timedout fence
+ * @job: the locked-up job
  *
  * Tries to get a ring proceeding again when it is stuck.
  */
-bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
-			       struct dma_fence *fence)
+bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, struct amdgpu_job *job)
 {
+	struct dma_fence *fence = job->base.s_fence->parent;
 	unsigned long flags;
 	ktime_t deadline;
 
@@ -452,7 +451,7 @@ bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
 	atomic_inc(&ring->adev->gpu_reset_counter);
 	while (!dma_fence_is_signaled(fence) &&
 	       ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
-		ring->funcs->soft_recovery(ring, vmid);
+		ring->funcs->soft_recovery(ring, job->vmid);
 
 	return dma_fence_is_signaled(fence);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index bbb53720a018..734df88f22d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -354,8 +354,7 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring);
 void amdgpu_ring_emit_reg_write_reg_wait_helper(struct amdgpu_ring *ring,
 						uint32_t reg0, uint32_t val0,
 						uint32_t reg1, uint32_t val1);
-bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
-			       struct dma_fence *fence);
+bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, struct amdgpu_job *job);
 
 static inline void amdgpu_ring_set_preempt_cond_exec(struct amdgpu_ring *ring,
 							bool cond_exec)
-- 
2.43.0

From git@z Thu Jan  1 00:00:00 1970
Subject: [PATCH 2/2] drm/amdgpu: Mark ctx as guilty in ring_soft_recovery
 path
From: Joshua Ashton <joshua@froggi.es>
Date: Sat, 13 Jan 2024 14:02:04 +0000
Message-Id: <20240113140206.2383133-2-joshua@froggi.es>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: 8bit

We need to bump the karma of the drm_sched job in order for the context
that we just recovered to get correct feedback that it is guilty of
hanging.

Without this feedback, the application may keep pushing through the soft
recoveries, continually hanging the system with jobs that timeout.

There is an accompanying Mesa/RADV patch here
https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27050
to properly handle device loss state when VRAM is not lost.

With these, I was able to run Counter-Strike 2 and launch an application
which can fault the GPU in a variety of ways, and still have Steam +
Counter-Strike 2 + Gamescope (compositor) stay up and continue
functioning on Steam Deck.

Signed-off-by: Joshua Ashton <joshua@froggi.es>

Cc: Friedrich Vock <friedrich.vock@gmx.de>
Cc: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Cc: Christian König <christian.koenig@amd.com>
Cc: André Almeida <andrealmeid@igalia.com>
Cc: stable@vger.kernel.org
Tested-by: Friedrich Vock <friedrich.vock@gmx.de>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 25209ce54552..e87cafb5b1c3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -448,6 +448,8 @@ bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, struct amdgpu_job *job)
 		dma_fence_set_error(fence, -ENODATA);
 	spin_unlock_irqrestore(fence->lock, flags);
 
+	if (job->vm)
+		drm_sched_increase_karma(&job->base);
 	atomic_inc(&ring->adev->gpu_reset_counter);
 	while (!dma_fence_is_signaled(fence) &&
 	       ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0)
-- 
2.43.0