xref: /linux/drivers/gpu/drm/amd/ras/rascore/ras_process.c (revision 24f171c7e145f43b9f187578e89b0982ce87e54c)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright 2025 Advanced Micro Devices, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21  * OTHER DEALINGS IN THE SOFTWARE.
22  *
23  */
24 #include "ras.h"
25 #include "ras_process.h"
26 
27 #define RAS_EVENT_FIFO_SIZE (128 * sizeof(struct ras_event_req))
28 
29 #define RAS_POLLING_ECC_TIMEOUT  300
30 
31 static int ras_process_put_event(struct ras_core_context *ras_core,
32 		struct ras_event_req *req)
33 {
34 	struct ras_process *ras_proc = &ras_core->ras_proc;
35 	int ret;
36 
37 	ret = kfifo_in_spinlocked(&ras_proc->event_fifo,
38 			req, sizeof(*req), &ras_proc->fifo_spinlock);
39 	if (!ret) {
40 		RAS_DEV_ERR(ras_core->dev, "Poison message fifo is full!\n");
41 		return -ENOSPC;
42 	}
43 
44 	return 0;
45 }
46 
47 static int ras_process_add_reset_gpu_event(struct ras_core_context *ras_core,
48 			uint32_t reset_cause)
49 {
50 	struct ras_event_req req = {0};
51 
52 	req.reset = reset_cause;
53 
54 	return ras_process_put_event(ras_core, &req);
55 }
56 
57 static int ras_process_get_event(struct ras_core_context *ras_core,
58 		struct ras_event_req *req)
59 {
60 	struct ras_process *ras_proc = &ras_core->ras_proc;
61 
62 	return kfifo_out_spinlocked(&ras_proc->event_fifo,
63 				req, sizeof(*req), &ras_proc->fifo_spinlock);
64 }
65 
66 static void ras_process_clear_event_fifo(struct ras_core_context *ras_core)
67 {
68 	struct ras_event_req req;
69 	int ret;
70 
71 	do {
72 		ret = ras_process_get_event(ras_core, &req);
73 	} while (ret);
74 }
75 
76 #define AMDGPU_RAS_WAITING_DATA_READY  200
77 static int ras_process_umc_event(struct ras_core_context *ras_core,
78 				uint32_t event_count)
79 {
80 	struct ras_ecc_count ecc_data;
81 	int ret = 0;
82 	uint32_t timeout = 0;
83 	uint32_t detected_de_count = 0;
84 
85 	do {
86 		memset(&ecc_data, 0, sizeof(ecc_data));
87 		ret = ras_core_update_ecc_info(ras_core);
88 		if (ret)
89 			return ret;
90 
91 		ret = ras_core_query_block_ecc_data(ras_core, RAS_BLOCK_ID__UMC, &ecc_data);
92 		if (ret)
93 			return ret;
94 
95 		if (ecc_data.new_de_count) {
96 			detected_de_count += ecc_data.new_de_count;
97 			timeout = 0;
98 		} else {
99 			if (!timeout && event_count)
100 				timeout = AMDGPU_RAS_WAITING_DATA_READY;
101 
102 			if (timeout) {
103 				if (!--timeout)
104 					break;
105 
106 				msleep(1);
107 			}
108 		}
109 	} while (detected_de_count < event_count);
110 
111 	if (detected_de_count && ras_core_gpu_is_rma(ras_core))
112 		ras_process_add_reset_gpu_event(ras_core, GPU_RESET_CAUSE_RMA);
113 
114 	return 0;
115 }
116 
117 static int ras_process_non_umc_event(struct ras_core_context *ras_core)
118 {
119 	struct ras_process *ras_proc = &ras_core->ras_proc;
120 	struct ras_event_req req;
121 	uint32_t event_count = kfifo_len(&ras_proc->event_fifo);
122 	uint32_t reset_flags = 0;
123 	int ret = 0, i;
124 
125 	for (i = 0; i < event_count; i++) {
126 		memset(&req, 0, sizeof(req));
127 		ret = ras_process_get_event(ras_core, &req);
128 		if (!ret)
129 			continue;
130 
131 		ras_core_event_notify(ras_core,
132 			RAS_EVENT_ID__POISON_CONSUMPTION, &req);
133 
134 		reset_flags |= req.reset;
135 
136 		if (req.reset == GPU_RESET_CAUSE_RMA)
137 			continue;
138 
139 		if (req.reset)
140 			RAS_DEV_INFO(ras_core->dev,
141 				"{%llu} GPU reset for %s RAS poison consumption is issued!\n",
142 				req.seqno, ras_core_get_ras_block_name(req.block));
143 		else
144 			RAS_DEV_INFO(ras_core->dev,
145 				"{%llu} %s RAS poison consumption is issued!\n",
146 				req.seqno, ras_core_get_ras_block_name(req.block));
147 	}
148 
149 	if (reset_flags) {
150 		ret = ras_core_event_notify(ras_core,
151 				RAS_EVENT_ID__RESET_GPU, &reset_flags);
152 		if (!ret && (reset_flags & GPU_RESET_CAUSE_RMA))
153 			return -RAS_CORE_GPU_IN_MODE1_RESET;
154 	}
155 
156 	return ret;
157 }
158 
159 int ras_process_handle_ras_event(struct ras_core_context *ras_core)
160 {
161 	struct ras_process *ras_proc = &ras_core->ras_proc;
162 	uint32_t umc_event_count;
163 	int ret;
164 
165 	ret = ras_core_event_notify(ras_core,
166 			RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN, NULL);
167 	if (ret)
168 		return ret;
169 
170 	ras_aca_clear_fatal_flag(ras_core);
171 	ras_umc_log_pending_bad_bank(ras_core);
172 
173 	do {
174 		umc_event_count = atomic_read(&ras_proc->umc_interrupt_count);
175 		ret = ras_process_umc_event(ras_core, umc_event_count);
176 		if (ret == -RAS_CORE_GPU_IN_MODE1_RESET)
177 			break;
178 
179 		if (umc_event_count)
180 			atomic_sub(umc_event_count, &ras_proc->umc_interrupt_count);
181 	} while (atomic_read(&ras_proc->umc_interrupt_count));
182 
183 	if ((ret != -RAS_CORE_GPU_IN_MODE1_RESET) &&
184 			(kfifo_len(&ras_proc->event_fifo)))
185 		ret = ras_process_non_umc_event(ras_core);
186 
187 	if (ret == -RAS_CORE_GPU_IN_MODE1_RESET) {
188 		/* Clear poison fifo */
189 		ras_process_clear_event_fifo(ras_core);
190 		atomic_set(&ras_proc->umc_interrupt_count, 0);
191 	}
192 
193 	ras_core_event_notify(ras_core,
194 			RAS_EVENT_ID__RAS_EVENT_PROC_END, NULL);
195 	return ret;
196 }
197 
198 static int thread_wait_condition(void *param)
199 {
200 	struct ras_process *ras_proc = (struct ras_process *)param;
201 
202 	return (kthread_should_stop() ||
203 		atomic_read(&ras_proc->ras_interrupt_req));
204 }
205 
206 static int ras_process_thread(void *context)
207 {
208 	struct ras_core_context *ras_core = (struct ras_core_context *)context;
209 	struct ras_process *ras_proc = &ras_core->ras_proc;
210 
211 	while (!kthread_should_stop()) {
212 		ras_wait_event_interruptible_timeout(&ras_proc->ras_process_wq,
213 			thread_wait_condition, ras_proc,
214 			msecs_to_jiffies(RAS_POLLING_ECC_TIMEOUT));
215 
216 		if (kthread_should_stop())
217 			break;
218 
219 		if (!ras_core->is_initialized)
220 			continue;
221 
222 		atomic_set(&ras_proc->ras_interrupt_req, 0);
223 
224 		if (ras_core_gpu_in_reset(ras_core))
225 			continue;
226 
227 		if (ras_core->sys_fn && ras_core->sys_fn->async_handle_ras_event)
228 			ras_core->sys_fn->async_handle_ras_event(ras_core, NULL);
229 		else
230 			ras_process_handle_ras_event(ras_core);
231 	}
232 
233 	return 0;
234 }
235 
236 int ras_process_init(struct ras_core_context *ras_core)
237 {
238 	struct ras_process *ras_proc = &ras_core->ras_proc;
239 	int ret;
240 
241 	ret = kfifo_alloc(&ras_proc->event_fifo, RAS_EVENT_FIFO_SIZE, GFP_KERNEL);
242 	if (ret)
243 		return ret;
244 
245 	spin_lock_init(&ras_proc->fifo_spinlock);
246 
247 	init_waitqueue_head(&ras_proc->ras_process_wq);
248 
249 	ras_proc->ras_process_thread = kthread_run(ras_process_thread,
250 							(void *)ras_core, "ras_process_thread");
251 	if (!ras_proc->ras_process_thread) {
252 		RAS_DEV_ERR(ras_core->dev, "Failed to create ras_process_thread.\n");
253 		ret =  -ENOMEM;
254 		goto err;
255 	}
256 
257 	return 0;
258 
259 err:
260 	ras_process_fini(ras_core);
261 	return ret;
262 }
263 
264 int ras_process_fini(struct ras_core_context *ras_core)
265 {
266 	struct ras_process *ras_proc = &ras_core->ras_proc;
267 
268 	if (ras_proc->ras_process_thread) {
269 		kthread_stop(ras_proc->ras_process_thread);
270 		ras_proc->ras_process_thread = NULL;
271 	}
272 
273 	kfifo_free(&ras_proc->event_fifo);
274 
275 	return 0;
276 }
277 
278 static int ras_process_add_umc_interrupt_req(struct ras_core_context *ras_core,
279 			struct ras_event_req *req)
280 {
281 	struct ras_process *ras_proc = &ras_core->ras_proc;
282 
283 	atomic_inc(&ras_proc->umc_interrupt_count);
284 	atomic_inc(&ras_proc->ras_interrupt_req);
285 
286 	wake_up(&ras_proc->ras_process_wq);
287 	return 0;
288 }
289 
290 static int ras_process_add_non_umc_interrupt_req(struct ras_core_context *ras_core,
291 		struct ras_event_req *req)
292 {
293 	struct ras_process *ras_proc = &ras_core->ras_proc;
294 	int ret;
295 
296 	ret = ras_process_put_event(ras_core, req);
297 	if (!ret) {
298 		atomic_inc(&ras_proc->ras_interrupt_req);
299 		wake_up(&ras_proc->ras_process_wq);
300 	}
301 
302 	return ret;
303 }
304 
305 int ras_process_add_interrupt_req(struct ras_core_context *ras_core,
306 	struct ras_event_req *req, bool is_umc)
307 {
308 	int ret;
309 
310 	if (!ras_core)
311 		return -EINVAL;
312 
313 	if (!ras_core->is_initialized)
314 		return -EPERM;
315 
316 	if (is_umc)
317 		ret = ras_process_add_umc_interrupt_req(ras_core, req);
318 	else
319 		ret = ras_process_add_non_umc_interrupt_req(ras_core, req);
320 
321 	return ret;
322 }
323