xref: /linux/drivers/accel/amdxdna/aie2_error.c (revision c06b6cde2a1c3bcbb561bd57bb6f34eae9030921)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
4  */
5 
6 #include <drm/drm_cache.h>
7 #include <drm/drm_device.h>
8 #include <drm/drm_print.h>
9 #include <drm/gpu_scheduler.h>
10 #include <linux/dma-mapping.h>
11 #include <linux/kthread.h>
12 #include <linux/kernel.h>
13 
14 #include "aie.h"
15 #include "aie2_msg_priv.h"
16 #include "aie2_pci.h"
17 #include "amdxdna_error.h"
18 #include "amdxdna_mailbox.h"
19 #include "amdxdna_pci_drv.h"
20 
21 struct async_event {
22 	struct amdxdna_dev_hdl		*ndev;
23 	struct async_event_msg_resp	resp;
24 	struct workqueue_struct		*wq;
25 	struct work_struct		work;
26 	u8				*buf;
27 	dma_addr_t			addr;
28 	u32				size;
29 };
30 
31 struct async_events {
32 	struct workqueue_struct		*wq;
33 	u8				*buf;
34 	dma_addr_t			addr;
35 	u32				size;
36 	u32				event_cnt;
37 	struct async_event		event[] __counted_by(event_cnt);
38 };
39 
40 /*
41  * Below enum, struct and lookup tables are porting from XAIE util header file.
42  *
43  * Below data is defined by AIE device and it is used for decode error message
44  * from the device.
45  */
46 
47 enum aie_module_type {
48 	AIE_MEM_MOD = 0,
49 	AIE_CORE_MOD,
50 	AIE_PL_MOD,
51 	AIE_UNKNOWN_MOD,
52 };
53 
54 enum aie_error_category {
55 	AIE_ERROR_SATURATION = 0,
56 	AIE_ERROR_FP,
57 	AIE_ERROR_STREAM,
58 	AIE_ERROR_ACCESS,
59 	AIE_ERROR_BUS,
60 	AIE_ERROR_INSTRUCTION,
61 	AIE_ERROR_ECC,
62 	AIE_ERROR_LOCK,
63 	AIE_ERROR_DMA,
64 	AIE_ERROR_MEM_PARITY,
65 	/* Unknown is not from XAIE, added for better category */
66 	AIE_ERROR_UNKNOWN,
67 };
68 
69 /* Don't pack, unless XAIE side changed */
70 struct aie_error {
71 	__u8			row;
72 	__u8			col;
73 	__u32			mod_type;
74 	__u8			event_id;
75 };
76 
77 struct aie_err_info {
78 	u32			err_cnt;
79 	u32			ret_code;
80 	u32			rsvd;
81 	struct aie_error	payload[] __counted_by(err_cnt);
82 };
83 
84 struct aie_event_category {
85 	u8			event_id;
86 	enum aie_error_category category;
87 };
88 
89 #define EVENT_CATEGORY(id, cat) { id, cat }
90 static const struct aie_event_category aie_ml_mem_event_cat[] = {
91 	EVENT_CATEGORY(88U,  AIE_ERROR_ECC),
92 	EVENT_CATEGORY(90U,  AIE_ERROR_ECC),
93 	EVENT_CATEGORY(91U,  AIE_ERROR_MEM_PARITY),
94 	EVENT_CATEGORY(92U,  AIE_ERROR_MEM_PARITY),
95 	EVENT_CATEGORY(93U,  AIE_ERROR_MEM_PARITY),
96 	EVENT_CATEGORY(94U,  AIE_ERROR_MEM_PARITY),
97 	EVENT_CATEGORY(95U,  AIE_ERROR_MEM_PARITY),
98 	EVENT_CATEGORY(96U,  AIE_ERROR_MEM_PARITY),
99 	EVENT_CATEGORY(97U,  AIE_ERROR_DMA),
100 	EVENT_CATEGORY(98U,  AIE_ERROR_DMA),
101 	EVENT_CATEGORY(99U,  AIE_ERROR_DMA),
102 	EVENT_CATEGORY(100U, AIE_ERROR_DMA),
103 	EVENT_CATEGORY(101U, AIE_ERROR_LOCK),
104 };
105 
106 static const struct aie_event_category aie_ml_core_event_cat[] = {
107 	EVENT_CATEGORY(55U, AIE_ERROR_ACCESS),
108 	EVENT_CATEGORY(56U, AIE_ERROR_STREAM),
109 	EVENT_CATEGORY(57U, AIE_ERROR_STREAM),
110 	EVENT_CATEGORY(58U, AIE_ERROR_BUS),
111 	EVENT_CATEGORY(59U, AIE_ERROR_INSTRUCTION),
112 	EVENT_CATEGORY(60U, AIE_ERROR_ACCESS),
113 	EVENT_CATEGORY(62U, AIE_ERROR_ECC),
114 	EVENT_CATEGORY(64U, AIE_ERROR_ECC),
115 	EVENT_CATEGORY(65U, AIE_ERROR_ACCESS),
116 	EVENT_CATEGORY(66U, AIE_ERROR_ACCESS),
117 	EVENT_CATEGORY(67U, AIE_ERROR_LOCK),
118 	EVENT_CATEGORY(70U, AIE_ERROR_INSTRUCTION),
119 	EVENT_CATEGORY(71U, AIE_ERROR_STREAM),
120 	EVENT_CATEGORY(72U, AIE_ERROR_BUS),
121 };
122 
123 static const struct aie_event_category aie_ml_mem_tile_event_cat[] = {
124 	EVENT_CATEGORY(130U, AIE_ERROR_ECC),
125 	EVENT_CATEGORY(132U, AIE_ERROR_ECC),
126 	EVENT_CATEGORY(133U, AIE_ERROR_DMA),
127 	EVENT_CATEGORY(134U, AIE_ERROR_DMA),
128 	EVENT_CATEGORY(135U, AIE_ERROR_STREAM),
129 	EVENT_CATEGORY(136U, AIE_ERROR_STREAM),
130 	EVENT_CATEGORY(137U, AIE_ERROR_STREAM),
131 	EVENT_CATEGORY(138U, AIE_ERROR_BUS),
132 	EVENT_CATEGORY(139U, AIE_ERROR_LOCK),
133 };
134 
135 static const struct aie_event_category aie_ml_shim_tile_event_cat[] = {
136 	EVENT_CATEGORY(64U, AIE_ERROR_BUS),
137 	EVENT_CATEGORY(65U, AIE_ERROR_STREAM),
138 	EVENT_CATEGORY(66U, AIE_ERROR_STREAM),
139 	EVENT_CATEGORY(67U, AIE_ERROR_BUS),
140 	EVENT_CATEGORY(68U, AIE_ERROR_BUS),
141 	EVENT_CATEGORY(69U, AIE_ERROR_BUS),
142 	EVENT_CATEGORY(70U, AIE_ERROR_BUS),
143 	EVENT_CATEGORY(71U, AIE_ERROR_BUS),
144 	EVENT_CATEGORY(72U, AIE_ERROR_DMA),
145 	EVENT_CATEGORY(73U, AIE_ERROR_DMA),
146 	EVENT_CATEGORY(74U, AIE_ERROR_LOCK),
147 };
148 
149 static const enum amdxdna_error_num aie_cat_err_num_map[] = {
150 	[AIE_ERROR_SATURATION] = AMDXDNA_ERROR_NUM_AIE_SATURATION,
151 	[AIE_ERROR_FP] = AMDXDNA_ERROR_NUM_AIE_FP,
152 	[AIE_ERROR_STREAM] = AMDXDNA_ERROR_NUM_AIE_STREAM,
153 	[AIE_ERROR_ACCESS] = AMDXDNA_ERROR_NUM_AIE_ACCESS,
154 	[AIE_ERROR_BUS] = AMDXDNA_ERROR_NUM_AIE_BUS,
155 	[AIE_ERROR_INSTRUCTION] = AMDXDNA_ERROR_NUM_AIE_INSTRUCTION,
156 	[AIE_ERROR_ECC] = AMDXDNA_ERROR_NUM_AIE_ECC,
157 	[AIE_ERROR_LOCK] = AMDXDNA_ERROR_NUM_AIE_LOCK,
158 	[AIE_ERROR_DMA] = AMDXDNA_ERROR_NUM_AIE_DMA,
159 	[AIE_ERROR_MEM_PARITY] = AMDXDNA_ERROR_NUM_AIE_MEM_PARITY,
160 	[AIE_ERROR_UNKNOWN] = AMDXDNA_ERROR_NUM_UNKNOWN,
161 };
162 
163 static_assert(ARRAY_SIZE(aie_cat_err_num_map) == AIE_ERROR_UNKNOWN + 1);
164 
165 static const enum amdxdna_error_module aie_err_mod_map[] = {
166 	[AIE_MEM_MOD] = AMDXDNA_ERROR_MODULE_AIE_MEMORY,
167 	[AIE_CORE_MOD] = AMDXDNA_ERROR_MODULE_AIE_CORE,
168 	[AIE_PL_MOD] = AMDXDNA_ERROR_MODULE_AIE_PL,
169 	[AIE_UNKNOWN_MOD] = AMDXDNA_ERROR_MODULE_UNKNOWN,
170 };
171 
172 static_assert(ARRAY_SIZE(aie_err_mod_map) == AIE_UNKNOWN_MOD + 1);
173 
174 static enum aie_error_category
175 aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
176 {
177 	const struct aie_event_category *lut;
178 	int num_entry;
179 	int i;
180 
181 	switch (mod_type) {
182 	case AIE_PL_MOD:
183 		lut = aie_ml_shim_tile_event_cat;
184 		num_entry = ARRAY_SIZE(aie_ml_shim_tile_event_cat);
185 		break;
186 	case AIE_CORE_MOD:
187 		lut = aie_ml_core_event_cat;
188 		num_entry = ARRAY_SIZE(aie_ml_core_event_cat);
189 		break;
190 	case AIE_MEM_MOD:
191 		if (row == 1) {
192 			lut = aie_ml_mem_tile_event_cat;
193 			num_entry = ARRAY_SIZE(aie_ml_mem_tile_event_cat);
194 		} else {
195 			lut = aie_ml_mem_event_cat;
196 			num_entry = ARRAY_SIZE(aie_ml_mem_event_cat);
197 		}
198 		break;
199 	default:
200 		return AIE_ERROR_UNKNOWN;
201 	}
202 
203 	for (i = 0; i < num_entry; i++) {
204 		if (event_id != lut[i].event_id)
205 			continue;
206 
207 		if (lut[i].category > AIE_ERROR_UNKNOWN)
208 			return AIE_ERROR_UNKNOWN;
209 
210 		return lut[i].category;
211 	}
212 
213 	return AIE_ERROR_UNKNOWN;
214 }
215 
216 static void aie2_update_last_async_error(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
217 {
218 	struct aie_error *errs = err_info;
219 	enum amdxdna_error_module err_mod;
220 	enum aie_error_category aie_err;
221 	enum amdxdna_error_num err_num;
222 	struct aie_error *last_err;
223 
224 	last_err = &errs[num_err - 1];
225 	if (last_err->mod_type >= AIE_UNKNOWN_MOD) {
226 		err_num = aie_cat_err_num_map[AIE_ERROR_UNKNOWN];
227 		err_mod = aie_err_mod_map[AIE_UNKNOWN_MOD];
228 	} else {
229 		aie_err = aie_get_error_category(last_err->row,
230 						 last_err->event_id,
231 						 last_err->mod_type);
232 		err_num = aie_cat_err_num_map[aie_err];
233 		err_mod = aie_err_mod_map[last_err->mod_type];
234 	}
235 
236 	ndev->last_async_err.err_code = AMDXDNA_ERROR_ENCODE(err_num, err_mod);
237 	ndev->last_async_err.ts_us = ktime_to_us(ktime_get_real());
238 	ndev->last_async_err.ex_err_code = AMDXDNA_EXTRA_ERR_ENCODE(last_err->row, last_err->col);
239 }
240 
241 static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
242 {
243 	struct aie_error *errs = err_info;
244 	u32 err_col = 0; /* assume that AIE has less than 32 columns */
245 	int i;
246 
247 	/* Get err column bitmap */
248 	for (i = 0; i < num_err; i++) {
249 		struct aie_error *err = &errs[i];
250 		enum aie_error_category cat;
251 
252 		cat = aie_get_error_category(err->row, err->event_id, err->mod_type);
253 		XDNA_ERR(ndev->aie.xdna, "Row: %d, Col: %d, module %d, event ID %d, category %d",
254 			 err->row, err->col, err->mod_type,
255 			 err->event_id, cat);
256 
257 		if (err->col >= 32) {
258 			XDNA_WARN(ndev->aie.xdna, "Invalid column number");
259 			break;
260 		}
261 
262 		err_col |= (1 << err->col);
263 	}
264 
265 	return err_col;
266 }
267 
268 static int aie2_error_async_cb(void *handle, void __iomem *data, size_t size)
269 {
270 	struct async_event *e = handle;
271 
272 	if (data) {
273 		e->resp.type = readl(data + offsetof(struct async_event_msg_resp, type));
274 		wmb(); /* Update status in the end, so that no lock for here */
275 		e->resp.status = readl(data + offsetof(struct async_event_msg_resp, status));
276 	}
277 	queue_work(e->wq, &e->work);
278 	return 0;
279 }
280 
281 static int aie2_error_event_send(struct async_event *e)
282 {
283 	drm_clflush_virt_range(e->buf, e->size); /* device can access */
284 	return aie2_register_asyn_event_msg(e->ndev, e->addr, e->size, e,
285 					    aie2_error_async_cb);
286 }
287 
288 static void aie2_error_worker(struct work_struct *err_work)
289 {
290 	struct aie_err_info *info;
291 	struct amdxdna_dev *xdna;
292 	struct async_event *e;
293 	u32 max_err;
294 	u32 err_col;
295 
296 	e = container_of(err_work, struct async_event, work);
297 
298 	xdna = e->ndev->aie.xdna;
299 
300 	if (e->resp.status == MAX_AIE2_STATUS_CODE)
301 		return;
302 
303 	e->resp.status = MAX_AIE2_STATUS_CODE;
304 
305 	print_hex_dump_debug("AIE error: ", DUMP_PREFIX_OFFSET, 16, 4,
306 			     e->buf, 0x100, false);
307 
308 	info = (struct aie_err_info *)e->buf;
309 	XDNA_DBG(xdna, "Error count %d return code %d", info->err_cnt, info->ret_code);
310 
311 	max_err = (e->size - sizeof(*info)) / sizeof(struct aie_error);
312 	if (unlikely(info->err_cnt > max_err)) {
313 		WARN_ONCE(1, "Error count too large %d\n", info->err_cnt);
314 		return;
315 	}
316 	err_col = aie2_error_backtrack(e->ndev, info->payload, info->err_cnt);
317 	if (!err_col) {
318 		XDNA_WARN(xdna, "Did not get error column");
319 		return;
320 	}
321 
322 	mutex_lock(&xdna->dev_lock);
323 	aie2_update_last_async_error(e->ndev, info->payload, info->err_cnt);
324 
325 	/* Re-sent this event to firmware */
326 	if (aie2_error_event_send(e))
327 		XDNA_WARN(xdna, "Unable to register async event");
328 	mutex_unlock(&xdna->dev_lock);
329 }
330 
331 void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev)
332 {
333 	struct amdxdna_dev *xdna = ndev->aie.xdna;
334 	struct async_events *events;
335 
336 	events = ndev->async_events;
337 
338 	mutex_unlock(&xdna->dev_lock);
339 	destroy_workqueue(events->wq);
340 	mutex_lock(&xdna->dev_lock);
341 
342 	amdxdna_free_msg_buffer(xdna, events->size, events->buf, events->addr);
343 	kfree(events);
344 }
345 
346 int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
347 {
348 	struct amdxdna_dev *xdna = ndev->aie.xdna;
349 	u32 total_col = ndev->total_col;
350 	u32 total_size = ASYNC_BUF_SIZE * total_col;
351 	struct async_events *events;
352 	int i, ret;
353 
354 	events = kzalloc_flex(*events, event, total_col);
355 	if (!events)
356 		return -ENOMEM;
357 
358 	events->buf = amdxdna_alloc_msg_buffer(xdna, &total_size, &events->addr);
359 	if (IS_ERR(events->buf)) {
360 		ret = PTR_ERR(events->buf);
361 		goto free_events;
362 	}
363 	events->size = total_size;
364 	events->event_cnt = total_col;
365 
366 	events->wq = alloc_ordered_workqueue("async_wq", 0);
367 	if (!events->wq) {
368 		ret = -ENOMEM;
369 		goto free_buf;
370 	}
371 
372 	for (i = 0; i < events->event_cnt; i++) {
373 		struct async_event *e = &events->event[i];
374 		u32 offset = i * ASYNC_BUF_SIZE;
375 
376 		e->ndev = ndev;
377 		e->wq = events->wq;
378 		e->buf = &events->buf[offset];
379 		e->addr = events->addr + offset;
380 		e->size = ASYNC_BUF_SIZE;
381 		e->resp.status = MAX_AIE2_STATUS_CODE;
382 		INIT_WORK(&e->work, aie2_error_worker);
383 
384 		ret = aie2_error_event_send(e);
385 		if (ret)
386 			goto free_wq;
387 	}
388 
389 	ndev->async_events = events;
390 
391 	XDNA_DBG(xdna, "Async event count %d, buf total size 0x%x",
392 		 events->event_cnt, events->size);
393 	return 0;
394 
395 free_wq:
396 	destroy_workqueue(events->wq);
397 free_buf:
398 	amdxdna_free_msg_buffer(xdna, events->size, events->buf, events->addr);
399 free_events:
400 	kfree(events);
401 	return ret;
402 }
403 
404 int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev, struct amdxdna_drm_get_array *args)
405 {
406 	struct amdxdna_dev *xdna = ndev->aie.xdna;
407 
408 	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
409 
410 	if (!args->num_element)
411 		return -EINVAL;
412 
413 	args->num_element = 1;
414 	args->element_size = min(args->element_size, sizeof(ndev->last_async_err));
415 	if (copy_to_user(u64_to_user_ptr(args->buffer),
416 			 &ndev->last_async_err, args->element_size))
417 		return -EFAULT;
418 
419 	return 0;
420 }
421