xref: /linux/drivers/accel/amdxdna/aie2_error.c (revision 2c1ed907520c50326b8f604907a8478b27881a2e)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
4  */
5 
6 #include <drm/drm_cache.h>
7 #include <drm/drm_device.h>
8 #include <drm/drm_print.h>
9 #include <drm/gpu_scheduler.h>
10 #include <linux/dma-mapping.h>
11 #include <linux/kthread.h>
12 #include <linux/kernel.h>
13 
14 #include "aie2_msg_priv.h"
15 #include "aie2_pci.h"
16 #include "amdxdna_mailbox.h"
17 #include "amdxdna_pci_drv.h"
18 
19 struct async_event {
20 	struct amdxdna_dev_hdl		*ndev;
21 	struct async_event_msg_resp	resp;
22 	struct workqueue_struct		*wq;
23 	struct work_struct		work;
24 	u8				*buf;
25 	dma_addr_t			addr;
26 	u32				size;
27 };
28 
29 struct async_events {
30 	struct workqueue_struct		*wq;
31 	u8				*buf;
32 	dma_addr_t			addr;
33 	u32				size;
34 	u32				event_cnt;
35 	struct async_event		event[] __counted_by(event_cnt);
36 };
37 
38 /*
39  * Below enum, struct and lookup tables are porting from XAIE util header file.
40  *
41  * Below data is defined by AIE device and it is used for decode error message
42  * from the device.
43  */
44 
45 enum aie_module_type {
46 	AIE_MEM_MOD = 0,
47 	AIE_CORE_MOD,
48 	AIE_PL_MOD,
49 };
50 
51 enum aie_error_category {
52 	AIE_ERROR_SATURATION = 0,
53 	AIE_ERROR_FP,
54 	AIE_ERROR_STREAM,
55 	AIE_ERROR_ACCESS,
56 	AIE_ERROR_BUS,
57 	AIE_ERROR_INSTRUCTION,
58 	AIE_ERROR_ECC,
59 	AIE_ERROR_LOCK,
60 	AIE_ERROR_DMA,
61 	AIE_ERROR_MEM_PARITY,
62 	/* Unknown is not from XAIE, added for better category */
63 	AIE_ERROR_UNKNOWN,
64 };
65 
66 /* Don't pack, unless XAIE side changed */
67 struct aie_error {
68 	__u8			row;
69 	__u8			col;
70 	__u32			mod_type;
71 	__u8			event_id;
72 };
73 
74 struct aie_err_info {
75 	u32			err_cnt;
76 	u32			ret_code;
77 	u32			rsvd;
78 	struct aie_error	payload[] __counted_by(err_cnt);
79 };
80 
81 struct aie_event_category {
82 	u8			event_id;
83 	enum aie_error_category category;
84 };
85 
86 #define EVENT_CATEGORY(id, cat) { id, cat }
87 static const struct aie_event_category aie_ml_mem_event_cat[] = {
88 	EVENT_CATEGORY(88U,  AIE_ERROR_ECC),
89 	EVENT_CATEGORY(90U,  AIE_ERROR_ECC),
90 	EVENT_CATEGORY(91U,  AIE_ERROR_MEM_PARITY),
91 	EVENT_CATEGORY(92U,  AIE_ERROR_MEM_PARITY),
92 	EVENT_CATEGORY(93U,  AIE_ERROR_MEM_PARITY),
93 	EVENT_CATEGORY(94U,  AIE_ERROR_MEM_PARITY),
94 	EVENT_CATEGORY(95U,  AIE_ERROR_MEM_PARITY),
95 	EVENT_CATEGORY(96U,  AIE_ERROR_MEM_PARITY),
96 	EVENT_CATEGORY(97U,  AIE_ERROR_DMA),
97 	EVENT_CATEGORY(98U,  AIE_ERROR_DMA),
98 	EVENT_CATEGORY(99U,  AIE_ERROR_DMA),
99 	EVENT_CATEGORY(100U, AIE_ERROR_DMA),
100 	EVENT_CATEGORY(101U, AIE_ERROR_LOCK),
101 };
102 
103 static const struct aie_event_category aie_ml_core_event_cat[] = {
104 	EVENT_CATEGORY(55U, AIE_ERROR_ACCESS),
105 	EVENT_CATEGORY(56U, AIE_ERROR_STREAM),
106 	EVENT_CATEGORY(57U, AIE_ERROR_STREAM),
107 	EVENT_CATEGORY(58U, AIE_ERROR_BUS),
108 	EVENT_CATEGORY(59U, AIE_ERROR_INSTRUCTION),
109 	EVENT_CATEGORY(60U, AIE_ERROR_ACCESS),
110 	EVENT_CATEGORY(62U, AIE_ERROR_ECC),
111 	EVENT_CATEGORY(64U, AIE_ERROR_ECC),
112 	EVENT_CATEGORY(65U, AIE_ERROR_ACCESS),
113 	EVENT_CATEGORY(66U, AIE_ERROR_ACCESS),
114 	EVENT_CATEGORY(67U, AIE_ERROR_LOCK),
115 	EVENT_CATEGORY(70U, AIE_ERROR_INSTRUCTION),
116 	EVENT_CATEGORY(71U, AIE_ERROR_STREAM),
117 	EVENT_CATEGORY(72U, AIE_ERROR_BUS),
118 };
119 
120 static const struct aie_event_category aie_ml_mem_tile_event_cat[] = {
121 	EVENT_CATEGORY(130U, AIE_ERROR_ECC),
122 	EVENT_CATEGORY(132U, AIE_ERROR_ECC),
123 	EVENT_CATEGORY(133U, AIE_ERROR_DMA),
124 	EVENT_CATEGORY(134U, AIE_ERROR_DMA),
125 	EVENT_CATEGORY(135U, AIE_ERROR_STREAM),
126 	EVENT_CATEGORY(136U, AIE_ERROR_STREAM),
127 	EVENT_CATEGORY(137U, AIE_ERROR_STREAM),
128 	EVENT_CATEGORY(138U, AIE_ERROR_BUS),
129 	EVENT_CATEGORY(139U, AIE_ERROR_LOCK),
130 };
131 
132 static const struct aie_event_category aie_ml_shim_tile_event_cat[] = {
133 	EVENT_CATEGORY(64U, AIE_ERROR_BUS),
134 	EVENT_CATEGORY(65U, AIE_ERROR_STREAM),
135 	EVENT_CATEGORY(66U, AIE_ERROR_STREAM),
136 	EVENT_CATEGORY(67U, AIE_ERROR_BUS),
137 	EVENT_CATEGORY(68U, AIE_ERROR_BUS),
138 	EVENT_CATEGORY(69U, AIE_ERROR_BUS),
139 	EVENT_CATEGORY(70U, AIE_ERROR_BUS),
140 	EVENT_CATEGORY(71U, AIE_ERROR_BUS),
141 	EVENT_CATEGORY(72U, AIE_ERROR_DMA),
142 	EVENT_CATEGORY(73U, AIE_ERROR_DMA),
143 	EVENT_CATEGORY(74U, AIE_ERROR_LOCK),
144 };
145 
146 static enum aie_error_category
aie_get_error_category(u8 row,u8 event_id,enum aie_module_type mod_type)147 aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
148 {
149 	const struct aie_event_category *lut;
150 	int num_entry;
151 	int i;
152 
153 	switch (mod_type) {
154 	case AIE_PL_MOD:
155 		lut = aie_ml_shim_tile_event_cat;
156 		num_entry = ARRAY_SIZE(aie_ml_shim_tile_event_cat);
157 		break;
158 	case AIE_CORE_MOD:
159 		lut = aie_ml_core_event_cat;
160 		num_entry = ARRAY_SIZE(aie_ml_core_event_cat);
161 		break;
162 	case AIE_MEM_MOD:
163 		if (row == 1) {
164 			lut = aie_ml_mem_tile_event_cat;
165 			num_entry = ARRAY_SIZE(aie_ml_mem_tile_event_cat);
166 		} else {
167 			lut = aie_ml_mem_event_cat;
168 			num_entry = ARRAY_SIZE(aie_ml_mem_event_cat);
169 		}
170 		break;
171 	default:
172 		return AIE_ERROR_UNKNOWN;
173 	}
174 
175 	for (i = 0; i < num_entry; i++) {
176 		if (event_id != lut[i].event_id)
177 			continue;
178 
179 		return lut[i].category;
180 	}
181 
182 	return AIE_ERROR_UNKNOWN;
183 }
184 
aie2_error_backtrack(struct amdxdna_dev_hdl * ndev,void * err_info,u32 num_err)185 static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
186 {
187 	struct aie_error *errs = err_info;
188 	u32 err_col = 0; /* assume that AIE has less than 32 columns */
189 	int i;
190 
191 	/* Get err column bitmap */
192 	for (i = 0; i < num_err; i++) {
193 		struct aie_error *err = &errs[i];
194 		enum aie_error_category cat;
195 
196 		cat = aie_get_error_category(err->row, err->event_id, err->mod_type);
197 		XDNA_ERR(ndev->xdna, "Row: %d, Col: %d, module %d, event ID %d, category %d",
198 			 err->row, err->col, err->mod_type,
199 			 err->event_id, cat);
200 
201 		if (err->col >= 32) {
202 			XDNA_WARN(ndev->xdna, "Invalid column number");
203 			break;
204 		}
205 
206 		err_col |= (1 << err->col);
207 	}
208 
209 	return err_col;
210 }
211 
aie2_error_async_cb(void * handle,const u32 * data,size_t size)212 static int aie2_error_async_cb(void *handle, const u32 *data, size_t size)
213 {
214 	struct async_event_msg_resp *resp;
215 	struct async_event *e = handle;
216 
217 	if (data) {
218 		resp = (struct async_event_msg_resp *)data;
219 		e->resp.type = resp->type;
220 		wmb(); /* Update status in the end, so that no lock for here */
221 		e->resp.status = resp->status;
222 	}
223 	queue_work(e->wq, &e->work);
224 	return 0;
225 }
226 
aie2_error_event_send(struct async_event * e)227 static int aie2_error_event_send(struct async_event *e)
228 {
229 	drm_clflush_virt_range(e->buf, e->size); /* device can access */
230 	return aie2_register_asyn_event_msg(e->ndev, e->addr, e->size, e,
231 					    aie2_error_async_cb);
232 }
233 
aie2_error_worker(struct work_struct * err_work)234 static void aie2_error_worker(struct work_struct *err_work)
235 {
236 	struct aie_err_info *info;
237 	struct amdxdna_dev *xdna;
238 	struct async_event *e;
239 	u32 max_err;
240 	u32 err_col;
241 
242 	e = container_of(err_work, struct async_event, work);
243 
244 	xdna = e->ndev->xdna;
245 
246 	if (e->resp.status == MAX_AIE2_STATUS_CODE)
247 		return;
248 
249 	e->resp.status = MAX_AIE2_STATUS_CODE;
250 
251 	print_hex_dump_debug("AIE error: ", DUMP_PREFIX_OFFSET, 16, 4,
252 			     e->buf, 0x100, false);
253 
254 	info = (struct aie_err_info *)e->buf;
255 	XDNA_DBG(xdna, "Error count %d return code %d", info->err_cnt, info->ret_code);
256 
257 	max_err = (e->size - sizeof(*info)) / sizeof(struct aie_error);
258 	if (unlikely(info->err_cnt > max_err)) {
259 		WARN_ONCE(1, "Error count too large %d\n", info->err_cnt);
260 		return;
261 	}
262 	err_col = aie2_error_backtrack(e->ndev, info->payload, info->err_cnt);
263 	if (!err_col) {
264 		XDNA_WARN(xdna, "Did not get error column");
265 		return;
266 	}
267 
268 	mutex_lock(&xdna->dev_lock);
269 	/* Re-sent this event to firmware */
270 	if (aie2_error_event_send(e))
271 		XDNA_WARN(xdna, "Unable to register async event");
272 	mutex_unlock(&xdna->dev_lock);
273 }
274 
aie2_error_async_events_send(struct amdxdna_dev_hdl * ndev)275 int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev)
276 {
277 	struct amdxdna_dev *xdna = ndev->xdna;
278 	struct async_event *e;
279 	int i, ret;
280 
281 	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
282 	for (i = 0; i < ndev->async_events->event_cnt; i++) {
283 		e = &ndev->async_events->event[i];
284 		ret = aie2_error_event_send(e);
285 		if (ret)
286 			return ret;
287 	}
288 
289 	return 0;
290 }
291 
aie2_error_async_events_free(struct amdxdna_dev_hdl * ndev)292 void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev)
293 {
294 	struct amdxdna_dev *xdna = ndev->xdna;
295 	struct async_events *events;
296 
297 	events = ndev->async_events;
298 
299 	mutex_unlock(&xdna->dev_lock);
300 	destroy_workqueue(events->wq);
301 	mutex_lock(&xdna->dev_lock);
302 
303 	dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
304 			     events->addr, DMA_FROM_DEVICE);
305 	kfree(events);
306 }
307 
aie2_error_async_events_alloc(struct amdxdna_dev_hdl * ndev)308 int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
309 {
310 	struct amdxdna_dev *xdna = ndev->xdna;
311 	u32 total_col = ndev->total_col;
312 	u32 total_size = ASYNC_BUF_SIZE * total_col;
313 	struct async_events *events;
314 	int i, ret;
315 
316 	events = kzalloc(struct_size(events, event, total_col), GFP_KERNEL);
317 	if (!events)
318 		return -ENOMEM;
319 
320 	events->buf = dma_alloc_noncoherent(xdna->ddev.dev, total_size, &events->addr,
321 					    DMA_FROM_DEVICE, GFP_KERNEL);
322 	if (!events->buf) {
323 		ret = -ENOMEM;
324 		goto free_events;
325 	}
326 	events->size = total_size;
327 	events->event_cnt = total_col;
328 
329 	events->wq = alloc_ordered_workqueue("async_wq", 0);
330 	if (!events->wq) {
331 		ret = -ENOMEM;
332 		goto free_buf;
333 	}
334 
335 	for (i = 0; i < events->event_cnt; i++) {
336 		struct async_event *e = &events->event[i];
337 		u32 offset = i * ASYNC_BUF_SIZE;
338 
339 		e->ndev = ndev;
340 		e->wq = events->wq;
341 		e->buf = &events->buf[offset];
342 		e->addr = events->addr + offset;
343 		e->size = ASYNC_BUF_SIZE;
344 		e->resp.status = MAX_AIE2_STATUS_CODE;
345 		INIT_WORK(&e->work, aie2_error_worker);
346 	}
347 
348 	ndev->async_events = events;
349 
350 	XDNA_DBG(xdna, "Async event count %d, buf total size 0x%x",
351 		 events->event_cnt, events->size);
352 	return 0;
353 
354 free_buf:
355 	dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
356 			     events->addr, DMA_FROM_DEVICE);
357 free_events:
358 	kfree(events);
359 	return ret;
360 }
361