1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
4 */
5
6 #include <drm/drm_cache.h>
7 #include <drm/drm_device.h>
8 #include <drm/drm_print.h>
9 #include <drm/gpu_scheduler.h>
10 #include <linux/dma-mapping.h>
11 #include <linux/kthread.h>
12 #include <linux/kernel.h>
13
14 #include "aie2_msg_priv.h"
15 #include "aie2_pci.h"
16 #include "amdxdna_error.h"
17 #include "amdxdna_mailbox.h"
18 #include "amdxdna_pci_drv.h"
19
20 struct async_event {
21 struct amdxdna_dev_hdl *ndev;
22 struct async_event_msg_resp resp;
23 struct workqueue_struct *wq;
24 struct work_struct work;
25 u8 *buf;
26 dma_addr_t addr;
27 u32 size;
28 };
29
30 struct async_events {
31 struct workqueue_struct *wq;
32 u8 *buf;
33 dma_addr_t addr;
34 u32 size;
35 u32 event_cnt;
36 struct async_event event[] __counted_by(event_cnt);
37 };
38
39 /*
40 * Below enum, struct and lookup tables are porting from XAIE util header file.
41 *
42 * Below data is defined by AIE device and it is used for decode error message
43 * from the device.
44 */
45
46 enum aie_module_type {
47 AIE_MEM_MOD = 0,
48 AIE_CORE_MOD,
49 AIE_PL_MOD,
50 AIE_UNKNOWN_MOD,
51 };
52
53 enum aie_error_category {
54 AIE_ERROR_SATURATION = 0,
55 AIE_ERROR_FP,
56 AIE_ERROR_STREAM,
57 AIE_ERROR_ACCESS,
58 AIE_ERROR_BUS,
59 AIE_ERROR_INSTRUCTION,
60 AIE_ERROR_ECC,
61 AIE_ERROR_LOCK,
62 AIE_ERROR_DMA,
63 AIE_ERROR_MEM_PARITY,
64 /* Unknown is not from XAIE, added for better category */
65 AIE_ERROR_UNKNOWN,
66 };
67
68 /* Don't pack, unless XAIE side changed */
69 struct aie_error {
70 __u8 row;
71 __u8 col;
72 __u32 mod_type;
73 __u8 event_id;
74 };
75
76 struct aie_err_info {
77 u32 err_cnt;
78 u32 ret_code;
79 u32 rsvd;
80 struct aie_error payload[] __counted_by(err_cnt);
81 };
82
83 struct aie_event_category {
84 u8 event_id;
85 enum aie_error_category category;
86 };
87
88 #define EVENT_CATEGORY(id, cat) { id, cat }
89 static const struct aie_event_category aie_ml_mem_event_cat[] = {
90 EVENT_CATEGORY(88U, AIE_ERROR_ECC),
91 EVENT_CATEGORY(90U, AIE_ERROR_ECC),
92 EVENT_CATEGORY(91U, AIE_ERROR_MEM_PARITY),
93 EVENT_CATEGORY(92U, AIE_ERROR_MEM_PARITY),
94 EVENT_CATEGORY(93U, AIE_ERROR_MEM_PARITY),
95 EVENT_CATEGORY(94U, AIE_ERROR_MEM_PARITY),
96 EVENT_CATEGORY(95U, AIE_ERROR_MEM_PARITY),
97 EVENT_CATEGORY(96U, AIE_ERROR_MEM_PARITY),
98 EVENT_CATEGORY(97U, AIE_ERROR_DMA),
99 EVENT_CATEGORY(98U, AIE_ERROR_DMA),
100 EVENT_CATEGORY(99U, AIE_ERROR_DMA),
101 EVENT_CATEGORY(100U, AIE_ERROR_DMA),
102 EVENT_CATEGORY(101U, AIE_ERROR_LOCK),
103 };
104
105 static const struct aie_event_category aie_ml_core_event_cat[] = {
106 EVENT_CATEGORY(55U, AIE_ERROR_ACCESS),
107 EVENT_CATEGORY(56U, AIE_ERROR_STREAM),
108 EVENT_CATEGORY(57U, AIE_ERROR_STREAM),
109 EVENT_CATEGORY(58U, AIE_ERROR_BUS),
110 EVENT_CATEGORY(59U, AIE_ERROR_INSTRUCTION),
111 EVENT_CATEGORY(60U, AIE_ERROR_ACCESS),
112 EVENT_CATEGORY(62U, AIE_ERROR_ECC),
113 EVENT_CATEGORY(64U, AIE_ERROR_ECC),
114 EVENT_CATEGORY(65U, AIE_ERROR_ACCESS),
115 EVENT_CATEGORY(66U, AIE_ERROR_ACCESS),
116 EVENT_CATEGORY(67U, AIE_ERROR_LOCK),
117 EVENT_CATEGORY(70U, AIE_ERROR_INSTRUCTION),
118 EVENT_CATEGORY(71U, AIE_ERROR_STREAM),
119 EVENT_CATEGORY(72U, AIE_ERROR_BUS),
120 };
121
122 static const struct aie_event_category aie_ml_mem_tile_event_cat[] = {
123 EVENT_CATEGORY(130U, AIE_ERROR_ECC),
124 EVENT_CATEGORY(132U, AIE_ERROR_ECC),
125 EVENT_CATEGORY(133U, AIE_ERROR_DMA),
126 EVENT_CATEGORY(134U, AIE_ERROR_DMA),
127 EVENT_CATEGORY(135U, AIE_ERROR_STREAM),
128 EVENT_CATEGORY(136U, AIE_ERROR_STREAM),
129 EVENT_CATEGORY(137U, AIE_ERROR_STREAM),
130 EVENT_CATEGORY(138U, AIE_ERROR_BUS),
131 EVENT_CATEGORY(139U, AIE_ERROR_LOCK),
132 };
133
134 static const struct aie_event_category aie_ml_shim_tile_event_cat[] = {
135 EVENT_CATEGORY(64U, AIE_ERROR_BUS),
136 EVENT_CATEGORY(65U, AIE_ERROR_STREAM),
137 EVENT_CATEGORY(66U, AIE_ERROR_STREAM),
138 EVENT_CATEGORY(67U, AIE_ERROR_BUS),
139 EVENT_CATEGORY(68U, AIE_ERROR_BUS),
140 EVENT_CATEGORY(69U, AIE_ERROR_BUS),
141 EVENT_CATEGORY(70U, AIE_ERROR_BUS),
142 EVENT_CATEGORY(71U, AIE_ERROR_BUS),
143 EVENT_CATEGORY(72U, AIE_ERROR_DMA),
144 EVENT_CATEGORY(73U, AIE_ERROR_DMA),
145 EVENT_CATEGORY(74U, AIE_ERROR_LOCK),
146 };
147
148 static const enum amdxdna_error_num aie_cat_err_num_map[] = {
149 [AIE_ERROR_SATURATION] = AMDXDNA_ERROR_NUM_AIE_SATURATION,
150 [AIE_ERROR_FP] = AMDXDNA_ERROR_NUM_AIE_FP,
151 [AIE_ERROR_STREAM] = AMDXDNA_ERROR_NUM_AIE_STREAM,
152 [AIE_ERROR_ACCESS] = AMDXDNA_ERROR_NUM_AIE_ACCESS,
153 [AIE_ERROR_BUS] = AMDXDNA_ERROR_NUM_AIE_BUS,
154 [AIE_ERROR_INSTRUCTION] = AMDXDNA_ERROR_NUM_AIE_INSTRUCTION,
155 [AIE_ERROR_ECC] = AMDXDNA_ERROR_NUM_AIE_ECC,
156 [AIE_ERROR_LOCK] = AMDXDNA_ERROR_NUM_AIE_LOCK,
157 [AIE_ERROR_DMA] = AMDXDNA_ERROR_NUM_AIE_DMA,
158 [AIE_ERROR_MEM_PARITY] = AMDXDNA_ERROR_NUM_AIE_MEM_PARITY,
159 [AIE_ERROR_UNKNOWN] = AMDXDNA_ERROR_NUM_UNKNOWN,
160 };
161
162 static_assert(ARRAY_SIZE(aie_cat_err_num_map) == AIE_ERROR_UNKNOWN + 1);
163
164 static const enum amdxdna_error_module aie_err_mod_map[] = {
165 [AIE_MEM_MOD] = AMDXDNA_ERROR_MODULE_AIE_MEMORY,
166 [AIE_CORE_MOD] = AMDXDNA_ERROR_MODULE_AIE_CORE,
167 [AIE_PL_MOD] = AMDXDNA_ERROR_MODULE_AIE_PL,
168 [AIE_UNKNOWN_MOD] = AMDXDNA_ERROR_MODULE_UNKNOWN,
169 };
170
171 static_assert(ARRAY_SIZE(aie_err_mod_map) == AIE_UNKNOWN_MOD + 1);
172
173 static enum aie_error_category
aie_get_error_category(u8 row,u8 event_id,enum aie_module_type mod_type)174 aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
175 {
176 const struct aie_event_category *lut;
177 int num_entry;
178 int i;
179
180 switch (mod_type) {
181 case AIE_PL_MOD:
182 lut = aie_ml_shim_tile_event_cat;
183 num_entry = ARRAY_SIZE(aie_ml_shim_tile_event_cat);
184 break;
185 case AIE_CORE_MOD:
186 lut = aie_ml_core_event_cat;
187 num_entry = ARRAY_SIZE(aie_ml_core_event_cat);
188 break;
189 case AIE_MEM_MOD:
190 if (row == 1) {
191 lut = aie_ml_mem_tile_event_cat;
192 num_entry = ARRAY_SIZE(aie_ml_mem_tile_event_cat);
193 } else {
194 lut = aie_ml_mem_event_cat;
195 num_entry = ARRAY_SIZE(aie_ml_mem_event_cat);
196 }
197 break;
198 default:
199 return AIE_ERROR_UNKNOWN;
200 }
201
202 for (i = 0; i < num_entry; i++) {
203 if (event_id != lut[i].event_id)
204 continue;
205
206 if (lut[i].category > AIE_ERROR_UNKNOWN)
207 return AIE_ERROR_UNKNOWN;
208
209 return lut[i].category;
210 }
211
212 return AIE_ERROR_UNKNOWN;
213 }
214
aie2_update_last_async_error(struct amdxdna_dev_hdl * ndev,void * err_info,u32 num_err)215 static void aie2_update_last_async_error(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
216 {
217 struct aie_error *errs = err_info;
218 enum amdxdna_error_module err_mod;
219 enum aie_error_category aie_err;
220 enum amdxdna_error_num err_num;
221 struct aie_error *last_err;
222
223 last_err = &errs[num_err - 1];
224 if (last_err->mod_type >= AIE_UNKNOWN_MOD) {
225 err_num = aie_cat_err_num_map[AIE_ERROR_UNKNOWN];
226 err_mod = aie_err_mod_map[AIE_UNKNOWN_MOD];
227 } else {
228 aie_err = aie_get_error_category(last_err->row,
229 last_err->event_id,
230 last_err->mod_type);
231 err_num = aie_cat_err_num_map[aie_err];
232 err_mod = aie_err_mod_map[last_err->mod_type];
233 }
234
235 ndev->last_async_err.err_code = AMDXDNA_ERROR_ENCODE(err_num, err_mod);
236 ndev->last_async_err.ts_us = ktime_to_us(ktime_get_real());
237 ndev->last_async_err.ex_err_code = AMDXDNA_EXTRA_ERR_ENCODE(last_err->row, last_err->col);
238 }
239
aie2_error_backtrack(struct amdxdna_dev_hdl * ndev,void * err_info,u32 num_err)240 static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
241 {
242 struct aie_error *errs = err_info;
243 u32 err_col = 0; /* assume that AIE has less than 32 columns */
244 int i;
245
246 /* Get err column bitmap */
247 for (i = 0; i < num_err; i++) {
248 struct aie_error *err = &errs[i];
249 enum aie_error_category cat;
250
251 cat = aie_get_error_category(err->row, err->event_id, err->mod_type);
252 XDNA_ERR(ndev->xdna, "Row: %d, Col: %d, module %d, event ID %d, category %d",
253 err->row, err->col, err->mod_type,
254 err->event_id, cat);
255
256 if (err->col >= 32) {
257 XDNA_WARN(ndev->xdna, "Invalid column number");
258 break;
259 }
260
261 err_col |= (1 << err->col);
262 }
263
264 return err_col;
265 }
266
aie2_error_async_cb(void * handle,void __iomem * data,size_t size)267 static int aie2_error_async_cb(void *handle, void __iomem *data, size_t size)
268 {
269 struct async_event *e = handle;
270
271 if (data) {
272 e->resp.type = readl(data + offsetof(struct async_event_msg_resp, type));
273 wmb(); /* Update status in the end, so that no lock for here */
274 e->resp.status = readl(data + offsetof(struct async_event_msg_resp, status));
275 }
276 queue_work(e->wq, &e->work);
277 return 0;
278 }
279
aie2_error_event_send(struct async_event * e)280 static int aie2_error_event_send(struct async_event *e)
281 {
282 drm_clflush_virt_range(e->buf, e->size); /* device can access */
283 return aie2_register_asyn_event_msg(e->ndev, e->addr, e->size, e,
284 aie2_error_async_cb);
285 }
286
aie2_error_worker(struct work_struct * err_work)287 static void aie2_error_worker(struct work_struct *err_work)
288 {
289 struct aie_err_info *info;
290 struct amdxdna_dev *xdna;
291 struct async_event *e;
292 u32 max_err;
293 u32 err_col;
294
295 e = container_of(err_work, struct async_event, work);
296
297 xdna = e->ndev->xdna;
298
299 if (e->resp.status == MAX_AIE2_STATUS_CODE)
300 return;
301
302 e->resp.status = MAX_AIE2_STATUS_CODE;
303
304 print_hex_dump_debug("AIE error: ", DUMP_PREFIX_OFFSET, 16, 4,
305 e->buf, 0x100, false);
306
307 info = (struct aie_err_info *)e->buf;
308 XDNA_DBG(xdna, "Error count %d return code %d", info->err_cnt, info->ret_code);
309
310 max_err = (e->size - sizeof(*info)) / sizeof(struct aie_error);
311 if (unlikely(info->err_cnt > max_err)) {
312 WARN_ONCE(1, "Error count too large %d\n", info->err_cnt);
313 return;
314 }
315 err_col = aie2_error_backtrack(e->ndev, info->payload, info->err_cnt);
316 if (!err_col) {
317 XDNA_WARN(xdna, "Did not get error column");
318 return;
319 }
320
321 mutex_lock(&xdna->dev_lock);
322 aie2_update_last_async_error(e->ndev, info->payload, info->err_cnt);
323
324 /* Re-sent this event to firmware */
325 if (aie2_error_event_send(e))
326 XDNA_WARN(xdna, "Unable to register async event");
327 mutex_unlock(&xdna->dev_lock);
328 }
329
aie2_error_async_events_free(struct amdxdna_dev_hdl * ndev)330 void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev)
331 {
332 struct amdxdna_dev *xdna = ndev->xdna;
333 struct async_events *events;
334
335 events = ndev->async_events;
336
337 mutex_unlock(&xdna->dev_lock);
338 destroy_workqueue(events->wq);
339 mutex_lock(&xdna->dev_lock);
340
341 aie2_free_msg_buffer(ndev, events->size, events->buf, events->addr);
342 kfree(events);
343 }
344
aie2_error_async_events_alloc(struct amdxdna_dev_hdl * ndev)345 int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
346 {
347 struct amdxdna_dev *xdna = ndev->xdna;
348 u32 total_col = ndev->total_col;
349 u32 total_size = ASYNC_BUF_SIZE * total_col;
350 struct async_events *events;
351 int i, ret;
352
353 events = kzalloc_flex(*events, event, total_col);
354 if (!events)
355 return -ENOMEM;
356
357 events->buf = aie2_alloc_msg_buffer(ndev, &total_size, &events->addr);
358
359 if (!events->buf) {
360 ret = -ENOMEM;
361 goto free_events;
362 }
363 events->size = total_size;
364 events->event_cnt = total_col;
365
366 events->wq = alloc_ordered_workqueue("async_wq", 0);
367 if (!events->wq) {
368 ret = -ENOMEM;
369 goto free_buf;
370 }
371
372 for (i = 0; i < events->event_cnt; i++) {
373 struct async_event *e = &events->event[i];
374 u32 offset = i * ASYNC_BUF_SIZE;
375
376 e->ndev = ndev;
377 e->wq = events->wq;
378 e->buf = &events->buf[offset];
379 e->addr = events->addr + offset;
380 e->size = ASYNC_BUF_SIZE;
381 e->resp.status = MAX_AIE2_STATUS_CODE;
382 INIT_WORK(&e->work, aie2_error_worker);
383
384 ret = aie2_error_event_send(e);
385 if (ret)
386 goto free_wq;
387 }
388
389 ndev->async_events = events;
390
391 XDNA_DBG(xdna, "Async event count %d, buf total size 0x%x",
392 events->event_cnt, events->size);
393 return 0;
394
395 free_wq:
396 destroy_workqueue(events->wq);
397 free_buf:
398 aie2_free_msg_buffer(ndev, events->size, events->buf, events->addr);
399 free_events:
400 kfree(events);
401 return ret;
402 }
403
aie2_get_array_async_error(struct amdxdna_dev_hdl * ndev,struct amdxdna_drm_get_array * args)404 int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev, struct amdxdna_drm_get_array *args)
405 {
406 struct amdxdna_dev *xdna = ndev->xdna;
407
408 drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
409
410 args->num_element = 1;
411 args->element_size = sizeof(ndev->last_async_err);
412 if (copy_to_user(u64_to_user_ptr(args->buffer),
413 &ndev->last_async_err, args->element_size))
414 return -EFAULT;
415
416 return 0;
417 }
418