1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2023-2024, Advanced Micro Devices, Inc. 4 */ 5 6 #include <drm/drm_cache.h> 7 #include <drm/drm_device.h> 8 #include <drm/drm_print.h> 9 #include <drm/gpu_scheduler.h> 10 #include <linux/dma-mapping.h> 11 #include <linux/kthread.h> 12 #include <linux/kernel.h> 13 14 #include "aie2_msg_priv.h" 15 #include "aie2_pci.h" 16 #include "amdxdna_error.h" 17 #include "amdxdna_mailbox.h" 18 #include "amdxdna_pci_drv.h" 19 20 struct async_event { 21 struct amdxdna_dev_hdl *ndev; 22 struct async_event_msg_resp resp; 23 struct workqueue_struct *wq; 24 struct work_struct work; 25 u8 *buf; 26 dma_addr_t addr; 27 u32 size; 28 }; 29 30 struct async_events { 31 struct workqueue_struct *wq; 32 u8 *buf; 33 dma_addr_t addr; 34 u32 size; 35 u32 event_cnt; 36 struct async_event event[] __counted_by(event_cnt); 37 }; 38 39 /* 40 * Below enum, struct and lookup tables are porting from XAIE util header file. 41 * 42 * Below data is defined by AIE device and it is used for decode error message 43 * from the device. 44 */ 45 46 enum aie_module_type { 47 AIE_MEM_MOD = 0, 48 AIE_CORE_MOD, 49 AIE_PL_MOD, 50 AIE_UNKNOWN_MOD, 51 }; 52 53 enum aie_error_category { 54 AIE_ERROR_SATURATION = 0, 55 AIE_ERROR_FP, 56 AIE_ERROR_STREAM, 57 AIE_ERROR_ACCESS, 58 AIE_ERROR_BUS, 59 AIE_ERROR_INSTRUCTION, 60 AIE_ERROR_ECC, 61 AIE_ERROR_LOCK, 62 AIE_ERROR_DMA, 63 AIE_ERROR_MEM_PARITY, 64 /* Unknown is not from XAIE, added for better category */ 65 AIE_ERROR_UNKNOWN, 66 }; 67 68 /* Don't pack, unless XAIE side changed */ 69 struct aie_error { 70 __u8 row; 71 __u8 col; 72 __u32 mod_type; 73 __u8 event_id; 74 }; 75 76 struct aie_err_info { 77 u32 err_cnt; 78 u32 ret_code; 79 u32 rsvd; 80 struct aie_error payload[] __counted_by(err_cnt); 81 }; 82 83 struct aie_event_category { 84 u8 event_id; 85 enum aie_error_category category; 86 }; 87 88 #define EVENT_CATEGORY(id, cat) { id, cat } 89 static const struct aie_event_category aie_ml_mem_event_cat[] = { 90 EVENT_CATEGORY(88U, AIE_ERROR_ECC), 91 EVENT_CATEGORY(90U, AIE_ERROR_ECC), 92 EVENT_CATEGORY(91U, AIE_ERROR_MEM_PARITY), 93 EVENT_CATEGORY(92U, AIE_ERROR_MEM_PARITY), 94 EVENT_CATEGORY(93U, AIE_ERROR_MEM_PARITY), 95 EVENT_CATEGORY(94U, AIE_ERROR_MEM_PARITY), 96 EVENT_CATEGORY(95U, AIE_ERROR_MEM_PARITY), 97 EVENT_CATEGORY(96U, AIE_ERROR_MEM_PARITY), 98 EVENT_CATEGORY(97U, AIE_ERROR_DMA), 99 EVENT_CATEGORY(98U, AIE_ERROR_DMA), 100 EVENT_CATEGORY(99U, AIE_ERROR_DMA), 101 EVENT_CATEGORY(100U, AIE_ERROR_DMA), 102 EVENT_CATEGORY(101U, AIE_ERROR_LOCK), 103 }; 104 105 static const struct aie_event_category aie_ml_core_event_cat[] = { 106 EVENT_CATEGORY(55U, AIE_ERROR_ACCESS), 107 EVENT_CATEGORY(56U, AIE_ERROR_STREAM), 108 EVENT_CATEGORY(57U, AIE_ERROR_STREAM), 109 EVENT_CATEGORY(58U, AIE_ERROR_BUS), 110 EVENT_CATEGORY(59U, AIE_ERROR_INSTRUCTION), 111 EVENT_CATEGORY(60U, AIE_ERROR_ACCESS), 112 EVENT_CATEGORY(62U, AIE_ERROR_ECC), 113 EVENT_CATEGORY(64U, AIE_ERROR_ECC), 114 EVENT_CATEGORY(65U, AIE_ERROR_ACCESS), 115 EVENT_CATEGORY(66U, AIE_ERROR_ACCESS), 116 EVENT_CATEGORY(67U, AIE_ERROR_LOCK), 117 EVENT_CATEGORY(70U, AIE_ERROR_INSTRUCTION), 118 EVENT_CATEGORY(71U, AIE_ERROR_STREAM), 119 EVENT_CATEGORY(72U, AIE_ERROR_BUS), 120 }; 121 122 static const struct aie_event_category aie_ml_mem_tile_event_cat[] = { 123 EVENT_CATEGORY(130U, AIE_ERROR_ECC), 124 EVENT_CATEGORY(132U, AIE_ERROR_ECC), 125 EVENT_CATEGORY(133U, AIE_ERROR_DMA), 126 EVENT_CATEGORY(134U, AIE_ERROR_DMA), 127 EVENT_CATEGORY(135U, AIE_ERROR_STREAM), 128 EVENT_CATEGORY(136U, AIE_ERROR_STREAM), 129 EVENT_CATEGORY(137U, AIE_ERROR_STREAM), 130 EVENT_CATEGORY(138U, AIE_ERROR_BUS), 131 EVENT_CATEGORY(139U, AIE_ERROR_LOCK), 132 }; 133 134 static const struct aie_event_category aie_ml_shim_tile_event_cat[] = { 135 EVENT_CATEGORY(64U, AIE_ERROR_BUS), 136 EVENT_CATEGORY(65U, AIE_ERROR_STREAM), 137 EVENT_CATEGORY(66U, AIE_ERROR_STREAM), 138 EVENT_CATEGORY(67U, AIE_ERROR_BUS), 139 EVENT_CATEGORY(68U, AIE_ERROR_BUS), 140 EVENT_CATEGORY(69U, AIE_ERROR_BUS), 141 EVENT_CATEGORY(70U, AIE_ERROR_BUS), 142 EVENT_CATEGORY(71U, AIE_ERROR_BUS), 143 EVENT_CATEGORY(72U, AIE_ERROR_DMA), 144 EVENT_CATEGORY(73U, AIE_ERROR_DMA), 145 EVENT_CATEGORY(74U, AIE_ERROR_LOCK), 146 }; 147 148 static const enum amdxdna_error_num aie_cat_err_num_map[] = { 149 [AIE_ERROR_SATURATION] = AMDXDNA_ERROR_NUM_AIE_SATURATION, 150 [AIE_ERROR_FP] = AMDXDNA_ERROR_NUM_AIE_FP, 151 [AIE_ERROR_STREAM] = AMDXDNA_ERROR_NUM_AIE_STREAM, 152 [AIE_ERROR_ACCESS] = AMDXDNA_ERROR_NUM_AIE_ACCESS, 153 [AIE_ERROR_BUS] = AMDXDNA_ERROR_NUM_AIE_BUS, 154 [AIE_ERROR_INSTRUCTION] = AMDXDNA_ERROR_NUM_AIE_INSTRUCTION, 155 [AIE_ERROR_ECC] = AMDXDNA_ERROR_NUM_AIE_ECC, 156 [AIE_ERROR_LOCK] = AMDXDNA_ERROR_NUM_AIE_LOCK, 157 [AIE_ERROR_DMA] = AMDXDNA_ERROR_NUM_AIE_DMA, 158 [AIE_ERROR_MEM_PARITY] = AMDXDNA_ERROR_NUM_AIE_MEM_PARITY, 159 [AIE_ERROR_UNKNOWN] = AMDXDNA_ERROR_NUM_UNKNOWN, 160 }; 161 162 static_assert(ARRAY_SIZE(aie_cat_err_num_map) == AIE_ERROR_UNKNOWN + 1); 163 164 static const enum amdxdna_error_module aie_err_mod_map[] = { 165 [AIE_MEM_MOD] = AMDXDNA_ERROR_MODULE_AIE_MEMORY, 166 [AIE_CORE_MOD] = AMDXDNA_ERROR_MODULE_AIE_CORE, 167 [AIE_PL_MOD] = AMDXDNA_ERROR_MODULE_AIE_PL, 168 [AIE_UNKNOWN_MOD] = AMDXDNA_ERROR_MODULE_UNKNOWN, 169 }; 170 171 static_assert(ARRAY_SIZE(aie_err_mod_map) == AIE_UNKNOWN_MOD + 1); 172 173 static enum aie_error_category 174 aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type) 175 { 176 const struct aie_event_category *lut; 177 int num_entry; 178 int i; 179 180 switch (mod_type) { 181 case AIE_PL_MOD: 182 lut = aie_ml_shim_tile_event_cat; 183 num_entry = ARRAY_SIZE(aie_ml_shim_tile_event_cat); 184 break; 185 case AIE_CORE_MOD: 186 lut = aie_ml_core_event_cat; 187 num_entry = ARRAY_SIZE(aie_ml_core_event_cat); 188 break; 189 case AIE_MEM_MOD: 190 if (row == 1) { 191 lut = aie_ml_mem_tile_event_cat; 192 num_entry = ARRAY_SIZE(aie_ml_mem_tile_event_cat); 193 } else { 194 lut = aie_ml_mem_event_cat; 195 num_entry = ARRAY_SIZE(aie_ml_mem_event_cat); 196 } 197 break; 198 default: 199 return AIE_ERROR_UNKNOWN; 200 } 201 202 for (i = 0; i < num_entry; i++) { 203 if (event_id != lut[i].event_id) 204 continue; 205 206 if (lut[i].category > AIE_ERROR_UNKNOWN) 207 return AIE_ERROR_UNKNOWN; 208 209 return lut[i].category; 210 } 211 212 return AIE_ERROR_UNKNOWN; 213 } 214 215 static void aie2_update_last_async_error(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err) 216 { 217 struct aie_error *errs = err_info; 218 enum amdxdna_error_module err_mod; 219 enum aie_error_category aie_err; 220 enum amdxdna_error_num err_num; 221 struct aie_error *last_err; 222 223 last_err = &errs[num_err - 1]; 224 if (last_err->mod_type >= AIE_UNKNOWN_MOD) { 225 err_num = aie_cat_err_num_map[AIE_ERROR_UNKNOWN]; 226 err_mod = aie_err_mod_map[AIE_UNKNOWN_MOD]; 227 } else { 228 aie_err = aie_get_error_category(last_err->row, 229 last_err->event_id, 230 last_err->mod_type); 231 err_num = aie_cat_err_num_map[aie_err]; 232 err_mod = aie_err_mod_map[last_err->mod_type]; 233 } 234 235 ndev->last_async_err.err_code = AMDXDNA_ERROR_ENCODE(err_num, err_mod); 236 ndev->last_async_err.ts_us = ktime_to_us(ktime_get_real()); 237 ndev->last_async_err.ex_err_code = AMDXDNA_EXTRA_ERR_ENCODE(last_err->row, last_err->col); 238 } 239 240 static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err) 241 { 242 struct aie_error *errs = err_info; 243 u32 err_col = 0; /* assume that AIE has less than 32 columns */ 244 int i; 245 246 /* Get err column bitmap */ 247 for (i = 0; i < num_err; i++) { 248 struct aie_error *err = &errs[i]; 249 enum aie_error_category cat; 250 251 cat = aie_get_error_category(err->row, err->event_id, err->mod_type); 252 XDNA_ERR(ndev->xdna, "Row: %d, Col: %d, module %d, event ID %d, category %d", 253 err->row, err->col, err->mod_type, 254 err->event_id, cat); 255 256 if (err->col >= 32) { 257 XDNA_WARN(ndev->xdna, "Invalid column number"); 258 break; 259 } 260 261 err_col |= (1 << err->col); 262 } 263 264 return err_col; 265 } 266 267 static int aie2_error_async_cb(void *handle, void __iomem *data, size_t size) 268 { 269 struct async_event *e = handle; 270 271 if (data) { 272 e->resp.type = readl(data + offsetof(struct async_event_msg_resp, type)); 273 wmb(); /* Update status in the end, so that no lock for here */ 274 e->resp.status = readl(data + offsetof(struct async_event_msg_resp, status)); 275 } 276 queue_work(e->wq, &e->work); 277 return 0; 278 } 279 280 static int aie2_error_event_send(struct async_event *e) 281 { 282 drm_clflush_virt_range(e->buf, e->size); /* device can access */ 283 return aie2_register_asyn_event_msg(e->ndev, e->addr, e->size, e, 284 aie2_error_async_cb); 285 } 286 287 static void aie2_error_worker(struct work_struct *err_work) 288 { 289 struct aie_err_info *info; 290 struct amdxdna_dev *xdna; 291 struct async_event *e; 292 u32 max_err; 293 u32 err_col; 294 295 e = container_of(err_work, struct async_event, work); 296 297 xdna = e->ndev->xdna; 298 299 if (e->resp.status == MAX_AIE2_STATUS_CODE) 300 return; 301 302 e->resp.status = MAX_AIE2_STATUS_CODE; 303 304 print_hex_dump_debug("AIE error: ", DUMP_PREFIX_OFFSET, 16, 4, 305 e->buf, 0x100, false); 306 307 info = (struct aie_err_info *)e->buf; 308 XDNA_DBG(xdna, "Error count %d return code %d", info->err_cnt, info->ret_code); 309 310 max_err = (e->size - sizeof(*info)) / sizeof(struct aie_error); 311 if (unlikely(info->err_cnt > max_err)) { 312 WARN_ONCE(1, "Error count too large %d\n", info->err_cnt); 313 return; 314 } 315 err_col = aie2_error_backtrack(e->ndev, info->payload, info->err_cnt); 316 if (!err_col) { 317 XDNA_WARN(xdna, "Did not get error column"); 318 return; 319 } 320 321 mutex_lock(&xdna->dev_lock); 322 aie2_update_last_async_error(e->ndev, info->payload, info->err_cnt); 323 324 /* Re-sent this event to firmware */ 325 if (aie2_error_event_send(e)) 326 XDNA_WARN(xdna, "Unable to register async event"); 327 mutex_unlock(&xdna->dev_lock); 328 } 329 330 void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev) 331 { 332 struct amdxdna_dev *xdna = ndev->xdna; 333 struct async_events *events; 334 335 events = ndev->async_events; 336 337 mutex_unlock(&xdna->dev_lock); 338 destroy_workqueue(events->wq); 339 mutex_lock(&xdna->dev_lock); 340 341 dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf, 342 events->addr, DMA_FROM_DEVICE); 343 kfree(events); 344 } 345 346 int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev) 347 { 348 struct amdxdna_dev *xdna = ndev->xdna; 349 u32 total_col = ndev->total_col; 350 u32 total_size = ASYNC_BUF_SIZE * total_col; 351 struct async_events *events; 352 int i, ret; 353 354 events = kzalloc(struct_size(events, event, total_col), GFP_KERNEL); 355 if (!events) 356 return -ENOMEM; 357 358 events->buf = dma_alloc_noncoherent(xdna->ddev.dev, total_size, &events->addr, 359 DMA_FROM_DEVICE, GFP_KERNEL); 360 if (!events->buf) { 361 ret = -ENOMEM; 362 goto free_events; 363 } 364 events->size = total_size; 365 events->event_cnt = total_col; 366 367 events->wq = alloc_ordered_workqueue("async_wq", 0); 368 if (!events->wq) { 369 ret = -ENOMEM; 370 goto free_buf; 371 } 372 373 for (i = 0; i < events->event_cnt; i++) { 374 struct async_event *e = &events->event[i]; 375 u32 offset = i * ASYNC_BUF_SIZE; 376 377 e->ndev = ndev; 378 e->wq = events->wq; 379 e->buf = &events->buf[offset]; 380 e->addr = events->addr + offset; 381 e->size = ASYNC_BUF_SIZE; 382 e->resp.status = MAX_AIE2_STATUS_CODE; 383 INIT_WORK(&e->work, aie2_error_worker); 384 385 ret = aie2_error_event_send(e); 386 if (ret) 387 goto free_wq; 388 } 389 390 ndev->async_events = events; 391 392 XDNA_DBG(xdna, "Async event count %d, buf total size 0x%x", 393 events->event_cnt, events->size); 394 return 0; 395 396 free_wq: 397 destroy_workqueue(events->wq); 398 free_buf: 399 dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf, 400 events->addr, DMA_FROM_DEVICE); 401 free_events: 402 kfree(events); 403 return ret; 404 } 405 406 int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev, struct amdxdna_drm_get_array *args) 407 { 408 struct amdxdna_dev *xdna = ndev->xdna; 409 410 drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock)); 411 412 args->num_element = 1; 413 args->element_size = sizeof(ndev->last_async_err); 414 if (copy_to_user(u64_to_user_ptr(args->buffer), 415 &ndev->last_async_err, args->element_size)) 416 return -EFAULT; 417 418 return 0; 419 } 420