1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2023-2024, Advanced Micro Devices, Inc. 4 */ 5 6 #include <drm/drm_cache.h> 7 #include <drm/drm_device.h> 8 #include <drm/drm_print.h> 9 #include <drm/gpu_scheduler.h> 10 #include <linux/dma-mapping.h> 11 #include <linux/kthread.h> 12 #include <linux/kernel.h> 13 14 #include "aie.h" 15 #include "aie2_msg_priv.h" 16 #include "aie2_pci.h" 17 #include "amdxdna_error.h" 18 #include "amdxdna_mailbox.h" 19 #include "amdxdna_pci_drv.h" 20 21 struct async_event { 22 struct amdxdna_dev_hdl *ndev; 23 struct async_event_msg_resp resp; 24 struct workqueue_struct *wq; 25 struct work_struct work; 26 u8 *buf; 27 dma_addr_t addr; 28 u32 size; 29 }; 30 31 struct async_events { 32 struct workqueue_struct *wq; 33 u8 *buf; 34 dma_addr_t addr; 35 u32 size; 36 u32 event_cnt; 37 struct async_event event[] __counted_by(event_cnt); 38 }; 39 40 /* 41 * Below enum, struct and lookup tables are porting from XAIE util header file. 42 * 43 * Below data is defined by AIE device and it is used for decode error message 44 * from the device. 45 */ 46 47 enum aie_module_type { 48 AIE_MEM_MOD = 0, 49 AIE_CORE_MOD, 50 AIE_PL_MOD, 51 AIE_UNKNOWN_MOD, 52 }; 53 54 enum aie_error_category { 55 AIE_ERROR_SATURATION = 0, 56 AIE_ERROR_FP, 57 AIE_ERROR_STREAM, 58 AIE_ERROR_ACCESS, 59 AIE_ERROR_BUS, 60 AIE_ERROR_INSTRUCTION, 61 AIE_ERROR_ECC, 62 AIE_ERROR_LOCK, 63 AIE_ERROR_DMA, 64 AIE_ERROR_MEM_PARITY, 65 /* Unknown is not from XAIE, added for better category */ 66 AIE_ERROR_UNKNOWN, 67 }; 68 69 /* Don't pack, unless XAIE side changed */ 70 struct aie_error { 71 __u8 row; 72 __u8 col; 73 __u32 mod_type; 74 __u8 event_id; 75 }; 76 77 struct aie_err_info { 78 u32 err_cnt; 79 u32 ret_code; 80 u32 rsvd; 81 struct aie_error payload[] __counted_by(err_cnt); 82 }; 83 84 struct aie_event_category { 85 u8 event_id; 86 enum aie_error_category category; 87 }; 88 89 #define EVENT_CATEGORY(id, cat) { id, cat } 90 static const struct aie_event_category aie_ml_mem_event_cat[] = { 91 EVENT_CATEGORY(88U, AIE_ERROR_ECC), 92 EVENT_CATEGORY(90U, AIE_ERROR_ECC), 93 EVENT_CATEGORY(91U, AIE_ERROR_MEM_PARITY), 94 EVENT_CATEGORY(92U, AIE_ERROR_MEM_PARITY), 95 EVENT_CATEGORY(93U, AIE_ERROR_MEM_PARITY), 96 EVENT_CATEGORY(94U, AIE_ERROR_MEM_PARITY), 97 EVENT_CATEGORY(95U, AIE_ERROR_MEM_PARITY), 98 EVENT_CATEGORY(96U, AIE_ERROR_MEM_PARITY), 99 EVENT_CATEGORY(97U, AIE_ERROR_DMA), 100 EVENT_CATEGORY(98U, AIE_ERROR_DMA), 101 EVENT_CATEGORY(99U, AIE_ERROR_DMA), 102 EVENT_CATEGORY(100U, AIE_ERROR_DMA), 103 EVENT_CATEGORY(101U, AIE_ERROR_LOCK), 104 }; 105 106 static const struct aie_event_category aie_ml_core_event_cat[] = { 107 EVENT_CATEGORY(55U, AIE_ERROR_ACCESS), 108 EVENT_CATEGORY(56U, AIE_ERROR_STREAM), 109 EVENT_CATEGORY(57U, AIE_ERROR_STREAM), 110 EVENT_CATEGORY(58U, AIE_ERROR_BUS), 111 EVENT_CATEGORY(59U, AIE_ERROR_INSTRUCTION), 112 EVENT_CATEGORY(60U, AIE_ERROR_ACCESS), 113 EVENT_CATEGORY(62U, AIE_ERROR_ECC), 114 EVENT_CATEGORY(64U, AIE_ERROR_ECC), 115 EVENT_CATEGORY(65U, AIE_ERROR_ACCESS), 116 EVENT_CATEGORY(66U, AIE_ERROR_ACCESS), 117 EVENT_CATEGORY(67U, AIE_ERROR_LOCK), 118 EVENT_CATEGORY(70U, AIE_ERROR_INSTRUCTION), 119 EVENT_CATEGORY(71U, AIE_ERROR_STREAM), 120 EVENT_CATEGORY(72U, AIE_ERROR_BUS), 121 }; 122 123 static const struct aie_event_category aie_ml_mem_tile_event_cat[] = { 124 EVENT_CATEGORY(130U, AIE_ERROR_ECC), 125 EVENT_CATEGORY(132U, AIE_ERROR_ECC), 126 EVENT_CATEGORY(133U, AIE_ERROR_DMA), 127 EVENT_CATEGORY(134U, AIE_ERROR_DMA), 128 EVENT_CATEGORY(135U, AIE_ERROR_STREAM), 129 EVENT_CATEGORY(136U, AIE_ERROR_STREAM), 130 EVENT_CATEGORY(137U, AIE_ERROR_STREAM), 131 EVENT_CATEGORY(138U, AIE_ERROR_BUS), 132 EVENT_CATEGORY(139U, AIE_ERROR_LOCK), 133 }; 134 135 static const struct aie_event_category aie_ml_shim_tile_event_cat[] = { 136 EVENT_CATEGORY(64U, AIE_ERROR_BUS), 137 EVENT_CATEGORY(65U, AIE_ERROR_STREAM), 138 EVENT_CATEGORY(66U, AIE_ERROR_STREAM), 139 EVENT_CATEGORY(67U, AIE_ERROR_BUS), 140 EVENT_CATEGORY(68U, AIE_ERROR_BUS), 141 EVENT_CATEGORY(69U, AIE_ERROR_BUS), 142 EVENT_CATEGORY(70U, AIE_ERROR_BUS), 143 EVENT_CATEGORY(71U, AIE_ERROR_BUS), 144 EVENT_CATEGORY(72U, AIE_ERROR_DMA), 145 EVENT_CATEGORY(73U, AIE_ERROR_DMA), 146 EVENT_CATEGORY(74U, AIE_ERROR_LOCK), 147 }; 148 149 static const enum amdxdna_error_num aie_cat_err_num_map[] = { 150 [AIE_ERROR_SATURATION] = AMDXDNA_ERROR_NUM_AIE_SATURATION, 151 [AIE_ERROR_FP] = AMDXDNA_ERROR_NUM_AIE_FP, 152 [AIE_ERROR_STREAM] = AMDXDNA_ERROR_NUM_AIE_STREAM, 153 [AIE_ERROR_ACCESS] = AMDXDNA_ERROR_NUM_AIE_ACCESS, 154 [AIE_ERROR_BUS] = AMDXDNA_ERROR_NUM_AIE_BUS, 155 [AIE_ERROR_INSTRUCTION] = AMDXDNA_ERROR_NUM_AIE_INSTRUCTION, 156 [AIE_ERROR_ECC] = AMDXDNA_ERROR_NUM_AIE_ECC, 157 [AIE_ERROR_LOCK] = AMDXDNA_ERROR_NUM_AIE_LOCK, 158 [AIE_ERROR_DMA] = AMDXDNA_ERROR_NUM_AIE_DMA, 159 [AIE_ERROR_MEM_PARITY] = AMDXDNA_ERROR_NUM_AIE_MEM_PARITY, 160 [AIE_ERROR_UNKNOWN] = AMDXDNA_ERROR_NUM_UNKNOWN, 161 }; 162 163 static_assert(ARRAY_SIZE(aie_cat_err_num_map) == AIE_ERROR_UNKNOWN + 1); 164 165 static const enum amdxdna_error_module aie_err_mod_map[] = { 166 [AIE_MEM_MOD] = AMDXDNA_ERROR_MODULE_AIE_MEMORY, 167 [AIE_CORE_MOD] = AMDXDNA_ERROR_MODULE_AIE_CORE, 168 [AIE_PL_MOD] = AMDXDNA_ERROR_MODULE_AIE_PL, 169 [AIE_UNKNOWN_MOD] = AMDXDNA_ERROR_MODULE_UNKNOWN, 170 }; 171 172 static_assert(ARRAY_SIZE(aie_err_mod_map) == AIE_UNKNOWN_MOD + 1); 173 174 static enum aie_error_category 175 aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type) 176 { 177 const struct aie_event_category *lut; 178 int num_entry; 179 int i; 180 181 switch (mod_type) { 182 case AIE_PL_MOD: 183 lut = aie_ml_shim_tile_event_cat; 184 num_entry = ARRAY_SIZE(aie_ml_shim_tile_event_cat); 185 break; 186 case AIE_CORE_MOD: 187 lut = aie_ml_core_event_cat; 188 num_entry = ARRAY_SIZE(aie_ml_core_event_cat); 189 break; 190 case AIE_MEM_MOD: 191 if (row == 1) { 192 lut = aie_ml_mem_tile_event_cat; 193 num_entry = ARRAY_SIZE(aie_ml_mem_tile_event_cat); 194 } else { 195 lut = aie_ml_mem_event_cat; 196 num_entry = ARRAY_SIZE(aie_ml_mem_event_cat); 197 } 198 break; 199 default: 200 return AIE_ERROR_UNKNOWN; 201 } 202 203 for (i = 0; i < num_entry; i++) { 204 if (event_id != lut[i].event_id) 205 continue; 206 207 if (lut[i].category > AIE_ERROR_UNKNOWN) 208 return AIE_ERROR_UNKNOWN; 209 210 return lut[i].category; 211 } 212 213 return AIE_ERROR_UNKNOWN; 214 } 215 216 static void aie2_update_last_async_error(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err) 217 { 218 struct aie_error *errs = err_info; 219 enum amdxdna_error_module err_mod; 220 enum aie_error_category aie_err; 221 enum amdxdna_error_num err_num; 222 struct aie_error *last_err; 223 224 last_err = &errs[num_err - 1]; 225 if (last_err->mod_type >= AIE_UNKNOWN_MOD) { 226 err_num = aie_cat_err_num_map[AIE_ERROR_UNKNOWN]; 227 err_mod = aie_err_mod_map[AIE_UNKNOWN_MOD]; 228 } else { 229 aie_err = aie_get_error_category(last_err->row, 230 last_err->event_id, 231 last_err->mod_type); 232 err_num = aie_cat_err_num_map[aie_err]; 233 err_mod = aie_err_mod_map[last_err->mod_type]; 234 } 235 236 ndev->last_async_err.err_code = AMDXDNA_ERROR_ENCODE(err_num, err_mod); 237 ndev->last_async_err.ts_us = ktime_to_us(ktime_get_real()); 238 ndev->last_async_err.ex_err_code = AMDXDNA_EXTRA_ERR_ENCODE(last_err->row, last_err->col); 239 } 240 241 static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err) 242 { 243 struct aie_error *errs = err_info; 244 u32 err_col = 0; /* assume that AIE has less than 32 columns */ 245 int i; 246 247 /* Get err column bitmap */ 248 for (i = 0; i < num_err; i++) { 249 struct aie_error *err = &errs[i]; 250 enum aie_error_category cat; 251 252 cat = aie_get_error_category(err->row, err->event_id, err->mod_type); 253 XDNA_ERR(ndev->aie.xdna, "Row: %d, Col: %d, module %d, event ID %d, category %d", 254 err->row, err->col, err->mod_type, 255 err->event_id, cat); 256 257 if (err->col >= 32) { 258 XDNA_WARN(ndev->aie.xdna, "Invalid column number"); 259 break; 260 } 261 262 err_col |= (1 << err->col); 263 } 264 265 return err_col; 266 } 267 268 static int aie2_error_async_cb(void *handle, void __iomem *data, size_t size) 269 { 270 struct async_event *e = handle; 271 272 if (data) { 273 e->resp.type = readl(data + offsetof(struct async_event_msg_resp, type)); 274 wmb(); /* Update status in the end, so that no lock for here */ 275 e->resp.status = readl(data + offsetof(struct async_event_msg_resp, status)); 276 } 277 queue_work(e->wq, &e->work); 278 return 0; 279 } 280 281 static int aie2_error_event_send(struct async_event *e) 282 { 283 drm_clflush_virt_range(e->buf, e->size); /* device can access */ 284 return aie2_register_asyn_event_msg(e->ndev, e->addr, e->size, e, 285 aie2_error_async_cb); 286 } 287 288 static void aie2_error_worker(struct work_struct *err_work) 289 { 290 struct aie_err_info *info; 291 struct amdxdna_dev *xdna; 292 struct async_event *e; 293 u32 max_err; 294 u32 err_col; 295 296 e = container_of(err_work, struct async_event, work); 297 298 xdna = e->ndev->aie.xdna; 299 300 if (e->resp.status == MAX_AIE2_STATUS_CODE) 301 return; 302 303 e->resp.status = MAX_AIE2_STATUS_CODE; 304 305 print_hex_dump_debug("AIE error: ", DUMP_PREFIX_OFFSET, 16, 4, 306 e->buf, 0x100, false); 307 308 info = (struct aie_err_info *)e->buf; 309 XDNA_DBG(xdna, "Error count %d return code %d", info->err_cnt, info->ret_code); 310 311 max_err = (e->size - sizeof(*info)) / sizeof(struct aie_error); 312 if (unlikely(info->err_cnt > max_err)) { 313 WARN_ONCE(1, "Error count too large %d\n", info->err_cnt); 314 return; 315 } 316 err_col = aie2_error_backtrack(e->ndev, info->payload, info->err_cnt); 317 if (!err_col) { 318 XDNA_WARN(xdna, "Did not get error column"); 319 return; 320 } 321 322 mutex_lock(&xdna->dev_lock); 323 aie2_update_last_async_error(e->ndev, info->payload, info->err_cnt); 324 325 /* Re-sent this event to firmware */ 326 if (aie2_error_event_send(e)) 327 XDNA_WARN(xdna, "Unable to register async event"); 328 mutex_unlock(&xdna->dev_lock); 329 } 330 331 void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev) 332 { 333 struct amdxdna_dev *xdna = ndev->aie.xdna; 334 struct async_events *events; 335 336 events = ndev->async_events; 337 338 mutex_unlock(&xdna->dev_lock); 339 destroy_workqueue(events->wq); 340 mutex_lock(&xdna->dev_lock); 341 342 amdxdna_free_msg_buffer(xdna, events->size, events->buf, events->addr); 343 kfree(events); 344 } 345 346 int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev) 347 { 348 struct amdxdna_dev *xdna = ndev->aie.xdna; 349 u32 total_col = ndev->total_col; 350 u32 total_size = ASYNC_BUF_SIZE * total_col; 351 struct async_events *events; 352 int i, ret; 353 354 events = kzalloc_flex(*events, event, total_col); 355 if (!events) 356 return -ENOMEM; 357 358 events->buf = amdxdna_alloc_msg_buffer(xdna, &total_size, &events->addr); 359 if (IS_ERR(events->buf)) { 360 ret = PTR_ERR(events->buf); 361 goto free_events; 362 } 363 events->size = total_size; 364 events->event_cnt = total_col; 365 366 events->wq = alloc_ordered_workqueue("async_wq", 0); 367 if (!events->wq) { 368 ret = -ENOMEM; 369 goto free_buf; 370 } 371 372 for (i = 0; i < events->event_cnt; i++) { 373 struct async_event *e = &events->event[i]; 374 u32 offset = i * ASYNC_BUF_SIZE; 375 376 e->ndev = ndev; 377 e->wq = events->wq; 378 e->buf = &events->buf[offset]; 379 e->addr = events->addr + offset; 380 e->size = ASYNC_BUF_SIZE; 381 e->resp.status = MAX_AIE2_STATUS_CODE; 382 INIT_WORK(&e->work, aie2_error_worker); 383 384 ret = aie2_error_event_send(e); 385 if (ret) 386 goto free_wq; 387 } 388 389 ndev->async_events = events; 390 391 XDNA_DBG(xdna, "Async event count %d, buf total size 0x%x", 392 events->event_cnt, events->size); 393 return 0; 394 395 free_wq: 396 destroy_workqueue(events->wq); 397 free_buf: 398 amdxdna_free_msg_buffer(xdna, events->size, events->buf, events->addr); 399 free_events: 400 kfree(events); 401 return ret; 402 } 403 404 int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev, struct amdxdna_drm_get_array *args) 405 { 406 struct amdxdna_dev *xdna = ndev->aie.xdna; 407 408 drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock)); 409 410 if (!args->num_element) 411 return -EINVAL; 412 413 args->num_element = 1; 414 args->element_size = min(args->element_size, sizeof(ndev->last_async_err)); 415 if (copy_to_user(u64_to_user_ptr(args->buffer), 416 &ndev->last_async_err, args->element_size)) 417 return -EFAULT; 418 419 return 0; 420 } 421