1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2023-2024, Advanced Micro Devices, Inc. 4 */ 5 6 #include <drm/drm_cache.h> 7 #include <drm/drm_device.h> 8 #include <drm/drm_print.h> 9 #include <drm/gpu_scheduler.h> 10 #include <linux/dma-mapping.h> 11 #include <linux/kthread.h> 12 #include <linux/kernel.h> 13 14 #include "aie2_msg_priv.h" 15 #include "aie2_pci.h" 16 #include "amdxdna_mailbox.h" 17 #include "amdxdna_pci_drv.h" 18 19 struct async_event { 20 struct amdxdna_dev_hdl *ndev; 21 struct async_event_msg_resp resp; 22 struct workqueue_struct *wq; 23 struct work_struct work; 24 u8 *buf; 25 dma_addr_t addr; 26 u32 size; 27 }; 28 29 struct async_events { 30 struct workqueue_struct *wq; 31 u8 *buf; 32 dma_addr_t addr; 33 u32 size; 34 u32 event_cnt; 35 struct async_event event[] __counted_by(event_cnt); 36 }; 37 38 /* 39 * Below enum, struct and lookup tables are porting from XAIE util header file. 40 * 41 * Below data is defined by AIE device and it is used for decode error message 42 * from the device. 43 */ 44 45 enum aie_module_type { 46 AIE_MEM_MOD = 0, 47 AIE_CORE_MOD, 48 AIE_PL_MOD, 49 }; 50 51 enum aie_error_category { 52 AIE_ERROR_SATURATION = 0, 53 AIE_ERROR_FP, 54 AIE_ERROR_STREAM, 55 AIE_ERROR_ACCESS, 56 AIE_ERROR_BUS, 57 AIE_ERROR_INSTRUCTION, 58 AIE_ERROR_ECC, 59 AIE_ERROR_LOCK, 60 AIE_ERROR_DMA, 61 AIE_ERROR_MEM_PARITY, 62 /* Unknown is not from XAIE, added for better category */ 63 AIE_ERROR_UNKNOWN, 64 }; 65 66 /* Don't pack, unless XAIE side changed */ 67 struct aie_error { 68 __u8 row; 69 __u8 col; 70 __u32 mod_type; 71 __u8 event_id; 72 }; 73 74 struct aie_err_info { 75 u32 err_cnt; 76 u32 ret_code; 77 u32 rsvd; 78 struct aie_error payload[] __counted_by(err_cnt); 79 }; 80 81 struct aie_event_category { 82 u8 event_id; 83 enum aie_error_category category; 84 }; 85 86 #define EVENT_CATEGORY(id, cat) { id, cat } 87 static const struct aie_event_category aie_ml_mem_event_cat[] = { 88 EVENT_CATEGORY(88U, AIE_ERROR_ECC), 89 EVENT_CATEGORY(90U, AIE_ERROR_ECC), 90 EVENT_CATEGORY(91U, AIE_ERROR_MEM_PARITY), 91 EVENT_CATEGORY(92U, AIE_ERROR_MEM_PARITY), 92 EVENT_CATEGORY(93U, AIE_ERROR_MEM_PARITY), 93 EVENT_CATEGORY(94U, AIE_ERROR_MEM_PARITY), 94 EVENT_CATEGORY(95U, AIE_ERROR_MEM_PARITY), 95 EVENT_CATEGORY(96U, AIE_ERROR_MEM_PARITY), 96 EVENT_CATEGORY(97U, AIE_ERROR_DMA), 97 EVENT_CATEGORY(98U, AIE_ERROR_DMA), 98 EVENT_CATEGORY(99U, AIE_ERROR_DMA), 99 EVENT_CATEGORY(100U, AIE_ERROR_DMA), 100 EVENT_CATEGORY(101U, AIE_ERROR_LOCK), 101 }; 102 103 static const struct aie_event_category aie_ml_core_event_cat[] = { 104 EVENT_CATEGORY(55U, AIE_ERROR_ACCESS), 105 EVENT_CATEGORY(56U, AIE_ERROR_STREAM), 106 EVENT_CATEGORY(57U, AIE_ERROR_STREAM), 107 EVENT_CATEGORY(58U, AIE_ERROR_BUS), 108 EVENT_CATEGORY(59U, AIE_ERROR_INSTRUCTION), 109 EVENT_CATEGORY(60U, AIE_ERROR_ACCESS), 110 EVENT_CATEGORY(62U, AIE_ERROR_ECC), 111 EVENT_CATEGORY(64U, AIE_ERROR_ECC), 112 EVENT_CATEGORY(65U, AIE_ERROR_ACCESS), 113 EVENT_CATEGORY(66U, AIE_ERROR_ACCESS), 114 EVENT_CATEGORY(67U, AIE_ERROR_LOCK), 115 EVENT_CATEGORY(70U, AIE_ERROR_INSTRUCTION), 116 EVENT_CATEGORY(71U, AIE_ERROR_STREAM), 117 EVENT_CATEGORY(72U, AIE_ERROR_BUS), 118 }; 119 120 static const struct aie_event_category aie_ml_mem_tile_event_cat[] = { 121 EVENT_CATEGORY(130U, AIE_ERROR_ECC), 122 EVENT_CATEGORY(132U, AIE_ERROR_ECC), 123 EVENT_CATEGORY(133U, AIE_ERROR_DMA), 124 EVENT_CATEGORY(134U, AIE_ERROR_DMA), 125 EVENT_CATEGORY(135U, AIE_ERROR_STREAM), 126 EVENT_CATEGORY(136U, AIE_ERROR_STREAM), 127 EVENT_CATEGORY(137U, AIE_ERROR_STREAM), 128 EVENT_CATEGORY(138U, AIE_ERROR_BUS), 129 EVENT_CATEGORY(139U, AIE_ERROR_LOCK), 130 }; 131 132 static const struct aie_event_category aie_ml_shim_tile_event_cat[] = { 133 EVENT_CATEGORY(64U, AIE_ERROR_BUS), 134 EVENT_CATEGORY(65U, AIE_ERROR_STREAM), 135 EVENT_CATEGORY(66U, AIE_ERROR_STREAM), 136 EVENT_CATEGORY(67U, AIE_ERROR_BUS), 137 EVENT_CATEGORY(68U, AIE_ERROR_BUS), 138 EVENT_CATEGORY(69U, AIE_ERROR_BUS), 139 EVENT_CATEGORY(70U, AIE_ERROR_BUS), 140 EVENT_CATEGORY(71U, AIE_ERROR_BUS), 141 EVENT_CATEGORY(72U, AIE_ERROR_DMA), 142 EVENT_CATEGORY(73U, AIE_ERROR_DMA), 143 EVENT_CATEGORY(74U, AIE_ERROR_LOCK), 144 }; 145 146 static enum aie_error_category 147 aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type) 148 { 149 const struct aie_event_category *lut; 150 int num_entry; 151 int i; 152 153 switch (mod_type) { 154 case AIE_PL_MOD: 155 lut = aie_ml_shim_tile_event_cat; 156 num_entry = ARRAY_SIZE(aie_ml_shim_tile_event_cat); 157 break; 158 case AIE_CORE_MOD: 159 lut = aie_ml_core_event_cat; 160 num_entry = ARRAY_SIZE(aie_ml_core_event_cat); 161 break; 162 case AIE_MEM_MOD: 163 if (row == 1) { 164 lut = aie_ml_mem_tile_event_cat; 165 num_entry = ARRAY_SIZE(aie_ml_mem_tile_event_cat); 166 } else { 167 lut = aie_ml_mem_event_cat; 168 num_entry = ARRAY_SIZE(aie_ml_mem_event_cat); 169 } 170 break; 171 default: 172 return AIE_ERROR_UNKNOWN; 173 } 174 175 for (i = 0; i < num_entry; i++) { 176 if (event_id != lut[i].event_id) 177 continue; 178 179 return lut[i].category; 180 } 181 182 return AIE_ERROR_UNKNOWN; 183 } 184 185 static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err) 186 { 187 struct aie_error *errs = err_info; 188 u32 err_col = 0; /* assume that AIE has less than 32 columns */ 189 int i; 190 191 /* Get err column bitmap */ 192 for (i = 0; i < num_err; i++) { 193 struct aie_error *err = &errs[i]; 194 enum aie_error_category cat; 195 196 cat = aie_get_error_category(err->row, err->event_id, err->mod_type); 197 XDNA_ERR(ndev->xdna, "Row: %d, Col: %d, module %d, event ID %d, category %d", 198 err->row, err->col, err->mod_type, 199 err->event_id, cat); 200 201 if (err->col >= 32) { 202 XDNA_WARN(ndev->xdna, "Invalid column number"); 203 break; 204 } 205 206 err_col |= (1 << err->col); 207 } 208 209 return err_col; 210 } 211 212 static int aie2_error_async_cb(void *handle, void __iomem *data, size_t size) 213 { 214 struct async_event *e = handle; 215 216 if (data) { 217 e->resp.type = readl(data + offsetof(struct async_event_msg_resp, type)); 218 wmb(); /* Update status in the end, so that no lock for here */ 219 e->resp.status = readl(data + offsetof(struct async_event_msg_resp, status)); 220 } 221 queue_work(e->wq, &e->work); 222 return 0; 223 } 224 225 static int aie2_error_event_send(struct async_event *e) 226 { 227 drm_clflush_virt_range(e->buf, e->size); /* device can access */ 228 return aie2_register_asyn_event_msg(e->ndev, e->addr, e->size, e, 229 aie2_error_async_cb); 230 } 231 232 static void aie2_error_worker(struct work_struct *err_work) 233 { 234 struct aie_err_info *info; 235 struct amdxdna_dev *xdna; 236 struct async_event *e; 237 u32 max_err; 238 u32 err_col; 239 240 e = container_of(err_work, struct async_event, work); 241 242 xdna = e->ndev->xdna; 243 244 if (e->resp.status == MAX_AIE2_STATUS_CODE) 245 return; 246 247 e->resp.status = MAX_AIE2_STATUS_CODE; 248 249 print_hex_dump_debug("AIE error: ", DUMP_PREFIX_OFFSET, 16, 4, 250 e->buf, 0x100, false); 251 252 info = (struct aie_err_info *)e->buf; 253 XDNA_DBG(xdna, "Error count %d return code %d", info->err_cnt, info->ret_code); 254 255 max_err = (e->size - sizeof(*info)) / sizeof(struct aie_error); 256 if (unlikely(info->err_cnt > max_err)) { 257 WARN_ONCE(1, "Error count too large %d\n", info->err_cnt); 258 return; 259 } 260 err_col = aie2_error_backtrack(e->ndev, info->payload, info->err_cnt); 261 if (!err_col) { 262 XDNA_WARN(xdna, "Did not get error column"); 263 return; 264 } 265 266 mutex_lock(&xdna->dev_lock); 267 /* Re-sent this event to firmware */ 268 if (aie2_error_event_send(e)) 269 XDNA_WARN(xdna, "Unable to register async event"); 270 mutex_unlock(&xdna->dev_lock); 271 } 272 273 int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev) 274 { 275 struct amdxdna_dev *xdna = ndev->xdna; 276 struct async_event *e; 277 int i, ret; 278 279 drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock)); 280 for (i = 0; i < ndev->async_events->event_cnt; i++) { 281 e = &ndev->async_events->event[i]; 282 ret = aie2_error_event_send(e); 283 if (ret) 284 return ret; 285 } 286 287 return 0; 288 } 289 290 void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev) 291 { 292 struct amdxdna_dev *xdna = ndev->xdna; 293 struct async_events *events; 294 295 events = ndev->async_events; 296 297 mutex_unlock(&xdna->dev_lock); 298 destroy_workqueue(events->wq); 299 mutex_lock(&xdna->dev_lock); 300 301 dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf, 302 events->addr, DMA_FROM_DEVICE); 303 kfree(events); 304 } 305 306 int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev) 307 { 308 struct amdxdna_dev *xdna = ndev->xdna; 309 u32 total_col = ndev->total_col; 310 u32 total_size = ASYNC_BUF_SIZE * total_col; 311 struct async_events *events; 312 int i, ret; 313 314 events = kzalloc(struct_size(events, event, total_col), GFP_KERNEL); 315 if (!events) 316 return -ENOMEM; 317 318 events->buf = dma_alloc_noncoherent(xdna->ddev.dev, total_size, &events->addr, 319 DMA_FROM_DEVICE, GFP_KERNEL); 320 if (!events->buf) { 321 ret = -ENOMEM; 322 goto free_events; 323 } 324 events->size = total_size; 325 events->event_cnt = total_col; 326 327 events->wq = alloc_ordered_workqueue("async_wq", 0); 328 if (!events->wq) { 329 ret = -ENOMEM; 330 goto free_buf; 331 } 332 333 for (i = 0; i < events->event_cnt; i++) { 334 struct async_event *e = &events->event[i]; 335 u32 offset = i * ASYNC_BUF_SIZE; 336 337 e->ndev = ndev; 338 e->wq = events->wq; 339 e->buf = &events->buf[offset]; 340 e->addr = events->addr + offset; 341 e->size = ASYNC_BUF_SIZE; 342 e->resp.status = MAX_AIE2_STATUS_CODE; 343 INIT_WORK(&e->work, aie2_error_worker); 344 } 345 346 ndev->async_events = events; 347 348 XDNA_DBG(xdna, "Async event count %d, buf total size 0x%x", 349 events->event_cnt, events->size); 350 return 0; 351 352 free_buf: 353 dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf, 354 events->addr, DMA_FROM_DEVICE); 355 free_events: 356 kfree(events); 357 return ret; 358 } 359