1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2023-2024, Advanced Micro Devices, Inc. 4 */ 5 6 #include <drm/drm_cache.h> 7 #include <drm/drm_device.h> 8 #include <drm/drm_print.h> 9 #include <drm/gpu_scheduler.h> 10 #include <linux/dma-mapping.h> 11 #include <linux/kthread.h> 12 #include <linux/kernel.h> 13 14 #include "aie2_msg_priv.h" 15 #include "aie2_pci.h" 16 #include "amdxdna_mailbox.h" 17 #include "amdxdna_pci_drv.h" 18 19 struct async_event { 20 struct amdxdna_dev_hdl *ndev; 21 struct async_event_msg_resp resp; 22 struct workqueue_struct *wq; 23 struct work_struct work; 24 u8 *buf; 25 dma_addr_t addr; 26 u32 size; 27 }; 28 29 struct async_events { 30 struct workqueue_struct *wq; 31 u8 *buf; 32 dma_addr_t addr; 33 u32 size; 34 u32 event_cnt; 35 struct async_event event[] __counted_by(event_cnt); 36 }; 37 38 /* 39 * Below enum, struct and lookup tables are porting from XAIE util header file. 40 * 41 * Below data is defined by AIE device and it is used for decode error message 42 * from the device. 43 */ 44 45 enum aie_module_type { 46 AIE_MEM_MOD = 0, 47 AIE_CORE_MOD, 48 AIE_PL_MOD, 49 }; 50 51 enum aie_error_category { 52 AIE_ERROR_SATURATION = 0, 53 AIE_ERROR_FP, 54 AIE_ERROR_STREAM, 55 AIE_ERROR_ACCESS, 56 AIE_ERROR_BUS, 57 AIE_ERROR_INSTRUCTION, 58 AIE_ERROR_ECC, 59 AIE_ERROR_LOCK, 60 AIE_ERROR_DMA, 61 AIE_ERROR_MEM_PARITY, 62 /* Unknown is not from XAIE, added for better category */ 63 AIE_ERROR_UNKNOWN, 64 }; 65 66 /* Don't pack, unless XAIE side changed */ 67 struct aie_error { 68 __u8 row; 69 __u8 col; 70 __u32 mod_type; 71 __u8 event_id; 72 }; 73 74 struct aie_err_info { 75 u32 err_cnt; 76 u32 ret_code; 77 u32 rsvd; 78 struct aie_error payload[] __counted_by(err_cnt); 79 }; 80 81 struct aie_event_category { 82 u8 event_id; 83 enum aie_error_category category; 84 }; 85 86 #define EVENT_CATEGORY(id, cat) { id, cat } 87 static const struct aie_event_category aie_ml_mem_event_cat[] = { 88 EVENT_CATEGORY(88U, AIE_ERROR_ECC), 89 EVENT_CATEGORY(90U, AIE_ERROR_ECC), 90 EVENT_CATEGORY(91U, AIE_ERROR_MEM_PARITY), 91 EVENT_CATEGORY(92U, AIE_ERROR_MEM_PARITY), 92 EVENT_CATEGORY(93U, AIE_ERROR_MEM_PARITY), 93 EVENT_CATEGORY(94U, AIE_ERROR_MEM_PARITY), 94 EVENT_CATEGORY(95U, AIE_ERROR_MEM_PARITY), 95 EVENT_CATEGORY(96U, AIE_ERROR_MEM_PARITY), 96 EVENT_CATEGORY(97U, AIE_ERROR_DMA), 97 EVENT_CATEGORY(98U, AIE_ERROR_DMA), 98 EVENT_CATEGORY(99U, AIE_ERROR_DMA), 99 EVENT_CATEGORY(100U, AIE_ERROR_DMA), 100 EVENT_CATEGORY(101U, AIE_ERROR_LOCK), 101 }; 102 103 static const struct aie_event_category aie_ml_core_event_cat[] = { 104 EVENT_CATEGORY(55U, AIE_ERROR_ACCESS), 105 EVENT_CATEGORY(56U, AIE_ERROR_STREAM), 106 EVENT_CATEGORY(57U, AIE_ERROR_STREAM), 107 EVENT_CATEGORY(58U, AIE_ERROR_BUS), 108 EVENT_CATEGORY(59U, AIE_ERROR_INSTRUCTION), 109 EVENT_CATEGORY(60U, AIE_ERROR_ACCESS), 110 EVENT_CATEGORY(62U, AIE_ERROR_ECC), 111 EVENT_CATEGORY(64U, AIE_ERROR_ECC), 112 EVENT_CATEGORY(65U, AIE_ERROR_ACCESS), 113 EVENT_CATEGORY(66U, AIE_ERROR_ACCESS), 114 EVENT_CATEGORY(67U, AIE_ERROR_LOCK), 115 EVENT_CATEGORY(70U, AIE_ERROR_INSTRUCTION), 116 EVENT_CATEGORY(71U, AIE_ERROR_STREAM), 117 EVENT_CATEGORY(72U, AIE_ERROR_BUS), 118 }; 119 120 static const struct aie_event_category aie_ml_mem_tile_event_cat[] = { 121 EVENT_CATEGORY(130U, AIE_ERROR_ECC), 122 EVENT_CATEGORY(132U, AIE_ERROR_ECC), 123 EVENT_CATEGORY(133U, AIE_ERROR_DMA), 124 EVENT_CATEGORY(134U, AIE_ERROR_DMA), 125 EVENT_CATEGORY(135U, AIE_ERROR_STREAM), 126 EVENT_CATEGORY(136U, AIE_ERROR_STREAM), 127 EVENT_CATEGORY(137U, AIE_ERROR_STREAM), 128 EVENT_CATEGORY(138U, AIE_ERROR_BUS), 129 EVENT_CATEGORY(139U, AIE_ERROR_LOCK), 130 }; 131 132 static const struct aie_event_category aie_ml_shim_tile_event_cat[] = { 133 EVENT_CATEGORY(64U, AIE_ERROR_BUS), 134 EVENT_CATEGORY(65U, AIE_ERROR_STREAM), 135 EVENT_CATEGORY(66U, AIE_ERROR_STREAM), 136 EVENT_CATEGORY(67U, AIE_ERROR_BUS), 137 EVENT_CATEGORY(68U, AIE_ERROR_BUS), 138 EVENT_CATEGORY(69U, AIE_ERROR_BUS), 139 EVENT_CATEGORY(70U, AIE_ERROR_BUS), 140 EVENT_CATEGORY(71U, AIE_ERROR_BUS), 141 EVENT_CATEGORY(72U, AIE_ERROR_DMA), 142 EVENT_CATEGORY(73U, AIE_ERROR_DMA), 143 EVENT_CATEGORY(74U, AIE_ERROR_LOCK), 144 }; 145 146 static enum aie_error_category 147 aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type) 148 { 149 const struct aie_event_category *lut; 150 int num_entry; 151 int i; 152 153 switch (mod_type) { 154 case AIE_PL_MOD: 155 lut = aie_ml_shim_tile_event_cat; 156 num_entry = ARRAY_SIZE(aie_ml_shim_tile_event_cat); 157 break; 158 case AIE_CORE_MOD: 159 lut = aie_ml_core_event_cat; 160 num_entry = ARRAY_SIZE(aie_ml_core_event_cat); 161 break; 162 case AIE_MEM_MOD: 163 if (row == 1) { 164 lut = aie_ml_mem_tile_event_cat; 165 num_entry = ARRAY_SIZE(aie_ml_mem_tile_event_cat); 166 } else { 167 lut = aie_ml_mem_event_cat; 168 num_entry = ARRAY_SIZE(aie_ml_mem_event_cat); 169 } 170 break; 171 default: 172 return AIE_ERROR_UNKNOWN; 173 } 174 175 for (i = 0; i < num_entry; i++) { 176 if (event_id != lut[i].event_id) 177 continue; 178 179 return lut[i].category; 180 } 181 182 return AIE_ERROR_UNKNOWN; 183 } 184 185 static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err) 186 { 187 struct aie_error *errs = err_info; 188 u32 err_col = 0; /* assume that AIE has less than 32 columns */ 189 int i; 190 191 /* Get err column bitmap */ 192 for (i = 0; i < num_err; i++) { 193 struct aie_error *err = &errs[i]; 194 enum aie_error_category cat; 195 196 cat = aie_get_error_category(err->row, err->event_id, err->mod_type); 197 XDNA_ERR(ndev->xdna, "Row: %d, Col: %d, module %d, event ID %d, category %d", 198 err->row, err->col, err->mod_type, 199 err->event_id, cat); 200 201 if (err->col >= 32) { 202 XDNA_WARN(ndev->xdna, "Invalid column number"); 203 break; 204 } 205 206 err_col |= (1 << err->col); 207 } 208 209 return err_col; 210 } 211 212 static int aie2_error_async_cb(void *handle, const u32 *data, size_t size) 213 { 214 struct async_event_msg_resp *resp; 215 struct async_event *e = handle; 216 217 if (data) { 218 resp = (struct async_event_msg_resp *)data; 219 e->resp.type = resp->type; 220 wmb(); /* Update status in the end, so that no lock for here */ 221 e->resp.status = resp->status; 222 } 223 queue_work(e->wq, &e->work); 224 return 0; 225 } 226 227 static int aie2_error_event_send(struct async_event *e) 228 { 229 drm_clflush_virt_range(e->buf, e->size); /* device can access */ 230 return aie2_register_asyn_event_msg(e->ndev, e->addr, e->size, e, 231 aie2_error_async_cb); 232 } 233 234 static void aie2_error_worker(struct work_struct *err_work) 235 { 236 struct aie_err_info *info; 237 struct amdxdna_dev *xdna; 238 struct async_event *e; 239 u32 max_err; 240 u32 err_col; 241 242 e = container_of(err_work, struct async_event, work); 243 244 xdna = e->ndev->xdna; 245 246 if (e->resp.status == MAX_AIE2_STATUS_CODE) 247 return; 248 249 e->resp.status = MAX_AIE2_STATUS_CODE; 250 251 print_hex_dump_debug("AIE error: ", DUMP_PREFIX_OFFSET, 16, 4, 252 e->buf, 0x100, false); 253 254 info = (struct aie_err_info *)e->buf; 255 XDNA_DBG(xdna, "Error count %d return code %d", info->err_cnt, info->ret_code); 256 257 max_err = (e->size - sizeof(*info)) / sizeof(struct aie_error); 258 if (unlikely(info->err_cnt > max_err)) { 259 WARN_ONCE(1, "Error count too large %d\n", info->err_cnt); 260 return; 261 } 262 err_col = aie2_error_backtrack(e->ndev, info->payload, info->err_cnt); 263 if (!err_col) { 264 XDNA_WARN(xdna, "Did not get error column"); 265 return; 266 } 267 268 mutex_lock(&xdna->dev_lock); 269 /* Re-sent this event to firmware */ 270 if (aie2_error_event_send(e)) 271 XDNA_WARN(xdna, "Unable to register async event"); 272 mutex_unlock(&xdna->dev_lock); 273 } 274 275 int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev) 276 { 277 struct amdxdna_dev *xdna = ndev->xdna; 278 struct async_event *e; 279 int i, ret; 280 281 drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock)); 282 for (i = 0; i < ndev->async_events->event_cnt; i++) { 283 e = &ndev->async_events->event[i]; 284 ret = aie2_error_event_send(e); 285 if (ret) 286 return ret; 287 } 288 289 return 0; 290 } 291 292 void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev) 293 { 294 struct amdxdna_dev *xdna = ndev->xdna; 295 struct async_events *events; 296 297 events = ndev->async_events; 298 299 mutex_unlock(&xdna->dev_lock); 300 destroy_workqueue(events->wq); 301 mutex_lock(&xdna->dev_lock); 302 303 dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf, 304 events->addr, DMA_FROM_DEVICE); 305 kfree(events); 306 } 307 308 int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev) 309 { 310 struct amdxdna_dev *xdna = ndev->xdna; 311 u32 total_col = ndev->total_col; 312 u32 total_size = ASYNC_BUF_SIZE * total_col; 313 struct async_events *events; 314 int i, ret; 315 316 events = kzalloc(struct_size(events, event, total_col), GFP_KERNEL); 317 if (!events) 318 return -ENOMEM; 319 320 events->buf = dma_alloc_noncoherent(xdna->ddev.dev, total_size, &events->addr, 321 DMA_FROM_DEVICE, GFP_KERNEL); 322 if (!events->buf) { 323 ret = -ENOMEM; 324 goto free_events; 325 } 326 events->size = total_size; 327 events->event_cnt = total_col; 328 329 events->wq = alloc_ordered_workqueue("async_wq", 0); 330 if (!events->wq) { 331 ret = -ENOMEM; 332 goto free_buf; 333 } 334 335 for (i = 0; i < events->event_cnt; i++) { 336 struct async_event *e = &events->event[i]; 337 u32 offset = i * ASYNC_BUF_SIZE; 338 339 e->ndev = ndev; 340 e->wq = events->wq; 341 e->buf = &events->buf[offset]; 342 e->addr = events->addr + offset; 343 e->size = ASYNC_BUF_SIZE; 344 e->resp.status = MAX_AIE2_STATUS_CODE; 345 INIT_WORK(&e->work, aie2_error_worker); 346 } 347 348 ndev->async_events = events; 349 350 XDNA_DBG(xdna, "Async event count %d, buf total size 0x%x", 351 events->event_cnt, events->size); 352 return 0; 353 354 free_buf: 355 dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf, 356 events->addr, DMA_FROM_DEVICE); 357 free_events: 358 kfree(events); 359 return ret; 360 } 361