1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2022 Intel Corporation 4 */ 5 6 #include "xe_guc_ct.h" 7 8 #include <linux/bitfield.h> 9 #include <linux/circ_buf.h> 10 #include <linux/delay.h> 11 #include <linux/fault-inject.h> 12 13 #include <kunit/static_stub.h> 14 15 #include <drm/drm_managed.h> 16 17 #include "abi/guc_actions_abi.h" 18 #include "abi/guc_actions_sriov_abi.h" 19 #include "abi/guc_klvs_abi.h" 20 #include "xe_bo.h" 21 #include "xe_devcoredump.h" 22 #include "xe_device.h" 23 #include "xe_gt.h" 24 #include "xe_gt_pagefault.h" 25 #include "xe_gt_printk.h" 26 #include "xe_gt_sriov_pf_control.h" 27 #include "xe_gt_sriov_pf_monitor.h" 28 #include "xe_gt_sriov_printk.h" 29 #include "xe_gt_tlb_invalidation.h" 30 #include "xe_guc.h" 31 #include "xe_guc_log.h" 32 #include "xe_guc_relay.h" 33 #include "xe_guc_submit.h" 34 #include "xe_map.h" 35 #include "xe_pm.h" 36 #include "xe_trace_guc.h" 37 38 static void receive_g2h(struct xe_guc_ct *ct); 39 static void g2h_worker_func(struct work_struct *w); 40 static void safe_mode_worker_func(struct work_struct *w); 41 static void ct_exit_safe_mode(struct xe_guc_ct *ct); 42 43 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) 44 enum { 45 /* Internal states, not error conditions */ 46 CT_DEAD_STATE_REARM, /* 0x0001 */ 47 CT_DEAD_STATE_CAPTURE, /* 0x0002 */ 48 49 /* Error conditions */ 50 CT_DEAD_SETUP, /* 0x0004 */ 51 CT_DEAD_H2G_WRITE, /* 0x0008 */ 52 CT_DEAD_H2G_HAS_ROOM, /* 0x0010 */ 53 CT_DEAD_G2H_READ, /* 0x0020 */ 54 CT_DEAD_G2H_RECV, /* 0x0040 */ 55 CT_DEAD_G2H_RELEASE, /* 0x0080 */ 56 CT_DEAD_DEADLOCK, /* 0x0100 */ 57 CT_DEAD_PROCESS_FAILED, /* 0x0200 */ 58 CT_DEAD_FAST_G2H, /* 0x0400 */ 59 CT_DEAD_PARSE_G2H_RESPONSE, /* 0x0800 */ 60 CT_DEAD_PARSE_G2H_UNKNOWN, /* 0x1000 */ 61 CT_DEAD_PARSE_G2H_ORIGIN, /* 0x2000 */ 62 CT_DEAD_PARSE_G2H_TYPE, /* 0x4000 */ 63 CT_DEAD_CRASH, /* 0x8000 */ 64 }; 65 66 static void ct_dead_worker_func(struct work_struct *w); 67 static void ct_dead_capture(struct xe_guc_ct *ct, struct guc_ctb *ctb, u32 reason_code); 68 69 #define CT_DEAD(ct, ctb, reason_code) ct_dead_capture((ct), (ctb), CT_DEAD_##reason_code) 70 #else 71 #define CT_DEAD(ct, ctb, reason) \ 72 do { \ 73 struct guc_ctb *_ctb = (ctb); \ 74 if (_ctb) \ 75 _ctb->info.broken = true; \ 76 } while (0) 77 #endif 78 79 /* Used when a CT send wants to block and / or receive data */ 80 struct g2h_fence { 81 u32 *response_buffer; 82 u32 seqno; 83 u32 response_data; 84 u16 response_len; 85 u16 error; 86 u16 hint; 87 u16 reason; 88 bool cancel; 89 bool retry; 90 bool fail; 91 bool done; 92 }; 93 94 #define make_u64(hi, lo) ((u64)((u64)(u32)(hi) << 32 | (u32)(lo))) 95 96 static void g2h_fence_init(struct g2h_fence *g2h_fence, u32 *response_buffer) 97 { 98 memset(g2h_fence, 0, sizeof(*g2h_fence)); 99 g2h_fence->response_buffer = response_buffer; 100 g2h_fence->seqno = ~0x0; 101 } 102 103 static void g2h_fence_cancel(struct g2h_fence *g2h_fence) 104 { 105 g2h_fence->cancel = true; 106 g2h_fence->fail = true; 107 g2h_fence->done = true; 108 } 109 110 static bool g2h_fence_needs_alloc(struct g2h_fence *g2h_fence) 111 { 112 return g2h_fence->seqno == ~0x0; 113 } 114 115 static struct xe_guc * 116 ct_to_guc(struct xe_guc_ct *ct) 117 { 118 return container_of(ct, struct xe_guc, ct); 119 } 120 121 static struct xe_gt * 122 ct_to_gt(struct xe_guc_ct *ct) 123 { 124 return container_of(ct, struct xe_gt, uc.guc.ct); 125 } 126 127 static struct xe_device * 128 ct_to_xe(struct xe_guc_ct *ct) 129 { 130 return gt_to_xe(ct_to_gt(ct)); 131 } 132 133 /** 134 * DOC: GuC CTB Blob 135 * 136 * We allocate single blob to hold both CTB descriptors and buffers: 137 * 138 * +--------+-----------------------------------------------+------+ 139 * | offset | contents | size | 140 * +========+===============================================+======+ 141 * | 0x0000 | H2G CTB Descriptor (send) | | 142 * +--------+-----------------------------------------------+ 4K | 143 * | 0x0800 | G2H CTB Descriptor (g2h) | | 144 * +--------+-----------------------------------------------+------+ 145 * | 0x1000 | H2G CT Buffer (send) | n*4K | 146 * | | | | 147 * +--------+-----------------------------------------------+------+ 148 * | 0x1000 | G2H CT Buffer (g2h) | m*4K | 149 * | + n*4K | | | 150 * +--------+-----------------------------------------------+------+ 151 * 152 * Size of each ``CT Buffer`` must be multiple of 4K. 153 * We don't expect too many messages in flight at any time, unless we are 154 * using the GuC submission. In that case each request requires a minimum 155 * 2 dwords which gives us a maximum 256 queue'd requests. Hopefully this 156 * enough space to avoid backpressure on the driver. We increase the size 157 * of the receive buffer (relative to the send) to ensure a G2H response 158 * CTB has a landing spot. 159 * 160 * In addition to submissions, the G2H buffer needs to be able to hold 161 * enough space for recoverable page fault notifications. The number of 162 * page faults is interrupt driven and can be as much as the number of 163 * compute resources available. However, most of the actual work for these 164 * is in a separate page fault worker thread. Therefore we only need to 165 * make sure the queue has enough space to handle all of the submissions 166 * and responses and an extra buffer for incoming page faults. 167 */ 168 169 #define CTB_DESC_SIZE ALIGN(sizeof(struct guc_ct_buffer_desc), SZ_2K) 170 #define CTB_H2G_BUFFER_SIZE (SZ_4K) 171 #define CTB_G2H_BUFFER_SIZE (SZ_128K) 172 #define G2H_ROOM_BUFFER_SIZE (CTB_G2H_BUFFER_SIZE / 2) 173 174 /** 175 * xe_guc_ct_queue_proc_time_jiffies - Return maximum time to process a full 176 * CT command queue 177 * @ct: the &xe_guc_ct. Unused at this moment but will be used in the future. 178 * 179 * Observation is that a 4KiB buffer full of commands takes a little over a 180 * second to process. Use that to calculate maximum time to process a full CT 181 * command queue. 182 * 183 * Return: Maximum time to process a full CT queue in jiffies. 184 */ 185 long xe_guc_ct_queue_proc_time_jiffies(struct xe_guc_ct *ct) 186 { 187 BUILD_BUG_ON(!IS_ALIGNED(CTB_H2G_BUFFER_SIZE, SZ_4)); 188 return (CTB_H2G_BUFFER_SIZE / SZ_4K) * HZ; 189 } 190 191 static size_t guc_ct_size(void) 192 { 193 return 2 * CTB_DESC_SIZE + CTB_H2G_BUFFER_SIZE + 194 CTB_G2H_BUFFER_SIZE; 195 } 196 197 static void guc_ct_fini(struct drm_device *drm, void *arg) 198 { 199 struct xe_guc_ct *ct = arg; 200 201 ct_exit_safe_mode(ct); 202 destroy_workqueue(ct->g2h_wq); 203 xa_destroy(&ct->fence_lookup); 204 } 205 206 static void primelockdep(struct xe_guc_ct *ct) 207 { 208 if (!IS_ENABLED(CONFIG_LOCKDEP)) 209 return; 210 211 fs_reclaim_acquire(GFP_KERNEL); 212 might_lock(&ct->lock); 213 fs_reclaim_release(GFP_KERNEL); 214 } 215 216 int xe_guc_ct_init_noalloc(struct xe_guc_ct *ct) 217 { 218 struct xe_device *xe = ct_to_xe(ct); 219 struct xe_gt *gt = ct_to_gt(ct); 220 int err; 221 222 xe_gt_assert(gt, !(guc_ct_size() % PAGE_SIZE)); 223 224 ct->g2h_wq = alloc_ordered_workqueue("xe-g2h-wq", WQ_MEM_RECLAIM); 225 if (!ct->g2h_wq) 226 return -ENOMEM; 227 228 spin_lock_init(&ct->fast_lock); 229 xa_init(&ct->fence_lookup); 230 INIT_WORK(&ct->g2h_worker, g2h_worker_func); 231 INIT_DELAYED_WORK(&ct->safe_mode_worker, safe_mode_worker_func); 232 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) 233 spin_lock_init(&ct->dead.lock); 234 INIT_WORK(&ct->dead.worker, ct_dead_worker_func); 235 #endif 236 init_waitqueue_head(&ct->wq); 237 init_waitqueue_head(&ct->g2h_fence_wq); 238 239 err = drmm_mutex_init(&xe->drm, &ct->lock); 240 if (err) 241 return err; 242 243 primelockdep(ct); 244 245 err = drmm_add_action_or_reset(&xe->drm, guc_ct_fini, ct); 246 if (err) 247 return err; 248 249 xe_gt_assert(gt, ct->state == XE_GUC_CT_STATE_NOT_INITIALIZED); 250 ct->state = XE_GUC_CT_STATE_DISABLED; 251 return 0; 252 } 253 ALLOW_ERROR_INJECTION(xe_guc_ct_init_noalloc, ERRNO); /* See xe_pci_probe() */ 254 255 int xe_guc_ct_init(struct xe_guc_ct *ct) 256 { 257 struct xe_device *xe = ct_to_xe(ct); 258 struct xe_gt *gt = ct_to_gt(ct); 259 struct xe_tile *tile = gt_to_tile(gt); 260 struct xe_bo *bo; 261 262 bo = xe_managed_bo_create_pin_map(xe, tile, guc_ct_size(), 263 XE_BO_FLAG_SYSTEM | 264 XE_BO_FLAG_GGTT | 265 XE_BO_FLAG_GGTT_INVALIDATE | 266 XE_BO_FLAG_PINNED_NORESTORE); 267 if (IS_ERR(bo)) 268 return PTR_ERR(bo); 269 270 ct->bo = bo; 271 return 0; 272 } 273 ALLOW_ERROR_INJECTION(xe_guc_ct_init, ERRNO); /* See xe_pci_probe() */ 274 275 #define desc_read(xe_, guc_ctb__, field_) \ 276 xe_map_rd_field(xe_, &guc_ctb__->desc, 0, \ 277 struct guc_ct_buffer_desc, field_) 278 279 #define desc_write(xe_, guc_ctb__, field_, val_) \ 280 xe_map_wr_field(xe_, &guc_ctb__->desc, 0, \ 281 struct guc_ct_buffer_desc, field_, val_) 282 283 static void guc_ct_ctb_h2g_init(struct xe_device *xe, struct guc_ctb *h2g, 284 struct iosys_map *map) 285 { 286 h2g->info.size = CTB_H2G_BUFFER_SIZE / sizeof(u32); 287 h2g->info.resv_space = 0; 288 h2g->info.tail = 0; 289 h2g->info.head = 0; 290 h2g->info.space = CIRC_SPACE(h2g->info.tail, h2g->info.head, 291 h2g->info.size) - 292 h2g->info.resv_space; 293 h2g->info.broken = false; 294 295 h2g->desc = *map; 296 xe_map_memset(xe, &h2g->desc, 0, 0, sizeof(struct guc_ct_buffer_desc)); 297 298 h2g->cmds = IOSYS_MAP_INIT_OFFSET(map, CTB_DESC_SIZE * 2); 299 } 300 301 static void guc_ct_ctb_g2h_init(struct xe_device *xe, struct guc_ctb *g2h, 302 struct iosys_map *map) 303 { 304 g2h->info.size = CTB_G2H_BUFFER_SIZE / sizeof(u32); 305 g2h->info.resv_space = G2H_ROOM_BUFFER_SIZE / sizeof(u32); 306 g2h->info.head = 0; 307 g2h->info.tail = 0; 308 g2h->info.space = CIRC_SPACE(g2h->info.tail, g2h->info.head, 309 g2h->info.size) - 310 g2h->info.resv_space; 311 g2h->info.broken = false; 312 313 g2h->desc = IOSYS_MAP_INIT_OFFSET(map, CTB_DESC_SIZE); 314 xe_map_memset(xe, &g2h->desc, 0, 0, sizeof(struct guc_ct_buffer_desc)); 315 316 g2h->cmds = IOSYS_MAP_INIT_OFFSET(map, CTB_DESC_SIZE * 2 + 317 CTB_H2G_BUFFER_SIZE); 318 } 319 320 static int guc_ct_ctb_h2g_register(struct xe_guc_ct *ct) 321 { 322 struct xe_guc *guc = ct_to_guc(ct); 323 u32 desc_addr, ctb_addr, size; 324 int err; 325 326 desc_addr = xe_bo_ggtt_addr(ct->bo); 327 ctb_addr = xe_bo_ggtt_addr(ct->bo) + CTB_DESC_SIZE * 2; 328 size = ct->ctbs.h2g.info.size * sizeof(u32); 329 330 err = xe_guc_self_cfg64(guc, 331 GUC_KLV_SELF_CFG_H2G_CTB_DESCRIPTOR_ADDR_KEY, 332 desc_addr); 333 if (err) 334 return err; 335 336 err = xe_guc_self_cfg64(guc, 337 GUC_KLV_SELF_CFG_H2G_CTB_ADDR_KEY, 338 ctb_addr); 339 if (err) 340 return err; 341 342 return xe_guc_self_cfg32(guc, 343 GUC_KLV_SELF_CFG_H2G_CTB_SIZE_KEY, 344 size); 345 } 346 347 static int guc_ct_ctb_g2h_register(struct xe_guc_ct *ct) 348 { 349 struct xe_guc *guc = ct_to_guc(ct); 350 u32 desc_addr, ctb_addr, size; 351 int err; 352 353 desc_addr = xe_bo_ggtt_addr(ct->bo) + CTB_DESC_SIZE; 354 ctb_addr = xe_bo_ggtt_addr(ct->bo) + CTB_DESC_SIZE * 2 + 355 CTB_H2G_BUFFER_SIZE; 356 size = ct->ctbs.g2h.info.size * sizeof(u32); 357 358 err = xe_guc_self_cfg64(guc, 359 GUC_KLV_SELF_CFG_G2H_CTB_DESCRIPTOR_ADDR_KEY, 360 desc_addr); 361 if (err) 362 return err; 363 364 err = xe_guc_self_cfg64(guc, 365 GUC_KLV_SELF_CFG_G2H_CTB_ADDR_KEY, 366 ctb_addr); 367 if (err) 368 return err; 369 370 return xe_guc_self_cfg32(guc, 371 GUC_KLV_SELF_CFG_G2H_CTB_SIZE_KEY, 372 size); 373 } 374 375 static int guc_ct_control_toggle(struct xe_guc_ct *ct, bool enable) 376 { 377 u32 request[HOST2GUC_CONTROL_CTB_REQUEST_MSG_LEN] = { 378 FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) | 379 FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) | 380 FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION, 381 GUC_ACTION_HOST2GUC_CONTROL_CTB), 382 FIELD_PREP(HOST2GUC_CONTROL_CTB_REQUEST_MSG_1_CONTROL, 383 enable ? GUC_CTB_CONTROL_ENABLE : 384 GUC_CTB_CONTROL_DISABLE), 385 }; 386 int ret = xe_guc_mmio_send(ct_to_guc(ct), request, ARRAY_SIZE(request)); 387 388 return ret > 0 ? -EPROTO : ret; 389 } 390 391 static void guc_ct_change_state(struct xe_guc_ct *ct, 392 enum xe_guc_ct_state state) 393 { 394 struct xe_gt *gt = ct_to_gt(ct); 395 struct g2h_fence *g2h_fence; 396 unsigned long idx; 397 398 mutex_lock(&ct->lock); /* Serialise dequeue_one_g2h() */ 399 spin_lock_irq(&ct->fast_lock); /* Serialise CT fast-path */ 400 401 xe_gt_assert(ct_to_gt(ct), ct->g2h_outstanding == 0 || 402 state == XE_GUC_CT_STATE_STOPPED); 403 404 if (ct->g2h_outstanding) 405 xe_pm_runtime_put(ct_to_xe(ct)); 406 ct->g2h_outstanding = 0; 407 ct->state = state; 408 409 xe_gt_dbg(gt, "GuC CT communication channel %s\n", 410 state == XE_GUC_CT_STATE_STOPPED ? "stopped" : 411 str_enabled_disabled(state == XE_GUC_CT_STATE_ENABLED)); 412 413 spin_unlock_irq(&ct->fast_lock); 414 415 /* cancel all in-flight send-recv requests */ 416 xa_for_each(&ct->fence_lookup, idx, g2h_fence) 417 g2h_fence_cancel(g2h_fence); 418 419 /* make sure guc_ct_send_recv() will see g2h_fence changes */ 420 smp_mb(); 421 wake_up_all(&ct->g2h_fence_wq); 422 423 /* 424 * Lockdep doesn't like this under the fast lock and he destroy only 425 * needs to be serialized with the send path which ct lock provides. 426 */ 427 xa_destroy(&ct->fence_lookup); 428 429 mutex_unlock(&ct->lock); 430 } 431 432 static bool ct_needs_safe_mode(struct xe_guc_ct *ct) 433 { 434 return !pci_dev_msi_enabled(to_pci_dev(ct_to_xe(ct)->drm.dev)); 435 } 436 437 static bool ct_restart_safe_mode_worker(struct xe_guc_ct *ct) 438 { 439 if (!ct_needs_safe_mode(ct)) 440 return false; 441 442 queue_delayed_work(ct->g2h_wq, &ct->safe_mode_worker, HZ / 10); 443 return true; 444 } 445 446 static void safe_mode_worker_func(struct work_struct *w) 447 { 448 struct xe_guc_ct *ct = container_of(w, struct xe_guc_ct, safe_mode_worker.work); 449 450 receive_g2h(ct); 451 452 if (!ct_restart_safe_mode_worker(ct)) 453 xe_gt_dbg(ct_to_gt(ct), "GuC CT safe-mode canceled\n"); 454 } 455 456 static void ct_enter_safe_mode(struct xe_guc_ct *ct) 457 { 458 if (ct_restart_safe_mode_worker(ct)) 459 xe_gt_dbg(ct_to_gt(ct), "GuC CT safe-mode enabled\n"); 460 } 461 462 static void ct_exit_safe_mode(struct xe_guc_ct *ct) 463 { 464 if (cancel_delayed_work_sync(&ct->safe_mode_worker)) 465 xe_gt_dbg(ct_to_gt(ct), "GuC CT safe-mode disabled\n"); 466 } 467 468 int xe_guc_ct_enable(struct xe_guc_ct *ct) 469 { 470 struct xe_device *xe = ct_to_xe(ct); 471 struct xe_gt *gt = ct_to_gt(ct); 472 int err; 473 474 xe_gt_assert(gt, !xe_guc_ct_enabled(ct)); 475 476 xe_map_memset(xe, &ct->bo->vmap, 0, 0, xe_bo_size(ct->bo)); 477 guc_ct_ctb_h2g_init(xe, &ct->ctbs.h2g, &ct->bo->vmap); 478 guc_ct_ctb_g2h_init(xe, &ct->ctbs.g2h, &ct->bo->vmap); 479 480 err = guc_ct_ctb_h2g_register(ct); 481 if (err) 482 goto err_out; 483 484 err = guc_ct_ctb_g2h_register(ct); 485 if (err) 486 goto err_out; 487 488 err = guc_ct_control_toggle(ct, true); 489 if (err) 490 goto err_out; 491 492 guc_ct_change_state(ct, XE_GUC_CT_STATE_ENABLED); 493 494 smp_mb(); 495 wake_up_all(&ct->wq); 496 497 if (ct_needs_safe_mode(ct)) 498 ct_enter_safe_mode(ct); 499 500 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) 501 /* 502 * The CT has now been reset so the dumper can be re-armed 503 * after any existing dead state has been dumped. 504 */ 505 spin_lock_irq(&ct->dead.lock); 506 if (ct->dead.reason) { 507 ct->dead.reason |= (1 << CT_DEAD_STATE_REARM); 508 queue_work(system_unbound_wq, &ct->dead.worker); 509 } 510 spin_unlock_irq(&ct->dead.lock); 511 #endif 512 513 return 0; 514 515 err_out: 516 xe_gt_err(gt, "Failed to enable GuC CT (%pe)\n", ERR_PTR(err)); 517 CT_DEAD(ct, NULL, SETUP); 518 519 return err; 520 } 521 522 static void stop_g2h_handler(struct xe_guc_ct *ct) 523 { 524 cancel_work_sync(&ct->g2h_worker); 525 } 526 527 /** 528 * xe_guc_ct_disable - Set GuC to disabled state 529 * @ct: the &xe_guc_ct 530 * 531 * Set GuC CT to disabled state and stop g2h handler. No outstanding g2h expected 532 * in this transition. 533 */ 534 void xe_guc_ct_disable(struct xe_guc_ct *ct) 535 { 536 guc_ct_change_state(ct, XE_GUC_CT_STATE_DISABLED); 537 ct_exit_safe_mode(ct); 538 stop_g2h_handler(ct); 539 } 540 541 /** 542 * xe_guc_ct_stop - Set GuC to stopped state 543 * @ct: the &xe_guc_ct 544 * 545 * Set GuC CT to stopped state, stop g2h handler, and clear any outstanding g2h 546 */ 547 void xe_guc_ct_stop(struct xe_guc_ct *ct) 548 { 549 if (!xe_guc_ct_initialized(ct)) 550 return; 551 552 guc_ct_change_state(ct, XE_GUC_CT_STATE_STOPPED); 553 stop_g2h_handler(ct); 554 } 555 556 static bool h2g_has_room(struct xe_guc_ct *ct, u32 cmd_len) 557 { 558 struct guc_ctb *h2g = &ct->ctbs.h2g; 559 560 lockdep_assert_held(&ct->lock); 561 562 if (cmd_len > h2g->info.space) { 563 h2g->info.head = desc_read(ct_to_xe(ct), h2g, head); 564 565 if (h2g->info.head > h2g->info.size) { 566 struct xe_device *xe = ct_to_xe(ct); 567 u32 desc_status = desc_read(xe, h2g, status); 568 569 desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW); 570 571 xe_gt_err(ct_to_gt(ct), "CT: invalid head offset %u >= %u)\n", 572 h2g->info.head, h2g->info.size); 573 CT_DEAD(ct, h2g, H2G_HAS_ROOM); 574 return false; 575 } 576 577 h2g->info.space = CIRC_SPACE(h2g->info.tail, h2g->info.head, 578 h2g->info.size) - 579 h2g->info.resv_space; 580 if (cmd_len > h2g->info.space) 581 return false; 582 } 583 584 return true; 585 } 586 587 static bool g2h_has_room(struct xe_guc_ct *ct, u32 g2h_len) 588 { 589 if (!g2h_len) 590 return true; 591 592 lockdep_assert_held(&ct->fast_lock); 593 594 return ct->ctbs.g2h.info.space > g2h_len; 595 } 596 597 static int has_room(struct xe_guc_ct *ct, u32 cmd_len, u32 g2h_len) 598 { 599 lockdep_assert_held(&ct->lock); 600 601 if (!g2h_has_room(ct, g2h_len) || !h2g_has_room(ct, cmd_len)) 602 return -EBUSY; 603 604 return 0; 605 } 606 607 static void h2g_reserve_space(struct xe_guc_ct *ct, u32 cmd_len) 608 { 609 lockdep_assert_held(&ct->lock); 610 ct->ctbs.h2g.info.space -= cmd_len; 611 } 612 613 static void __g2h_reserve_space(struct xe_guc_ct *ct, u32 g2h_len, u32 num_g2h) 614 { 615 xe_gt_assert(ct_to_gt(ct), g2h_len <= ct->ctbs.g2h.info.space); 616 xe_gt_assert(ct_to_gt(ct), (!g2h_len && !num_g2h) || 617 (g2h_len && num_g2h)); 618 619 if (g2h_len) { 620 lockdep_assert_held(&ct->fast_lock); 621 622 if (!ct->g2h_outstanding) 623 xe_pm_runtime_get_noresume(ct_to_xe(ct)); 624 625 ct->ctbs.g2h.info.space -= g2h_len; 626 ct->g2h_outstanding += num_g2h; 627 } 628 } 629 630 static void __g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len) 631 { 632 bool bad = false; 633 634 lockdep_assert_held(&ct->fast_lock); 635 636 bad = ct->ctbs.g2h.info.space + g2h_len > 637 ct->ctbs.g2h.info.size - ct->ctbs.g2h.info.resv_space; 638 bad |= !ct->g2h_outstanding; 639 640 if (bad) { 641 xe_gt_err(ct_to_gt(ct), "Invalid G2H release: %d + %d vs %d - %d -> %d vs %d, outstanding = %d!\n", 642 ct->ctbs.g2h.info.space, g2h_len, 643 ct->ctbs.g2h.info.size, ct->ctbs.g2h.info.resv_space, 644 ct->ctbs.g2h.info.space + g2h_len, 645 ct->ctbs.g2h.info.size - ct->ctbs.g2h.info.resv_space, 646 ct->g2h_outstanding); 647 CT_DEAD(ct, &ct->ctbs.g2h, G2H_RELEASE); 648 return; 649 } 650 651 ct->ctbs.g2h.info.space += g2h_len; 652 if (!--ct->g2h_outstanding) 653 xe_pm_runtime_put(ct_to_xe(ct)); 654 } 655 656 static void g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len) 657 { 658 spin_lock_irq(&ct->fast_lock); 659 __g2h_release_space(ct, g2h_len); 660 spin_unlock_irq(&ct->fast_lock); 661 } 662 663 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) 664 static void fast_req_track(struct xe_guc_ct *ct, u16 fence, u16 action) 665 { 666 unsigned int slot = fence % ARRAY_SIZE(ct->fast_req); 667 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC) 668 unsigned long entries[SZ_32]; 669 unsigned int n; 670 671 n = stack_trace_save(entries, ARRAY_SIZE(entries), 1); 672 673 /* May be called under spinlock, so avoid sleeping */ 674 ct->fast_req[slot].stack = stack_depot_save(entries, n, GFP_NOWAIT); 675 #endif 676 ct->fast_req[slot].fence = fence; 677 ct->fast_req[slot].action = action; 678 } 679 #else 680 static void fast_req_track(struct xe_guc_ct *ct, u16 fence, u16 action) 681 { 682 } 683 #endif 684 685 /* 686 * The CT protocol accepts a 16 bits fence. This field is fully owned by the 687 * driver, the GuC will just copy it to the reply message. Since we need to 688 * be able to distinguish between replies to REQUEST and FAST_REQUEST messages, 689 * we use one bit of the seqno as an indicator for that and a rolling counter 690 * for the remaining 15 bits. 691 */ 692 #define CT_SEQNO_MASK GENMASK(14, 0) 693 #define CT_SEQNO_UNTRACKED BIT(15) 694 static u16 next_ct_seqno(struct xe_guc_ct *ct, bool is_g2h_fence) 695 { 696 u32 seqno = ct->fence_seqno++ & CT_SEQNO_MASK; 697 698 if (!is_g2h_fence) 699 seqno |= CT_SEQNO_UNTRACKED; 700 701 return seqno; 702 } 703 704 #define H2G_CT_HEADERS (GUC_CTB_HDR_LEN + 1) /* one DW CTB header and one DW HxG header */ 705 706 static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len, 707 u32 ct_fence_value, bool want_response) 708 { 709 struct xe_device *xe = ct_to_xe(ct); 710 struct xe_gt *gt = ct_to_gt(ct); 711 struct guc_ctb *h2g = &ct->ctbs.h2g; 712 u32 cmd[H2G_CT_HEADERS]; 713 u32 tail = h2g->info.tail; 714 u32 full_len; 715 struct iosys_map map = IOSYS_MAP_INIT_OFFSET(&h2g->cmds, 716 tail * sizeof(u32)); 717 u32 desc_status; 718 719 full_len = len + GUC_CTB_HDR_LEN; 720 721 lockdep_assert_held(&ct->lock); 722 xe_gt_assert(gt, full_len <= GUC_CTB_MSG_MAX_LEN); 723 724 desc_status = desc_read(xe, h2g, status); 725 if (desc_status) { 726 xe_gt_err(gt, "CT write: non-zero status: %u\n", desc_status); 727 goto corrupted; 728 } 729 730 if (IS_ENABLED(CONFIG_DRM_XE_DEBUG)) { 731 u32 desc_tail = desc_read(xe, h2g, tail); 732 u32 desc_head = desc_read(xe, h2g, head); 733 734 if (tail != desc_tail) { 735 desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_MISMATCH); 736 xe_gt_err(gt, "CT write: tail was modified %u != %u\n", desc_tail, tail); 737 goto corrupted; 738 } 739 740 if (tail > h2g->info.size) { 741 desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW); 742 xe_gt_err(gt, "CT write: tail out of range: %u vs %u\n", 743 tail, h2g->info.size); 744 goto corrupted; 745 } 746 747 if (desc_head >= h2g->info.size) { 748 desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW); 749 xe_gt_err(gt, "CT write: invalid head offset %u >= %u)\n", 750 desc_head, h2g->info.size); 751 goto corrupted; 752 } 753 } 754 755 /* Command will wrap, zero fill (NOPs), return and check credits again */ 756 if (tail + full_len > h2g->info.size) { 757 xe_map_memset(xe, &map, 0, 0, 758 (h2g->info.size - tail) * sizeof(u32)); 759 h2g_reserve_space(ct, (h2g->info.size - tail)); 760 h2g->info.tail = 0; 761 desc_write(xe, h2g, tail, h2g->info.tail); 762 763 return -EAGAIN; 764 } 765 766 /* 767 * dw0: CT header (including fence) 768 * dw1: HXG header (including action code) 769 * dw2+: action data 770 */ 771 cmd[0] = FIELD_PREP(GUC_CTB_MSG_0_FORMAT, GUC_CTB_FORMAT_HXG) | 772 FIELD_PREP(GUC_CTB_MSG_0_NUM_DWORDS, len) | 773 FIELD_PREP(GUC_CTB_MSG_0_FENCE, ct_fence_value); 774 if (want_response) { 775 cmd[1] = 776 FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) | 777 FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION | 778 GUC_HXG_EVENT_MSG_0_DATA0, action[0]); 779 } else { 780 fast_req_track(ct, ct_fence_value, 781 FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, action[0])); 782 783 cmd[1] = 784 FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_FAST_REQUEST) | 785 FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION | 786 GUC_HXG_EVENT_MSG_0_DATA0, action[0]); 787 } 788 789 /* H2G header in cmd[1] replaces action[0] so: */ 790 --len; 791 ++action; 792 793 /* Write H2G ensuring visible before descriptor update */ 794 xe_map_memcpy_to(xe, &map, 0, cmd, H2G_CT_HEADERS * sizeof(u32)); 795 xe_map_memcpy_to(xe, &map, H2G_CT_HEADERS * sizeof(u32), action, len * sizeof(u32)); 796 xe_device_wmb(xe); 797 798 /* Update local copies */ 799 h2g->info.tail = (tail + full_len) % h2g->info.size; 800 h2g_reserve_space(ct, full_len); 801 802 /* Update descriptor */ 803 desc_write(xe, h2g, tail, h2g->info.tail); 804 805 trace_xe_guc_ctb_h2g(xe, gt->info.id, *(action - 1), full_len, 806 desc_read(xe, h2g, head), h2g->info.tail); 807 808 return 0; 809 810 corrupted: 811 CT_DEAD(ct, &ct->ctbs.h2g, H2G_WRITE); 812 return -EPIPE; 813 } 814 815 static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, 816 u32 len, u32 g2h_len, u32 num_g2h, 817 struct g2h_fence *g2h_fence) 818 { 819 struct xe_gt *gt __maybe_unused = ct_to_gt(ct); 820 u16 seqno; 821 int ret; 822 823 xe_gt_assert(gt, xe_guc_ct_initialized(ct)); 824 xe_gt_assert(gt, !g2h_len || !g2h_fence); 825 xe_gt_assert(gt, !num_g2h || !g2h_fence); 826 xe_gt_assert(gt, !g2h_len || num_g2h); 827 xe_gt_assert(gt, g2h_len || !num_g2h); 828 lockdep_assert_held(&ct->lock); 829 830 if (unlikely(ct->ctbs.h2g.info.broken)) { 831 ret = -EPIPE; 832 goto out; 833 } 834 835 if (ct->state == XE_GUC_CT_STATE_DISABLED) { 836 ret = -ENODEV; 837 goto out; 838 } 839 840 if (ct->state == XE_GUC_CT_STATE_STOPPED) { 841 ret = -ECANCELED; 842 goto out; 843 } 844 845 xe_gt_assert(gt, xe_guc_ct_enabled(ct)); 846 847 if (g2h_fence) { 848 g2h_len = GUC_CTB_HXG_MSG_MAX_LEN; 849 num_g2h = 1; 850 851 if (g2h_fence_needs_alloc(g2h_fence)) { 852 g2h_fence->seqno = next_ct_seqno(ct, true); 853 ret = xa_err(xa_store(&ct->fence_lookup, 854 g2h_fence->seqno, g2h_fence, 855 GFP_ATOMIC)); 856 if (ret) 857 goto out; 858 } 859 860 seqno = g2h_fence->seqno; 861 } else { 862 seqno = next_ct_seqno(ct, false); 863 } 864 865 if (g2h_len) 866 spin_lock_irq(&ct->fast_lock); 867 retry: 868 ret = has_room(ct, len + GUC_CTB_HDR_LEN, g2h_len); 869 if (unlikely(ret)) 870 goto out_unlock; 871 872 ret = h2g_write(ct, action, len, seqno, !!g2h_fence); 873 if (unlikely(ret)) { 874 if (ret == -EAGAIN) 875 goto retry; 876 goto out_unlock; 877 } 878 879 __g2h_reserve_space(ct, g2h_len, num_g2h); 880 xe_guc_notify(ct_to_guc(ct)); 881 out_unlock: 882 if (g2h_len) 883 spin_unlock_irq(&ct->fast_lock); 884 out: 885 return ret; 886 } 887 888 static void kick_reset(struct xe_guc_ct *ct) 889 { 890 xe_gt_reset_async(ct_to_gt(ct)); 891 } 892 893 static int dequeue_one_g2h(struct xe_guc_ct *ct); 894 895 static int guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len, 896 u32 g2h_len, u32 num_g2h, 897 struct g2h_fence *g2h_fence) 898 { 899 struct xe_device *xe = ct_to_xe(ct); 900 struct xe_gt *gt = ct_to_gt(ct); 901 unsigned int sleep_period_ms = 1; 902 int ret; 903 904 xe_gt_assert(gt, !g2h_len || !g2h_fence); 905 lockdep_assert_held(&ct->lock); 906 xe_device_assert_mem_access(ct_to_xe(ct)); 907 908 try_again: 909 ret = __guc_ct_send_locked(ct, action, len, g2h_len, num_g2h, 910 g2h_fence); 911 912 /* 913 * We wait to try to restore credits for about 1 second before bailing. 914 * In the case of H2G credits we have no choice but just to wait for the 915 * GuC to consume H2Gs in the channel so we use a wait / sleep loop. In 916 * the case of G2H we process any G2H in the channel, hopefully freeing 917 * credits as we consume the G2H messages. 918 */ 919 if (unlikely(ret == -EBUSY && 920 !h2g_has_room(ct, len + GUC_CTB_HDR_LEN))) { 921 struct guc_ctb *h2g = &ct->ctbs.h2g; 922 923 if (sleep_period_ms == 1024) 924 goto broken; 925 926 trace_xe_guc_ct_h2g_flow_control(xe, h2g->info.head, h2g->info.tail, 927 h2g->info.size, 928 h2g->info.space, 929 len + GUC_CTB_HDR_LEN); 930 msleep(sleep_period_ms); 931 sleep_period_ms <<= 1; 932 933 goto try_again; 934 } else if (unlikely(ret == -EBUSY)) { 935 struct xe_device *xe = ct_to_xe(ct); 936 struct guc_ctb *g2h = &ct->ctbs.g2h; 937 938 trace_xe_guc_ct_g2h_flow_control(xe, g2h->info.head, 939 desc_read(xe, g2h, tail), 940 g2h->info.size, 941 g2h->info.space, 942 g2h_fence ? 943 GUC_CTB_HXG_MSG_MAX_LEN : 944 g2h_len); 945 946 #define g2h_avail(ct) \ 947 (desc_read(ct_to_xe(ct), (&ct->ctbs.g2h), tail) != ct->ctbs.g2h.info.head) 948 if (!wait_event_timeout(ct->wq, !ct->g2h_outstanding || 949 g2h_avail(ct), HZ)) 950 goto broken; 951 #undef g2h_avail 952 953 ret = dequeue_one_g2h(ct); 954 if (ret < 0) { 955 if (ret != -ECANCELED) 956 xe_gt_err(ct_to_gt(ct), "CTB receive failed (%pe)", 957 ERR_PTR(ret)); 958 goto broken; 959 } 960 961 goto try_again; 962 } 963 964 return ret; 965 966 broken: 967 xe_gt_err(gt, "No forward process on H2G, reset required\n"); 968 CT_DEAD(ct, &ct->ctbs.h2g, DEADLOCK); 969 970 return -EDEADLK; 971 } 972 973 static int guc_ct_send(struct xe_guc_ct *ct, const u32 *action, u32 len, 974 u32 g2h_len, u32 num_g2h, struct g2h_fence *g2h_fence) 975 { 976 int ret; 977 978 xe_gt_assert(ct_to_gt(ct), !g2h_len || !g2h_fence); 979 980 mutex_lock(&ct->lock); 981 ret = guc_ct_send_locked(ct, action, len, g2h_len, num_g2h, g2h_fence); 982 mutex_unlock(&ct->lock); 983 984 return ret; 985 } 986 987 int xe_guc_ct_send(struct xe_guc_ct *ct, const u32 *action, u32 len, 988 u32 g2h_len, u32 num_g2h) 989 { 990 int ret; 991 992 ret = guc_ct_send(ct, action, len, g2h_len, num_g2h, NULL); 993 if (ret == -EDEADLK) 994 kick_reset(ct); 995 996 return ret; 997 } 998 999 int xe_guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len, 1000 u32 g2h_len, u32 num_g2h) 1001 { 1002 int ret; 1003 1004 ret = guc_ct_send_locked(ct, action, len, g2h_len, num_g2h, NULL); 1005 if (ret == -EDEADLK) 1006 kick_reset(ct); 1007 1008 return ret; 1009 } 1010 1011 int xe_guc_ct_send_g2h_handler(struct xe_guc_ct *ct, const u32 *action, u32 len) 1012 { 1013 int ret; 1014 1015 lockdep_assert_held(&ct->lock); 1016 1017 ret = guc_ct_send_locked(ct, action, len, 0, 0, NULL); 1018 if (ret == -EDEADLK) 1019 kick_reset(ct); 1020 1021 return ret; 1022 } 1023 1024 /* 1025 * Check if a GT reset is in progress or will occur and if GT reset brought the 1026 * CT back up. Randomly picking 5 seconds for an upper limit to do a GT a reset. 1027 */ 1028 static bool retry_failure(struct xe_guc_ct *ct, int ret) 1029 { 1030 if (!(ret == -EDEADLK || ret == -EPIPE || ret == -ENODEV)) 1031 return false; 1032 1033 #define ct_alive(ct) \ 1034 (xe_guc_ct_enabled(ct) && !ct->ctbs.h2g.info.broken && \ 1035 !ct->ctbs.g2h.info.broken) 1036 if (!wait_event_interruptible_timeout(ct->wq, ct_alive(ct), HZ * 5)) 1037 return false; 1038 #undef ct_alive 1039 1040 return true; 1041 } 1042 1043 static int guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len, 1044 u32 *response_buffer, bool no_fail) 1045 { 1046 struct xe_gt *gt = ct_to_gt(ct); 1047 struct g2h_fence g2h_fence; 1048 int ret = 0; 1049 1050 /* 1051 * We use a fence to implement blocking sends / receiving response data. 1052 * The seqno of the fence is sent in the H2G, returned in the G2H, and 1053 * an xarray is used as storage media with the seqno being to key. 1054 * Fields in the fence hold success, failure, retry status and the 1055 * response data. Safe to allocate on the stack as the xarray is the 1056 * only reference and it cannot be present after this function exits. 1057 */ 1058 retry: 1059 g2h_fence_init(&g2h_fence, response_buffer); 1060 retry_same_fence: 1061 ret = guc_ct_send(ct, action, len, 0, 0, &g2h_fence); 1062 if (unlikely(ret == -ENOMEM)) { 1063 /* Retry allocation /w GFP_KERNEL */ 1064 ret = xa_err(xa_store(&ct->fence_lookup, g2h_fence.seqno, 1065 &g2h_fence, GFP_KERNEL)); 1066 if (ret) 1067 return ret; 1068 1069 goto retry_same_fence; 1070 } else if (unlikely(ret)) { 1071 if (ret == -EDEADLK) 1072 kick_reset(ct); 1073 1074 if (no_fail && retry_failure(ct, ret)) 1075 goto retry_same_fence; 1076 1077 if (!g2h_fence_needs_alloc(&g2h_fence)) 1078 xa_erase(&ct->fence_lookup, g2h_fence.seqno); 1079 1080 return ret; 1081 } 1082 1083 ret = wait_event_timeout(ct->g2h_fence_wq, g2h_fence.done, HZ); 1084 if (!ret) { 1085 LNL_FLUSH_WORK(&ct->g2h_worker); 1086 if (g2h_fence.done) { 1087 xe_gt_warn(gt, "G2H fence %u, action %04x, done\n", 1088 g2h_fence.seqno, action[0]); 1089 ret = 1; 1090 } 1091 } 1092 1093 /* 1094 * Ensure we serialize with completion side to prevent UAF with fence going out of scope on 1095 * the stack, since we have no clue if it will fire after the timeout before we can erase 1096 * from the xa. Also we have some dependent loads and stores below for which we need the 1097 * correct ordering, and we lack the needed barriers. 1098 */ 1099 mutex_lock(&ct->lock); 1100 if (!ret) { 1101 xe_gt_err(gt, "Timed out wait for G2H, fence %u, action %04x, done %s", 1102 g2h_fence.seqno, action[0], str_yes_no(g2h_fence.done)); 1103 xa_erase(&ct->fence_lookup, g2h_fence.seqno); 1104 mutex_unlock(&ct->lock); 1105 return -ETIME; 1106 } 1107 1108 if (g2h_fence.retry) { 1109 xe_gt_dbg(gt, "H2G action %#x retrying: reason %#x\n", 1110 action[0], g2h_fence.reason); 1111 mutex_unlock(&ct->lock); 1112 goto retry; 1113 } 1114 if (g2h_fence.fail) { 1115 if (g2h_fence.cancel) { 1116 xe_gt_dbg(gt, "H2G request %#x canceled!\n", action[0]); 1117 ret = -ECANCELED; 1118 goto unlock; 1119 } 1120 xe_gt_err(gt, "H2G request %#x failed: error %#x hint %#x\n", 1121 action[0], g2h_fence.error, g2h_fence.hint); 1122 ret = -EIO; 1123 } 1124 1125 if (ret > 0) 1126 ret = response_buffer ? g2h_fence.response_len : g2h_fence.response_data; 1127 1128 unlock: 1129 mutex_unlock(&ct->lock); 1130 1131 return ret; 1132 } 1133 1134 /** 1135 * xe_guc_ct_send_recv - Send and receive HXG to the GuC 1136 * @ct: the &xe_guc_ct 1137 * @action: the dword array with `HXG Request`_ message (can't be NULL) 1138 * @len: length of the `HXG Request`_ message (in dwords, can't be 0) 1139 * @response_buffer: placeholder for the `HXG Response`_ message (can be NULL) 1140 * 1141 * Send a `HXG Request`_ message to the GuC over CT communication channel and 1142 * blocks until GuC replies with a `HXG Response`_ message. 1143 * 1144 * For non-blocking communication with GuC use xe_guc_ct_send(). 1145 * 1146 * Note: The size of &response_buffer must be at least GUC_CTB_MAX_DWORDS_. 1147 * 1148 * Return: response length (in dwords) if &response_buffer was not NULL, or 1149 * DATA0 from `HXG Response`_ if &response_buffer was NULL, or 1150 * a negative error code on failure. 1151 */ 1152 int xe_guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len, 1153 u32 *response_buffer) 1154 { 1155 KUNIT_STATIC_STUB_REDIRECT(xe_guc_ct_send_recv, ct, action, len, response_buffer); 1156 return guc_ct_send_recv(ct, action, len, response_buffer, false); 1157 } 1158 ALLOW_ERROR_INJECTION(xe_guc_ct_send_recv, ERRNO); 1159 1160 int xe_guc_ct_send_recv_no_fail(struct xe_guc_ct *ct, const u32 *action, 1161 u32 len, u32 *response_buffer) 1162 { 1163 return guc_ct_send_recv(ct, action, len, response_buffer, true); 1164 } 1165 1166 static u32 *msg_to_hxg(u32 *msg) 1167 { 1168 return msg + GUC_CTB_MSG_MIN_LEN; 1169 } 1170 1171 static u32 msg_len_to_hxg_len(u32 len) 1172 { 1173 return len - GUC_CTB_MSG_MIN_LEN; 1174 } 1175 1176 static int parse_g2h_event(struct xe_guc_ct *ct, u32 *msg, u32 len) 1177 { 1178 u32 *hxg = msg_to_hxg(msg); 1179 u32 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, hxg[0]); 1180 1181 lockdep_assert_held(&ct->lock); 1182 1183 switch (action) { 1184 case XE_GUC_ACTION_SCHED_CONTEXT_MODE_DONE: 1185 case XE_GUC_ACTION_DEREGISTER_CONTEXT_DONE: 1186 case XE_GUC_ACTION_SCHED_ENGINE_MODE_DONE: 1187 case XE_GUC_ACTION_TLB_INVALIDATION_DONE: 1188 g2h_release_space(ct, len); 1189 } 1190 1191 return 0; 1192 } 1193 1194 static int guc_crash_process_msg(struct xe_guc_ct *ct, u32 action) 1195 { 1196 struct xe_gt *gt = ct_to_gt(ct); 1197 1198 if (action == XE_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED) 1199 xe_gt_err(gt, "GuC Crash dump notification\n"); 1200 else if (action == XE_GUC_ACTION_NOTIFY_EXCEPTION) 1201 xe_gt_err(gt, "GuC Exception notification\n"); 1202 else 1203 xe_gt_err(gt, "Unknown GuC crash notification: 0x%04X\n", action); 1204 1205 CT_DEAD(ct, NULL, CRASH); 1206 1207 kick_reset(ct); 1208 1209 return 0; 1210 } 1211 1212 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) 1213 static void fast_req_report(struct xe_guc_ct *ct, u16 fence) 1214 { 1215 u16 fence_min = U16_MAX, fence_max = 0; 1216 struct xe_gt *gt = ct_to_gt(ct); 1217 bool found = false; 1218 unsigned int n; 1219 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC) 1220 char *buf; 1221 #endif 1222 1223 lockdep_assert_held(&ct->lock); 1224 1225 for (n = 0; n < ARRAY_SIZE(ct->fast_req); n++) { 1226 if (ct->fast_req[n].fence < fence_min) 1227 fence_min = ct->fast_req[n].fence; 1228 if (ct->fast_req[n].fence > fence_max) 1229 fence_max = ct->fast_req[n].fence; 1230 1231 if (ct->fast_req[n].fence != fence) 1232 continue; 1233 found = true; 1234 1235 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC) 1236 buf = kmalloc(SZ_4K, GFP_NOWAIT); 1237 if (buf && stack_depot_snprint(ct->fast_req[n].stack, buf, SZ_4K, 0)) 1238 xe_gt_err(gt, "Fence 0x%x was used by action %#04x sent at:\n%s", 1239 fence, ct->fast_req[n].action, buf); 1240 else 1241 xe_gt_err(gt, "Fence 0x%x was used by action %#04x [failed to retrieve stack]\n", 1242 fence, ct->fast_req[n].action); 1243 kfree(buf); 1244 #else 1245 xe_gt_err(gt, "Fence 0x%x was used by action %#04x\n", 1246 fence, ct->fast_req[n].action); 1247 #endif 1248 break; 1249 } 1250 1251 if (!found) 1252 xe_gt_warn(gt, "Fence 0x%x not found - tracking buffer wrapped? [range = 0x%x -> 0x%x, next = 0x%X]\n", 1253 fence, fence_min, fence_max, ct->fence_seqno); 1254 } 1255 #else 1256 static void fast_req_report(struct xe_guc_ct *ct, u16 fence) 1257 { 1258 } 1259 #endif 1260 1261 static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len) 1262 { 1263 struct xe_gt *gt = ct_to_gt(ct); 1264 u32 *hxg = msg_to_hxg(msg); 1265 u32 hxg_len = msg_len_to_hxg_len(len); 1266 u32 fence = FIELD_GET(GUC_CTB_MSG_0_FENCE, msg[0]); 1267 u32 type = FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]); 1268 struct g2h_fence *g2h_fence; 1269 1270 lockdep_assert_held(&ct->lock); 1271 1272 /* 1273 * Fences for FAST_REQUEST messages are not tracked in ct->fence_lookup. 1274 * Those messages should never fail, so if we do get an error back it 1275 * means we're likely doing an illegal operation and the GuC is 1276 * rejecting it. We have no way to inform the code that submitted the 1277 * H2G that the message was rejected, so we need to escalate the 1278 * failure to trigger a reset. 1279 */ 1280 if (fence & CT_SEQNO_UNTRACKED) { 1281 if (type == GUC_HXG_TYPE_RESPONSE_FAILURE) 1282 xe_gt_err(gt, "FAST_REQ H2G fence 0x%x failed! e=0x%x, h=%u\n", 1283 fence, 1284 FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, hxg[0]), 1285 FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT, hxg[0])); 1286 else 1287 xe_gt_err(gt, "unexpected response %u for FAST_REQ H2G fence 0x%x!\n", 1288 type, fence); 1289 1290 fast_req_report(ct, fence); 1291 1292 CT_DEAD(ct, NULL, PARSE_G2H_RESPONSE); 1293 1294 return -EPROTO; 1295 } 1296 1297 g2h_fence = xa_erase(&ct->fence_lookup, fence); 1298 if (unlikely(!g2h_fence)) { 1299 /* Don't tear down channel, as send could've timed out */ 1300 /* CT_DEAD(ct, NULL, PARSE_G2H_UNKNOWN); */ 1301 xe_gt_warn(gt, "G2H fence (%u) not found!\n", fence); 1302 g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN); 1303 return 0; 1304 } 1305 1306 xe_gt_assert(gt, fence == g2h_fence->seqno); 1307 1308 if (type == GUC_HXG_TYPE_RESPONSE_FAILURE) { 1309 g2h_fence->fail = true; 1310 g2h_fence->error = FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, hxg[0]); 1311 g2h_fence->hint = FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT, hxg[0]); 1312 } else if (type == GUC_HXG_TYPE_NO_RESPONSE_RETRY) { 1313 g2h_fence->retry = true; 1314 g2h_fence->reason = FIELD_GET(GUC_HXG_RETRY_MSG_0_REASON, hxg[0]); 1315 } else if (g2h_fence->response_buffer) { 1316 g2h_fence->response_len = hxg_len; 1317 memcpy(g2h_fence->response_buffer, hxg, hxg_len * sizeof(u32)); 1318 } else { 1319 g2h_fence->response_data = FIELD_GET(GUC_HXG_RESPONSE_MSG_0_DATA0, hxg[0]); 1320 } 1321 1322 g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN); 1323 1324 g2h_fence->done = true; 1325 smp_mb(); 1326 1327 wake_up_all(&ct->g2h_fence_wq); 1328 1329 return 0; 1330 } 1331 1332 static int parse_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len) 1333 { 1334 struct xe_gt *gt = ct_to_gt(ct); 1335 u32 *hxg = msg_to_hxg(msg); 1336 u32 origin, type; 1337 int ret; 1338 1339 lockdep_assert_held(&ct->lock); 1340 1341 origin = FIELD_GET(GUC_HXG_MSG_0_ORIGIN, hxg[0]); 1342 if (unlikely(origin != GUC_HXG_ORIGIN_GUC)) { 1343 xe_gt_err(gt, "G2H channel broken on read, origin=%u, reset required\n", 1344 origin); 1345 CT_DEAD(ct, &ct->ctbs.g2h, PARSE_G2H_ORIGIN); 1346 1347 return -EPROTO; 1348 } 1349 1350 type = FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]); 1351 switch (type) { 1352 case GUC_HXG_TYPE_EVENT: 1353 ret = parse_g2h_event(ct, msg, len); 1354 break; 1355 case GUC_HXG_TYPE_RESPONSE_SUCCESS: 1356 case GUC_HXG_TYPE_RESPONSE_FAILURE: 1357 case GUC_HXG_TYPE_NO_RESPONSE_RETRY: 1358 ret = parse_g2h_response(ct, msg, len); 1359 break; 1360 default: 1361 xe_gt_err(gt, "G2H channel broken on read, type=%u, reset required\n", 1362 type); 1363 CT_DEAD(ct, &ct->ctbs.g2h, PARSE_G2H_TYPE); 1364 1365 ret = -EOPNOTSUPP; 1366 } 1367 1368 return ret; 1369 } 1370 1371 static int process_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len) 1372 { 1373 struct xe_guc *guc = ct_to_guc(ct); 1374 struct xe_gt *gt = ct_to_gt(ct); 1375 u32 hxg_len = msg_len_to_hxg_len(len); 1376 u32 *hxg = msg_to_hxg(msg); 1377 u32 action, adj_len; 1378 u32 *payload; 1379 int ret = 0; 1380 1381 if (FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]) != GUC_HXG_TYPE_EVENT) 1382 return 0; 1383 1384 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, hxg[0]); 1385 payload = hxg + GUC_HXG_EVENT_MSG_MIN_LEN; 1386 adj_len = hxg_len - GUC_HXG_EVENT_MSG_MIN_LEN; 1387 1388 switch (action) { 1389 case XE_GUC_ACTION_SCHED_CONTEXT_MODE_DONE: 1390 ret = xe_guc_sched_done_handler(guc, payload, adj_len); 1391 break; 1392 case XE_GUC_ACTION_DEREGISTER_CONTEXT_DONE: 1393 ret = xe_guc_deregister_done_handler(guc, payload, adj_len); 1394 break; 1395 case XE_GUC_ACTION_CONTEXT_RESET_NOTIFICATION: 1396 ret = xe_guc_exec_queue_reset_handler(guc, payload, adj_len); 1397 break; 1398 case XE_GUC_ACTION_ENGINE_FAILURE_NOTIFICATION: 1399 ret = xe_guc_exec_queue_reset_failure_handler(guc, payload, 1400 adj_len); 1401 break; 1402 case XE_GUC_ACTION_SCHED_ENGINE_MODE_DONE: 1403 /* Selftest only at the moment */ 1404 break; 1405 case XE_GUC_ACTION_STATE_CAPTURE_NOTIFICATION: 1406 ret = xe_guc_error_capture_handler(guc, payload, adj_len); 1407 break; 1408 case XE_GUC_ACTION_NOTIFY_FLUSH_LOG_BUFFER_TO_FILE: 1409 /* FIXME: Handle this */ 1410 break; 1411 case XE_GUC_ACTION_NOTIFY_MEMORY_CAT_ERROR: 1412 ret = xe_guc_exec_queue_memory_cat_error_handler(guc, payload, 1413 adj_len); 1414 break; 1415 case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC: 1416 ret = xe_guc_pagefault_handler(guc, payload, adj_len); 1417 break; 1418 case XE_GUC_ACTION_TLB_INVALIDATION_DONE: 1419 ret = xe_guc_tlb_invalidation_done_handler(guc, payload, 1420 adj_len); 1421 break; 1422 case XE_GUC_ACTION_ACCESS_COUNTER_NOTIFY: 1423 ret = xe_guc_access_counter_notify_handler(guc, payload, 1424 adj_len); 1425 break; 1426 case XE_GUC_ACTION_GUC2PF_RELAY_FROM_VF: 1427 ret = xe_guc_relay_process_guc2pf(&guc->relay, hxg, hxg_len); 1428 break; 1429 case XE_GUC_ACTION_GUC2VF_RELAY_FROM_PF: 1430 ret = xe_guc_relay_process_guc2vf(&guc->relay, hxg, hxg_len); 1431 break; 1432 case GUC_ACTION_GUC2PF_VF_STATE_NOTIFY: 1433 ret = xe_gt_sriov_pf_control_process_guc2pf(gt, hxg, hxg_len); 1434 break; 1435 case GUC_ACTION_GUC2PF_ADVERSE_EVENT: 1436 ret = xe_gt_sriov_pf_monitor_process_guc2pf(gt, hxg, hxg_len); 1437 break; 1438 case XE_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED: 1439 case XE_GUC_ACTION_NOTIFY_EXCEPTION: 1440 ret = guc_crash_process_msg(ct, action); 1441 break; 1442 default: 1443 xe_gt_err(gt, "unexpected G2H action 0x%04x\n", action); 1444 } 1445 1446 if (ret) { 1447 xe_gt_err(gt, "G2H action %#04x failed (%pe) len %u msg %*ph\n", 1448 action, ERR_PTR(ret), hxg_len, (int)sizeof(u32) * hxg_len, hxg); 1449 CT_DEAD(ct, NULL, PROCESS_FAILED); 1450 } 1451 1452 return 0; 1453 } 1454 1455 static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path) 1456 { 1457 struct xe_device *xe = ct_to_xe(ct); 1458 struct xe_gt *gt = ct_to_gt(ct); 1459 struct guc_ctb *g2h = &ct->ctbs.g2h; 1460 u32 tail, head, len, desc_status; 1461 s32 avail; 1462 u32 action; 1463 u32 *hxg; 1464 1465 xe_gt_assert(gt, xe_guc_ct_initialized(ct)); 1466 lockdep_assert_held(&ct->fast_lock); 1467 1468 if (ct->state == XE_GUC_CT_STATE_DISABLED) 1469 return -ENODEV; 1470 1471 if (ct->state == XE_GUC_CT_STATE_STOPPED) 1472 return -ECANCELED; 1473 1474 if (g2h->info.broken) 1475 return -EPIPE; 1476 1477 xe_gt_assert(gt, xe_guc_ct_enabled(ct)); 1478 1479 desc_status = desc_read(xe, g2h, status); 1480 if (desc_status) { 1481 if (desc_status & GUC_CTB_STATUS_DISABLED) { 1482 /* 1483 * Potentially valid if a CLIENT_RESET request resulted in 1484 * contexts/engines being reset. But should never happen as 1485 * no contexts should be active when CLIENT_RESET is sent. 1486 */ 1487 xe_gt_err(gt, "CT read: unexpected G2H after GuC has stopped!\n"); 1488 desc_status &= ~GUC_CTB_STATUS_DISABLED; 1489 } 1490 1491 if (desc_status) { 1492 xe_gt_err(gt, "CT read: non-zero status: %u\n", desc_status); 1493 goto corrupted; 1494 } 1495 } 1496 1497 if (IS_ENABLED(CONFIG_DRM_XE_DEBUG)) { 1498 u32 desc_tail = desc_read(xe, g2h, tail); 1499 /* 1500 u32 desc_head = desc_read(xe, g2h, head); 1501 1502 * info.head and desc_head are updated back-to-back at the end of 1503 * this function and nowhere else. Hence, they cannot be different 1504 * unless two g2h_read calls are running concurrently. Which is not 1505 * possible because it is guarded by ct->fast_lock. And yet, some 1506 * discrete platforms are regularly hitting this error :(. 1507 * 1508 * desc_head rolling backwards shouldn't cause any noticeable 1509 * problems - just a delay in GuC being allowed to proceed past that 1510 * point in the queue. So for now, just disable the error until it 1511 * can be root caused. 1512 * 1513 if (g2h->info.head != desc_head) { 1514 desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_MISMATCH); 1515 xe_gt_err(gt, "CT read: head was modified %u != %u\n", 1516 desc_head, g2h->info.head); 1517 goto corrupted; 1518 } 1519 */ 1520 1521 if (g2h->info.head > g2h->info.size) { 1522 desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_OVERFLOW); 1523 xe_gt_err(gt, "CT read: head out of range: %u vs %u\n", 1524 g2h->info.head, g2h->info.size); 1525 goto corrupted; 1526 } 1527 1528 if (desc_tail >= g2h->info.size) { 1529 desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_OVERFLOW); 1530 xe_gt_err(gt, "CT read: invalid tail offset %u >= %u)\n", 1531 desc_tail, g2h->info.size); 1532 goto corrupted; 1533 } 1534 } 1535 1536 /* Calculate DW available to read */ 1537 tail = desc_read(xe, g2h, tail); 1538 avail = tail - g2h->info.head; 1539 if (unlikely(avail == 0)) 1540 return 0; 1541 1542 if (avail < 0) 1543 avail += g2h->info.size; 1544 1545 /* Read header */ 1546 xe_map_memcpy_from(xe, msg, &g2h->cmds, sizeof(u32) * g2h->info.head, 1547 sizeof(u32)); 1548 len = FIELD_GET(GUC_CTB_MSG_0_NUM_DWORDS, msg[0]) + GUC_CTB_MSG_MIN_LEN; 1549 if (len > avail) { 1550 xe_gt_err(gt, "G2H channel broken on read, avail=%d, len=%d, reset required\n", 1551 avail, len); 1552 goto corrupted; 1553 } 1554 1555 head = (g2h->info.head + 1) % g2h->info.size; 1556 avail = len - 1; 1557 1558 /* Read G2H message */ 1559 if (avail + head > g2h->info.size) { 1560 u32 avail_til_wrap = g2h->info.size - head; 1561 1562 xe_map_memcpy_from(xe, msg + 1, 1563 &g2h->cmds, sizeof(u32) * head, 1564 avail_til_wrap * sizeof(u32)); 1565 xe_map_memcpy_from(xe, msg + 1 + avail_til_wrap, 1566 &g2h->cmds, 0, 1567 (avail - avail_til_wrap) * sizeof(u32)); 1568 } else { 1569 xe_map_memcpy_from(xe, msg + 1, 1570 &g2h->cmds, sizeof(u32) * head, 1571 avail * sizeof(u32)); 1572 } 1573 1574 hxg = msg_to_hxg(msg); 1575 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, hxg[0]); 1576 1577 if (fast_path) { 1578 if (FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]) != GUC_HXG_TYPE_EVENT) 1579 return 0; 1580 1581 switch (action) { 1582 case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC: 1583 case XE_GUC_ACTION_TLB_INVALIDATION_DONE: 1584 break; /* Process these in fast-path */ 1585 default: 1586 return 0; 1587 } 1588 } 1589 1590 /* Update local / descriptor header */ 1591 g2h->info.head = (head + avail) % g2h->info.size; 1592 desc_write(xe, g2h, head, g2h->info.head); 1593 1594 trace_xe_guc_ctb_g2h(xe, ct_to_gt(ct)->info.id, 1595 action, len, g2h->info.head, tail); 1596 1597 return len; 1598 1599 corrupted: 1600 CT_DEAD(ct, &ct->ctbs.g2h, G2H_READ); 1601 return -EPROTO; 1602 } 1603 1604 static void g2h_fast_path(struct xe_guc_ct *ct, u32 *msg, u32 len) 1605 { 1606 struct xe_gt *gt = ct_to_gt(ct); 1607 struct xe_guc *guc = ct_to_guc(ct); 1608 u32 hxg_len = msg_len_to_hxg_len(len); 1609 u32 *hxg = msg_to_hxg(msg); 1610 u32 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, hxg[0]); 1611 u32 *payload = hxg + GUC_HXG_MSG_MIN_LEN; 1612 u32 adj_len = hxg_len - GUC_HXG_MSG_MIN_LEN; 1613 int ret = 0; 1614 1615 switch (action) { 1616 case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC: 1617 ret = xe_guc_pagefault_handler(guc, payload, adj_len); 1618 break; 1619 case XE_GUC_ACTION_TLB_INVALIDATION_DONE: 1620 __g2h_release_space(ct, len); 1621 ret = xe_guc_tlb_invalidation_done_handler(guc, payload, 1622 adj_len); 1623 break; 1624 default: 1625 xe_gt_warn(gt, "NOT_POSSIBLE"); 1626 } 1627 1628 if (ret) { 1629 xe_gt_err(gt, "G2H action 0x%04x failed (%pe)\n", 1630 action, ERR_PTR(ret)); 1631 CT_DEAD(ct, NULL, FAST_G2H); 1632 } 1633 } 1634 1635 /** 1636 * xe_guc_ct_fast_path - process critical G2H in the IRQ handler 1637 * @ct: GuC CT object 1638 * 1639 * Anything related to page faults is critical for performance, process these 1640 * critical G2H in the IRQ. This is safe as these handlers either just wake up 1641 * waiters or queue another worker. 1642 */ 1643 void xe_guc_ct_fast_path(struct xe_guc_ct *ct) 1644 { 1645 struct xe_device *xe = ct_to_xe(ct); 1646 bool ongoing; 1647 int len; 1648 1649 ongoing = xe_pm_runtime_get_if_active(ct_to_xe(ct)); 1650 if (!ongoing && xe_pm_read_callback_task(ct_to_xe(ct)) == NULL) 1651 return; 1652 1653 spin_lock(&ct->fast_lock); 1654 do { 1655 len = g2h_read(ct, ct->fast_msg, true); 1656 if (len > 0) 1657 g2h_fast_path(ct, ct->fast_msg, len); 1658 } while (len > 0); 1659 spin_unlock(&ct->fast_lock); 1660 1661 if (ongoing) 1662 xe_pm_runtime_put(xe); 1663 } 1664 1665 /* Returns less than zero on error, 0 on done, 1 on more available */ 1666 static int dequeue_one_g2h(struct xe_guc_ct *ct) 1667 { 1668 int len; 1669 int ret; 1670 1671 lockdep_assert_held(&ct->lock); 1672 1673 spin_lock_irq(&ct->fast_lock); 1674 len = g2h_read(ct, ct->msg, false); 1675 spin_unlock_irq(&ct->fast_lock); 1676 if (len <= 0) 1677 return len; 1678 1679 ret = parse_g2h_msg(ct, ct->msg, len); 1680 if (unlikely(ret < 0)) 1681 return ret; 1682 1683 ret = process_g2h_msg(ct, ct->msg, len); 1684 if (unlikely(ret < 0)) 1685 return ret; 1686 1687 return 1; 1688 } 1689 1690 static void receive_g2h(struct xe_guc_ct *ct) 1691 { 1692 bool ongoing; 1693 int ret; 1694 1695 /* 1696 * Normal users must always hold mem_access.ref around CT calls. However 1697 * during the runtime pm callbacks we rely on CT to talk to the GuC, but 1698 * at this stage we can't rely on mem_access.ref and even the 1699 * callback_task will be different than current. For such cases we just 1700 * need to ensure we always process the responses from any blocking 1701 * ct_send requests or where we otherwise expect some response when 1702 * initiated from those callbacks (which will need to wait for the below 1703 * dequeue_one_g2h()). The dequeue_one_g2h() will gracefully fail if 1704 * the device has suspended to the point that the CT communication has 1705 * been disabled. 1706 * 1707 * If we are inside the runtime pm callback, we can be the only task 1708 * still issuing CT requests (since that requires having the 1709 * mem_access.ref). It seems like it might in theory be possible to 1710 * receive unsolicited events from the GuC just as we are 1711 * suspending-resuming, but those will currently anyway be lost when 1712 * eventually exiting from suspend, hence no need to wake up the device 1713 * here. If we ever need something stronger than get_if_ongoing() then 1714 * we need to be careful with blocking the pm callbacks from getting CT 1715 * responses, if the worker here is blocked on those callbacks 1716 * completing, creating a deadlock. 1717 */ 1718 ongoing = xe_pm_runtime_get_if_active(ct_to_xe(ct)); 1719 if (!ongoing && xe_pm_read_callback_task(ct_to_xe(ct)) == NULL) 1720 return; 1721 1722 do { 1723 mutex_lock(&ct->lock); 1724 ret = dequeue_one_g2h(ct); 1725 mutex_unlock(&ct->lock); 1726 1727 if (unlikely(ret == -EPROTO || ret == -EOPNOTSUPP)) { 1728 xe_gt_err(ct_to_gt(ct), "CT dequeue failed: %d", ret); 1729 CT_DEAD(ct, NULL, G2H_RECV); 1730 kick_reset(ct); 1731 } 1732 } while (ret == 1); 1733 1734 if (ongoing) 1735 xe_pm_runtime_put(ct_to_xe(ct)); 1736 } 1737 1738 static void g2h_worker_func(struct work_struct *w) 1739 { 1740 struct xe_guc_ct *ct = container_of(w, struct xe_guc_ct, g2h_worker); 1741 1742 receive_g2h(ct); 1743 } 1744 1745 static void xe_fixup_u64_in_cmds(struct xe_device *xe, struct iosys_map *cmds, 1746 u32 size, u32 idx, s64 shift) 1747 { 1748 u32 hi, lo; 1749 u64 offset; 1750 1751 lo = xe_map_rd_ring_u32(xe, cmds, idx, size); 1752 hi = xe_map_rd_ring_u32(xe, cmds, idx + 1, size); 1753 offset = make_u64(hi, lo); 1754 offset += shift; 1755 lo = lower_32_bits(offset); 1756 hi = upper_32_bits(offset); 1757 xe_map_wr_ring_u32(xe, cmds, idx, size, lo); 1758 xe_map_wr_ring_u32(xe, cmds, idx + 1, size, hi); 1759 } 1760 1761 /* 1762 * Shift any GGTT addresses within a single message left within CTB from 1763 * before post-migration recovery. 1764 * @ct: pointer to CT struct of the target GuC 1765 * @cmds: iomap buffer containing CT messages 1766 * @head: start of the target message within the buffer 1767 * @len: length of the target message 1768 * @size: size of the commands buffer 1769 * @shift: the address shift to be added to each GGTT reference 1770 * Return: true if the message was fixed or needed no fixups, false on failure 1771 */ 1772 static bool ct_fixup_ggtt_in_message(struct xe_guc_ct *ct, 1773 struct iosys_map *cmds, u32 head, 1774 u32 len, u32 size, s64 shift) 1775 { 1776 struct xe_gt *gt = ct_to_gt(ct); 1777 struct xe_device *xe = ct_to_xe(ct); 1778 u32 msg[GUC_HXG_MSG_MIN_LEN]; 1779 u32 action, i, n; 1780 1781 xe_gt_assert(gt, len >= GUC_HXG_MSG_MIN_LEN); 1782 1783 msg[0] = xe_map_rd_ring_u32(xe, cmds, head, size); 1784 action = FIELD_GET(GUC_HXG_REQUEST_MSG_0_ACTION, msg[0]); 1785 1786 xe_gt_sriov_dbg_verbose(gt, "fixing H2G %#x\n", action); 1787 1788 switch (action) { 1789 case XE_GUC_ACTION_REGISTER_CONTEXT: 1790 if (len != XE_GUC_REGISTER_CONTEXT_MSG_LEN) 1791 goto err_len; 1792 xe_fixup_u64_in_cmds(xe, cmds, size, head + 1793 XE_GUC_REGISTER_CONTEXT_DATA_5_WQ_DESC_ADDR_LOWER, 1794 shift); 1795 xe_fixup_u64_in_cmds(xe, cmds, size, head + 1796 XE_GUC_REGISTER_CONTEXT_DATA_7_WQ_BUF_BASE_LOWER, 1797 shift); 1798 xe_fixup_u64_in_cmds(xe, cmds, size, head + 1799 XE_GUC_REGISTER_CONTEXT_DATA_10_HW_LRC_ADDR, shift); 1800 break; 1801 case XE_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC: 1802 if (len < XE_GUC_REGISTER_CONTEXT_MULTI_LRC_MSG_MIN_LEN) 1803 goto err_len; 1804 n = xe_map_rd_ring_u32(xe, cmds, head + 1805 XE_GUC_REGISTER_CONTEXT_MULTI_LRC_DATA_10_NUM_CTXS, size); 1806 if (len != XE_GUC_REGISTER_CONTEXT_MULTI_LRC_MSG_MIN_LEN + 2 * n) 1807 goto err_len; 1808 xe_fixup_u64_in_cmds(xe, cmds, size, head + 1809 XE_GUC_REGISTER_CONTEXT_MULTI_LRC_DATA_5_WQ_DESC_ADDR_LOWER, 1810 shift); 1811 xe_fixup_u64_in_cmds(xe, cmds, size, head + 1812 XE_GUC_REGISTER_CONTEXT_MULTI_LRC_DATA_7_WQ_BUF_BASE_LOWER, 1813 shift); 1814 for (i = 0; i < n; i++) 1815 xe_fixup_u64_in_cmds(xe, cmds, size, head + 1816 XE_GUC_REGISTER_CONTEXT_MULTI_LRC_DATA_11_HW_LRC_ADDR 1817 + 2 * i, shift); 1818 break; 1819 default: 1820 break; 1821 } 1822 return true; 1823 1824 err_len: 1825 xe_gt_err(gt, "Skipped G2G %#x message fixups, unexpected length (%u)\n", action, len); 1826 return false; 1827 } 1828 1829 /* 1830 * Apply fixups to the next outgoing CT message within given CTB 1831 * @ct: the &xe_guc_ct struct instance representing the target GuC 1832 * @h2g: the &guc_ctb struct instance of the target buffer 1833 * @shift: shift to be added to all GGTT addresses within the CTB 1834 * @mhead: pointer to an integer storing message start position; the 1835 * position is changed to next message before this function return 1836 * @avail: size of the area available for parsing, that is length 1837 * of all remaining messages stored within the CTB 1838 * Return: size of the area available for parsing after one message 1839 * has been parsed, that is length remaining from the updated mhead 1840 */ 1841 static int ct_fixup_ggtt_in_buffer(struct xe_guc_ct *ct, struct guc_ctb *h2g, 1842 s64 shift, u32 *mhead, s32 avail) 1843 { 1844 struct xe_gt *gt = ct_to_gt(ct); 1845 struct xe_device *xe = ct_to_xe(ct); 1846 u32 msg[GUC_HXG_MSG_MIN_LEN]; 1847 u32 size = h2g->info.size; 1848 u32 head = *mhead; 1849 u32 len; 1850 1851 xe_gt_assert(gt, avail >= (s32)GUC_CTB_MSG_MIN_LEN); 1852 1853 /* Read header */ 1854 msg[0] = xe_map_rd_ring_u32(xe, &h2g->cmds, head, size); 1855 len = FIELD_GET(GUC_CTB_MSG_0_NUM_DWORDS, msg[0]) + GUC_CTB_MSG_MIN_LEN; 1856 1857 if (unlikely(len > (u32)avail)) { 1858 xe_gt_err(gt, "H2G channel broken on read, avail=%d, len=%d, fixups skipped\n", 1859 avail, len); 1860 return 0; 1861 } 1862 1863 head = (head + GUC_CTB_MSG_MIN_LEN) % size; 1864 if (!ct_fixup_ggtt_in_message(ct, &h2g->cmds, head, msg_len_to_hxg_len(len), size, shift)) 1865 return 0; 1866 *mhead = (head + msg_len_to_hxg_len(len)) % size; 1867 1868 return avail - len; 1869 } 1870 1871 /** 1872 * xe_guc_ct_fixup_messages_with_ggtt - Fixup any pending H2G CTB messages 1873 * @ct: pointer to CT struct of the target GuC 1874 * @ggtt_shift: shift to be added to all GGTT addresses within the CTB 1875 * 1876 * Messages in GuC to Host CTB are owned by GuC and any fixups in them 1877 * are made by GuC. But content of the Host to GuC CTB is owned by the 1878 * KMD, so fixups to GGTT references in any pending messages need to be 1879 * applied here. 1880 * This function updates GGTT offsets in payloads of pending H2G CTB 1881 * messages (messages which were not consumed by GuC before the VF got 1882 * paused). 1883 */ 1884 void xe_guc_ct_fixup_messages_with_ggtt(struct xe_guc_ct *ct, s64 ggtt_shift) 1885 { 1886 struct guc_ctb *h2g = &ct->ctbs.h2g; 1887 struct xe_guc *guc = ct_to_guc(ct); 1888 struct xe_gt *gt = guc_to_gt(guc); 1889 u32 head, tail, size; 1890 s32 avail; 1891 1892 if (unlikely(h2g->info.broken)) 1893 return; 1894 1895 h2g->info.head = desc_read(ct_to_xe(ct), h2g, head); 1896 head = h2g->info.head; 1897 tail = READ_ONCE(h2g->info.tail); 1898 size = h2g->info.size; 1899 1900 if (unlikely(head > size)) 1901 goto corrupted; 1902 1903 if (unlikely(tail >= size)) 1904 goto corrupted; 1905 1906 avail = tail - head; 1907 1908 /* beware of buffer wrap case */ 1909 if (unlikely(avail < 0)) 1910 avail += size; 1911 xe_gt_dbg(gt, "available %d (%u:%u:%u)\n", avail, head, tail, size); 1912 xe_gt_assert(gt, avail >= 0); 1913 1914 while (avail > 0) 1915 avail = ct_fixup_ggtt_in_buffer(ct, h2g, ggtt_shift, &head, avail); 1916 1917 return; 1918 1919 corrupted: 1920 xe_gt_err(gt, "Corrupted H2G descriptor head=%u tail=%u size=%u, fixups not applied\n", 1921 head, tail, size); 1922 h2g->info.broken = true; 1923 } 1924 1925 static struct xe_guc_ct_snapshot *guc_ct_snapshot_alloc(struct xe_guc_ct *ct, bool atomic, 1926 bool want_ctb) 1927 { 1928 struct xe_guc_ct_snapshot *snapshot; 1929 1930 snapshot = kzalloc(sizeof(*snapshot), atomic ? GFP_ATOMIC : GFP_KERNEL); 1931 if (!snapshot) 1932 return NULL; 1933 1934 if (ct->bo && want_ctb) { 1935 snapshot->ctb_size = xe_bo_size(ct->bo); 1936 snapshot->ctb = kmalloc(snapshot->ctb_size, atomic ? GFP_ATOMIC : GFP_KERNEL); 1937 } 1938 1939 return snapshot; 1940 } 1941 1942 static void guc_ctb_snapshot_capture(struct xe_device *xe, struct guc_ctb *ctb, 1943 struct guc_ctb_snapshot *snapshot) 1944 { 1945 xe_map_memcpy_from(xe, &snapshot->desc, &ctb->desc, 0, 1946 sizeof(struct guc_ct_buffer_desc)); 1947 memcpy(&snapshot->info, &ctb->info, sizeof(struct guc_ctb_info)); 1948 } 1949 1950 static void guc_ctb_snapshot_print(struct guc_ctb_snapshot *snapshot, 1951 struct drm_printer *p) 1952 { 1953 drm_printf(p, "\tsize: %d\n", snapshot->info.size); 1954 drm_printf(p, "\tresv_space: %d\n", snapshot->info.resv_space); 1955 drm_printf(p, "\thead: %d\n", snapshot->info.head); 1956 drm_printf(p, "\ttail: %d\n", snapshot->info.tail); 1957 drm_printf(p, "\tspace: %d\n", snapshot->info.space); 1958 drm_printf(p, "\tbroken: %d\n", snapshot->info.broken); 1959 drm_printf(p, "\thead (memory): %d\n", snapshot->desc.head); 1960 drm_printf(p, "\ttail (memory): %d\n", snapshot->desc.tail); 1961 drm_printf(p, "\tstatus (memory): 0x%x\n", snapshot->desc.status); 1962 } 1963 1964 static struct xe_guc_ct_snapshot *guc_ct_snapshot_capture(struct xe_guc_ct *ct, bool atomic, 1965 bool want_ctb) 1966 { 1967 struct xe_device *xe = ct_to_xe(ct); 1968 struct xe_guc_ct_snapshot *snapshot; 1969 1970 snapshot = guc_ct_snapshot_alloc(ct, atomic, want_ctb); 1971 if (!snapshot) { 1972 xe_gt_err(ct_to_gt(ct), "Skipping CTB snapshot entirely.\n"); 1973 return NULL; 1974 } 1975 1976 if (xe_guc_ct_enabled(ct) || ct->state == XE_GUC_CT_STATE_STOPPED) { 1977 snapshot->ct_enabled = true; 1978 snapshot->g2h_outstanding = READ_ONCE(ct->g2h_outstanding); 1979 guc_ctb_snapshot_capture(xe, &ct->ctbs.h2g, &snapshot->h2g); 1980 guc_ctb_snapshot_capture(xe, &ct->ctbs.g2h, &snapshot->g2h); 1981 } 1982 1983 if (ct->bo && snapshot->ctb) 1984 xe_map_memcpy_from(xe, snapshot->ctb, &ct->bo->vmap, 0, snapshot->ctb_size); 1985 1986 return snapshot; 1987 } 1988 1989 /** 1990 * xe_guc_ct_snapshot_capture - Take a quick snapshot of the CT state. 1991 * @ct: GuC CT object. 1992 * 1993 * This can be printed out in a later stage like during dev_coredump 1994 * analysis. This is safe to be called during atomic context. 1995 * 1996 * Returns: a GuC CT snapshot object that must be freed by the caller 1997 * by using `xe_guc_ct_snapshot_free`. 1998 */ 1999 struct xe_guc_ct_snapshot *xe_guc_ct_snapshot_capture(struct xe_guc_ct *ct) 2000 { 2001 return guc_ct_snapshot_capture(ct, true, true); 2002 } 2003 2004 /** 2005 * xe_guc_ct_snapshot_print - Print out a given GuC CT snapshot. 2006 * @snapshot: GuC CT snapshot object. 2007 * @p: drm_printer where it will be printed out. 2008 * 2009 * This function prints out a given GuC CT snapshot object. 2010 */ 2011 void xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot *snapshot, 2012 struct drm_printer *p) 2013 { 2014 if (!snapshot) 2015 return; 2016 2017 if (snapshot->ct_enabled) { 2018 drm_puts(p, "H2G CTB (all sizes in DW):\n"); 2019 guc_ctb_snapshot_print(&snapshot->h2g, p); 2020 2021 drm_puts(p, "G2H CTB (all sizes in DW):\n"); 2022 guc_ctb_snapshot_print(&snapshot->g2h, p); 2023 drm_printf(p, "\tg2h outstanding: %d\n", 2024 snapshot->g2h_outstanding); 2025 2026 if (snapshot->ctb) { 2027 drm_printf(p, "[CTB].length: 0x%zx\n", snapshot->ctb_size); 2028 xe_print_blob_ascii85(p, "[CTB].data", '\n', 2029 snapshot->ctb, 0, snapshot->ctb_size); 2030 } 2031 } else { 2032 drm_puts(p, "CT disabled\n"); 2033 } 2034 } 2035 2036 /** 2037 * xe_guc_ct_snapshot_free - Free all allocated objects for a given snapshot. 2038 * @snapshot: GuC CT snapshot object. 2039 * 2040 * This function free all the memory that needed to be allocated at capture 2041 * time. 2042 */ 2043 void xe_guc_ct_snapshot_free(struct xe_guc_ct_snapshot *snapshot) 2044 { 2045 if (!snapshot) 2046 return; 2047 2048 kfree(snapshot->ctb); 2049 kfree(snapshot); 2050 } 2051 2052 /** 2053 * xe_guc_ct_print - GuC CT Print. 2054 * @ct: GuC CT. 2055 * @p: drm_printer where it will be printed out. 2056 * @want_ctb: Should the full CTB content be dumped (vs just the headers) 2057 * 2058 * This function will quickly capture a snapshot of the CT state 2059 * and immediately print it out. 2060 */ 2061 void xe_guc_ct_print(struct xe_guc_ct *ct, struct drm_printer *p, bool want_ctb) 2062 { 2063 struct xe_guc_ct_snapshot *snapshot; 2064 2065 snapshot = guc_ct_snapshot_capture(ct, false, want_ctb); 2066 xe_guc_ct_snapshot_print(snapshot, p); 2067 xe_guc_ct_snapshot_free(snapshot); 2068 } 2069 2070 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) 2071 2072 #ifdef CONFIG_FUNCTION_ERROR_INJECTION 2073 /* 2074 * This is a helper function which assists the driver in identifying if a fault 2075 * injection test is currently active, allowing it to reduce unnecessary debug 2076 * output. Typically, the function returns zero, but the fault injection 2077 * framework can alter this to return an error. Since faults are injected 2078 * through this function, it's important to ensure the compiler doesn't optimize 2079 * it into an inline function. To avoid such optimization, the 'noinline' 2080 * attribute is applied. Compiler optimizes the static function defined in the 2081 * header file as an inline function. 2082 */ 2083 noinline int xe_is_injection_active(void) { return 0; } 2084 ALLOW_ERROR_INJECTION(xe_is_injection_active, ERRNO); 2085 #else 2086 int xe_is_injection_active(void) { return 0; } 2087 #endif 2088 2089 static void ct_dead_capture(struct xe_guc_ct *ct, struct guc_ctb *ctb, u32 reason_code) 2090 { 2091 struct xe_guc_log_snapshot *snapshot_log; 2092 struct xe_guc_ct_snapshot *snapshot_ct; 2093 struct xe_guc *guc = ct_to_guc(ct); 2094 unsigned long flags; 2095 bool have_capture; 2096 2097 if (ctb) 2098 ctb->info.broken = true; 2099 /* 2100 * Huge dump is getting generated when injecting error for guc CT/MMIO 2101 * functions. So, let us suppress the dump when fault is injected. 2102 */ 2103 if (xe_is_injection_active()) 2104 return; 2105 2106 /* Ignore further errors after the first dump until a reset */ 2107 if (ct->dead.reported) 2108 return; 2109 2110 spin_lock_irqsave(&ct->dead.lock, flags); 2111 2112 /* And only capture one dump at a time */ 2113 have_capture = ct->dead.reason & (1 << CT_DEAD_STATE_CAPTURE); 2114 ct->dead.reason |= (1 << reason_code) | 2115 (1 << CT_DEAD_STATE_CAPTURE); 2116 2117 spin_unlock_irqrestore(&ct->dead.lock, flags); 2118 2119 if (have_capture) 2120 return; 2121 2122 snapshot_log = xe_guc_log_snapshot_capture(&guc->log, true); 2123 snapshot_ct = xe_guc_ct_snapshot_capture((ct)); 2124 2125 spin_lock_irqsave(&ct->dead.lock, flags); 2126 2127 if (ct->dead.snapshot_log || ct->dead.snapshot_ct) { 2128 xe_gt_err(ct_to_gt(ct), "Got unexpected dead CT capture!\n"); 2129 xe_guc_log_snapshot_free(snapshot_log); 2130 xe_guc_ct_snapshot_free(snapshot_ct); 2131 } else { 2132 ct->dead.snapshot_log = snapshot_log; 2133 ct->dead.snapshot_ct = snapshot_ct; 2134 } 2135 2136 spin_unlock_irqrestore(&ct->dead.lock, flags); 2137 2138 queue_work(system_unbound_wq, &(ct)->dead.worker); 2139 } 2140 2141 static void ct_dead_print(struct xe_dead_ct *dead) 2142 { 2143 struct xe_guc_ct *ct = container_of(dead, struct xe_guc_ct, dead); 2144 struct xe_device *xe = ct_to_xe(ct); 2145 struct xe_gt *gt = ct_to_gt(ct); 2146 static int g_count; 2147 struct drm_printer ip = xe_gt_info_printer(gt); 2148 struct drm_printer lp = drm_line_printer(&ip, "Capture", ++g_count); 2149 2150 if (!dead->reason) { 2151 xe_gt_err(gt, "CTB is dead for no reason!?\n"); 2152 return; 2153 } 2154 2155 /* Can't generate a genuine core dump at this point, so just do the good bits */ 2156 drm_puts(&lp, "**** Xe Device Coredump ****\n"); 2157 drm_printf(&lp, "Reason: CTB is dead - 0x%X\n", dead->reason); 2158 xe_device_snapshot_print(xe, &lp); 2159 2160 drm_printf(&lp, "**** GT #%d ****\n", gt->info.id); 2161 drm_printf(&lp, "\tTile: %d\n", gt->tile->id); 2162 2163 drm_puts(&lp, "**** GuC Log ****\n"); 2164 xe_guc_log_snapshot_print(dead->snapshot_log, &lp); 2165 2166 drm_puts(&lp, "**** GuC CT ****\n"); 2167 xe_guc_ct_snapshot_print(dead->snapshot_ct, &lp); 2168 2169 drm_puts(&lp, "Done.\n"); 2170 } 2171 2172 static void ct_dead_worker_func(struct work_struct *w) 2173 { 2174 struct xe_guc_ct *ct = container_of(w, struct xe_guc_ct, dead.worker); 2175 2176 if (!ct->dead.reported) { 2177 ct->dead.reported = true; 2178 ct_dead_print(&ct->dead); 2179 } 2180 2181 spin_lock_irq(&ct->dead.lock); 2182 2183 xe_guc_log_snapshot_free(ct->dead.snapshot_log); 2184 ct->dead.snapshot_log = NULL; 2185 xe_guc_ct_snapshot_free(ct->dead.snapshot_ct); 2186 ct->dead.snapshot_ct = NULL; 2187 2188 if (ct->dead.reason & (1 << CT_DEAD_STATE_REARM)) { 2189 /* A reset has occurred so re-arm the error reporting */ 2190 ct->dead.reason = 0; 2191 ct->dead.reported = false; 2192 } 2193 2194 spin_unlock_irq(&ct->dead.lock); 2195 } 2196 #endif 2197