1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2022 Intel Corporation 4 */ 5 6 #include "xe_guc_ct.h" 7 8 #include <linux/bitfield.h> 9 #include <linux/circ_buf.h> 10 #include <linux/delay.h> 11 #include <linux/fault-inject.h> 12 13 #include <kunit/static_stub.h> 14 15 #include <drm/drm_managed.h> 16 17 #include "abi/guc_actions_abi.h" 18 #include "abi/guc_actions_sriov_abi.h" 19 #include "abi/guc_klvs_abi.h" 20 #include "xe_bo.h" 21 #include "xe_devcoredump.h" 22 #include "xe_device.h" 23 #include "xe_gt.h" 24 #include "xe_gt_pagefault.h" 25 #include "xe_gt_printk.h" 26 #include "xe_gt_sriov_pf_control.h" 27 #include "xe_gt_sriov_pf_monitor.h" 28 #include "xe_gt_sriov_printk.h" 29 #include "xe_gt_tlb_invalidation.h" 30 #include "xe_guc.h" 31 #include "xe_guc_log.h" 32 #include "xe_guc_relay.h" 33 #include "xe_guc_submit.h" 34 #include "xe_map.h" 35 #include "xe_pm.h" 36 #include "xe_trace_guc.h" 37 38 static void receive_g2h(struct xe_guc_ct *ct); 39 static void g2h_worker_func(struct work_struct *w); 40 static void safe_mode_worker_func(struct work_struct *w); 41 static void ct_exit_safe_mode(struct xe_guc_ct *ct); 42 43 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) 44 enum { 45 /* Internal states, not error conditions */ 46 CT_DEAD_STATE_REARM, /* 0x0001 */ 47 CT_DEAD_STATE_CAPTURE, /* 0x0002 */ 48 49 /* Error conditions */ 50 CT_DEAD_SETUP, /* 0x0004 */ 51 CT_DEAD_H2G_WRITE, /* 0x0008 */ 52 CT_DEAD_H2G_HAS_ROOM, /* 0x0010 */ 53 CT_DEAD_G2H_READ, /* 0x0020 */ 54 CT_DEAD_G2H_RECV, /* 0x0040 */ 55 CT_DEAD_G2H_RELEASE, /* 0x0080 */ 56 CT_DEAD_DEADLOCK, /* 0x0100 */ 57 CT_DEAD_PROCESS_FAILED, /* 0x0200 */ 58 CT_DEAD_FAST_G2H, /* 0x0400 */ 59 CT_DEAD_PARSE_G2H_RESPONSE, /* 0x0800 */ 60 CT_DEAD_PARSE_G2H_UNKNOWN, /* 0x1000 */ 61 CT_DEAD_PARSE_G2H_ORIGIN, /* 0x2000 */ 62 CT_DEAD_PARSE_G2H_TYPE, /* 0x4000 */ 63 CT_DEAD_CRASH, /* 0x8000 */ 64 }; 65 66 static void ct_dead_worker_func(struct work_struct *w); 67 static void ct_dead_capture(struct xe_guc_ct *ct, struct guc_ctb *ctb, u32 reason_code); 68 69 #define CT_DEAD(ct, ctb, reason_code) ct_dead_capture((ct), (ctb), CT_DEAD_##reason_code) 70 #else 71 #define CT_DEAD(ct, ctb, reason) \ 72 do { \ 73 struct guc_ctb *_ctb = (ctb); \ 74 if (_ctb) \ 75 _ctb->info.broken = true; \ 76 } while (0) 77 #endif 78 79 /* Used when a CT send wants to block and / or receive data */ 80 struct g2h_fence { 81 u32 *response_buffer; 82 u32 seqno; 83 u32 response_data; 84 u16 response_len; 85 u16 error; 86 u16 hint; 87 u16 reason; 88 bool cancel; 89 bool retry; 90 bool fail; 91 bool done; 92 }; 93 94 #define make_u64(hi, lo) ((u64)((u64)(u32)(hi) << 32 | (u32)(lo))) 95 96 static void g2h_fence_init(struct g2h_fence *g2h_fence, u32 *response_buffer) 97 { 98 g2h_fence->response_buffer = response_buffer; 99 g2h_fence->response_data = 0; 100 g2h_fence->response_len = 0; 101 g2h_fence->fail = false; 102 g2h_fence->retry = false; 103 g2h_fence->done = false; 104 g2h_fence->seqno = ~0x0; 105 } 106 107 static void g2h_fence_cancel(struct g2h_fence *g2h_fence) 108 { 109 g2h_fence->cancel = true; 110 g2h_fence->fail = true; 111 g2h_fence->done = true; 112 } 113 114 static bool g2h_fence_needs_alloc(struct g2h_fence *g2h_fence) 115 { 116 return g2h_fence->seqno == ~0x0; 117 } 118 119 static struct xe_guc * 120 ct_to_guc(struct xe_guc_ct *ct) 121 { 122 return container_of(ct, struct xe_guc, ct); 123 } 124 125 static struct xe_gt * 126 ct_to_gt(struct xe_guc_ct *ct) 127 { 128 return container_of(ct, struct xe_gt, uc.guc.ct); 129 } 130 131 static struct xe_device * 132 ct_to_xe(struct xe_guc_ct *ct) 133 { 134 return gt_to_xe(ct_to_gt(ct)); 135 } 136 137 /** 138 * DOC: GuC CTB Blob 139 * 140 * We allocate single blob to hold both CTB descriptors and buffers: 141 * 142 * +--------+-----------------------------------------------+------+ 143 * | offset | contents | size | 144 * +========+===============================================+======+ 145 * | 0x0000 | H2G CTB Descriptor (send) | | 146 * +--------+-----------------------------------------------+ 4K | 147 * | 0x0800 | G2H CTB Descriptor (g2h) | | 148 * +--------+-----------------------------------------------+------+ 149 * | 0x1000 | H2G CT Buffer (send) | n*4K | 150 * | | | | 151 * +--------+-----------------------------------------------+------+ 152 * | 0x1000 | G2H CT Buffer (g2h) | m*4K | 153 * | + n*4K | | | 154 * +--------+-----------------------------------------------+------+ 155 * 156 * Size of each ``CT Buffer`` must be multiple of 4K. 157 * We don't expect too many messages in flight at any time, unless we are 158 * using the GuC submission. In that case each request requires a minimum 159 * 2 dwords which gives us a maximum 256 queue'd requests. Hopefully this 160 * enough space to avoid backpressure on the driver. We increase the size 161 * of the receive buffer (relative to the send) to ensure a G2H response 162 * CTB has a landing spot. 163 * 164 * In addition to submissions, the G2H buffer needs to be able to hold 165 * enough space for recoverable page fault notifications. The number of 166 * page faults is interrupt driven and can be as much as the number of 167 * compute resources available. However, most of the actual work for these 168 * is in a separate page fault worker thread. Therefore we only need to 169 * make sure the queue has enough space to handle all of the submissions 170 * and responses and an extra buffer for incoming page faults. 171 */ 172 173 #define CTB_DESC_SIZE ALIGN(sizeof(struct guc_ct_buffer_desc), SZ_2K) 174 #define CTB_H2G_BUFFER_SIZE (SZ_4K) 175 #define CTB_G2H_BUFFER_SIZE (SZ_128K) 176 #define G2H_ROOM_BUFFER_SIZE (CTB_G2H_BUFFER_SIZE / 2) 177 178 /** 179 * xe_guc_ct_queue_proc_time_jiffies - Return maximum time to process a full 180 * CT command queue 181 * @ct: the &xe_guc_ct. Unused at this moment but will be used in the future. 182 * 183 * Observation is that a 4KiB buffer full of commands takes a little over a 184 * second to process. Use that to calculate maximum time to process a full CT 185 * command queue. 186 * 187 * Return: Maximum time to process a full CT queue in jiffies. 188 */ 189 long xe_guc_ct_queue_proc_time_jiffies(struct xe_guc_ct *ct) 190 { 191 BUILD_BUG_ON(!IS_ALIGNED(CTB_H2G_BUFFER_SIZE, SZ_4)); 192 return (CTB_H2G_BUFFER_SIZE / SZ_4K) * HZ; 193 } 194 195 static size_t guc_ct_size(void) 196 { 197 return 2 * CTB_DESC_SIZE + CTB_H2G_BUFFER_SIZE + 198 CTB_G2H_BUFFER_SIZE; 199 } 200 201 static void guc_ct_fini(struct drm_device *drm, void *arg) 202 { 203 struct xe_guc_ct *ct = arg; 204 205 ct_exit_safe_mode(ct); 206 destroy_workqueue(ct->g2h_wq); 207 xa_destroy(&ct->fence_lookup); 208 } 209 210 static void primelockdep(struct xe_guc_ct *ct) 211 { 212 if (!IS_ENABLED(CONFIG_LOCKDEP)) 213 return; 214 215 fs_reclaim_acquire(GFP_KERNEL); 216 might_lock(&ct->lock); 217 fs_reclaim_release(GFP_KERNEL); 218 } 219 220 int xe_guc_ct_init_noalloc(struct xe_guc_ct *ct) 221 { 222 struct xe_device *xe = ct_to_xe(ct); 223 struct xe_gt *gt = ct_to_gt(ct); 224 int err; 225 226 xe_gt_assert(gt, !(guc_ct_size() % PAGE_SIZE)); 227 228 ct->g2h_wq = alloc_ordered_workqueue("xe-g2h-wq", WQ_MEM_RECLAIM); 229 if (!ct->g2h_wq) 230 return -ENOMEM; 231 232 spin_lock_init(&ct->fast_lock); 233 xa_init(&ct->fence_lookup); 234 INIT_WORK(&ct->g2h_worker, g2h_worker_func); 235 INIT_DELAYED_WORK(&ct->safe_mode_worker, safe_mode_worker_func); 236 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) 237 spin_lock_init(&ct->dead.lock); 238 INIT_WORK(&ct->dead.worker, ct_dead_worker_func); 239 #endif 240 init_waitqueue_head(&ct->wq); 241 init_waitqueue_head(&ct->g2h_fence_wq); 242 243 err = drmm_mutex_init(&xe->drm, &ct->lock); 244 if (err) 245 return err; 246 247 primelockdep(ct); 248 249 err = drmm_add_action_or_reset(&xe->drm, guc_ct_fini, ct); 250 if (err) 251 return err; 252 253 xe_gt_assert(gt, ct->state == XE_GUC_CT_STATE_NOT_INITIALIZED); 254 ct->state = XE_GUC_CT_STATE_DISABLED; 255 return 0; 256 } 257 ALLOW_ERROR_INJECTION(xe_guc_ct_init_noalloc, ERRNO); /* See xe_pci_probe() */ 258 259 int xe_guc_ct_init(struct xe_guc_ct *ct) 260 { 261 struct xe_device *xe = ct_to_xe(ct); 262 struct xe_gt *gt = ct_to_gt(ct); 263 struct xe_tile *tile = gt_to_tile(gt); 264 struct xe_bo *bo; 265 266 bo = xe_managed_bo_create_pin_map(xe, tile, guc_ct_size(), 267 XE_BO_FLAG_SYSTEM | 268 XE_BO_FLAG_GGTT | 269 XE_BO_FLAG_GGTT_INVALIDATE | 270 XE_BO_FLAG_PINNED_NORESTORE); 271 if (IS_ERR(bo)) 272 return PTR_ERR(bo); 273 274 ct->bo = bo; 275 return 0; 276 } 277 ALLOW_ERROR_INJECTION(xe_guc_ct_init, ERRNO); /* See xe_pci_probe() */ 278 279 #define desc_read(xe_, guc_ctb__, field_) \ 280 xe_map_rd_field(xe_, &guc_ctb__->desc, 0, \ 281 struct guc_ct_buffer_desc, field_) 282 283 #define desc_write(xe_, guc_ctb__, field_, val_) \ 284 xe_map_wr_field(xe_, &guc_ctb__->desc, 0, \ 285 struct guc_ct_buffer_desc, field_, val_) 286 287 static void guc_ct_ctb_h2g_init(struct xe_device *xe, struct guc_ctb *h2g, 288 struct iosys_map *map) 289 { 290 h2g->info.size = CTB_H2G_BUFFER_SIZE / sizeof(u32); 291 h2g->info.resv_space = 0; 292 h2g->info.tail = 0; 293 h2g->info.head = 0; 294 h2g->info.space = CIRC_SPACE(h2g->info.tail, h2g->info.head, 295 h2g->info.size) - 296 h2g->info.resv_space; 297 h2g->info.broken = false; 298 299 h2g->desc = *map; 300 xe_map_memset(xe, &h2g->desc, 0, 0, sizeof(struct guc_ct_buffer_desc)); 301 302 h2g->cmds = IOSYS_MAP_INIT_OFFSET(map, CTB_DESC_SIZE * 2); 303 } 304 305 static void guc_ct_ctb_g2h_init(struct xe_device *xe, struct guc_ctb *g2h, 306 struct iosys_map *map) 307 { 308 g2h->info.size = CTB_G2H_BUFFER_SIZE / sizeof(u32); 309 g2h->info.resv_space = G2H_ROOM_BUFFER_SIZE / sizeof(u32); 310 g2h->info.head = 0; 311 g2h->info.tail = 0; 312 g2h->info.space = CIRC_SPACE(g2h->info.tail, g2h->info.head, 313 g2h->info.size) - 314 g2h->info.resv_space; 315 g2h->info.broken = false; 316 317 g2h->desc = IOSYS_MAP_INIT_OFFSET(map, CTB_DESC_SIZE); 318 xe_map_memset(xe, &g2h->desc, 0, 0, sizeof(struct guc_ct_buffer_desc)); 319 320 g2h->cmds = IOSYS_MAP_INIT_OFFSET(map, CTB_DESC_SIZE * 2 + 321 CTB_H2G_BUFFER_SIZE); 322 } 323 324 static int guc_ct_ctb_h2g_register(struct xe_guc_ct *ct) 325 { 326 struct xe_guc *guc = ct_to_guc(ct); 327 u32 desc_addr, ctb_addr, size; 328 int err; 329 330 desc_addr = xe_bo_ggtt_addr(ct->bo); 331 ctb_addr = xe_bo_ggtt_addr(ct->bo) + CTB_DESC_SIZE * 2; 332 size = ct->ctbs.h2g.info.size * sizeof(u32); 333 334 err = xe_guc_self_cfg64(guc, 335 GUC_KLV_SELF_CFG_H2G_CTB_DESCRIPTOR_ADDR_KEY, 336 desc_addr); 337 if (err) 338 return err; 339 340 err = xe_guc_self_cfg64(guc, 341 GUC_KLV_SELF_CFG_H2G_CTB_ADDR_KEY, 342 ctb_addr); 343 if (err) 344 return err; 345 346 return xe_guc_self_cfg32(guc, 347 GUC_KLV_SELF_CFG_H2G_CTB_SIZE_KEY, 348 size); 349 } 350 351 static int guc_ct_ctb_g2h_register(struct xe_guc_ct *ct) 352 { 353 struct xe_guc *guc = ct_to_guc(ct); 354 u32 desc_addr, ctb_addr, size; 355 int err; 356 357 desc_addr = xe_bo_ggtt_addr(ct->bo) + CTB_DESC_SIZE; 358 ctb_addr = xe_bo_ggtt_addr(ct->bo) + CTB_DESC_SIZE * 2 + 359 CTB_H2G_BUFFER_SIZE; 360 size = ct->ctbs.g2h.info.size * sizeof(u32); 361 362 err = xe_guc_self_cfg64(guc, 363 GUC_KLV_SELF_CFG_G2H_CTB_DESCRIPTOR_ADDR_KEY, 364 desc_addr); 365 if (err) 366 return err; 367 368 err = xe_guc_self_cfg64(guc, 369 GUC_KLV_SELF_CFG_G2H_CTB_ADDR_KEY, 370 ctb_addr); 371 if (err) 372 return err; 373 374 return xe_guc_self_cfg32(guc, 375 GUC_KLV_SELF_CFG_G2H_CTB_SIZE_KEY, 376 size); 377 } 378 379 static int guc_ct_control_toggle(struct xe_guc_ct *ct, bool enable) 380 { 381 u32 request[HOST2GUC_CONTROL_CTB_REQUEST_MSG_LEN] = { 382 FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) | 383 FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) | 384 FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION, 385 GUC_ACTION_HOST2GUC_CONTROL_CTB), 386 FIELD_PREP(HOST2GUC_CONTROL_CTB_REQUEST_MSG_1_CONTROL, 387 enable ? GUC_CTB_CONTROL_ENABLE : 388 GUC_CTB_CONTROL_DISABLE), 389 }; 390 int ret = xe_guc_mmio_send(ct_to_guc(ct), request, ARRAY_SIZE(request)); 391 392 return ret > 0 ? -EPROTO : ret; 393 } 394 395 static void guc_ct_change_state(struct xe_guc_ct *ct, 396 enum xe_guc_ct_state state) 397 { 398 struct xe_gt *gt = ct_to_gt(ct); 399 struct g2h_fence *g2h_fence; 400 unsigned long idx; 401 402 mutex_lock(&ct->lock); /* Serialise dequeue_one_g2h() */ 403 spin_lock_irq(&ct->fast_lock); /* Serialise CT fast-path */ 404 405 xe_gt_assert(ct_to_gt(ct), ct->g2h_outstanding == 0 || 406 state == XE_GUC_CT_STATE_STOPPED); 407 408 if (ct->g2h_outstanding) 409 xe_pm_runtime_put(ct_to_xe(ct)); 410 ct->g2h_outstanding = 0; 411 ct->state = state; 412 413 xe_gt_dbg(gt, "GuC CT communication channel %s\n", 414 state == XE_GUC_CT_STATE_STOPPED ? "stopped" : 415 str_enabled_disabled(state == XE_GUC_CT_STATE_ENABLED)); 416 417 spin_unlock_irq(&ct->fast_lock); 418 419 /* cancel all in-flight send-recv requests */ 420 xa_for_each(&ct->fence_lookup, idx, g2h_fence) 421 g2h_fence_cancel(g2h_fence); 422 423 /* make sure guc_ct_send_recv() will see g2h_fence changes */ 424 smp_mb(); 425 wake_up_all(&ct->g2h_fence_wq); 426 427 /* 428 * Lockdep doesn't like this under the fast lock and he destroy only 429 * needs to be serialized with the send path which ct lock provides. 430 */ 431 xa_destroy(&ct->fence_lookup); 432 433 mutex_unlock(&ct->lock); 434 } 435 436 static bool ct_needs_safe_mode(struct xe_guc_ct *ct) 437 { 438 return !pci_dev_msi_enabled(to_pci_dev(ct_to_xe(ct)->drm.dev)); 439 } 440 441 static bool ct_restart_safe_mode_worker(struct xe_guc_ct *ct) 442 { 443 if (!ct_needs_safe_mode(ct)) 444 return false; 445 446 queue_delayed_work(ct->g2h_wq, &ct->safe_mode_worker, HZ / 10); 447 return true; 448 } 449 450 static void safe_mode_worker_func(struct work_struct *w) 451 { 452 struct xe_guc_ct *ct = container_of(w, struct xe_guc_ct, safe_mode_worker.work); 453 454 receive_g2h(ct); 455 456 if (!ct_restart_safe_mode_worker(ct)) 457 xe_gt_dbg(ct_to_gt(ct), "GuC CT safe-mode canceled\n"); 458 } 459 460 static void ct_enter_safe_mode(struct xe_guc_ct *ct) 461 { 462 if (ct_restart_safe_mode_worker(ct)) 463 xe_gt_dbg(ct_to_gt(ct), "GuC CT safe-mode enabled\n"); 464 } 465 466 static void ct_exit_safe_mode(struct xe_guc_ct *ct) 467 { 468 if (cancel_delayed_work_sync(&ct->safe_mode_worker)) 469 xe_gt_dbg(ct_to_gt(ct), "GuC CT safe-mode disabled\n"); 470 } 471 472 int xe_guc_ct_enable(struct xe_guc_ct *ct) 473 { 474 struct xe_device *xe = ct_to_xe(ct); 475 struct xe_gt *gt = ct_to_gt(ct); 476 int err; 477 478 xe_gt_assert(gt, !xe_guc_ct_enabled(ct)); 479 480 xe_map_memset(xe, &ct->bo->vmap, 0, 0, xe_bo_size(ct->bo)); 481 guc_ct_ctb_h2g_init(xe, &ct->ctbs.h2g, &ct->bo->vmap); 482 guc_ct_ctb_g2h_init(xe, &ct->ctbs.g2h, &ct->bo->vmap); 483 484 err = guc_ct_ctb_h2g_register(ct); 485 if (err) 486 goto err_out; 487 488 err = guc_ct_ctb_g2h_register(ct); 489 if (err) 490 goto err_out; 491 492 err = guc_ct_control_toggle(ct, true); 493 if (err) 494 goto err_out; 495 496 guc_ct_change_state(ct, XE_GUC_CT_STATE_ENABLED); 497 498 smp_mb(); 499 wake_up_all(&ct->wq); 500 501 if (ct_needs_safe_mode(ct)) 502 ct_enter_safe_mode(ct); 503 504 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) 505 /* 506 * The CT has now been reset so the dumper can be re-armed 507 * after any existing dead state has been dumped. 508 */ 509 spin_lock_irq(&ct->dead.lock); 510 if (ct->dead.reason) { 511 ct->dead.reason |= (1 << CT_DEAD_STATE_REARM); 512 queue_work(system_unbound_wq, &ct->dead.worker); 513 } 514 spin_unlock_irq(&ct->dead.lock); 515 #endif 516 517 return 0; 518 519 err_out: 520 xe_gt_err(gt, "Failed to enable GuC CT (%pe)\n", ERR_PTR(err)); 521 CT_DEAD(ct, NULL, SETUP); 522 523 return err; 524 } 525 526 static void stop_g2h_handler(struct xe_guc_ct *ct) 527 { 528 cancel_work_sync(&ct->g2h_worker); 529 } 530 531 /** 532 * xe_guc_ct_disable - Set GuC to disabled state 533 * @ct: the &xe_guc_ct 534 * 535 * Set GuC CT to disabled state and stop g2h handler. No outstanding g2h expected 536 * in this transition. 537 */ 538 void xe_guc_ct_disable(struct xe_guc_ct *ct) 539 { 540 guc_ct_change_state(ct, XE_GUC_CT_STATE_DISABLED); 541 ct_exit_safe_mode(ct); 542 stop_g2h_handler(ct); 543 } 544 545 /** 546 * xe_guc_ct_stop - Set GuC to stopped state 547 * @ct: the &xe_guc_ct 548 * 549 * Set GuC CT to stopped state, stop g2h handler, and clear any outstanding g2h 550 */ 551 void xe_guc_ct_stop(struct xe_guc_ct *ct) 552 { 553 if (!xe_guc_ct_initialized(ct)) 554 return; 555 556 guc_ct_change_state(ct, XE_GUC_CT_STATE_STOPPED); 557 stop_g2h_handler(ct); 558 } 559 560 static bool h2g_has_room(struct xe_guc_ct *ct, u32 cmd_len) 561 { 562 struct guc_ctb *h2g = &ct->ctbs.h2g; 563 564 lockdep_assert_held(&ct->lock); 565 566 if (cmd_len > h2g->info.space) { 567 h2g->info.head = desc_read(ct_to_xe(ct), h2g, head); 568 569 if (h2g->info.head > h2g->info.size) { 570 struct xe_device *xe = ct_to_xe(ct); 571 u32 desc_status = desc_read(xe, h2g, status); 572 573 desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW); 574 575 xe_gt_err(ct_to_gt(ct), "CT: invalid head offset %u >= %u)\n", 576 h2g->info.head, h2g->info.size); 577 CT_DEAD(ct, h2g, H2G_HAS_ROOM); 578 return false; 579 } 580 581 h2g->info.space = CIRC_SPACE(h2g->info.tail, h2g->info.head, 582 h2g->info.size) - 583 h2g->info.resv_space; 584 if (cmd_len > h2g->info.space) 585 return false; 586 } 587 588 return true; 589 } 590 591 static bool g2h_has_room(struct xe_guc_ct *ct, u32 g2h_len) 592 { 593 if (!g2h_len) 594 return true; 595 596 lockdep_assert_held(&ct->fast_lock); 597 598 return ct->ctbs.g2h.info.space > g2h_len; 599 } 600 601 static int has_room(struct xe_guc_ct *ct, u32 cmd_len, u32 g2h_len) 602 { 603 lockdep_assert_held(&ct->lock); 604 605 if (!g2h_has_room(ct, g2h_len) || !h2g_has_room(ct, cmd_len)) 606 return -EBUSY; 607 608 return 0; 609 } 610 611 static void h2g_reserve_space(struct xe_guc_ct *ct, u32 cmd_len) 612 { 613 lockdep_assert_held(&ct->lock); 614 ct->ctbs.h2g.info.space -= cmd_len; 615 } 616 617 static void __g2h_reserve_space(struct xe_guc_ct *ct, u32 g2h_len, u32 num_g2h) 618 { 619 xe_gt_assert(ct_to_gt(ct), g2h_len <= ct->ctbs.g2h.info.space); 620 xe_gt_assert(ct_to_gt(ct), (!g2h_len && !num_g2h) || 621 (g2h_len && num_g2h)); 622 623 if (g2h_len) { 624 lockdep_assert_held(&ct->fast_lock); 625 626 if (!ct->g2h_outstanding) 627 xe_pm_runtime_get_noresume(ct_to_xe(ct)); 628 629 ct->ctbs.g2h.info.space -= g2h_len; 630 ct->g2h_outstanding += num_g2h; 631 } 632 } 633 634 static void __g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len) 635 { 636 bool bad = false; 637 638 lockdep_assert_held(&ct->fast_lock); 639 640 bad = ct->ctbs.g2h.info.space + g2h_len > 641 ct->ctbs.g2h.info.size - ct->ctbs.g2h.info.resv_space; 642 bad |= !ct->g2h_outstanding; 643 644 if (bad) { 645 xe_gt_err(ct_to_gt(ct), "Invalid G2H release: %d + %d vs %d - %d -> %d vs %d, outstanding = %d!\n", 646 ct->ctbs.g2h.info.space, g2h_len, 647 ct->ctbs.g2h.info.size, ct->ctbs.g2h.info.resv_space, 648 ct->ctbs.g2h.info.space + g2h_len, 649 ct->ctbs.g2h.info.size - ct->ctbs.g2h.info.resv_space, 650 ct->g2h_outstanding); 651 CT_DEAD(ct, &ct->ctbs.g2h, G2H_RELEASE); 652 return; 653 } 654 655 ct->ctbs.g2h.info.space += g2h_len; 656 if (!--ct->g2h_outstanding) 657 xe_pm_runtime_put(ct_to_xe(ct)); 658 } 659 660 static void g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len) 661 { 662 spin_lock_irq(&ct->fast_lock); 663 __g2h_release_space(ct, g2h_len); 664 spin_unlock_irq(&ct->fast_lock); 665 } 666 667 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) 668 static void fast_req_track(struct xe_guc_ct *ct, u16 fence, u16 action) 669 { 670 unsigned int slot = fence % ARRAY_SIZE(ct->fast_req); 671 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC) 672 unsigned long entries[SZ_32]; 673 unsigned int n; 674 675 n = stack_trace_save(entries, ARRAY_SIZE(entries), 1); 676 677 /* May be called under spinlock, so avoid sleeping */ 678 ct->fast_req[slot].stack = stack_depot_save(entries, n, GFP_NOWAIT); 679 #endif 680 ct->fast_req[slot].fence = fence; 681 ct->fast_req[slot].action = action; 682 } 683 #else 684 static void fast_req_track(struct xe_guc_ct *ct, u16 fence, u16 action) 685 { 686 } 687 #endif 688 689 /* 690 * The CT protocol accepts a 16 bits fence. This field is fully owned by the 691 * driver, the GuC will just copy it to the reply message. Since we need to 692 * be able to distinguish between replies to REQUEST and FAST_REQUEST messages, 693 * we use one bit of the seqno as an indicator for that and a rolling counter 694 * for the remaining 15 bits. 695 */ 696 #define CT_SEQNO_MASK GENMASK(14, 0) 697 #define CT_SEQNO_UNTRACKED BIT(15) 698 static u16 next_ct_seqno(struct xe_guc_ct *ct, bool is_g2h_fence) 699 { 700 u32 seqno = ct->fence_seqno++ & CT_SEQNO_MASK; 701 702 if (!is_g2h_fence) 703 seqno |= CT_SEQNO_UNTRACKED; 704 705 return seqno; 706 } 707 708 #define H2G_CT_HEADERS (GUC_CTB_HDR_LEN + 1) /* one DW CTB header and one DW HxG header */ 709 710 static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len, 711 u32 ct_fence_value, bool want_response) 712 { 713 struct xe_device *xe = ct_to_xe(ct); 714 struct xe_gt *gt = ct_to_gt(ct); 715 struct guc_ctb *h2g = &ct->ctbs.h2g; 716 u32 cmd[H2G_CT_HEADERS]; 717 u32 tail = h2g->info.tail; 718 u32 full_len; 719 struct iosys_map map = IOSYS_MAP_INIT_OFFSET(&h2g->cmds, 720 tail * sizeof(u32)); 721 u32 desc_status; 722 723 full_len = len + GUC_CTB_HDR_LEN; 724 725 lockdep_assert_held(&ct->lock); 726 xe_gt_assert(gt, full_len <= GUC_CTB_MSG_MAX_LEN); 727 728 desc_status = desc_read(xe, h2g, status); 729 if (desc_status) { 730 xe_gt_err(gt, "CT write: non-zero status: %u\n", desc_status); 731 goto corrupted; 732 } 733 734 if (IS_ENABLED(CONFIG_DRM_XE_DEBUG)) { 735 u32 desc_tail = desc_read(xe, h2g, tail); 736 u32 desc_head = desc_read(xe, h2g, head); 737 738 if (tail != desc_tail) { 739 desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_MISMATCH); 740 xe_gt_err(gt, "CT write: tail was modified %u != %u\n", desc_tail, tail); 741 goto corrupted; 742 } 743 744 if (tail > h2g->info.size) { 745 desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW); 746 xe_gt_err(gt, "CT write: tail out of range: %u vs %u\n", 747 tail, h2g->info.size); 748 goto corrupted; 749 } 750 751 if (desc_head >= h2g->info.size) { 752 desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW); 753 xe_gt_err(gt, "CT write: invalid head offset %u >= %u)\n", 754 desc_head, h2g->info.size); 755 goto corrupted; 756 } 757 } 758 759 /* Command will wrap, zero fill (NOPs), return and check credits again */ 760 if (tail + full_len > h2g->info.size) { 761 xe_map_memset(xe, &map, 0, 0, 762 (h2g->info.size - tail) * sizeof(u32)); 763 h2g_reserve_space(ct, (h2g->info.size - tail)); 764 h2g->info.tail = 0; 765 desc_write(xe, h2g, tail, h2g->info.tail); 766 767 return -EAGAIN; 768 } 769 770 /* 771 * dw0: CT header (including fence) 772 * dw1: HXG header (including action code) 773 * dw2+: action data 774 */ 775 cmd[0] = FIELD_PREP(GUC_CTB_MSG_0_FORMAT, GUC_CTB_FORMAT_HXG) | 776 FIELD_PREP(GUC_CTB_MSG_0_NUM_DWORDS, len) | 777 FIELD_PREP(GUC_CTB_MSG_0_FENCE, ct_fence_value); 778 if (want_response) { 779 cmd[1] = 780 FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) | 781 FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION | 782 GUC_HXG_EVENT_MSG_0_DATA0, action[0]); 783 } else { 784 fast_req_track(ct, ct_fence_value, 785 FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, action[0])); 786 787 cmd[1] = 788 FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_FAST_REQUEST) | 789 FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION | 790 GUC_HXG_EVENT_MSG_0_DATA0, action[0]); 791 } 792 793 /* H2G header in cmd[1] replaces action[0] so: */ 794 --len; 795 ++action; 796 797 /* Write H2G ensuring visible before descriptor update */ 798 xe_map_memcpy_to(xe, &map, 0, cmd, H2G_CT_HEADERS * sizeof(u32)); 799 xe_map_memcpy_to(xe, &map, H2G_CT_HEADERS * sizeof(u32), action, len * sizeof(u32)); 800 xe_device_wmb(xe); 801 802 /* Update local copies */ 803 h2g->info.tail = (tail + full_len) % h2g->info.size; 804 h2g_reserve_space(ct, full_len); 805 806 /* Update descriptor */ 807 desc_write(xe, h2g, tail, h2g->info.tail); 808 809 trace_xe_guc_ctb_h2g(xe, gt->info.id, *(action - 1), full_len, 810 desc_read(xe, h2g, head), h2g->info.tail); 811 812 return 0; 813 814 corrupted: 815 CT_DEAD(ct, &ct->ctbs.h2g, H2G_WRITE); 816 return -EPIPE; 817 } 818 819 static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, 820 u32 len, u32 g2h_len, u32 num_g2h, 821 struct g2h_fence *g2h_fence) 822 { 823 struct xe_gt *gt __maybe_unused = ct_to_gt(ct); 824 u16 seqno; 825 int ret; 826 827 xe_gt_assert(gt, xe_guc_ct_initialized(ct)); 828 xe_gt_assert(gt, !g2h_len || !g2h_fence); 829 xe_gt_assert(gt, !num_g2h || !g2h_fence); 830 xe_gt_assert(gt, !g2h_len || num_g2h); 831 xe_gt_assert(gt, g2h_len || !num_g2h); 832 lockdep_assert_held(&ct->lock); 833 834 if (unlikely(ct->ctbs.h2g.info.broken)) { 835 ret = -EPIPE; 836 goto out; 837 } 838 839 if (ct->state == XE_GUC_CT_STATE_DISABLED) { 840 ret = -ENODEV; 841 goto out; 842 } 843 844 if (ct->state == XE_GUC_CT_STATE_STOPPED) { 845 ret = -ECANCELED; 846 goto out; 847 } 848 849 xe_gt_assert(gt, xe_guc_ct_enabled(ct)); 850 851 if (g2h_fence) { 852 g2h_len = GUC_CTB_HXG_MSG_MAX_LEN; 853 num_g2h = 1; 854 855 if (g2h_fence_needs_alloc(g2h_fence)) { 856 g2h_fence->seqno = next_ct_seqno(ct, true); 857 ret = xa_err(xa_store(&ct->fence_lookup, 858 g2h_fence->seqno, g2h_fence, 859 GFP_ATOMIC)); 860 if (ret) 861 goto out; 862 } 863 864 seqno = g2h_fence->seqno; 865 } else { 866 seqno = next_ct_seqno(ct, false); 867 } 868 869 if (g2h_len) 870 spin_lock_irq(&ct->fast_lock); 871 retry: 872 ret = has_room(ct, len + GUC_CTB_HDR_LEN, g2h_len); 873 if (unlikely(ret)) 874 goto out_unlock; 875 876 ret = h2g_write(ct, action, len, seqno, !!g2h_fence); 877 if (unlikely(ret)) { 878 if (ret == -EAGAIN) 879 goto retry; 880 goto out_unlock; 881 } 882 883 __g2h_reserve_space(ct, g2h_len, num_g2h); 884 xe_guc_notify(ct_to_guc(ct)); 885 out_unlock: 886 if (g2h_len) 887 spin_unlock_irq(&ct->fast_lock); 888 out: 889 return ret; 890 } 891 892 static void kick_reset(struct xe_guc_ct *ct) 893 { 894 xe_gt_reset_async(ct_to_gt(ct)); 895 } 896 897 static int dequeue_one_g2h(struct xe_guc_ct *ct); 898 899 static int guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len, 900 u32 g2h_len, u32 num_g2h, 901 struct g2h_fence *g2h_fence) 902 { 903 struct xe_device *xe = ct_to_xe(ct); 904 struct xe_gt *gt = ct_to_gt(ct); 905 unsigned int sleep_period_ms = 1; 906 int ret; 907 908 xe_gt_assert(gt, !g2h_len || !g2h_fence); 909 lockdep_assert_held(&ct->lock); 910 xe_device_assert_mem_access(ct_to_xe(ct)); 911 912 try_again: 913 ret = __guc_ct_send_locked(ct, action, len, g2h_len, num_g2h, 914 g2h_fence); 915 916 /* 917 * We wait to try to restore credits for about 1 second before bailing. 918 * In the case of H2G credits we have no choice but just to wait for the 919 * GuC to consume H2Gs in the channel so we use a wait / sleep loop. In 920 * the case of G2H we process any G2H in the channel, hopefully freeing 921 * credits as we consume the G2H messages. 922 */ 923 if (unlikely(ret == -EBUSY && 924 !h2g_has_room(ct, len + GUC_CTB_HDR_LEN))) { 925 struct guc_ctb *h2g = &ct->ctbs.h2g; 926 927 if (sleep_period_ms == 1024) 928 goto broken; 929 930 trace_xe_guc_ct_h2g_flow_control(xe, h2g->info.head, h2g->info.tail, 931 h2g->info.size, 932 h2g->info.space, 933 len + GUC_CTB_HDR_LEN); 934 msleep(sleep_period_ms); 935 sleep_period_ms <<= 1; 936 937 goto try_again; 938 } else if (unlikely(ret == -EBUSY)) { 939 struct xe_device *xe = ct_to_xe(ct); 940 struct guc_ctb *g2h = &ct->ctbs.g2h; 941 942 trace_xe_guc_ct_g2h_flow_control(xe, g2h->info.head, 943 desc_read(xe, g2h, tail), 944 g2h->info.size, 945 g2h->info.space, 946 g2h_fence ? 947 GUC_CTB_HXG_MSG_MAX_LEN : 948 g2h_len); 949 950 #define g2h_avail(ct) \ 951 (desc_read(ct_to_xe(ct), (&ct->ctbs.g2h), tail) != ct->ctbs.g2h.info.head) 952 if (!wait_event_timeout(ct->wq, !ct->g2h_outstanding || 953 g2h_avail(ct), HZ)) 954 goto broken; 955 #undef g2h_avail 956 957 ret = dequeue_one_g2h(ct); 958 if (ret < 0) { 959 if (ret != -ECANCELED) 960 xe_gt_err(ct_to_gt(ct), "CTB receive failed (%pe)", 961 ERR_PTR(ret)); 962 goto broken; 963 } 964 965 goto try_again; 966 } 967 968 return ret; 969 970 broken: 971 xe_gt_err(gt, "No forward process on H2G, reset required\n"); 972 CT_DEAD(ct, &ct->ctbs.h2g, DEADLOCK); 973 974 return -EDEADLK; 975 } 976 977 static int guc_ct_send(struct xe_guc_ct *ct, const u32 *action, u32 len, 978 u32 g2h_len, u32 num_g2h, struct g2h_fence *g2h_fence) 979 { 980 int ret; 981 982 xe_gt_assert(ct_to_gt(ct), !g2h_len || !g2h_fence); 983 984 mutex_lock(&ct->lock); 985 ret = guc_ct_send_locked(ct, action, len, g2h_len, num_g2h, g2h_fence); 986 mutex_unlock(&ct->lock); 987 988 return ret; 989 } 990 991 int xe_guc_ct_send(struct xe_guc_ct *ct, const u32 *action, u32 len, 992 u32 g2h_len, u32 num_g2h) 993 { 994 int ret; 995 996 ret = guc_ct_send(ct, action, len, g2h_len, num_g2h, NULL); 997 if (ret == -EDEADLK) 998 kick_reset(ct); 999 1000 return ret; 1001 } 1002 1003 int xe_guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len, 1004 u32 g2h_len, u32 num_g2h) 1005 { 1006 int ret; 1007 1008 ret = guc_ct_send_locked(ct, action, len, g2h_len, num_g2h, NULL); 1009 if (ret == -EDEADLK) 1010 kick_reset(ct); 1011 1012 return ret; 1013 } 1014 1015 int xe_guc_ct_send_g2h_handler(struct xe_guc_ct *ct, const u32 *action, u32 len) 1016 { 1017 int ret; 1018 1019 lockdep_assert_held(&ct->lock); 1020 1021 ret = guc_ct_send_locked(ct, action, len, 0, 0, NULL); 1022 if (ret == -EDEADLK) 1023 kick_reset(ct); 1024 1025 return ret; 1026 } 1027 1028 /* 1029 * Check if a GT reset is in progress or will occur and if GT reset brought the 1030 * CT back up. Randomly picking 5 seconds for an upper limit to do a GT a reset. 1031 */ 1032 static bool retry_failure(struct xe_guc_ct *ct, int ret) 1033 { 1034 if (!(ret == -EDEADLK || ret == -EPIPE || ret == -ENODEV)) 1035 return false; 1036 1037 #define ct_alive(ct) \ 1038 (xe_guc_ct_enabled(ct) && !ct->ctbs.h2g.info.broken && \ 1039 !ct->ctbs.g2h.info.broken) 1040 if (!wait_event_interruptible_timeout(ct->wq, ct_alive(ct), HZ * 5)) 1041 return false; 1042 #undef ct_alive 1043 1044 return true; 1045 } 1046 1047 static int guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len, 1048 u32 *response_buffer, bool no_fail) 1049 { 1050 struct xe_gt *gt = ct_to_gt(ct); 1051 struct g2h_fence g2h_fence; 1052 int ret = 0; 1053 1054 /* 1055 * We use a fence to implement blocking sends / receiving response data. 1056 * The seqno of the fence is sent in the H2G, returned in the G2H, and 1057 * an xarray is used as storage media with the seqno being to key. 1058 * Fields in the fence hold success, failure, retry status and the 1059 * response data. Safe to allocate on the stack as the xarray is the 1060 * only reference and it cannot be present after this function exits. 1061 */ 1062 retry: 1063 g2h_fence_init(&g2h_fence, response_buffer); 1064 retry_same_fence: 1065 ret = guc_ct_send(ct, action, len, 0, 0, &g2h_fence); 1066 if (unlikely(ret == -ENOMEM)) { 1067 /* Retry allocation /w GFP_KERNEL */ 1068 ret = xa_err(xa_store(&ct->fence_lookup, g2h_fence.seqno, 1069 &g2h_fence, GFP_KERNEL)); 1070 if (ret) 1071 return ret; 1072 1073 goto retry_same_fence; 1074 } else if (unlikely(ret)) { 1075 if (ret == -EDEADLK) 1076 kick_reset(ct); 1077 1078 if (no_fail && retry_failure(ct, ret)) 1079 goto retry_same_fence; 1080 1081 if (!g2h_fence_needs_alloc(&g2h_fence)) 1082 xa_erase(&ct->fence_lookup, g2h_fence.seqno); 1083 1084 return ret; 1085 } 1086 1087 ret = wait_event_timeout(ct->g2h_fence_wq, g2h_fence.done, HZ); 1088 if (!ret) { 1089 LNL_FLUSH_WORK(&ct->g2h_worker); 1090 if (g2h_fence.done) { 1091 xe_gt_warn(gt, "G2H fence %u, action %04x, done\n", 1092 g2h_fence.seqno, action[0]); 1093 ret = 1; 1094 } 1095 } 1096 1097 /* 1098 * Ensure we serialize with completion side to prevent UAF with fence going out of scope on 1099 * the stack, since we have no clue if it will fire after the timeout before we can erase 1100 * from the xa. Also we have some dependent loads and stores below for which we need the 1101 * correct ordering, and we lack the needed barriers. 1102 */ 1103 mutex_lock(&ct->lock); 1104 if (!ret) { 1105 xe_gt_err(gt, "Timed out wait for G2H, fence %u, action %04x, done %s", 1106 g2h_fence.seqno, action[0], str_yes_no(g2h_fence.done)); 1107 xa_erase(&ct->fence_lookup, g2h_fence.seqno); 1108 mutex_unlock(&ct->lock); 1109 return -ETIME; 1110 } 1111 1112 if (g2h_fence.retry) { 1113 xe_gt_dbg(gt, "H2G action %#x retrying: reason %#x\n", 1114 action[0], g2h_fence.reason); 1115 mutex_unlock(&ct->lock); 1116 goto retry; 1117 } 1118 if (g2h_fence.fail) { 1119 if (g2h_fence.cancel) { 1120 xe_gt_dbg(gt, "H2G request %#x canceled!\n", action[0]); 1121 ret = -ECANCELED; 1122 goto unlock; 1123 } 1124 xe_gt_err(gt, "H2G request %#x failed: error %#x hint %#x\n", 1125 action[0], g2h_fence.error, g2h_fence.hint); 1126 ret = -EIO; 1127 } 1128 1129 if (ret > 0) 1130 ret = response_buffer ? g2h_fence.response_len : g2h_fence.response_data; 1131 1132 unlock: 1133 mutex_unlock(&ct->lock); 1134 1135 return ret; 1136 } 1137 1138 /** 1139 * xe_guc_ct_send_recv - Send and receive HXG to the GuC 1140 * @ct: the &xe_guc_ct 1141 * @action: the dword array with `HXG Request`_ message (can't be NULL) 1142 * @len: length of the `HXG Request`_ message (in dwords, can't be 0) 1143 * @response_buffer: placeholder for the `HXG Response`_ message (can be NULL) 1144 * 1145 * Send a `HXG Request`_ message to the GuC over CT communication channel and 1146 * blocks until GuC replies with a `HXG Response`_ message. 1147 * 1148 * For non-blocking communication with GuC use xe_guc_ct_send(). 1149 * 1150 * Note: The size of &response_buffer must be at least GUC_CTB_MAX_DWORDS_. 1151 * 1152 * Return: response length (in dwords) if &response_buffer was not NULL, or 1153 * DATA0 from `HXG Response`_ if &response_buffer was NULL, or 1154 * a negative error code on failure. 1155 */ 1156 int xe_guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len, 1157 u32 *response_buffer) 1158 { 1159 KUNIT_STATIC_STUB_REDIRECT(xe_guc_ct_send_recv, ct, action, len, response_buffer); 1160 return guc_ct_send_recv(ct, action, len, response_buffer, false); 1161 } 1162 ALLOW_ERROR_INJECTION(xe_guc_ct_send_recv, ERRNO); 1163 1164 int xe_guc_ct_send_recv_no_fail(struct xe_guc_ct *ct, const u32 *action, 1165 u32 len, u32 *response_buffer) 1166 { 1167 return guc_ct_send_recv(ct, action, len, response_buffer, true); 1168 } 1169 1170 static u32 *msg_to_hxg(u32 *msg) 1171 { 1172 return msg + GUC_CTB_MSG_MIN_LEN; 1173 } 1174 1175 static u32 msg_len_to_hxg_len(u32 len) 1176 { 1177 return len - GUC_CTB_MSG_MIN_LEN; 1178 } 1179 1180 static int parse_g2h_event(struct xe_guc_ct *ct, u32 *msg, u32 len) 1181 { 1182 u32 *hxg = msg_to_hxg(msg); 1183 u32 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, hxg[0]); 1184 1185 lockdep_assert_held(&ct->lock); 1186 1187 switch (action) { 1188 case XE_GUC_ACTION_SCHED_CONTEXT_MODE_DONE: 1189 case XE_GUC_ACTION_DEREGISTER_CONTEXT_DONE: 1190 case XE_GUC_ACTION_SCHED_ENGINE_MODE_DONE: 1191 case XE_GUC_ACTION_TLB_INVALIDATION_DONE: 1192 g2h_release_space(ct, len); 1193 } 1194 1195 return 0; 1196 } 1197 1198 static int guc_crash_process_msg(struct xe_guc_ct *ct, u32 action) 1199 { 1200 struct xe_gt *gt = ct_to_gt(ct); 1201 1202 if (action == XE_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED) 1203 xe_gt_err(gt, "GuC Crash dump notification\n"); 1204 else if (action == XE_GUC_ACTION_NOTIFY_EXCEPTION) 1205 xe_gt_err(gt, "GuC Exception notification\n"); 1206 else 1207 xe_gt_err(gt, "Unknown GuC crash notification: 0x%04X\n", action); 1208 1209 CT_DEAD(ct, NULL, CRASH); 1210 1211 kick_reset(ct); 1212 1213 return 0; 1214 } 1215 1216 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) 1217 static void fast_req_report(struct xe_guc_ct *ct, u16 fence) 1218 { 1219 u16 fence_min = U16_MAX, fence_max = 0; 1220 struct xe_gt *gt = ct_to_gt(ct); 1221 bool found = false; 1222 unsigned int n; 1223 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC) 1224 char *buf; 1225 #endif 1226 1227 lockdep_assert_held(&ct->lock); 1228 1229 for (n = 0; n < ARRAY_SIZE(ct->fast_req); n++) { 1230 if (ct->fast_req[n].fence < fence_min) 1231 fence_min = ct->fast_req[n].fence; 1232 if (ct->fast_req[n].fence > fence_max) 1233 fence_max = ct->fast_req[n].fence; 1234 1235 if (ct->fast_req[n].fence != fence) 1236 continue; 1237 found = true; 1238 1239 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC) 1240 buf = kmalloc(SZ_4K, GFP_NOWAIT); 1241 if (buf && stack_depot_snprint(ct->fast_req[n].stack, buf, SZ_4K, 0)) 1242 xe_gt_err(gt, "Fence 0x%x was used by action %#04x sent at:\n%s", 1243 fence, ct->fast_req[n].action, buf); 1244 else 1245 xe_gt_err(gt, "Fence 0x%x was used by action %#04x [failed to retrieve stack]\n", 1246 fence, ct->fast_req[n].action); 1247 kfree(buf); 1248 #else 1249 xe_gt_err(gt, "Fence 0x%x was used by action %#04x\n", 1250 fence, ct->fast_req[n].action); 1251 #endif 1252 break; 1253 } 1254 1255 if (!found) 1256 xe_gt_warn(gt, "Fence 0x%x not found - tracking buffer wrapped? [range = 0x%x -> 0x%x, next = 0x%X]\n", 1257 fence, fence_min, fence_max, ct->fence_seqno); 1258 } 1259 #else 1260 static void fast_req_report(struct xe_guc_ct *ct, u16 fence) 1261 { 1262 } 1263 #endif 1264 1265 static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len) 1266 { 1267 struct xe_gt *gt = ct_to_gt(ct); 1268 u32 *hxg = msg_to_hxg(msg); 1269 u32 hxg_len = msg_len_to_hxg_len(len); 1270 u32 fence = FIELD_GET(GUC_CTB_MSG_0_FENCE, msg[0]); 1271 u32 type = FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]); 1272 struct g2h_fence *g2h_fence; 1273 1274 lockdep_assert_held(&ct->lock); 1275 1276 /* 1277 * Fences for FAST_REQUEST messages are not tracked in ct->fence_lookup. 1278 * Those messages should never fail, so if we do get an error back it 1279 * means we're likely doing an illegal operation and the GuC is 1280 * rejecting it. We have no way to inform the code that submitted the 1281 * H2G that the message was rejected, so we need to escalate the 1282 * failure to trigger a reset. 1283 */ 1284 if (fence & CT_SEQNO_UNTRACKED) { 1285 if (type == GUC_HXG_TYPE_RESPONSE_FAILURE) 1286 xe_gt_err(gt, "FAST_REQ H2G fence 0x%x failed! e=0x%x, h=%u\n", 1287 fence, 1288 FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, hxg[0]), 1289 FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT, hxg[0])); 1290 else 1291 xe_gt_err(gt, "unexpected response %u for FAST_REQ H2G fence 0x%x!\n", 1292 type, fence); 1293 1294 fast_req_report(ct, fence); 1295 1296 CT_DEAD(ct, NULL, PARSE_G2H_RESPONSE); 1297 1298 return -EPROTO; 1299 } 1300 1301 g2h_fence = xa_erase(&ct->fence_lookup, fence); 1302 if (unlikely(!g2h_fence)) { 1303 /* Don't tear down channel, as send could've timed out */ 1304 /* CT_DEAD(ct, NULL, PARSE_G2H_UNKNOWN); */ 1305 xe_gt_warn(gt, "G2H fence (%u) not found!\n", fence); 1306 g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN); 1307 return 0; 1308 } 1309 1310 xe_gt_assert(gt, fence == g2h_fence->seqno); 1311 1312 if (type == GUC_HXG_TYPE_RESPONSE_FAILURE) { 1313 g2h_fence->fail = true; 1314 g2h_fence->error = FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, hxg[0]); 1315 g2h_fence->hint = FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT, hxg[0]); 1316 } else if (type == GUC_HXG_TYPE_NO_RESPONSE_RETRY) { 1317 g2h_fence->retry = true; 1318 g2h_fence->reason = FIELD_GET(GUC_HXG_RETRY_MSG_0_REASON, hxg[0]); 1319 } else if (g2h_fence->response_buffer) { 1320 g2h_fence->response_len = hxg_len; 1321 memcpy(g2h_fence->response_buffer, hxg, hxg_len * sizeof(u32)); 1322 } else { 1323 g2h_fence->response_data = FIELD_GET(GUC_HXG_RESPONSE_MSG_0_DATA0, hxg[0]); 1324 } 1325 1326 g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN); 1327 1328 g2h_fence->done = true; 1329 smp_mb(); 1330 1331 wake_up_all(&ct->g2h_fence_wq); 1332 1333 return 0; 1334 } 1335 1336 static int parse_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len) 1337 { 1338 struct xe_gt *gt = ct_to_gt(ct); 1339 u32 *hxg = msg_to_hxg(msg); 1340 u32 origin, type; 1341 int ret; 1342 1343 lockdep_assert_held(&ct->lock); 1344 1345 origin = FIELD_GET(GUC_HXG_MSG_0_ORIGIN, hxg[0]); 1346 if (unlikely(origin != GUC_HXG_ORIGIN_GUC)) { 1347 xe_gt_err(gt, "G2H channel broken on read, origin=%u, reset required\n", 1348 origin); 1349 CT_DEAD(ct, &ct->ctbs.g2h, PARSE_G2H_ORIGIN); 1350 1351 return -EPROTO; 1352 } 1353 1354 type = FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]); 1355 switch (type) { 1356 case GUC_HXG_TYPE_EVENT: 1357 ret = parse_g2h_event(ct, msg, len); 1358 break; 1359 case GUC_HXG_TYPE_RESPONSE_SUCCESS: 1360 case GUC_HXG_TYPE_RESPONSE_FAILURE: 1361 case GUC_HXG_TYPE_NO_RESPONSE_RETRY: 1362 ret = parse_g2h_response(ct, msg, len); 1363 break; 1364 default: 1365 xe_gt_err(gt, "G2H channel broken on read, type=%u, reset required\n", 1366 type); 1367 CT_DEAD(ct, &ct->ctbs.g2h, PARSE_G2H_TYPE); 1368 1369 ret = -EOPNOTSUPP; 1370 } 1371 1372 return ret; 1373 } 1374 1375 static int process_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len) 1376 { 1377 struct xe_guc *guc = ct_to_guc(ct); 1378 struct xe_gt *gt = ct_to_gt(ct); 1379 u32 hxg_len = msg_len_to_hxg_len(len); 1380 u32 *hxg = msg_to_hxg(msg); 1381 u32 action, adj_len; 1382 u32 *payload; 1383 int ret = 0; 1384 1385 if (FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]) != GUC_HXG_TYPE_EVENT) 1386 return 0; 1387 1388 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, hxg[0]); 1389 payload = hxg + GUC_HXG_EVENT_MSG_MIN_LEN; 1390 adj_len = hxg_len - GUC_HXG_EVENT_MSG_MIN_LEN; 1391 1392 switch (action) { 1393 case XE_GUC_ACTION_SCHED_CONTEXT_MODE_DONE: 1394 ret = xe_guc_sched_done_handler(guc, payload, adj_len); 1395 break; 1396 case XE_GUC_ACTION_DEREGISTER_CONTEXT_DONE: 1397 ret = xe_guc_deregister_done_handler(guc, payload, adj_len); 1398 break; 1399 case XE_GUC_ACTION_CONTEXT_RESET_NOTIFICATION: 1400 ret = xe_guc_exec_queue_reset_handler(guc, payload, adj_len); 1401 break; 1402 case XE_GUC_ACTION_ENGINE_FAILURE_NOTIFICATION: 1403 ret = xe_guc_exec_queue_reset_failure_handler(guc, payload, 1404 adj_len); 1405 break; 1406 case XE_GUC_ACTION_SCHED_ENGINE_MODE_DONE: 1407 /* Selftest only at the moment */ 1408 break; 1409 case XE_GUC_ACTION_STATE_CAPTURE_NOTIFICATION: 1410 ret = xe_guc_error_capture_handler(guc, payload, adj_len); 1411 break; 1412 case XE_GUC_ACTION_NOTIFY_FLUSH_LOG_BUFFER_TO_FILE: 1413 /* FIXME: Handle this */ 1414 break; 1415 case XE_GUC_ACTION_NOTIFY_MEMORY_CAT_ERROR: 1416 ret = xe_guc_exec_queue_memory_cat_error_handler(guc, payload, 1417 adj_len); 1418 break; 1419 case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC: 1420 ret = xe_guc_pagefault_handler(guc, payload, adj_len); 1421 break; 1422 case XE_GUC_ACTION_TLB_INVALIDATION_DONE: 1423 ret = xe_guc_tlb_invalidation_done_handler(guc, payload, 1424 adj_len); 1425 break; 1426 case XE_GUC_ACTION_ACCESS_COUNTER_NOTIFY: 1427 ret = xe_guc_access_counter_notify_handler(guc, payload, 1428 adj_len); 1429 break; 1430 case XE_GUC_ACTION_GUC2PF_RELAY_FROM_VF: 1431 ret = xe_guc_relay_process_guc2pf(&guc->relay, hxg, hxg_len); 1432 break; 1433 case XE_GUC_ACTION_GUC2VF_RELAY_FROM_PF: 1434 ret = xe_guc_relay_process_guc2vf(&guc->relay, hxg, hxg_len); 1435 break; 1436 case GUC_ACTION_GUC2PF_VF_STATE_NOTIFY: 1437 ret = xe_gt_sriov_pf_control_process_guc2pf(gt, hxg, hxg_len); 1438 break; 1439 case GUC_ACTION_GUC2PF_ADVERSE_EVENT: 1440 ret = xe_gt_sriov_pf_monitor_process_guc2pf(gt, hxg, hxg_len); 1441 break; 1442 case XE_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED: 1443 case XE_GUC_ACTION_NOTIFY_EXCEPTION: 1444 ret = guc_crash_process_msg(ct, action); 1445 break; 1446 default: 1447 xe_gt_err(gt, "unexpected G2H action 0x%04x\n", action); 1448 } 1449 1450 if (ret) { 1451 xe_gt_err(gt, "G2H action %#04x failed (%pe) len %u msg %*ph\n", 1452 action, ERR_PTR(ret), hxg_len, (int)sizeof(u32) * hxg_len, hxg); 1453 CT_DEAD(ct, NULL, PROCESS_FAILED); 1454 } 1455 1456 return 0; 1457 } 1458 1459 static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path) 1460 { 1461 struct xe_device *xe = ct_to_xe(ct); 1462 struct xe_gt *gt = ct_to_gt(ct); 1463 struct guc_ctb *g2h = &ct->ctbs.g2h; 1464 u32 tail, head, len, desc_status; 1465 s32 avail; 1466 u32 action; 1467 u32 *hxg; 1468 1469 xe_gt_assert(gt, xe_guc_ct_initialized(ct)); 1470 lockdep_assert_held(&ct->fast_lock); 1471 1472 if (ct->state == XE_GUC_CT_STATE_DISABLED) 1473 return -ENODEV; 1474 1475 if (ct->state == XE_GUC_CT_STATE_STOPPED) 1476 return -ECANCELED; 1477 1478 if (g2h->info.broken) 1479 return -EPIPE; 1480 1481 xe_gt_assert(gt, xe_guc_ct_enabled(ct)); 1482 1483 desc_status = desc_read(xe, g2h, status); 1484 if (desc_status) { 1485 if (desc_status & GUC_CTB_STATUS_DISABLED) { 1486 /* 1487 * Potentially valid if a CLIENT_RESET request resulted in 1488 * contexts/engines being reset. But should never happen as 1489 * no contexts should be active when CLIENT_RESET is sent. 1490 */ 1491 xe_gt_err(gt, "CT read: unexpected G2H after GuC has stopped!\n"); 1492 desc_status &= ~GUC_CTB_STATUS_DISABLED; 1493 } 1494 1495 if (desc_status) { 1496 xe_gt_err(gt, "CT read: non-zero status: %u\n", desc_status); 1497 goto corrupted; 1498 } 1499 } 1500 1501 if (IS_ENABLED(CONFIG_DRM_XE_DEBUG)) { 1502 u32 desc_tail = desc_read(xe, g2h, tail); 1503 /* 1504 u32 desc_head = desc_read(xe, g2h, head); 1505 1506 * info.head and desc_head are updated back-to-back at the end of 1507 * this function and nowhere else. Hence, they cannot be different 1508 * unless two g2h_read calls are running concurrently. Which is not 1509 * possible because it is guarded by ct->fast_lock. And yet, some 1510 * discrete platforms are regularly hitting this error :(. 1511 * 1512 * desc_head rolling backwards shouldn't cause any noticeable 1513 * problems - just a delay in GuC being allowed to proceed past that 1514 * point in the queue. So for now, just disable the error until it 1515 * can be root caused. 1516 * 1517 if (g2h->info.head != desc_head) { 1518 desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_MISMATCH); 1519 xe_gt_err(gt, "CT read: head was modified %u != %u\n", 1520 desc_head, g2h->info.head); 1521 goto corrupted; 1522 } 1523 */ 1524 1525 if (g2h->info.head > g2h->info.size) { 1526 desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_OVERFLOW); 1527 xe_gt_err(gt, "CT read: head out of range: %u vs %u\n", 1528 g2h->info.head, g2h->info.size); 1529 goto corrupted; 1530 } 1531 1532 if (desc_tail >= g2h->info.size) { 1533 desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_OVERFLOW); 1534 xe_gt_err(gt, "CT read: invalid tail offset %u >= %u)\n", 1535 desc_tail, g2h->info.size); 1536 goto corrupted; 1537 } 1538 } 1539 1540 /* Calculate DW available to read */ 1541 tail = desc_read(xe, g2h, tail); 1542 avail = tail - g2h->info.head; 1543 if (unlikely(avail == 0)) 1544 return 0; 1545 1546 if (avail < 0) 1547 avail += g2h->info.size; 1548 1549 /* Read header */ 1550 xe_map_memcpy_from(xe, msg, &g2h->cmds, sizeof(u32) * g2h->info.head, 1551 sizeof(u32)); 1552 len = FIELD_GET(GUC_CTB_MSG_0_NUM_DWORDS, msg[0]) + GUC_CTB_MSG_MIN_LEN; 1553 if (len > avail) { 1554 xe_gt_err(gt, "G2H channel broken on read, avail=%d, len=%d, reset required\n", 1555 avail, len); 1556 goto corrupted; 1557 } 1558 1559 head = (g2h->info.head + 1) % g2h->info.size; 1560 avail = len - 1; 1561 1562 /* Read G2H message */ 1563 if (avail + head > g2h->info.size) { 1564 u32 avail_til_wrap = g2h->info.size - head; 1565 1566 xe_map_memcpy_from(xe, msg + 1, 1567 &g2h->cmds, sizeof(u32) * head, 1568 avail_til_wrap * sizeof(u32)); 1569 xe_map_memcpy_from(xe, msg + 1 + avail_til_wrap, 1570 &g2h->cmds, 0, 1571 (avail - avail_til_wrap) * sizeof(u32)); 1572 } else { 1573 xe_map_memcpy_from(xe, msg + 1, 1574 &g2h->cmds, sizeof(u32) * head, 1575 avail * sizeof(u32)); 1576 } 1577 1578 hxg = msg_to_hxg(msg); 1579 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, hxg[0]); 1580 1581 if (fast_path) { 1582 if (FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]) != GUC_HXG_TYPE_EVENT) 1583 return 0; 1584 1585 switch (action) { 1586 case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC: 1587 case XE_GUC_ACTION_TLB_INVALIDATION_DONE: 1588 break; /* Process these in fast-path */ 1589 default: 1590 return 0; 1591 } 1592 } 1593 1594 /* Update local / descriptor header */ 1595 g2h->info.head = (head + avail) % g2h->info.size; 1596 desc_write(xe, g2h, head, g2h->info.head); 1597 1598 trace_xe_guc_ctb_g2h(xe, ct_to_gt(ct)->info.id, 1599 action, len, g2h->info.head, tail); 1600 1601 return len; 1602 1603 corrupted: 1604 CT_DEAD(ct, &ct->ctbs.g2h, G2H_READ); 1605 return -EPROTO; 1606 } 1607 1608 static void g2h_fast_path(struct xe_guc_ct *ct, u32 *msg, u32 len) 1609 { 1610 struct xe_gt *gt = ct_to_gt(ct); 1611 struct xe_guc *guc = ct_to_guc(ct); 1612 u32 hxg_len = msg_len_to_hxg_len(len); 1613 u32 *hxg = msg_to_hxg(msg); 1614 u32 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, hxg[0]); 1615 u32 *payload = hxg + GUC_HXG_MSG_MIN_LEN; 1616 u32 adj_len = hxg_len - GUC_HXG_MSG_MIN_LEN; 1617 int ret = 0; 1618 1619 switch (action) { 1620 case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC: 1621 ret = xe_guc_pagefault_handler(guc, payload, adj_len); 1622 break; 1623 case XE_GUC_ACTION_TLB_INVALIDATION_DONE: 1624 __g2h_release_space(ct, len); 1625 ret = xe_guc_tlb_invalidation_done_handler(guc, payload, 1626 adj_len); 1627 break; 1628 default: 1629 xe_gt_warn(gt, "NOT_POSSIBLE"); 1630 } 1631 1632 if (ret) { 1633 xe_gt_err(gt, "G2H action 0x%04x failed (%pe)\n", 1634 action, ERR_PTR(ret)); 1635 CT_DEAD(ct, NULL, FAST_G2H); 1636 } 1637 } 1638 1639 /** 1640 * xe_guc_ct_fast_path - process critical G2H in the IRQ handler 1641 * @ct: GuC CT object 1642 * 1643 * Anything related to page faults is critical for performance, process these 1644 * critical G2H in the IRQ. This is safe as these handlers either just wake up 1645 * waiters or queue another worker. 1646 */ 1647 void xe_guc_ct_fast_path(struct xe_guc_ct *ct) 1648 { 1649 struct xe_device *xe = ct_to_xe(ct); 1650 bool ongoing; 1651 int len; 1652 1653 ongoing = xe_pm_runtime_get_if_active(ct_to_xe(ct)); 1654 if (!ongoing && xe_pm_read_callback_task(ct_to_xe(ct)) == NULL) 1655 return; 1656 1657 spin_lock(&ct->fast_lock); 1658 do { 1659 len = g2h_read(ct, ct->fast_msg, true); 1660 if (len > 0) 1661 g2h_fast_path(ct, ct->fast_msg, len); 1662 } while (len > 0); 1663 spin_unlock(&ct->fast_lock); 1664 1665 if (ongoing) 1666 xe_pm_runtime_put(xe); 1667 } 1668 1669 /* Returns less than zero on error, 0 on done, 1 on more available */ 1670 static int dequeue_one_g2h(struct xe_guc_ct *ct) 1671 { 1672 int len; 1673 int ret; 1674 1675 lockdep_assert_held(&ct->lock); 1676 1677 spin_lock_irq(&ct->fast_lock); 1678 len = g2h_read(ct, ct->msg, false); 1679 spin_unlock_irq(&ct->fast_lock); 1680 if (len <= 0) 1681 return len; 1682 1683 ret = parse_g2h_msg(ct, ct->msg, len); 1684 if (unlikely(ret < 0)) 1685 return ret; 1686 1687 ret = process_g2h_msg(ct, ct->msg, len); 1688 if (unlikely(ret < 0)) 1689 return ret; 1690 1691 return 1; 1692 } 1693 1694 static void receive_g2h(struct xe_guc_ct *ct) 1695 { 1696 bool ongoing; 1697 int ret; 1698 1699 /* 1700 * Normal users must always hold mem_access.ref around CT calls. However 1701 * during the runtime pm callbacks we rely on CT to talk to the GuC, but 1702 * at this stage we can't rely on mem_access.ref and even the 1703 * callback_task will be different than current. For such cases we just 1704 * need to ensure we always process the responses from any blocking 1705 * ct_send requests or where we otherwise expect some response when 1706 * initiated from those callbacks (which will need to wait for the below 1707 * dequeue_one_g2h()). The dequeue_one_g2h() will gracefully fail if 1708 * the device has suspended to the point that the CT communication has 1709 * been disabled. 1710 * 1711 * If we are inside the runtime pm callback, we can be the only task 1712 * still issuing CT requests (since that requires having the 1713 * mem_access.ref). It seems like it might in theory be possible to 1714 * receive unsolicited events from the GuC just as we are 1715 * suspending-resuming, but those will currently anyway be lost when 1716 * eventually exiting from suspend, hence no need to wake up the device 1717 * here. If we ever need something stronger than get_if_ongoing() then 1718 * we need to be careful with blocking the pm callbacks from getting CT 1719 * responses, if the worker here is blocked on those callbacks 1720 * completing, creating a deadlock. 1721 */ 1722 ongoing = xe_pm_runtime_get_if_active(ct_to_xe(ct)); 1723 if (!ongoing && xe_pm_read_callback_task(ct_to_xe(ct)) == NULL) 1724 return; 1725 1726 do { 1727 mutex_lock(&ct->lock); 1728 ret = dequeue_one_g2h(ct); 1729 mutex_unlock(&ct->lock); 1730 1731 if (unlikely(ret == -EPROTO || ret == -EOPNOTSUPP)) { 1732 xe_gt_err(ct_to_gt(ct), "CT dequeue failed: %d", ret); 1733 CT_DEAD(ct, NULL, G2H_RECV); 1734 kick_reset(ct); 1735 } 1736 } while (ret == 1); 1737 1738 if (ongoing) 1739 xe_pm_runtime_put(ct_to_xe(ct)); 1740 } 1741 1742 static void g2h_worker_func(struct work_struct *w) 1743 { 1744 struct xe_guc_ct *ct = container_of(w, struct xe_guc_ct, g2h_worker); 1745 1746 receive_g2h(ct); 1747 } 1748 1749 static void xe_fixup_u64_in_cmds(struct xe_device *xe, struct iosys_map *cmds, 1750 u32 size, u32 idx, s64 shift) 1751 { 1752 u32 hi, lo; 1753 u64 offset; 1754 1755 lo = xe_map_rd_ring_u32(xe, cmds, idx, size); 1756 hi = xe_map_rd_ring_u32(xe, cmds, idx + 1, size); 1757 offset = make_u64(hi, lo); 1758 offset += shift; 1759 lo = lower_32_bits(offset); 1760 hi = upper_32_bits(offset); 1761 xe_map_wr_ring_u32(xe, cmds, idx, size, lo); 1762 xe_map_wr_ring_u32(xe, cmds, idx + 1, size, hi); 1763 } 1764 1765 /* 1766 * Shift any GGTT addresses within a single message left within CTB from 1767 * before post-migration recovery. 1768 * @ct: pointer to CT struct of the target GuC 1769 * @cmds: iomap buffer containing CT messages 1770 * @head: start of the target message within the buffer 1771 * @len: length of the target message 1772 * @size: size of the commands buffer 1773 * @shift: the address shift to be added to each GGTT reference 1774 * Return: true if the message was fixed or needed no fixups, false on failure 1775 */ 1776 static bool ct_fixup_ggtt_in_message(struct xe_guc_ct *ct, 1777 struct iosys_map *cmds, u32 head, 1778 u32 len, u32 size, s64 shift) 1779 { 1780 struct xe_gt *gt = ct_to_gt(ct); 1781 struct xe_device *xe = ct_to_xe(ct); 1782 u32 msg[GUC_HXG_MSG_MIN_LEN]; 1783 u32 action, i, n; 1784 1785 xe_gt_assert(gt, len >= GUC_HXG_MSG_MIN_LEN); 1786 1787 msg[0] = xe_map_rd_ring_u32(xe, cmds, head, size); 1788 action = FIELD_GET(GUC_HXG_REQUEST_MSG_0_ACTION, msg[0]); 1789 1790 xe_gt_sriov_dbg_verbose(gt, "fixing H2G %#x\n", action); 1791 1792 switch (action) { 1793 case XE_GUC_ACTION_REGISTER_CONTEXT: 1794 if (len != XE_GUC_REGISTER_CONTEXT_MSG_LEN) 1795 goto err_len; 1796 xe_fixup_u64_in_cmds(xe, cmds, size, head + 1797 XE_GUC_REGISTER_CONTEXT_DATA_5_WQ_DESC_ADDR_LOWER, 1798 shift); 1799 xe_fixup_u64_in_cmds(xe, cmds, size, head + 1800 XE_GUC_REGISTER_CONTEXT_DATA_7_WQ_BUF_BASE_LOWER, 1801 shift); 1802 xe_fixup_u64_in_cmds(xe, cmds, size, head + 1803 XE_GUC_REGISTER_CONTEXT_DATA_10_HW_LRC_ADDR, shift); 1804 break; 1805 case XE_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC: 1806 if (len < XE_GUC_REGISTER_CONTEXT_MULTI_LRC_MSG_MIN_LEN) 1807 goto err_len; 1808 n = xe_map_rd_ring_u32(xe, cmds, head + 1809 XE_GUC_REGISTER_CONTEXT_MULTI_LRC_DATA_10_NUM_CTXS, size); 1810 if (len != XE_GUC_REGISTER_CONTEXT_MULTI_LRC_MSG_MIN_LEN + 2 * n) 1811 goto err_len; 1812 xe_fixup_u64_in_cmds(xe, cmds, size, head + 1813 XE_GUC_REGISTER_CONTEXT_MULTI_LRC_DATA_5_WQ_DESC_ADDR_LOWER, 1814 shift); 1815 xe_fixup_u64_in_cmds(xe, cmds, size, head + 1816 XE_GUC_REGISTER_CONTEXT_MULTI_LRC_DATA_7_WQ_BUF_BASE_LOWER, 1817 shift); 1818 for (i = 0; i < n; i++) 1819 xe_fixup_u64_in_cmds(xe, cmds, size, head + 1820 XE_GUC_REGISTER_CONTEXT_MULTI_LRC_DATA_11_HW_LRC_ADDR 1821 + 2 * i, shift); 1822 break; 1823 default: 1824 break; 1825 } 1826 return true; 1827 1828 err_len: 1829 xe_gt_err(gt, "Skipped G2G %#x message fixups, unexpected length (%u)\n", action, len); 1830 return false; 1831 } 1832 1833 /* 1834 * Apply fixups to the next outgoing CT message within given CTB 1835 * @ct: the &xe_guc_ct struct instance representing the target GuC 1836 * @h2g: the &guc_ctb struct instance of the target buffer 1837 * @shift: shift to be added to all GGTT addresses within the CTB 1838 * @mhead: pointer to an integer storing message start position; the 1839 * position is changed to next message before this function return 1840 * @avail: size of the area available for parsing, that is length 1841 * of all remaining messages stored within the CTB 1842 * Return: size of the area available for parsing after one message 1843 * has been parsed, that is length remaining from the updated mhead 1844 */ 1845 static int ct_fixup_ggtt_in_buffer(struct xe_guc_ct *ct, struct guc_ctb *h2g, 1846 s64 shift, u32 *mhead, s32 avail) 1847 { 1848 struct xe_gt *gt = ct_to_gt(ct); 1849 struct xe_device *xe = ct_to_xe(ct); 1850 u32 msg[GUC_HXG_MSG_MIN_LEN]; 1851 u32 size = h2g->info.size; 1852 u32 head = *mhead; 1853 u32 len; 1854 1855 xe_gt_assert(gt, avail >= (s32)GUC_CTB_MSG_MIN_LEN); 1856 1857 /* Read header */ 1858 msg[0] = xe_map_rd_ring_u32(xe, &h2g->cmds, head, size); 1859 len = FIELD_GET(GUC_CTB_MSG_0_NUM_DWORDS, msg[0]) + GUC_CTB_MSG_MIN_LEN; 1860 1861 if (unlikely(len > (u32)avail)) { 1862 xe_gt_err(gt, "H2G channel broken on read, avail=%d, len=%d, fixups skipped\n", 1863 avail, len); 1864 return 0; 1865 } 1866 1867 head = (head + GUC_CTB_MSG_MIN_LEN) % size; 1868 if (!ct_fixup_ggtt_in_message(ct, &h2g->cmds, head, msg_len_to_hxg_len(len), size, shift)) 1869 return 0; 1870 *mhead = (head + msg_len_to_hxg_len(len)) % size; 1871 1872 return avail - len; 1873 } 1874 1875 /** 1876 * xe_guc_ct_fixup_messages_with_ggtt - Fixup any pending H2G CTB messages 1877 * @ct: pointer to CT struct of the target GuC 1878 * @ggtt_shift: shift to be added to all GGTT addresses within the CTB 1879 * 1880 * Messages in GuC to Host CTB are owned by GuC and any fixups in them 1881 * are made by GuC. But content of the Host to GuC CTB is owned by the 1882 * KMD, so fixups to GGTT references in any pending messages need to be 1883 * applied here. 1884 * This function updates GGTT offsets in payloads of pending H2G CTB 1885 * messages (messages which were not consumed by GuC before the VF got 1886 * paused). 1887 */ 1888 void xe_guc_ct_fixup_messages_with_ggtt(struct xe_guc_ct *ct, s64 ggtt_shift) 1889 { 1890 struct guc_ctb *h2g = &ct->ctbs.h2g; 1891 struct xe_guc *guc = ct_to_guc(ct); 1892 struct xe_gt *gt = guc_to_gt(guc); 1893 u32 head, tail, size; 1894 s32 avail; 1895 1896 if (unlikely(h2g->info.broken)) 1897 return; 1898 1899 h2g->info.head = desc_read(ct_to_xe(ct), h2g, head); 1900 head = h2g->info.head; 1901 tail = READ_ONCE(h2g->info.tail); 1902 size = h2g->info.size; 1903 1904 if (unlikely(head > size)) 1905 goto corrupted; 1906 1907 if (unlikely(tail >= size)) 1908 goto corrupted; 1909 1910 avail = tail - head; 1911 1912 /* beware of buffer wrap case */ 1913 if (unlikely(avail < 0)) 1914 avail += size; 1915 xe_gt_dbg(gt, "available %d (%u:%u:%u)\n", avail, head, tail, size); 1916 xe_gt_assert(gt, avail >= 0); 1917 1918 while (avail > 0) 1919 avail = ct_fixup_ggtt_in_buffer(ct, h2g, ggtt_shift, &head, avail); 1920 1921 return; 1922 1923 corrupted: 1924 xe_gt_err(gt, "Corrupted H2G descriptor head=%u tail=%u size=%u, fixups not applied\n", 1925 head, tail, size); 1926 h2g->info.broken = true; 1927 } 1928 1929 static struct xe_guc_ct_snapshot *guc_ct_snapshot_alloc(struct xe_guc_ct *ct, bool atomic, 1930 bool want_ctb) 1931 { 1932 struct xe_guc_ct_snapshot *snapshot; 1933 1934 snapshot = kzalloc(sizeof(*snapshot), atomic ? GFP_ATOMIC : GFP_KERNEL); 1935 if (!snapshot) 1936 return NULL; 1937 1938 if (ct->bo && want_ctb) { 1939 snapshot->ctb_size = xe_bo_size(ct->bo); 1940 snapshot->ctb = kmalloc(snapshot->ctb_size, atomic ? GFP_ATOMIC : GFP_KERNEL); 1941 } 1942 1943 return snapshot; 1944 } 1945 1946 static void guc_ctb_snapshot_capture(struct xe_device *xe, struct guc_ctb *ctb, 1947 struct guc_ctb_snapshot *snapshot) 1948 { 1949 xe_map_memcpy_from(xe, &snapshot->desc, &ctb->desc, 0, 1950 sizeof(struct guc_ct_buffer_desc)); 1951 memcpy(&snapshot->info, &ctb->info, sizeof(struct guc_ctb_info)); 1952 } 1953 1954 static void guc_ctb_snapshot_print(struct guc_ctb_snapshot *snapshot, 1955 struct drm_printer *p) 1956 { 1957 drm_printf(p, "\tsize: %d\n", snapshot->info.size); 1958 drm_printf(p, "\tresv_space: %d\n", snapshot->info.resv_space); 1959 drm_printf(p, "\thead: %d\n", snapshot->info.head); 1960 drm_printf(p, "\ttail: %d\n", snapshot->info.tail); 1961 drm_printf(p, "\tspace: %d\n", snapshot->info.space); 1962 drm_printf(p, "\tbroken: %d\n", snapshot->info.broken); 1963 drm_printf(p, "\thead (memory): %d\n", snapshot->desc.head); 1964 drm_printf(p, "\ttail (memory): %d\n", snapshot->desc.tail); 1965 drm_printf(p, "\tstatus (memory): 0x%x\n", snapshot->desc.status); 1966 } 1967 1968 static struct xe_guc_ct_snapshot *guc_ct_snapshot_capture(struct xe_guc_ct *ct, bool atomic, 1969 bool want_ctb) 1970 { 1971 struct xe_device *xe = ct_to_xe(ct); 1972 struct xe_guc_ct_snapshot *snapshot; 1973 1974 snapshot = guc_ct_snapshot_alloc(ct, atomic, want_ctb); 1975 if (!snapshot) { 1976 xe_gt_err(ct_to_gt(ct), "Skipping CTB snapshot entirely.\n"); 1977 return NULL; 1978 } 1979 1980 if (xe_guc_ct_enabled(ct) || ct->state == XE_GUC_CT_STATE_STOPPED) { 1981 snapshot->ct_enabled = true; 1982 snapshot->g2h_outstanding = READ_ONCE(ct->g2h_outstanding); 1983 guc_ctb_snapshot_capture(xe, &ct->ctbs.h2g, &snapshot->h2g); 1984 guc_ctb_snapshot_capture(xe, &ct->ctbs.g2h, &snapshot->g2h); 1985 } 1986 1987 if (ct->bo && snapshot->ctb) 1988 xe_map_memcpy_from(xe, snapshot->ctb, &ct->bo->vmap, 0, snapshot->ctb_size); 1989 1990 return snapshot; 1991 } 1992 1993 /** 1994 * xe_guc_ct_snapshot_capture - Take a quick snapshot of the CT state. 1995 * @ct: GuC CT object. 1996 * 1997 * This can be printed out in a later stage like during dev_coredump 1998 * analysis. This is safe to be called during atomic context. 1999 * 2000 * Returns: a GuC CT snapshot object that must be freed by the caller 2001 * by using `xe_guc_ct_snapshot_free`. 2002 */ 2003 struct xe_guc_ct_snapshot *xe_guc_ct_snapshot_capture(struct xe_guc_ct *ct) 2004 { 2005 return guc_ct_snapshot_capture(ct, true, true); 2006 } 2007 2008 /** 2009 * xe_guc_ct_snapshot_print - Print out a given GuC CT snapshot. 2010 * @snapshot: GuC CT snapshot object. 2011 * @p: drm_printer where it will be printed out. 2012 * 2013 * This function prints out a given GuC CT snapshot object. 2014 */ 2015 void xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot *snapshot, 2016 struct drm_printer *p) 2017 { 2018 if (!snapshot) 2019 return; 2020 2021 if (snapshot->ct_enabled) { 2022 drm_puts(p, "H2G CTB (all sizes in DW):\n"); 2023 guc_ctb_snapshot_print(&snapshot->h2g, p); 2024 2025 drm_puts(p, "G2H CTB (all sizes in DW):\n"); 2026 guc_ctb_snapshot_print(&snapshot->g2h, p); 2027 drm_printf(p, "\tg2h outstanding: %d\n", 2028 snapshot->g2h_outstanding); 2029 2030 if (snapshot->ctb) { 2031 drm_printf(p, "[CTB].length: 0x%zx\n", snapshot->ctb_size); 2032 xe_print_blob_ascii85(p, "[CTB].data", '\n', 2033 snapshot->ctb, 0, snapshot->ctb_size); 2034 } 2035 } else { 2036 drm_puts(p, "CT disabled\n"); 2037 } 2038 } 2039 2040 /** 2041 * xe_guc_ct_snapshot_free - Free all allocated objects for a given snapshot. 2042 * @snapshot: GuC CT snapshot object. 2043 * 2044 * This function free all the memory that needed to be allocated at capture 2045 * time. 2046 */ 2047 void xe_guc_ct_snapshot_free(struct xe_guc_ct_snapshot *snapshot) 2048 { 2049 if (!snapshot) 2050 return; 2051 2052 kfree(snapshot->ctb); 2053 kfree(snapshot); 2054 } 2055 2056 /** 2057 * xe_guc_ct_print - GuC CT Print. 2058 * @ct: GuC CT. 2059 * @p: drm_printer where it will be printed out. 2060 * @want_ctb: Should the full CTB content be dumped (vs just the headers) 2061 * 2062 * This function will quickly capture a snapshot of the CT state 2063 * and immediately print it out. 2064 */ 2065 void xe_guc_ct_print(struct xe_guc_ct *ct, struct drm_printer *p, bool want_ctb) 2066 { 2067 struct xe_guc_ct_snapshot *snapshot; 2068 2069 snapshot = guc_ct_snapshot_capture(ct, false, want_ctb); 2070 xe_guc_ct_snapshot_print(snapshot, p); 2071 xe_guc_ct_snapshot_free(snapshot); 2072 } 2073 2074 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) 2075 2076 #ifdef CONFIG_FUNCTION_ERROR_INJECTION 2077 /* 2078 * This is a helper function which assists the driver in identifying if a fault 2079 * injection test is currently active, allowing it to reduce unnecessary debug 2080 * output. Typically, the function returns zero, but the fault injection 2081 * framework can alter this to return an error. Since faults are injected 2082 * through this function, it's important to ensure the compiler doesn't optimize 2083 * it into an inline function. To avoid such optimization, the 'noinline' 2084 * attribute is applied. Compiler optimizes the static function defined in the 2085 * header file as an inline function. 2086 */ 2087 noinline int xe_is_injection_active(void) { return 0; } 2088 ALLOW_ERROR_INJECTION(xe_is_injection_active, ERRNO); 2089 #else 2090 int xe_is_injection_active(void) { return 0; } 2091 #endif 2092 2093 static void ct_dead_capture(struct xe_guc_ct *ct, struct guc_ctb *ctb, u32 reason_code) 2094 { 2095 struct xe_guc_log_snapshot *snapshot_log; 2096 struct xe_guc_ct_snapshot *snapshot_ct; 2097 struct xe_guc *guc = ct_to_guc(ct); 2098 unsigned long flags; 2099 bool have_capture; 2100 2101 if (ctb) 2102 ctb->info.broken = true; 2103 /* 2104 * Huge dump is getting generated when injecting error for guc CT/MMIO 2105 * functions. So, let us suppress the dump when fault is injected. 2106 */ 2107 if (xe_is_injection_active()) 2108 return; 2109 2110 /* Ignore further errors after the first dump until a reset */ 2111 if (ct->dead.reported) 2112 return; 2113 2114 spin_lock_irqsave(&ct->dead.lock, flags); 2115 2116 /* And only capture one dump at a time */ 2117 have_capture = ct->dead.reason & (1 << CT_DEAD_STATE_CAPTURE); 2118 ct->dead.reason |= (1 << reason_code) | 2119 (1 << CT_DEAD_STATE_CAPTURE); 2120 2121 spin_unlock_irqrestore(&ct->dead.lock, flags); 2122 2123 if (have_capture) 2124 return; 2125 2126 snapshot_log = xe_guc_log_snapshot_capture(&guc->log, true); 2127 snapshot_ct = xe_guc_ct_snapshot_capture((ct)); 2128 2129 spin_lock_irqsave(&ct->dead.lock, flags); 2130 2131 if (ct->dead.snapshot_log || ct->dead.snapshot_ct) { 2132 xe_gt_err(ct_to_gt(ct), "Got unexpected dead CT capture!\n"); 2133 xe_guc_log_snapshot_free(snapshot_log); 2134 xe_guc_ct_snapshot_free(snapshot_ct); 2135 } else { 2136 ct->dead.snapshot_log = snapshot_log; 2137 ct->dead.snapshot_ct = snapshot_ct; 2138 } 2139 2140 spin_unlock_irqrestore(&ct->dead.lock, flags); 2141 2142 queue_work(system_unbound_wq, &(ct)->dead.worker); 2143 } 2144 2145 static void ct_dead_print(struct xe_dead_ct *dead) 2146 { 2147 struct xe_guc_ct *ct = container_of(dead, struct xe_guc_ct, dead); 2148 struct xe_device *xe = ct_to_xe(ct); 2149 struct xe_gt *gt = ct_to_gt(ct); 2150 static int g_count; 2151 struct drm_printer ip = xe_gt_info_printer(gt); 2152 struct drm_printer lp = drm_line_printer(&ip, "Capture", ++g_count); 2153 2154 if (!dead->reason) { 2155 xe_gt_err(gt, "CTB is dead for no reason!?\n"); 2156 return; 2157 } 2158 2159 /* Can't generate a genuine core dump at this point, so just do the good bits */ 2160 drm_puts(&lp, "**** Xe Device Coredump ****\n"); 2161 drm_printf(&lp, "Reason: CTB is dead - 0x%X\n", dead->reason); 2162 xe_device_snapshot_print(xe, &lp); 2163 2164 drm_printf(&lp, "**** GT #%d ****\n", gt->info.id); 2165 drm_printf(&lp, "\tTile: %d\n", gt->tile->id); 2166 2167 drm_puts(&lp, "**** GuC Log ****\n"); 2168 xe_guc_log_snapshot_print(dead->snapshot_log, &lp); 2169 2170 drm_puts(&lp, "**** GuC CT ****\n"); 2171 xe_guc_ct_snapshot_print(dead->snapshot_ct, &lp); 2172 2173 drm_puts(&lp, "Done.\n"); 2174 } 2175 2176 static void ct_dead_worker_func(struct work_struct *w) 2177 { 2178 struct xe_guc_ct *ct = container_of(w, struct xe_guc_ct, dead.worker); 2179 2180 if (!ct->dead.reported) { 2181 ct->dead.reported = true; 2182 ct_dead_print(&ct->dead); 2183 } 2184 2185 spin_lock_irq(&ct->dead.lock); 2186 2187 xe_guc_log_snapshot_free(ct->dead.snapshot_log); 2188 ct->dead.snapshot_log = NULL; 2189 xe_guc_ct_snapshot_free(ct->dead.snapshot_ct); 2190 ct->dead.snapshot_ct = NULL; 2191 2192 if (ct->dead.reason & (1 << CT_DEAD_STATE_REARM)) { 2193 /* A reset has occurred so re-arm the error reporting */ 2194 ct->dead.reason = 0; 2195 ct->dead.reported = false; 2196 } 2197 2198 spin_unlock_irq(&ct->dead.lock); 2199 } 2200 #endif 2201