1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2022 Intel Corporation 4 */ 5 6 #include "xe_guc_ct.h" 7 8 #include <linux/bitfield.h> 9 #include <linux/circ_buf.h> 10 #include <linux/delay.h> 11 #include <linux/fault-inject.h> 12 13 #include <kunit/static_stub.h> 14 15 #include <drm/drm_managed.h> 16 17 #include "abi/guc_actions_abi.h" 18 #include "abi/guc_actions_sriov_abi.h" 19 #include "abi/guc_klvs_abi.h" 20 #include "xe_bo.h" 21 #include "xe_devcoredump.h" 22 #include "xe_device.h" 23 #include "xe_gt.h" 24 #include "xe_gt_pagefault.h" 25 #include "xe_gt_printk.h" 26 #include "xe_gt_sriov_pf_control.h" 27 #include "xe_gt_sriov_pf_monitor.h" 28 #include "xe_guc.h" 29 #include "xe_guc_log.h" 30 #include "xe_guc_relay.h" 31 #include "xe_guc_submit.h" 32 #include "xe_guc_tlb_inval.h" 33 #include "xe_map.h" 34 #include "xe_pm.h" 35 #include "xe_sriov_vf.h" 36 #include "xe_trace_guc.h" 37 38 static void receive_g2h(struct xe_guc_ct *ct); 39 static void g2h_worker_func(struct work_struct *w); 40 static void safe_mode_worker_func(struct work_struct *w); 41 static void ct_exit_safe_mode(struct xe_guc_ct *ct); 42 static void guc_ct_change_state(struct xe_guc_ct *ct, 43 enum xe_guc_ct_state state); 44 45 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) 46 enum { 47 /* Internal states, not error conditions */ 48 CT_DEAD_STATE_REARM, /* 0x0001 */ 49 CT_DEAD_STATE_CAPTURE, /* 0x0002 */ 50 51 /* Error conditions */ 52 CT_DEAD_SETUP, /* 0x0004 */ 53 CT_DEAD_H2G_WRITE, /* 0x0008 */ 54 CT_DEAD_H2G_HAS_ROOM, /* 0x0010 */ 55 CT_DEAD_G2H_READ, /* 0x0020 */ 56 CT_DEAD_G2H_RECV, /* 0x0040 */ 57 CT_DEAD_G2H_RELEASE, /* 0x0080 */ 58 CT_DEAD_DEADLOCK, /* 0x0100 */ 59 CT_DEAD_PROCESS_FAILED, /* 0x0200 */ 60 CT_DEAD_FAST_G2H, /* 0x0400 */ 61 CT_DEAD_PARSE_G2H_RESPONSE, /* 0x0800 */ 62 CT_DEAD_PARSE_G2H_UNKNOWN, /* 0x1000 */ 63 CT_DEAD_PARSE_G2H_ORIGIN, /* 0x2000 */ 64 CT_DEAD_PARSE_G2H_TYPE, /* 0x4000 */ 65 CT_DEAD_CRASH, /* 0x8000 */ 66 }; 67 68 static void ct_dead_worker_func(struct work_struct *w); 69 static void ct_dead_capture(struct xe_guc_ct *ct, struct guc_ctb *ctb, u32 reason_code); 70 71 #define CT_DEAD(ct, ctb, reason_code) ct_dead_capture((ct), (ctb), CT_DEAD_##reason_code) 72 #else 73 #define CT_DEAD(ct, ctb, reason) \ 74 do { \ 75 struct guc_ctb *_ctb = (ctb); \ 76 if (_ctb) \ 77 _ctb->info.broken = true; \ 78 } while (0) 79 #endif 80 81 /* Used when a CT send wants to block and / or receive data */ 82 struct g2h_fence { 83 u32 *response_buffer; 84 u32 seqno; 85 u32 response_data; 86 u16 response_len; 87 u16 error; 88 u16 hint; 89 u16 reason; 90 bool cancel; 91 bool retry; 92 bool fail; 93 bool done; 94 }; 95 96 static void g2h_fence_init(struct g2h_fence *g2h_fence, u32 *response_buffer) 97 { 98 memset(g2h_fence, 0, sizeof(*g2h_fence)); 99 g2h_fence->response_buffer = response_buffer; 100 g2h_fence->seqno = ~0x0; 101 } 102 103 static void g2h_fence_cancel(struct g2h_fence *g2h_fence) 104 { 105 g2h_fence->cancel = true; 106 g2h_fence->fail = true; 107 g2h_fence->done = true; 108 } 109 110 static bool g2h_fence_needs_alloc(struct g2h_fence *g2h_fence) 111 { 112 return g2h_fence->seqno == ~0x0; 113 } 114 115 static struct xe_guc * 116 ct_to_guc(struct xe_guc_ct *ct) 117 { 118 return container_of(ct, struct xe_guc, ct); 119 } 120 121 static struct xe_gt * 122 ct_to_gt(struct xe_guc_ct *ct) 123 { 124 return container_of(ct, struct xe_gt, uc.guc.ct); 125 } 126 127 static struct xe_device * 128 ct_to_xe(struct xe_guc_ct *ct) 129 { 130 return gt_to_xe(ct_to_gt(ct)); 131 } 132 133 /** 134 * DOC: GuC CTB Blob 135 * 136 * We allocate single blob to hold both CTB descriptors and buffers: 137 * 138 * +--------+-----------------------------------------------+------+ 139 * | offset | contents | size | 140 * +========+===============================================+======+ 141 * | 0x0000 | H2G CTB Descriptor (send) | | 142 * +--------+-----------------------------------------------+ 4K | 143 * | 0x0800 | G2H CTB Descriptor (g2h) | | 144 * +--------+-----------------------------------------------+------+ 145 * | 0x1000 | H2G CT Buffer (send) | n*4K | 146 * | | | | 147 * +--------+-----------------------------------------------+------+ 148 * | 0x1000 | G2H CT Buffer (g2h) | m*4K | 149 * | + n*4K | | | 150 * +--------+-----------------------------------------------+------+ 151 * 152 * Size of each ``CT Buffer`` must be multiple of 4K. 153 * We don't expect too many messages in flight at any time, unless we are 154 * using the GuC submission. In that case each request requires a minimum 155 * 2 dwords which gives us a maximum 256 queue'd requests. Hopefully this 156 * enough space to avoid backpressure on the driver. We increase the size 157 * of the receive buffer (relative to the send) to ensure a G2H response 158 * CTB has a landing spot. 159 * 160 * In addition to submissions, the G2H buffer needs to be able to hold 161 * enough space for recoverable page fault notifications. The number of 162 * page faults is interrupt driven and can be as much as the number of 163 * compute resources available. However, most of the actual work for these 164 * is in a separate page fault worker thread. Therefore we only need to 165 * make sure the queue has enough space to handle all of the submissions 166 * and responses and an extra buffer for incoming page faults. 167 */ 168 169 #define CTB_DESC_SIZE ALIGN(sizeof(struct guc_ct_buffer_desc), SZ_2K) 170 #define CTB_H2G_BUFFER_OFFSET (CTB_DESC_SIZE * 2) 171 #define CTB_H2G_BUFFER_SIZE (SZ_4K) 172 #define CTB_G2H_BUFFER_SIZE (SZ_128K) 173 #define G2H_ROOM_BUFFER_SIZE (CTB_G2H_BUFFER_SIZE / 2) 174 175 /** 176 * xe_guc_ct_queue_proc_time_jiffies - Return maximum time to process a full 177 * CT command queue 178 * @ct: the &xe_guc_ct. Unused at this moment but will be used in the future. 179 * 180 * Observation is that a 4KiB buffer full of commands takes a little over a 181 * second to process. Use that to calculate maximum time to process a full CT 182 * command queue. 183 * 184 * Return: Maximum time to process a full CT queue in jiffies. 185 */ 186 long xe_guc_ct_queue_proc_time_jiffies(struct xe_guc_ct *ct) 187 { 188 BUILD_BUG_ON(!IS_ALIGNED(CTB_H2G_BUFFER_SIZE, SZ_4)); 189 return (CTB_H2G_BUFFER_SIZE / SZ_4K) * HZ; 190 } 191 192 static size_t guc_ct_size(void) 193 { 194 return CTB_H2G_BUFFER_OFFSET + CTB_H2G_BUFFER_SIZE + 195 CTB_G2H_BUFFER_SIZE; 196 } 197 198 static void guc_ct_fini(struct drm_device *drm, void *arg) 199 { 200 struct xe_guc_ct *ct = arg; 201 202 ct_exit_safe_mode(ct); 203 destroy_workqueue(ct->g2h_wq); 204 xa_destroy(&ct->fence_lookup); 205 } 206 207 static void primelockdep(struct xe_guc_ct *ct) 208 { 209 if (!IS_ENABLED(CONFIG_LOCKDEP)) 210 return; 211 212 fs_reclaim_acquire(GFP_KERNEL); 213 might_lock(&ct->lock); 214 fs_reclaim_release(GFP_KERNEL); 215 } 216 217 int xe_guc_ct_init_noalloc(struct xe_guc_ct *ct) 218 { 219 struct xe_device *xe = ct_to_xe(ct); 220 struct xe_gt *gt = ct_to_gt(ct); 221 int err; 222 223 xe_gt_assert(gt, !(guc_ct_size() % PAGE_SIZE)); 224 225 ct->g2h_wq = alloc_ordered_workqueue("xe-g2h-wq", WQ_MEM_RECLAIM); 226 if (!ct->g2h_wq) 227 return -ENOMEM; 228 229 spin_lock_init(&ct->fast_lock); 230 xa_init(&ct->fence_lookup); 231 INIT_WORK(&ct->g2h_worker, g2h_worker_func); 232 INIT_DELAYED_WORK(&ct->safe_mode_worker, safe_mode_worker_func); 233 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) 234 spin_lock_init(&ct->dead.lock); 235 INIT_WORK(&ct->dead.worker, ct_dead_worker_func); 236 #endif 237 init_waitqueue_head(&ct->wq); 238 init_waitqueue_head(&ct->g2h_fence_wq); 239 240 err = drmm_mutex_init(&xe->drm, &ct->lock); 241 if (err) 242 return err; 243 244 primelockdep(ct); 245 246 err = drmm_add_action_or_reset(&xe->drm, guc_ct_fini, ct); 247 if (err) 248 return err; 249 250 xe_gt_assert(gt, ct->state == XE_GUC_CT_STATE_NOT_INITIALIZED); 251 ct->state = XE_GUC_CT_STATE_DISABLED; 252 return 0; 253 } 254 ALLOW_ERROR_INJECTION(xe_guc_ct_init_noalloc, ERRNO); /* See xe_pci_probe() */ 255 256 static void guc_action_disable_ct(void *arg) 257 { 258 struct xe_guc_ct *ct = arg; 259 260 guc_ct_change_state(ct, XE_GUC_CT_STATE_DISABLED); 261 } 262 263 int xe_guc_ct_init(struct xe_guc_ct *ct) 264 { 265 struct xe_device *xe = ct_to_xe(ct); 266 struct xe_gt *gt = ct_to_gt(ct); 267 struct xe_tile *tile = gt_to_tile(gt); 268 struct xe_bo *bo; 269 270 bo = xe_managed_bo_create_pin_map(xe, tile, guc_ct_size(), 271 XE_BO_FLAG_SYSTEM | 272 XE_BO_FLAG_GGTT | 273 XE_BO_FLAG_GGTT_INVALIDATE | 274 XE_BO_FLAG_PINNED_NORESTORE); 275 if (IS_ERR(bo)) 276 return PTR_ERR(bo); 277 278 ct->bo = bo; 279 280 return devm_add_action_or_reset(xe->drm.dev, guc_action_disable_ct, ct); 281 } 282 ALLOW_ERROR_INJECTION(xe_guc_ct_init, ERRNO); /* See xe_pci_probe() */ 283 284 /** 285 * xe_guc_ct_init_post_hwconfig - Reinitialize the GuC CTB in VRAM 286 * @ct: the &xe_guc_ct 287 * 288 * Allocate a new BO in VRAM and free the previous BO that was allocated 289 * in system memory (SMEM). Applicable only for DGFX products. 290 * 291 * Return: 0 on success, or a negative errno on failure. 292 */ 293 int xe_guc_ct_init_post_hwconfig(struct xe_guc_ct *ct) 294 { 295 struct xe_device *xe = ct_to_xe(ct); 296 struct xe_gt *gt = ct_to_gt(ct); 297 struct xe_tile *tile = gt_to_tile(gt); 298 int ret; 299 300 xe_assert(xe, !xe_guc_ct_enabled(ct)); 301 302 if (IS_DGFX(xe)) { 303 ret = xe_managed_bo_reinit_in_vram(xe, tile, &ct->bo); 304 if (ret) 305 return ret; 306 } 307 308 devm_remove_action(xe->drm.dev, guc_action_disable_ct, ct); 309 return devm_add_action_or_reset(xe->drm.dev, guc_action_disable_ct, ct); 310 } 311 312 #define desc_read(xe_, guc_ctb__, field_) \ 313 xe_map_rd_field(xe_, &guc_ctb__->desc, 0, \ 314 struct guc_ct_buffer_desc, field_) 315 316 #define desc_write(xe_, guc_ctb__, field_, val_) \ 317 xe_map_wr_field(xe_, &guc_ctb__->desc, 0, \ 318 struct guc_ct_buffer_desc, field_, val_) 319 320 static void guc_ct_ctb_h2g_init(struct xe_device *xe, struct guc_ctb *h2g, 321 struct iosys_map *map) 322 { 323 h2g->info.size = CTB_H2G_BUFFER_SIZE / sizeof(u32); 324 h2g->info.resv_space = 0; 325 h2g->info.tail = 0; 326 h2g->info.head = 0; 327 h2g->info.space = CIRC_SPACE(h2g->info.tail, h2g->info.head, 328 h2g->info.size) - 329 h2g->info.resv_space; 330 h2g->info.broken = false; 331 332 h2g->desc = *map; 333 xe_map_memset(xe, &h2g->desc, 0, 0, sizeof(struct guc_ct_buffer_desc)); 334 335 h2g->cmds = IOSYS_MAP_INIT_OFFSET(map, CTB_H2G_BUFFER_OFFSET); 336 } 337 338 static void guc_ct_ctb_g2h_init(struct xe_device *xe, struct guc_ctb *g2h, 339 struct iosys_map *map) 340 { 341 g2h->info.size = CTB_G2H_BUFFER_SIZE / sizeof(u32); 342 g2h->info.resv_space = G2H_ROOM_BUFFER_SIZE / sizeof(u32); 343 g2h->info.head = 0; 344 g2h->info.tail = 0; 345 g2h->info.space = CIRC_SPACE(g2h->info.tail, g2h->info.head, 346 g2h->info.size) - 347 g2h->info.resv_space; 348 g2h->info.broken = false; 349 350 g2h->desc = IOSYS_MAP_INIT_OFFSET(map, CTB_DESC_SIZE); 351 xe_map_memset(xe, &g2h->desc, 0, 0, sizeof(struct guc_ct_buffer_desc)); 352 353 g2h->cmds = IOSYS_MAP_INIT_OFFSET(map, CTB_H2G_BUFFER_OFFSET + 354 CTB_H2G_BUFFER_SIZE); 355 } 356 357 static int guc_ct_ctb_h2g_register(struct xe_guc_ct *ct) 358 { 359 struct xe_guc *guc = ct_to_guc(ct); 360 u32 desc_addr, ctb_addr, size; 361 int err; 362 363 desc_addr = xe_bo_ggtt_addr(ct->bo); 364 ctb_addr = xe_bo_ggtt_addr(ct->bo) + CTB_H2G_BUFFER_OFFSET; 365 size = ct->ctbs.h2g.info.size * sizeof(u32); 366 367 err = xe_guc_self_cfg64(guc, 368 GUC_KLV_SELF_CFG_H2G_CTB_DESCRIPTOR_ADDR_KEY, 369 desc_addr); 370 if (err) 371 return err; 372 373 err = xe_guc_self_cfg64(guc, 374 GUC_KLV_SELF_CFG_H2G_CTB_ADDR_KEY, 375 ctb_addr); 376 if (err) 377 return err; 378 379 return xe_guc_self_cfg32(guc, 380 GUC_KLV_SELF_CFG_H2G_CTB_SIZE_KEY, 381 size); 382 } 383 384 static int guc_ct_ctb_g2h_register(struct xe_guc_ct *ct) 385 { 386 struct xe_guc *guc = ct_to_guc(ct); 387 u32 desc_addr, ctb_addr, size; 388 int err; 389 390 desc_addr = xe_bo_ggtt_addr(ct->bo) + CTB_DESC_SIZE; 391 ctb_addr = xe_bo_ggtt_addr(ct->bo) + CTB_H2G_BUFFER_OFFSET + 392 CTB_H2G_BUFFER_SIZE; 393 size = ct->ctbs.g2h.info.size * sizeof(u32); 394 395 err = xe_guc_self_cfg64(guc, 396 GUC_KLV_SELF_CFG_G2H_CTB_DESCRIPTOR_ADDR_KEY, 397 desc_addr); 398 if (err) 399 return err; 400 401 err = xe_guc_self_cfg64(guc, 402 GUC_KLV_SELF_CFG_G2H_CTB_ADDR_KEY, 403 ctb_addr); 404 if (err) 405 return err; 406 407 return xe_guc_self_cfg32(guc, 408 GUC_KLV_SELF_CFG_G2H_CTB_SIZE_KEY, 409 size); 410 } 411 412 static int guc_ct_control_toggle(struct xe_guc_ct *ct, bool enable) 413 { 414 u32 request[HOST2GUC_CONTROL_CTB_REQUEST_MSG_LEN] = { 415 FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) | 416 FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) | 417 FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION, 418 GUC_ACTION_HOST2GUC_CONTROL_CTB), 419 FIELD_PREP(HOST2GUC_CONTROL_CTB_REQUEST_MSG_1_CONTROL, 420 enable ? GUC_CTB_CONTROL_ENABLE : 421 GUC_CTB_CONTROL_DISABLE), 422 }; 423 int ret = xe_guc_mmio_send(ct_to_guc(ct), request, ARRAY_SIZE(request)); 424 425 return ret > 0 ? -EPROTO : ret; 426 } 427 428 static void guc_ct_change_state(struct xe_guc_ct *ct, 429 enum xe_guc_ct_state state) 430 { 431 struct xe_gt *gt = ct_to_gt(ct); 432 struct g2h_fence *g2h_fence; 433 unsigned long idx; 434 435 mutex_lock(&ct->lock); /* Serialise dequeue_one_g2h() */ 436 spin_lock_irq(&ct->fast_lock); /* Serialise CT fast-path */ 437 438 xe_gt_assert(ct_to_gt(ct), ct->g2h_outstanding == 0 || 439 state == XE_GUC_CT_STATE_STOPPED); 440 441 if (ct->g2h_outstanding) 442 xe_pm_runtime_put(ct_to_xe(ct)); 443 ct->g2h_outstanding = 0; 444 ct->state = state; 445 446 xe_gt_dbg(gt, "GuC CT communication channel %s\n", 447 state == XE_GUC_CT_STATE_STOPPED ? "stopped" : 448 str_enabled_disabled(state == XE_GUC_CT_STATE_ENABLED)); 449 450 spin_unlock_irq(&ct->fast_lock); 451 452 /* cancel all in-flight send-recv requests */ 453 xa_for_each(&ct->fence_lookup, idx, g2h_fence) 454 g2h_fence_cancel(g2h_fence); 455 456 /* make sure guc_ct_send_recv() will see g2h_fence changes */ 457 smp_mb(); 458 wake_up_all(&ct->g2h_fence_wq); 459 460 /* 461 * Lockdep doesn't like this under the fast lock and he destroy only 462 * needs to be serialized with the send path which ct lock provides. 463 */ 464 xa_destroy(&ct->fence_lookup); 465 466 mutex_unlock(&ct->lock); 467 } 468 469 static bool ct_needs_safe_mode(struct xe_guc_ct *ct) 470 { 471 return !pci_dev_msi_enabled(to_pci_dev(ct_to_xe(ct)->drm.dev)); 472 } 473 474 static bool ct_restart_safe_mode_worker(struct xe_guc_ct *ct) 475 { 476 if (!ct_needs_safe_mode(ct)) 477 return false; 478 479 queue_delayed_work(ct->g2h_wq, &ct->safe_mode_worker, HZ / 10); 480 return true; 481 } 482 483 static void safe_mode_worker_func(struct work_struct *w) 484 { 485 struct xe_guc_ct *ct = container_of(w, struct xe_guc_ct, safe_mode_worker.work); 486 487 receive_g2h(ct); 488 489 if (!ct_restart_safe_mode_worker(ct)) 490 xe_gt_dbg(ct_to_gt(ct), "GuC CT safe-mode canceled\n"); 491 } 492 493 static void ct_enter_safe_mode(struct xe_guc_ct *ct) 494 { 495 if (ct_restart_safe_mode_worker(ct)) 496 xe_gt_dbg(ct_to_gt(ct), "GuC CT safe-mode enabled\n"); 497 } 498 499 static void ct_exit_safe_mode(struct xe_guc_ct *ct) 500 { 501 if (cancel_delayed_work_sync(&ct->safe_mode_worker)) 502 xe_gt_dbg(ct_to_gt(ct), "GuC CT safe-mode disabled\n"); 503 } 504 505 static int __xe_guc_ct_start(struct xe_guc_ct *ct, bool needs_register) 506 { 507 struct xe_device *xe = ct_to_xe(ct); 508 struct xe_gt *gt = ct_to_gt(ct); 509 int err; 510 511 xe_gt_assert(gt, !xe_guc_ct_enabled(ct)); 512 513 if (needs_register) { 514 xe_map_memset(xe, &ct->bo->vmap, 0, 0, xe_bo_size(ct->bo)); 515 guc_ct_ctb_h2g_init(xe, &ct->ctbs.h2g, &ct->bo->vmap); 516 guc_ct_ctb_g2h_init(xe, &ct->ctbs.g2h, &ct->bo->vmap); 517 518 err = guc_ct_ctb_h2g_register(ct); 519 if (err) 520 goto err_out; 521 522 err = guc_ct_ctb_g2h_register(ct); 523 if (err) 524 goto err_out; 525 526 err = guc_ct_control_toggle(ct, true); 527 if (err) 528 goto err_out; 529 } else { 530 ct->ctbs.h2g.info.broken = false; 531 ct->ctbs.g2h.info.broken = false; 532 /* Skip everything in H2G buffer */ 533 xe_map_memset(xe, &ct->bo->vmap, CTB_H2G_BUFFER_OFFSET, 0, 534 CTB_H2G_BUFFER_SIZE); 535 } 536 537 guc_ct_change_state(ct, XE_GUC_CT_STATE_ENABLED); 538 539 smp_mb(); 540 wake_up_all(&ct->wq); 541 542 if (ct_needs_safe_mode(ct)) 543 ct_enter_safe_mode(ct); 544 545 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) 546 /* 547 * The CT has now been reset so the dumper can be re-armed 548 * after any existing dead state has been dumped. 549 */ 550 spin_lock_irq(&ct->dead.lock); 551 if (ct->dead.reason) { 552 ct->dead.reason |= (1 << CT_DEAD_STATE_REARM); 553 queue_work(system_unbound_wq, &ct->dead.worker); 554 } 555 spin_unlock_irq(&ct->dead.lock); 556 #endif 557 558 return 0; 559 560 err_out: 561 xe_gt_err(gt, "Failed to enable GuC CT (%pe)\n", ERR_PTR(err)); 562 CT_DEAD(ct, NULL, SETUP); 563 564 return err; 565 } 566 567 /** 568 * xe_guc_ct_restart() - Restart GuC CT 569 * @ct: the &xe_guc_ct 570 * 571 * Restart GuC CT to an empty state without issuing a CT register MMIO command. 572 * 573 * Return: 0 on success, or a negative errno on failure. 574 */ 575 int xe_guc_ct_restart(struct xe_guc_ct *ct) 576 { 577 return __xe_guc_ct_start(ct, false); 578 } 579 580 /** 581 * xe_guc_ct_enable() - Enable GuC CT 582 * @ct: the &xe_guc_ct 583 * 584 * Enable GuC CT to an empty state and issue a CT register MMIO command. 585 * 586 * Return: 0 on success, or a negative errno on failure. 587 */ 588 int xe_guc_ct_enable(struct xe_guc_ct *ct) 589 { 590 return __xe_guc_ct_start(ct, true); 591 } 592 593 static void stop_g2h_handler(struct xe_guc_ct *ct) 594 { 595 cancel_work_sync(&ct->g2h_worker); 596 } 597 598 /** 599 * xe_guc_ct_disable - Set GuC to disabled state 600 * @ct: the &xe_guc_ct 601 * 602 * Set GuC CT to disabled state and stop g2h handler. No outstanding g2h expected 603 * in this transition. 604 */ 605 void xe_guc_ct_disable(struct xe_guc_ct *ct) 606 { 607 guc_ct_change_state(ct, XE_GUC_CT_STATE_DISABLED); 608 ct_exit_safe_mode(ct); 609 stop_g2h_handler(ct); 610 } 611 612 /** 613 * xe_guc_ct_flush_and_stop - Flush and stop all processing of G2H / H2G 614 * @ct: the &xe_guc_ct 615 */ 616 void xe_guc_ct_flush_and_stop(struct xe_guc_ct *ct) 617 { 618 receive_g2h(ct); 619 xe_guc_ct_stop(ct); 620 } 621 622 /** 623 * xe_guc_ct_stop - Set GuC to stopped state 624 * @ct: the &xe_guc_ct 625 * 626 * Set GuC CT to stopped state, stop g2h handler, and clear any outstanding g2h 627 */ 628 void xe_guc_ct_stop(struct xe_guc_ct *ct) 629 { 630 if (!xe_guc_ct_initialized(ct)) 631 return; 632 633 guc_ct_change_state(ct, XE_GUC_CT_STATE_STOPPED); 634 stop_g2h_handler(ct); 635 } 636 637 static bool h2g_has_room(struct xe_guc_ct *ct, u32 cmd_len) 638 { 639 struct guc_ctb *h2g = &ct->ctbs.h2g; 640 641 lockdep_assert_held(&ct->lock); 642 643 if (cmd_len > h2g->info.space) { 644 h2g->info.head = desc_read(ct_to_xe(ct), h2g, head); 645 646 if (h2g->info.head > h2g->info.size) { 647 struct xe_device *xe = ct_to_xe(ct); 648 u32 desc_status = desc_read(xe, h2g, status); 649 650 desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW); 651 652 xe_gt_err(ct_to_gt(ct), "CT: invalid head offset %u >= %u)\n", 653 h2g->info.head, h2g->info.size); 654 CT_DEAD(ct, h2g, H2G_HAS_ROOM); 655 return false; 656 } 657 658 h2g->info.space = CIRC_SPACE(h2g->info.tail, h2g->info.head, 659 h2g->info.size) - 660 h2g->info.resv_space; 661 if (cmd_len > h2g->info.space) 662 return false; 663 } 664 665 return true; 666 } 667 668 static bool g2h_has_room(struct xe_guc_ct *ct, u32 g2h_len) 669 { 670 if (!g2h_len) 671 return true; 672 673 lockdep_assert_held(&ct->fast_lock); 674 675 return ct->ctbs.g2h.info.space > g2h_len; 676 } 677 678 static int has_room(struct xe_guc_ct *ct, u32 cmd_len, u32 g2h_len) 679 { 680 lockdep_assert_held(&ct->lock); 681 682 if (!g2h_has_room(ct, g2h_len) || !h2g_has_room(ct, cmd_len)) 683 return -EBUSY; 684 685 return 0; 686 } 687 688 static void h2g_reserve_space(struct xe_guc_ct *ct, u32 cmd_len) 689 { 690 lockdep_assert_held(&ct->lock); 691 ct->ctbs.h2g.info.space -= cmd_len; 692 } 693 694 static void __g2h_reserve_space(struct xe_guc_ct *ct, u32 g2h_len, u32 num_g2h) 695 { 696 xe_gt_assert(ct_to_gt(ct), g2h_len <= ct->ctbs.g2h.info.space); 697 xe_gt_assert(ct_to_gt(ct), (!g2h_len && !num_g2h) || 698 (g2h_len && num_g2h)); 699 700 if (g2h_len) { 701 lockdep_assert_held(&ct->fast_lock); 702 703 if (!ct->g2h_outstanding) 704 xe_pm_runtime_get_noresume(ct_to_xe(ct)); 705 706 ct->ctbs.g2h.info.space -= g2h_len; 707 ct->g2h_outstanding += num_g2h; 708 } 709 } 710 711 static void __g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len) 712 { 713 bool bad = false; 714 715 lockdep_assert_held(&ct->fast_lock); 716 717 bad = ct->ctbs.g2h.info.space + g2h_len > 718 ct->ctbs.g2h.info.size - ct->ctbs.g2h.info.resv_space; 719 bad |= !ct->g2h_outstanding; 720 721 if (bad) { 722 xe_gt_err(ct_to_gt(ct), "Invalid G2H release: %d + %d vs %d - %d -> %d vs %d, outstanding = %d!\n", 723 ct->ctbs.g2h.info.space, g2h_len, 724 ct->ctbs.g2h.info.size, ct->ctbs.g2h.info.resv_space, 725 ct->ctbs.g2h.info.space + g2h_len, 726 ct->ctbs.g2h.info.size - ct->ctbs.g2h.info.resv_space, 727 ct->g2h_outstanding); 728 CT_DEAD(ct, &ct->ctbs.g2h, G2H_RELEASE); 729 return; 730 } 731 732 ct->ctbs.g2h.info.space += g2h_len; 733 if (!--ct->g2h_outstanding) 734 xe_pm_runtime_put(ct_to_xe(ct)); 735 } 736 737 static void g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len) 738 { 739 spin_lock_irq(&ct->fast_lock); 740 __g2h_release_space(ct, g2h_len); 741 spin_unlock_irq(&ct->fast_lock); 742 } 743 744 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) 745 static void fast_req_track(struct xe_guc_ct *ct, u16 fence, u16 action) 746 { 747 unsigned int slot = fence % ARRAY_SIZE(ct->fast_req); 748 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC) 749 unsigned long entries[SZ_32]; 750 unsigned int n; 751 752 n = stack_trace_save(entries, ARRAY_SIZE(entries), 1); 753 754 /* May be called under spinlock, so avoid sleeping */ 755 ct->fast_req[slot].stack = stack_depot_save(entries, n, GFP_NOWAIT); 756 #endif 757 ct->fast_req[slot].fence = fence; 758 ct->fast_req[slot].action = action; 759 } 760 #else 761 static void fast_req_track(struct xe_guc_ct *ct, u16 fence, u16 action) 762 { 763 } 764 #endif 765 766 /* 767 * The CT protocol accepts a 16 bits fence. This field is fully owned by the 768 * driver, the GuC will just copy it to the reply message. Since we need to 769 * be able to distinguish between replies to REQUEST and FAST_REQUEST messages, 770 * we use one bit of the seqno as an indicator for that and a rolling counter 771 * for the remaining 15 bits. 772 */ 773 #define CT_SEQNO_MASK GENMASK(14, 0) 774 #define CT_SEQNO_UNTRACKED BIT(15) 775 static u16 next_ct_seqno(struct xe_guc_ct *ct, bool is_g2h_fence) 776 { 777 u32 seqno = ct->fence_seqno++ & CT_SEQNO_MASK; 778 779 if (!is_g2h_fence) 780 seqno |= CT_SEQNO_UNTRACKED; 781 782 return seqno; 783 } 784 785 #define MAKE_ACTION(type, __action) \ 786 ({ \ 787 FIELD_PREP(GUC_HXG_MSG_0_TYPE, type) | \ 788 FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION | \ 789 GUC_HXG_EVENT_MSG_0_DATA0, __action); \ 790 }) 791 792 static bool vf_action_can_safely_fail(struct xe_device *xe, u32 action) 793 { 794 /* 795 * When resuming a VF, we can't reliably track whether context 796 * registration has completed in the GuC state machine. It is harmless 797 * to resend the request, as it will fail silently if GUC_HXG_TYPE_EVENT 798 * is used. Additionally, if there is an H2G protocol issue on a VF, 799 * subsequent H2G messages sent as GUC_HXG_TYPE_FAST_REQUEST will likely 800 * fail. 801 */ 802 return IS_SRIOV_VF(xe) && xe_sriov_vf_migration_supported(xe) && 803 (action == XE_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC || 804 action == XE_GUC_ACTION_REGISTER_CONTEXT); 805 } 806 807 #define H2G_CT_HEADERS (GUC_CTB_HDR_LEN + 1) /* one DW CTB header and one DW HxG header */ 808 809 static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len, 810 u32 ct_fence_value, bool want_response) 811 { 812 struct xe_device *xe = ct_to_xe(ct); 813 struct xe_gt *gt = ct_to_gt(ct); 814 struct guc_ctb *h2g = &ct->ctbs.h2g; 815 u32 cmd[H2G_CT_HEADERS]; 816 u32 tail = h2g->info.tail; 817 u32 full_len; 818 struct iosys_map map = IOSYS_MAP_INIT_OFFSET(&h2g->cmds, 819 tail * sizeof(u32)); 820 u32 desc_status; 821 822 full_len = len + GUC_CTB_HDR_LEN; 823 824 lockdep_assert_held(&ct->lock); 825 xe_gt_assert(gt, full_len <= GUC_CTB_MSG_MAX_LEN); 826 827 desc_status = desc_read(xe, h2g, status); 828 if (desc_status) { 829 xe_gt_err(gt, "CT write: non-zero status: %u\n", desc_status); 830 goto corrupted; 831 } 832 833 if (IS_ENABLED(CONFIG_DRM_XE_DEBUG)) { 834 u32 desc_tail = desc_read(xe, h2g, tail); 835 u32 desc_head = desc_read(xe, h2g, head); 836 837 if (tail != desc_tail) { 838 desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_MISMATCH); 839 xe_gt_err(gt, "CT write: tail was modified %u != %u\n", desc_tail, tail); 840 goto corrupted; 841 } 842 843 if (tail > h2g->info.size) { 844 desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW); 845 xe_gt_err(gt, "CT write: tail out of range: %u vs %u\n", 846 tail, h2g->info.size); 847 goto corrupted; 848 } 849 850 if (desc_head >= h2g->info.size) { 851 desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW); 852 xe_gt_err(gt, "CT write: invalid head offset %u >= %u)\n", 853 desc_head, h2g->info.size); 854 goto corrupted; 855 } 856 } 857 858 /* Command will wrap, zero fill (NOPs), return and check credits again */ 859 if (tail + full_len > h2g->info.size) { 860 xe_map_memset(xe, &map, 0, 0, 861 (h2g->info.size - tail) * sizeof(u32)); 862 h2g_reserve_space(ct, (h2g->info.size - tail)); 863 h2g->info.tail = 0; 864 desc_write(xe, h2g, tail, h2g->info.tail); 865 866 return -EAGAIN; 867 } 868 869 /* 870 * dw0: CT header (including fence) 871 * dw1: HXG header (including action code) 872 * dw2+: action data 873 */ 874 cmd[0] = FIELD_PREP(GUC_CTB_MSG_0_FORMAT, GUC_CTB_FORMAT_HXG) | 875 FIELD_PREP(GUC_CTB_MSG_0_NUM_DWORDS, len) | 876 FIELD_PREP(GUC_CTB_MSG_0_FENCE, ct_fence_value); 877 if (want_response) { 878 cmd[1] = MAKE_ACTION(GUC_HXG_TYPE_REQUEST, action[0]); 879 } else if (vf_action_can_safely_fail(xe, action[0])) { 880 cmd[1] = MAKE_ACTION(GUC_HXG_TYPE_EVENT, action[0]); 881 } else { 882 fast_req_track(ct, ct_fence_value, 883 FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, action[0])); 884 885 cmd[1] = MAKE_ACTION(GUC_HXG_TYPE_FAST_REQUEST, action[0]); 886 } 887 888 /* H2G header in cmd[1] replaces action[0] so: */ 889 --len; 890 ++action; 891 892 /* Write H2G ensuring visible before descriptor update */ 893 xe_map_memcpy_to(xe, &map, 0, cmd, H2G_CT_HEADERS * sizeof(u32)); 894 xe_map_memcpy_to(xe, &map, H2G_CT_HEADERS * sizeof(u32), action, len * sizeof(u32)); 895 xe_device_wmb(xe); 896 897 /* Update local copies */ 898 h2g->info.tail = (tail + full_len) % h2g->info.size; 899 h2g_reserve_space(ct, full_len); 900 901 /* Update descriptor */ 902 desc_write(xe, h2g, tail, h2g->info.tail); 903 904 trace_xe_guc_ctb_h2g(xe, gt->info.id, *(action - 1), full_len, 905 desc_read(xe, h2g, head), h2g->info.tail); 906 907 return 0; 908 909 corrupted: 910 CT_DEAD(ct, &ct->ctbs.h2g, H2G_WRITE); 911 return -EPIPE; 912 } 913 914 static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, 915 u32 len, u32 g2h_len, u32 num_g2h, 916 struct g2h_fence *g2h_fence) 917 { 918 struct xe_gt *gt = ct_to_gt(ct); 919 u16 seqno; 920 int ret; 921 922 xe_gt_assert(gt, xe_guc_ct_initialized(ct)); 923 xe_gt_assert(gt, !g2h_len || !g2h_fence); 924 xe_gt_assert(gt, !num_g2h || !g2h_fence); 925 xe_gt_assert(gt, !g2h_len || num_g2h); 926 xe_gt_assert(gt, g2h_len || !num_g2h); 927 lockdep_assert_held(&ct->lock); 928 929 if (unlikely(ct->ctbs.h2g.info.broken)) { 930 ret = -EPIPE; 931 goto out; 932 } 933 934 if (ct->state == XE_GUC_CT_STATE_DISABLED) { 935 ret = -ENODEV; 936 goto out; 937 } 938 939 if (ct->state == XE_GUC_CT_STATE_STOPPED || xe_gt_recovery_pending(gt)) { 940 ret = -ECANCELED; 941 goto out; 942 } 943 944 xe_gt_assert(gt, xe_guc_ct_enabled(ct)); 945 946 if (g2h_fence) { 947 g2h_len = GUC_CTB_HXG_MSG_MAX_LEN; 948 num_g2h = 1; 949 950 if (g2h_fence_needs_alloc(g2h_fence)) { 951 g2h_fence->seqno = next_ct_seqno(ct, true); 952 ret = xa_err(xa_store(&ct->fence_lookup, 953 g2h_fence->seqno, g2h_fence, 954 GFP_ATOMIC)); 955 if (ret) 956 goto out; 957 } 958 959 seqno = g2h_fence->seqno; 960 } else { 961 seqno = next_ct_seqno(ct, false); 962 } 963 964 if (g2h_len) 965 spin_lock_irq(&ct->fast_lock); 966 retry: 967 ret = has_room(ct, len + GUC_CTB_HDR_LEN, g2h_len); 968 if (unlikely(ret)) 969 goto out_unlock; 970 971 ret = h2g_write(ct, action, len, seqno, !!g2h_fence); 972 if (unlikely(ret)) { 973 if (ret == -EAGAIN) 974 goto retry; 975 goto out_unlock; 976 } 977 978 __g2h_reserve_space(ct, g2h_len, num_g2h); 979 xe_guc_notify(ct_to_guc(ct)); 980 out_unlock: 981 if (g2h_len) 982 spin_unlock_irq(&ct->fast_lock); 983 out: 984 return ret; 985 } 986 987 static void kick_reset(struct xe_guc_ct *ct) 988 { 989 xe_gt_reset_async(ct_to_gt(ct)); 990 } 991 992 static int dequeue_one_g2h(struct xe_guc_ct *ct); 993 994 /* 995 * wait before retry of sending h2g message 996 * Return: true if ready for retry, false if the wait timeouted 997 */ 998 static bool guc_ct_send_wait_for_retry(struct xe_guc_ct *ct, u32 len, 999 u32 g2h_len, struct g2h_fence *g2h_fence, 1000 unsigned int *sleep_period_ms) 1001 { 1002 struct xe_device *xe = ct_to_xe(ct); 1003 1004 /* 1005 * We wait to try to restore credits for about 1 second before bailing. 1006 * In the case of H2G credits we have no choice but just to wait for the 1007 * GuC to consume H2Gs in the channel so we use a wait / sleep loop. In 1008 * the case of G2H we process any G2H in the channel, hopefully freeing 1009 * credits as we consume the G2H messages. 1010 */ 1011 if (!h2g_has_room(ct, len + GUC_CTB_HDR_LEN)) { 1012 struct guc_ctb *h2g = &ct->ctbs.h2g; 1013 1014 if (*sleep_period_ms == 1024) 1015 return false; 1016 1017 trace_xe_guc_ct_h2g_flow_control(xe, h2g->info.head, h2g->info.tail, 1018 h2g->info.size, 1019 h2g->info.space, 1020 len + GUC_CTB_HDR_LEN); 1021 msleep(*sleep_period_ms); 1022 *sleep_period_ms <<= 1; 1023 } else { 1024 struct xe_device *xe = ct_to_xe(ct); 1025 struct guc_ctb *g2h = &ct->ctbs.g2h; 1026 int ret; 1027 1028 trace_xe_guc_ct_g2h_flow_control(xe, g2h->info.head, 1029 desc_read(xe, g2h, tail), 1030 g2h->info.size, 1031 g2h->info.space, 1032 g2h_fence ? 1033 GUC_CTB_HXG_MSG_MAX_LEN : 1034 g2h_len); 1035 1036 #define g2h_avail(ct) \ 1037 (desc_read(ct_to_xe(ct), (&ct->ctbs.g2h), tail) != ct->ctbs.g2h.info.head) 1038 if (!wait_event_timeout(ct->wq, !ct->g2h_outstanding || 1039 g2h_avail(ct), HZ)) 1040 return false; 1041 #undef g2h_avail 1042 1043 ret = dequeue_one_g2h(ct); 1044 if (ret < 0) { 1045 if (ret != -ECANCELED) 1046 xe_gt_err(ct_to_gt(ct), "CTB receive failed (%pe)", 1047 ERR_PTR(ret)); 1048 return false; 1049 } 1050 } 1051 return true; 1052 } 1053 1054 static int guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len, 1055 u32 g2h_len, u32 num_g2h, 1056 struct g2h_fence *g2h_fence) 1057 { 1058 struct xe_gt *gt = ct_to_gt(ct); 1059 unsigned int sleep_period_ms = 1; 1060 int ret; 1061 1062 xe_gt_assert(gt, !g2h_len || !g2h_fence); 1063 lockdep_assert_held(&ct->lock); 1064 xe_device_assert_mem_access(ct_to_xe(ct)); 1065 1066 try_again: 1067 ret = __guc_ct_send_locked(ct, action, len, g2h_len, num_g2h, 1068 g2h_fence); 1069 1070 if (unlikely(ret == -EBUSY)) { 1071 if (!guc_ct_send_wait_for_retry(ct, len, g2h_len, g2h_fence, 1072 &sleep_period_ms)) 1073 goto broken; 1074 goto try_again; 1075 } 1076 1077 return ret; 1078 1079 broken: 1080 xe_gt_err(gt, "No forward process on H2G, reset required\n"); 1081 CT_DEAD(ct, &ct->ctbs.h2g, DEADLOCK); 1082 1083 return -EDEADLK; 1084 } 1085 1086 static int guc_ct_send(struct xe_guc_ct *ct, const u32 *action, u32 len, 1087 u32 g2h_len, u32 num_g2h, struct g2h_fence *g2h_fence) 1088 { 1089 int ret; 1090 1091 xe_gt_assert(ct_to_gt(ct), !g2h_len || !g2h_fence); 1092 1093 mutex_lock(&ct->lock); 1094 ret = guc_ct_send_locked(ct, action, len, g2h_len, num_g2h, g2h_fence); 1095 mutex_unlock(&ct->lock); 1096 1097 return ret; 1098 } 1099 1100 int xe_guc_ct_send(struct xe_guc_ct *ct, const u32 *action, u32 len, 1101 u32 g2h_len, u32 num_g2h) 1102 { 1103 int ret; 1104 1105 ret = guc_ct_send(ct, action, len, g2h_len, num_g2h, NULL); 1106 if (ret == -EDEADLK) 1107 kick_reset(ct); 1108 1109 return ret; 1110 } 1111 1112 int xe_guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len, 1113 u32 g2h_len, u32 num_g2h) 1114 { 1115 int ret; 1116 1117 ret = guc_ct_send_locked(ct, action, len, g2h_len, num_g2h, NULL); 1118 if (ret == -EDEADLK) 1119 kick_reset(ct); 1120 1121 return ret; 1122 } 1123 1124 int xe_guc_ct_send_g2h_handler(struct xe_guc_ct *ct, const u32 *action, u32 len) 1125 { 1126 int ret; 1127 1128 lockdep_assert_held(&ct->lock); 1129 1130 ret = guc_ct_send_locked(ct, action, len, 0, 0, NULL); 1131 if (ret == -EDEADLK) 1132 kick_reset(ct); 1133 1134 return ret; 1135 } 1136 1137 /* 1138 * Check if a GT reset is in progress or will occur and if GT reset brought the 1139 * CT back up. Randomly picking 5 seconds for an upper limit to do a GT a reset. 1140 */ 1141 static bool retry_failure(struct xe_guc_ct *ct, int ret) 1142 { 1143 if (!(ret == -EDEADLK || ret == -EPIPE || ret == -ENODEV)) 1144 return false; 1145 1146 #define ct_alive(ct) \ 1147 (xe_guc_ct_enabled(ct) && !ct->ctbs.h2g.info.broken && \ 1148 !ct->ctbs.g2h.info.broken) 1149 if (!wait_event_interruptible_timeout(ct->wq, ct_alive(ct), HZ * 5)) 1150 return false; 1151 #undef ct_alive 1152 1153 return true; 1154 } 1155 1156 #define GUC_SEND_RETRY_LIMIT 50 1157 #define GUC_SEND_RETRY_MSLEEP 5 1158 1159 static int guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len, 1160 u32 *response_buffer, bool no_fail) 1161 { 1162 struct xe_gt *gt = ct_to_gt(ct); 1163 struct g2h_fence g2h_fence; 1164 unsigned int retries = 0; 1165 int ret = 0; 1166 1167 /* 1168 * We use a fence to implement blocking sends / receiving response data. 1169 * The seqno of the fence is sent in the H2G, returned in the G2H, and 1170 * an xarray is used as storage media with the seqno being to key. 1171 * Fields in the fence hold success, failure, retry status and the 1172 * response data. Safe to allocate on the stack as the xarray is the 1173 * only reference and it cannot be present after this function exits. 1174 */ 1175 retry: 1176 g2h_fence_init(&g2h_fence, response_buffer); 1177 retry_same_fence: 1178 ret = guc_ct_send(ct, action, len, 0, 0, &g2h_fence); 1179 if (unlikely(ret == -ENOMEM)) { 1180 /* Retry allocation /w GFP_KERNEL */ 1181 ret = xa_err(xa_store(&ct->fence_lookup, g2h_fence.seqno, 1182 &g2h_fence, GFP_KERNEL)); 1183 if (ret) 1184 return ret; 1185 1186 goto retry_same_fence; 1187 } else if (unlikely(ret)) { 1188 if (ret == -EDEADLK) 1189 kick_reset(ct); 1190 1191 if (no_fail && retry_failure(ct, ret)) 1192 goto retry_same_fence; 1193 1194 if (!g2h_fence_needs_alloc(&g2h_fence)) 1195 xa_erase(&ct->fence_lookup, g2h_fence.seqno); 1196 1197 return ret; 1198 } 1199 1200 ret = wait_event_timeout(ct->g2h_fence_wq, g2h_fence.done, HZ); 1201 if (!ret) { 1202 LNL_FLUSH_WORK(&ct->g2h_worker); 1203 if (g2h_fence.done) { 1204 xe_gt_warn(gt, "G2H fence %u, action %04x, done\n", 1205 g2h_fence.seqno, action[0]); 1206 ret = 1; 1207 } 1208 } 1209 1210 /* 1211 * Ensure we serialize with completion side to prevent UAF with fence going out of scope on 1212 * the stack, since we have no clue if it will fire after the timeout before we can erase 1213 * from the xa. Also we have some dependent loads and stores below for which we need the 1214 * correct ordering, and we lack the needed barriers. 1215 */ 1216 mutex_lock(&ct->lock); 1217 if (!ret) { 1218 xe_gt_err(gt, "Timed out wait for G2H, fence %u, action %04x, done %s", 1219 g2h_fence.seqno, action[0], str_yes_no(g2h_fence.done)); 1220 xa_erase(&ct->fence_lookup, g2h_fence.seqno); 1221 mutex_unlock(&ct->lock); 1222 return -ETIME; 1223 } 1224 1225 if (g2h_fence.retry) { 1226 xe_gt_dbg(gt, "H2G action %#x retrying: reason %#x\n", 1227 action[0], g2h_fence.reason); 1228 mutex_unlock(&ct->lock); 1229 if (++retries > GUC_SEND_RETRY_LIMIT) { 1230 xe_gt_err(gt, "H2G action %#x reached retry limit=%u, aborting\n", 1231 action[0], GUC_SEND_RETRY_LIMIT); 1232 return -ELOOP; 1233 } 1234 msleep(GUC_SEND_RETRY_MSLEEP * retries); 1235 goto retry; 1236 } 1237 if (g2h_fence.fail) { 1238 if (g2h_fence.cancel) { 1239 xe_gt_dbg(gt, "H2G request %#x canceled!\n", action[0]); 1240 ret = -ECANCELED; 1241 goto unlock; 1242 } 1243 xe_gt_err(gt, "H2G request %#x failed: error %#x hint %#x\n", 1244 action[0], g2h_fence.error, g2h_fence.hint); 1245 ret = -EIO; 1246 } 1247 1248 if (ret > 0) 1249 ret = response_buffer ? g2h_fence.response_len : g2h_fence.response_data; 1250 1251 unlock: 1252 mutex_unlock(&ct->lock); 1253 1254 return ret; 1255 } 1256 1257 /** 1258 * xe_guc_ct_send_recv - Send and receive HXG to the GuC 1259 * @ct: the &xe_guc_ct 1260 * @action: the dword array with `HXG Request`_ message (can't be NULL) 1261 * @len: length of the `HXG Request`_ message (in dwords, can't be 0) 1262 * @response_buffer: placeholder for the `HXG Response`_ message (can be NULL) 1263 * 1264 * Send a `HXG Request`_ message to the GuC over CT communication channel and 1265 * blocks until GuC replies with a `HXG Response`_ message. 1266 * 1267 * For non-blocking communication with GuC use xe_guc_ct_send(). 1268 * 1269 * Note: The size of &response_buffer must be at least GUC_CTB_MAX_DWORDS_. 1270 * 1271 * Return: response length (in dwords) if &response_buffer was not NULL, or 1272 * DATA0 from `HXG Response`_ if &response_buffer was NULL, or 1273 * a negative error code on failure. 1274 */ 1275 int xe_guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len, 1276 u32 *response_buffer) 1277 { 1278 KUNIT_STATIC_STUB_REDIRECT(xe_guc_ct_send_recv, ct, action, len, response_buffer); 1279 return guc_ct_send_recv(ct, action, len, response_buffer, false); 1280 } 1281 ALLOW_ERROR_INJECTION(xe_guc_ct_send_recv, ERRNO); 1282 1283 int xe_guc_ct_send_recv_no_fail(struct xe_guc_ct *ct, const u32 *action, 1284 u32 len, u32 *response_buffer) 1285 { 1286 return guc_ct_send_recv(ct, action, len, response_buffer, true); 1287 } 1288 1289 static u32 *msg_to_hxg(u32 *msg) 1290 { 1291 return msg + GUC_CTB_MSG_MIN_LEN; 1292 } 1293 1294 static u32 msg_len_to_hxg_len(u32 len) 1295 { 1296 return len - GUC_CTB_MSG_MIN_LEN; 1297 } 1298 1299 static int parse_g2h_event(struct xe_guc_ct *ct, u32 *msg, u32 len) 1300 { 1301 u32 *hxg = msg_to_hxg(msg); 1302 u32 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, hxg[0]); 1303 1304 lockdep_assert_held(&ct->lock); 1305 1306 switch (action) { 1307 case XE_GUC_ACTION_SCHED_CONTEXT_MODE_DONE: 1308 case XE_GUC_ACTION_DEREGISTER_CONTEXT_DONE: 1309 case XE_GUC_ACTION_SCHED_ENGINE_MODE_DONE: 1310 case XE_GUC_ACTION_TLB_INVALIDATION_DONE: 1311 g2h_release_space(ct, len); 1312 } 1313 1314 return 0; 1315 } 1316 1317 static int guc_crash_process_msg(struct xe_guc_ct *ct, u32 action) 1318 { 1319 struct xe_gt *gt = ct_to_gt(ct); 1320 1321 if (action == XE_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED) 1322 xe_gt_err(gt, "GuC Crash dump notification\n"); 1323 else if (action == XE_GUC_ACTION_NOTIFY_EXCEPTION) 1324 xe_gt_err(gt, "GuC Exception notification\n"); 1325 else 1326 xe_gt_err(gt, "Unknown GuC crash notification: 0x%04X\n", action); 1327 1328 CT_DEAD(ct, NULL, CRASH); 1329 1330 kick_reset(ct); 1331 1332 return 0; 1333 } 1334 1335 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) 1336 static void fast_req_report(struct xe_guc_ct *ct, u16 fence) 1337 { 1338 u16 fence_min = U16_MAX, fence_max = 0; 1339 struct xe_gt *gt = ct_to_gt(ct); 1340 bool found = false; 1341 unsigned int n; 1342 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC) 1343 char *buf; 1344 #endif 1345 1346 lockdep_assert_held(&ct->lock); 1347 1348 for (n = 0; n < ARRAY_SIZE(ct->fast_req); n++) { 1349 if (ct->fast_req[n].fence < fence_min) 1350 fence_min = ct->fast_req[n].fence; 1351 if (ct->fast_req[n].fence > fence_max) 1352 fence_max = ct->fast_req[n].fence; 1353 1354 if (ct->fast_req[n].fence != fence) 1355 continue; 1356 found = true; 1357 1358 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC) 1359 buf = kmalloc(SZ_4K, GFP_NOWAIT); 1360 if (buf && stack_depot_snprint(ct->fast_req[n].stack, buf, SZ_4K, 0)) 1361 xe_gt_err(gt, "Fence 0x%x was used by action %#04x sent at:\n%s", 1362 fence, ct->fast_req[n].action, buf); 1363 else 1364 xe_gt_err(gt, "Fence 0x%x was used by action %#04x [failed to retrieve stack]\n", 1365 fence, ct->fast_req[n].action); 1366 kfree(buf); 1367 #else 1368 xe_gt_err(gt, "Fence 0x%x was used by action %#04x\n", 1369 fence, ct->fast_req[n].action); 1370 #endif 1371 break; 1372 } 1373 1374 if (!found) 1375 xe_gt_warn(gt, "Fence 0x%x not found - tracking buffer wrapped? [range = 0x%x -> 0x%x, next = 0x%X]\n", 1376 fence, fence_min, fence_max, ct->fence_seqno); 1377 } 1378 #else 1379 static void fast_req_report(struct xe_guc_ct *ct, u16 fence) 1380 { 1381 } 1382 #endif 1383 1384 static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len) 1385 { 1386 struct xe_gt *gt = ct_to_gt(ct); 1387 u32 *hxg = msg_to_hxg(msg); 1388 u32 hxg_len = msg_len_to_hxg_len(len); 1389 u32 fence = FIELD_GET(GUC_CTB_MSG_0_FENCE, msg[0]); 1390 u32 type = FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]); 1391 struct g2h_fence *g2h_fence; 1392 1393 lockdep_assert_held(&ct->lock); 1394 1395 /* 1396 * Fences for FAST_REQUEST messages are not tracked in ct->fence_lookup. 1397 * Those messages should never fail, so if we do get an error back it 1398 * means we're likely doing an illegal operation and the GuC is 1399 * rejecting it. We have no way to inform the code that submitted the 1400 * H2G that the message was rejected, so we need to escalate the 1401 * failure to trigger a reset. 1402 */ 1403 if (fence & CT_SEQNO_UNTRACKED) { 1404 if (type == GUC_HXG_TYPE_RESPONSE_FAILURE) 1405 xe_gt_err(gt, "FAST_REQ H2G fence 0x%x failed! e=0x%x, h=%u\n", 1406 fence, 1407 FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, hxg[0]), 1408 FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT, hxg[0])); 1409 else 1410 xe_gt_err(gt, "unexpected response %u for FAST_REQ H2G fence 0x%x!\n", 1411 type, fence); 1412 1413 fast_req_report(ct, fence); 1414 1415 /* FIXME: W/A race in the GuC, will get in firmware soon */ 1416 if (xe_gt_recovery_pending(gt)) 1417 return 0; 1418 1419 CT_DEAD(ct, NULL, PARSE_G2H_RESPONSE); 1420 1421 return -EPROTO; 1422 } 1423 1424 g2h_fence = xa_erase(&ct->fence_lookup, fence); 1425 if (unlikely(!g2h_fence)) { 1426 /* Don't tear down channel, as send could've timed out */ 1427 /* CT_DEAD(ct, NULL, PARSE_G2H_UNKNOWN); */ 1428 xe_gt_warn(gt, "G2H fence (%u) not found!\n", fence); 1429 g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN); 1430 return 0; 1431 } 1432 1433 xe_gt_assert(gt, fence == g2h_fence->seqno); 1434 1435 if (type == GUC_HXG_TYPE_RESPONSE_FAILURE) { 1436 g2h_fence->fail = true; 1437 g2h_fence->error = FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, hxg[0]); 1438 g2h_fence->hint = FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT, hxg[0]); 1439 } else if (type == GUC_HXG_TYPE_NO_RESPONSE_RETRY) { 1440 g2h_fence->retry = true; 1441 g2h_fence->reason = FIELD_GET(GUC_HXG_RETRY_MSG_0_REASON, hxg[0]); 1442 } else if (g2h_fence->response_buffer) { 1443 g2h_fence->response_len = hxg_len; 1444 memcpy(g2h_fence->response_buffer, hxg, hxg_len * sizeof(u32)); 1445 } else { 1446 g2h_fence->response_data = FIELD_GET(GUC_HXG_RESPONSE_MSG_0_DATA0, hxg[0]); 1447 } 1448 1449 g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN); 1450 1451 g2h_fence->done = true; 1452 smp_mb(); 1453 1454 wake_up_all(&ct->g2h_fence_wq); 1455 1456 return 0; 1457 } 1458 1459 static int parse_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len) 1460 { 1461 struct xe_gt *gt = ct_to_gt(ct); 1462 u32 *hxg = msg_to_hxg(msg); 1463 u32 origin, type; 1464 int ret; 1465 1466 lockdep_assert_held(&ct->lock); 1467 1468 origin = FIELD_GET(GUC_HXG_MSG_0_ORIGIN, hxg[0]); 1469 if (unlikely(origin != GUC_HXG_ORIGIN_GUC)) { 1470 xe_gt_err(gt, "G2H channel broken on read, origin=%u, reset required\n", 1471 origin); 1472 CT_DEAD(ct, &ct->ctbs.g2h, PARSE_G2H_ORIGIN); 1473 1474 return -EPROTO; 1475 } 1476 1477 type = FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]); 1478 switch (type) { 1479 case GUC_HXG_TYPE_EVENT: 1480 ret = parse_g2h_event(ct, msg, len); 1481 break; 1482 case GUC_HXG_TYPE_RESPONSE_SUCCESS: 1483 case GUC_HXG_TYPE_RESPONSE_FAILURE: 1484 case GUC_HXG_TYPE_NO_RESPONSE_RETRY: 1485 ret = parse_g2h_response(ct, msg, len); 1486 break; 1487 default: 1488 xe_gt_err(gt, "G2H channel broken on read, type=%u, reset required\n", 1489 type); 1490 CT_DEAD(ct, &ct->ctbs.g2h, PARSE_G2H_TYPE); 1491 1492 ret = -EOPNOTSUPP; 1493 } 1494 1495 return ret; 1496 } 1497 1498 static int process_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len) 1499 { 1500 struct xe_guc *guc = ct_to_guc(ct); 1501 struct xe_gt *gt = ct_to_gt(ct); 1502 u32 hxg_len = msg_len_to_hxg_len(len); 1503 u32 *hxg = msg_to_hxg(msg); 1504 u32 action, adj_len; 1505 u32 *payload; 1506 int ret = 0; 1507 1508 if (FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]) != GUC_HXG_TYPE_EVENT) 1509 return 0; 1510 1511 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, hxg[0]); 1512 payload = hxg + GUC_HXG_EVENT_MSG_MIN_LEN; 1513 adj_len = hxg_len - GUC_HXG_EVENT_MSG_MIN_LEN; 1514 1515 switch (action) { 1516 case XE_GUC_ACTION_SCHED_CONTEXT_MODE_DONE: 1517 ret = xe_guc_sched_done_handler(guc, payload, adj_len); 1518 break; 1519 case XE_GUC_ACTION_DEREGISTER_CONTEXT_DONE: 1520 ret = xe_guc_deregister_done_handler(guc, payload, adj_len); 1521 break; 1522 case XE_GUC_ACTION_CONTEXT_RESET_NOTIFICATION: 1523 ret = xe_guc_exec_queue_reset_handler(guc, payload, adj_len); 1524 break; 1525 case XE_GUC_ACTION_ENGINE_FAILURE_NOTIFICATION: 1526 ret = xe_guc_exec_queue_reset_failure_handler(guc, payload, 1527 adj_len); 1528 break; 1529 case XE_GUC_ACTION_SCHED_ENGINE_MODE_DONE: 1530 /* Selftest only at the moment */ 1531 break; 1532 case XE_GUC_ACTION_STATE_CAPTURE_NOTIFICATION: 1533 ret = xe_guc_error_capture_handler(guc, payload, adj_len); 1534 break; 1535 case XE_GUC_ACTION_NOTIFY_FLUSH_LOG_BUFFER_TO_FILE: 1536 /* FIXME: Handle this */ 1537 break; 1538 case XE_GUC_ACTION_NOTIFY_MEMORY_CAT_ERROR: 1539 ret = xe_guc_exec_queue_memory_cat_error_handler(guc, payload, 1540 adj_len); 1541 break; 1542 case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC: 1543 ret = xe_guc_pagefault_handler(guc, payload, adj_len); 1544 break; 1545 case XE_GUC_ACTION_TLB_INVALIDATION_DONE: 1546 ret = xe_guc_tlb_inval_done_handler(guc, payload, adj_len); 1547 break; 1548 case XE_GUC_ACTION_ACCESS_COUNTER_NOTIFY: 1549 ret = xe_guc_access_counter_notify_handler(guc, payload, 1550 adj_len); 1551 break; 1552 case XE_GUC_ACTION_GUC2PF_RELAY_FROM_VF: 1553 ret = xe_guc_relay_process_guc2pf(&guc->relay, hxg, hxg_len); 1554 break; 1555 case XE_GUC_ACTION_GUC2VF_RELAY_FROM_PF: 1556 ret = xe_guc_relay_process_guc2vf(&guc->relay, hxg, hxg_len); 1557 break; 1558 case GUC_ACTION_GUC2PF_VF_STATE_NOTIFY: 1559 ret = xe_gt_sriov_pf_control_process_guc2pf(gt, hxg, hxg_len); 1560 break; 1561 case GUC_ACTION_GUC2PF_ADVERSE_EVENT: 1562 ret = xe_gt_sriov_pf_monitor_process_guc2pf(gt, hxg, hxg_len); 1563 break; 1564 case XE_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED: 1565 case XE_GUC_ACTION_NOTIFY_EXCEPTION: 1566 ret = guc_crash_process_msg(ct, action); 1567 break; 1568 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) 1569 case XE_GUC_ACTION_TEST_G2G_RECV: 1570 ret = xe_guc_g2g_test_notification(guc, payload, adj_len); 1571 break; 1572 #endif 1573 default: 1574 xe_gt_err(gt, "unexpected G2H action 0x%04x\n", action); 1575 } 1576 1577 if (ret) { 1578 xe_gt_err(gt, "G2H action %#04x failed (%pe) len %u msg %*ph\n", 1579 action, ERR_PTR(ret), hxg_len, (int)sizeof(u32) * hxg_len, hxg); 1580 CT_DEAD(ct, NULL, PROCESS_FAILED); 1581 } 1582 1583 return 0; 1584 } 1585 1586 static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path) 1587 { 1588 struct xe_device *xe = ct_to_xe(ct); 1589 struct xe_gt *gt = ct_to_gt(ct); 1590 struct guc_ctb *g2h = &ct->ctbs.g2h; 1591 u32 tail, head, len, desc_status; 1592 s32 avail; 1593 u32 action; 1594 u32 *hxg; 1595 1596 xe_gt_assert(gt, xe_guc_ct_initialized(ct)); 1597 lockdep_assert_held(&ct->fast_lock); 1598 1599 if (ct->state == XE_GUC_CT_STATE_DISABLED) 1600 return -ENODEV; 1601 1602 if (ct->state == XE_GUC_CT_STATE_STOPPED) 1603 return -ECANCELED; 1604 1605 if (g2h->info.broken) 1606 return -EPIPE; 1607 1608 xe_gt_assert(gt, xe_guc_ct_enabled(ct)); 1609 1610 desc_status = desc_read(xe, g2h, status); 1611 if (desc_status) { 1612 if (desc_status & GUC_CTB_STATUS_DISABLED) { 1613 /* 1614 * Potentially valid if a CLIENT_RESET request resulted in 1615 * contexts/engines being reset. But should never happen as 1616 * no contexts should be active when CLIENT_RESET is sent. 1617 */ 1618 xe_gt_err(gt, "CT read: unexpected G2H after GuC has stopped!\n"); 1619 desc_status &= ~GUC_CTB_STATUS_DISABLED; 1620 } 1621 1622 if (desc_status) { 1623 xe_gt_err(gt, "CT read: non-zero status: %u\n", desc_status); 1624 goto corrupted; 1625 } 1626 } 1627 1628 if (IS_ENABLED(CONFIG_DRM_XE_DEBUG)) { 1629 u32 desc_tail = desc_read(xe, g2h, tail); 1630 /* 1631 u32 desc_head = desc_read(xe, g2h, head); 1632 1633 * info.head and desc_head are updated back-to-back at the end of 1634 * this function and nowhere else. Hence, they cannot be different 1635 * unless two g2h_read calls are running concurrently. Which is not 1636 * possible because it is guarded by ct->fast_lock. And yet, some 1637 * discrete platforms are regularly hitting this error :(. 1638 * 1639 * desc_head rolling backwards shouldn't cause any noticeable 1640 * problems - just a delay in GuC being allowed to proceed past that 1641 * point in the queue. So for now, just disable the error until it 1642 * can be root caused. 1643 * 1644 if (g2h->info.head != desc_head) { 1645 desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_MISMATCH); 1646 xe_gt_err(gt, "CT read: head was modified %u != %u\n", 1647 desc_head, g2h->info.head); 1648 goto corrupted; 1649 } 1650 */ 1651 1652 if (g2h->info.head > g2h->info.size) { 1653 desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_OVERFLOW); 1654 xe_gt_err(gt, "CT read: head out of range: %u vs %u\n", 1655 g2h->info.head, g2h->info.size); 1656 goto corrupted; 1657 } 1658 1659 if (desc_tail >= g2h->info.size) { 1660 desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_OVERFLOW); 1661 xe_gt_err(gt, "CT read: invalid tail offset %u >= %u)\n", 1662 desc_tail, g2h->info.size); 1663 goto corrupted; 1664 } 1665 } 1666 1667 /* Calculate DW available to read */ 1668 tail = desc_read(xe, g2h, tail); 1669 avail = tail - g2h->info.head; 1670 if (unlikely(avail == 0)) 1671 return 0; 1672 1673 if (avail < 0) 1674 avail += g2h->info.size; 1675 1676 /* Read header */ 1677 xe_map_memcpy_from(xe, msg, &g2h->cmds, sizeof(u32) * g2h->info.head, 1678 sizeof(u32)); 1679 len = FIELD_GET(GUC_CTB_MSG_0_NUM_DWORDS, msg[0]) + GUC_CTB_MSG_MIN_LEN; 1680 if (len > avail) { 1681 xe_gt_err(gt, "G2H channel broken on read, avail=%d, len=%d, reset required\n", 1682 avail, len); 1683 goto corrupted; 1684 } 1685 1686 head = (g2h->info.head + 1) % g2h->info.size; 1687 avail = len - 1; 1688 1689 /* Read G2H message */ 1690 if (avail + head > g2h->info.size) { 1691 u32 avail_til_wrap = g2h->info.size - head; 1692 1693 xe_map_memcpy_from(xe, msg + 1, 1694 &g2h->cmds, sizeof(u32) * head, 1695 avail_til_wrap * sizeof(u32)); 1696 xe_map_memcpy_from(xe, msg + 1 + avail_til_wrap, 1697 &g2h->cmds, 0, 1698 (avail - avail_til_wrap) * sizeof(u32)); 1699 } else { 1700 xe_map_memcpy_from(xe, msg + 1, 1701 &g2h->cmds, sizeof(u32) * head, 1702 avail * sizeof(u32)); 1703 } 1704 1705 hxg = msg_to_hxg(msg); 1706 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, hxg[0]); 1707 1708 if (fast_path) { 1709 if (FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]) != GUC_HXG_TYPE_EVENT) 1710 return 0; 1711 1712 switch (action) { 1713 case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC: 1714 case XE_GUC_ACTION_TLB_INVALIDATION_DONE: 1715 break; /* Process these in fast-path */ 1716 default: 1717 return 0; 1718 } 1719 } 1720 1721 /* Update local / descriptor header */ 1722 g2h->info.head = (head + avail) % g2h->info.size; 1723 desc_write(xe, g2h, head, g2h->info.head); 1724 1725 trace_xe_guc_ctb_g2h(xe, ct_to_gt(ct)->info.id, 1726 action, len, g2h->info.head, tail); 1727 1728 return len; 1729 1730 corrupted: 1731 CT_DEAD(ct, &ct->ctbs.g2h, G2H_READ); 1732 return -EPROTO; 1733 } 1734 1735 static void g2h_fast_path(struct xe_guc_ct *ct, u32 *msg, u32 len) 1736 { 1737 struct xe_gt *gt = ct_to_gt(ct); 1738 struct xe_guc *guc = ct_to_guc(ct); 1739 u32 hxg_len = msg_len_to_hxg_len(len); 1740 u32 *hxg = msg_to_hxg(msg); 1741 u32 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, hxg[0]); 1742 u32 *payload = hxg + GUC_HXG_MSG_MIN_LEN; 1743 u32 adj_len = hxg_len - GUC_HXG_MSG_MIN_LEN; 1744 int ret = 0; 1745 1746 switch (action) { 1747 case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC: 1748 ret = xe_guc_pagefault_handler(guc, payload, adj_len); 1749 break; 1750 case XE_GUC_ACTION_TLB_INVALIDATION_DONE: 1751 __g2h_release_space(ct, len); 1752 ret = xe_guc_tlb_inval_done_handler(guc, payload, adj_len); 1753 break; 1754 default: 1755 xe_gt_warn(gt, "NOT_POSSIBLE"); 1756 } 1757 1758 if (ret) { 1759 xe_gt_err(gt, "G2H action 0x%04x failed (%pe)\n", 1760 action, ERR_PTR(ret)); 1761 CT_DEAD(ct, NULL, FAST_G2H); 1762 } 1763 } 1764 1765 /** 1766 * xe_guc_ct_fast_path - process critical G2H in the IRQ handler 1767 * @ct: GuC CT object 1768 * 1769 * Anything related to page faults is critical for performance, process these 1770 * critical G2H in the IRQ. This is safe as these handlers either just wake up 1771 * waiters or queue another worker. 1772 */ 1773 void xe_guc_ct_fast_path(struct xe_guc_ct *ct) 1774 { 1775 struct xe_device *xe = ct_to_xe(ct); 1776 bool ongoing; 1777 int len; 1778 1779 ongoing = xe_pm_runtime_get_if_active(ct_to_xe(ct)); 1780 if (!ongoing && xe_pm_read_callback_task(ct_to_xe(ct)) == NULL) 1781 return; 1782 1783 spin_lock(&ct->fast_lock); 1784 do { 1785 len = g2h_read(ct, ct->fast_msg, true); 1786 if (len > 0) 1787 g2h_fast_path(ct, ct->fast_msg, len); 1788 } while (len > 0); 1789 spin_unlock(&ct->fast_lock); 1790 1791 if (ongoing) 1792 xe_pm_runtime_put(xe); 1793 } 1794 1795 /* Returns less than zero on error, 0 on done, 1 on more available */ 1796 static int dequeue_one_g2h(struct xe_guc_ct *ct) 1797 { 1798 int len; 1799 int ret; 1800 1801 lockdep_assert_held(&ct->lock); 1802 1803 spin_lock_irq(&ct->fast_lock); 1804 len = g2h_read(ct, ct->msg, false); 1805 spin_unlock_irq(&ct->fast_lock); 1806 if (len <= 0) 1807 return len; 1808 1809 ret = parse_g2h_msg(ct, ct->msg, len); 1810 if (unlikely(ret < 0)) 1811 return ret; 1812 1813 ret = process_g2h_msg(ct, ct->msg, len); 1814 if (unlikely(ret < 0)) 1815 return ret; 1816 1817 return 1; 1818 } 1819 1820 static void receive_g2h(struct xe_guc_ct *ct) 1821 { 1822 bool ongoing; 1823 int ret; 1824 1825 /* 1826 * Normal users must always hold mem_access.ref around CT calls. However 1827 * during the runtime pm callbacks we rely on CT to talk to the GuC, but 1828 * at this stage we can't rely on mem_access.ref and even the 1829 * callback_task will be different than current. For such cases we just 1830 * need to ensure we always process the responses from any blocking 1831 * ct_send requests or where we otherwise expect some response when 1832 * initiated from those callbacks (which will need to wait for the below 1833 * dequeue_one_g2h()). The dequeue_one_g2h() will gracefully fail if 1834 * the device has suspended to the point that the CT communication has 1835 * been disabled. 1836 * 1837 * If we are inside the runtime pm callback, we can be the only task 1838 * still issuing CT requests (since that requires having the 1839 * mem_access.ref). It seems like it might in theory be possible to 1840 * receive unsolicited events from the GuC just as we are 1841 * suspending-resuming, but those will currently anyway be lost when 1842 * eventually exiting from suspend, hence no need to wake up the device 1843 * here. If we ever need something stronger than get_if_ongoing() then 1844 * we need to be careful with blocking the pm callbacks from getting CT 1845 * responses, if the worker here is blocked on those callbacks 1846 * completing, creating a deadlock. 1847 */ 1848 ongoing = xe_pm_runtime_get_if_active(ct_to_xe(ct)); 1849 if (!ongoing && xe_pm_read_callback_task(ct_to_xe(ct)) == NULL) 1850 return; 1851 1852 do { 1853 mutex_lock(&ct->lock); 1854 ret = dequeue_one_g2h(ct); 1855 mutex_unlock(&ct->lock); 1856 1857 if (unlikely(ret == -EPROTO || ret == -EOPNOTSUPP)) { 1858 xe_gt_err(ct_to_gt(ct), "CT dequeue failed: %d", ret); 1859 CT_DEAD(ct, NULL, G2H_RECV); 1860 kick_reset(ct); 1861 } 1862 } while (ret == 1); 1863 1864 if (ongoing) 1865 xe_pm_runtime_put(ct_to_xe(ct)); 1866 } 1867 1868 static void g2h_worker_func(struct work_struct *w) 1869 { 1870 struct xe_guc_ct *ct = container_of(w, struct xe_guc_ct, g2h_worker); 1871 1872 receive_g2h(ct); 1873 } 1874 1875 static struct xe_guc_ct_snapshot *guc_ct_snapshot_alloc(struct xe_guc_ct *ct, bool atomic, 1876 bool want_ctb) 1877 { 1878 struct xe_guc_ct_snapshot *snapshot; 1879 1880 snapshot = kzalloc(sizeof(*snapshot), atomic ? GFP_ATOMIC : GFP_KERNEL); 1881 if (!snapshot) 1882 return NULL; 1883 1884 if (ct->bo && want_ctb) { 1885 snapshot->ctb_size = xe_bo_size(ct->bo); 1886 snapshot->ctb = kmalloc(snapshot->ctb_size, atomic ? GFP_ATOMIC : GFP_KERNEL); 1887 } 1888 1889 return snapshot; 1890 } 1891 1892 static void guc_ctb_snapshot_capture(struct xe_device *xe, struct guc_ctb *ctb, 1893 struct guc_ctb_snapshot *snapshot) 1894 { 1895 xe_map_memcpy_from(xe, &snapshot->desc, &ctb->desc, 0, 1896 sizeof(struct guc_ct_buffer_desc)); 1897 memcpy(&snapshot->info, &ctb->info, sizeof(struct guc_ctb_info)); 1898 } 1899 1900 static void guc_ctb_snapshot_print(struct guc_ctb_snapshot *snapshot, 1901 struct drm_printer *p) 1902 { 1903 drm_printf(p, "\tsize: %d\n", snapshot->info.size); 1904 drm_printf(p, "\tresv_space: %d\n", snapshot->info.resv_space); 1905 drm_printf(p, "\thead: %d\n", snapshot->info.head); 1906 drm_printf(p, "\ttail: %d\n", snapshot->info.tail); 1907 drm_printf(p, "\tspace: %d\n", snapshot->info.space); 1908 drm_printf(p, "\tbroken: %d\n", snapshot->info.broken); 1909 drm_printf(p, "\thead (memory): %d\n", snapshot->desc.head); 1910 drm_printf(p, "\ttail (memory): %d\n", snapshot->desc.tail); 1911 drm_printf(p, "\tstatus (memory): 0x%x\n", snapshot->desc.status); 1912 } 1913 1914 static struct xe_guc_ct_snapshot *guc_ct_snapshot_capture(struct xe_guc_ct *ct, bool atomic, 1915 bool want_ctb) 1916 { 1917 struct xe_device *xe = ct_to_xe(ct); 1918 struct xe_guc_ct_snapshot *snapshot; 1919 1920 snapshot = guc_ct_snapshot_alloc(ct, atomic, want_ctb); 1921 if (!snapshot) { 1922 xe_gt_err(ct_to_gt(ct), "Skipping CTB snapshot entirely.\n"); 1923 return NULL; 1924 } 1925 1926 if (xe_guc_ct_enabled(ct) || ct->state == XE_GUC_CT_STATE_STOPPED) { 1927 snapshot->ct_enabled = true; 1928 snapshot->g2h_outstanding = READ_ONCE(ct->g2h_outstanding); 1929 guc_ctb_snapshot_capture(xe, &ct->ctbs.h2g, &snapshot->h2g); 1930 guc_ctb_snapshot_capture(xe, &ct->ctbs.g2h, &snapshot->g2h); 1931 } 1932 1933 if (ct->bo && snapshot->ctb) 1934 xe_map_memcpy_from(xe, snapshot->ctb, &ct->bo->vmap, 0, snapshot->ctb_size); 1935 1936 return snapshot; 1937 } 1938 1939 /** 1940 * xe_guc_ct_snapshot_capture - Take a quick snapshot of the CT state. 1941 * @ct: GuC CT object. 1942 * 1943 * This can be printed out in a later stage like during dev_coredump 1944 * analysis. This is safe to be called during atomic context. 1945 * 1946 * Returns: a GuC CT snapshot object that must be freed by the caller 1947 * by using `xe_guc_ct_snapshot_free`. 1948 */ 1949 struct xe_guc_ct_snapshot *xe_guc_ct_snapshot_capture(struct xe_guc_ct *ct) 1950 { 1951 return guc_ct_snapshot_capture(ct, true, true); 1952 } 1953 1954 /** 1955 * xe_guc_ct_snapshot_print - Print out a given GuC CT snapshot. 1956 * @snapshot: GuC CT snapshot object. 1957 * @p: drm_printer where it will be printed out. 1958 * 1959 * This function prints out a given GuC CT snapshot object. 1960 */ 1961 void xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot *snapshot, 1962 struct drm_printer *p) 1963 { 1964 if (!snapshot) 1965 return; 1966 1967 if (snapshot->ct_enabled) { 1968 drm_puts(p, "H2G CTB (all sizes in DW):\n"); 1969 guc_ctb_snapshot_print(&snapshot->h2g, p); 1970 1971 drm_puts(p, "G2H CTB (all sizes in DW):\n"); 1972 guc_ctb_snapshot_print(&snapshot->g2h, p); 1973 drm_printf(p, "\tg2h outstanding: %d\n", 1974 snapshot->g2h_outstanding); 1975 1976 if (snapshot->ctb) { 1977 drm_printf(p, "[CTB].length: 0x%zx\n", snapshot->ctb_size); 1978 xe_print_blob_ascii85(p, "[CTB].data", '\n', 1979 snapshot->ctb, 0, snapshot->ctb_size); 1980 } 1981 } else { 1982 drm_puts(p, "CT disabled\n"); 1983 } 1984 } 1985 1986 /** 1987 * xe_guc_ct_snapshot_free - Free all allocated objects for a given snapshot. 1988 * @snapshot: GuC CT snapshot object. 1989 * 1990 * This function free all the memory that needed to be allocated at capture 1991 * time. 1992 */ 1993 void xe_guc_ct_snapshot_free(struct xe_guc_ct_snapshot *snapshot) 1994 { 1995 if (!snapshot) 1996 return; 1997 1998 kfree(snapshot->ctb); 1999 kfree(snapshot); 2000 } 2001 2002 /** 2003 * xe_guc_ct_print - GuC CT Print. 2004 * @ct: GuC CT. 2005 * @p: drm_printer where it will be printed out. 2006 * @want_ctb: Should the full CTB content be dumped (vs just the headers) 2007 * 2008 * This function will quickly capture a snapshot of the CT state 2009 * and immediately print it out. 2010 */ 2011 void xe_guc_ct_print(struct xe_guc_ct *ct, struct drm_printer *p, bool want_ctb) 2012 { 2013 struct xe_guc_ct_snapshot *snapshot; 2014 2015 snapshot = guc_ct_snapshot_capture(ct, false, want_ctb); 2016 xe_guc_ct_snapshot_print(snapshot, p); 2017 xe_guc_ct_snapshot_free(snapshot); 2018 } 2019 2020 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG) 2021 2022 #ifdef CONFIG_FUNCTION_ERROR_INJECTION 2023 /* 2024 * This is a helper function which assists the driver in identifying if a fault 2025 * injection test is currently active, allowing it to reduce unnecessary debug 2026 * output. Typically, the function returns zero, but the fault injection 2027 * framework can alter this to return an error. Since faults are injected 2028 * through this function, it's important to ensure the compiler doesn't optimize 2029 * it into an inline function. To avoid such optimization, the 'noinline' 2030 * attribute is applied. Compiler optimizes the static function defined in the 2031 * header file as an inline function. 2032 */ 2033 noinline int xe_is_injection_active(void) { return 0; } 2034 ALLOW_ERROR_INJECTION(xe_is_injection_active, ERRNO); 2035 #else 2036 int xe_is_injection_active(void) { return 0; } 2037 #endif 2038 2039 static void ct_dead_capture(struct xe_guc_ct *ct, struct guc_ctb *ctb, u32 reason_code) 2040 { 2041 struct xe_guc_log_snapshot *snapshot_log; 2042 struct xe_guc_ct_snapshot *snapshot_ct; 2043 struct xe_guc *guc = ct_to_guc(ct); 2044 unsigned long flags; 2045 bool have_capture; 2046 2047 if (ctb) 2048 ctb->info.broken = true; 2049 /* 2050 * Huge dump is getting generated when injecting error for guc CT/MMIO 2051 * functions. So, let us suppress the dump when fault is injected. 2052 */ 2053 if (xe_is_injection_active()) 2054 return; 2055 2056 /* Ignore further errors after the first dump until a reset */ 2057 if (ct->dead.reported) 2058 return; 2059 2060 spin_lock_irqsave(&ct->dead.lock, flags); 2061 2062 /* And only capture one dump at a time */ 2063 have_capture = ct->dead.reason & (1 << CT_DEAD_STATE_CAPTURE); 2064 ct->dead.reason |= (1 << reason_code) | 2065 (1 << CT_DEAD_STATE_CAPTURE); 2066 2067 spin_unlock_irqrestore(&ct->dead.lock, flags); 2068 2069 if (have_capture) 2070 return; 2071 2072 snapshot_log = xe_guc_log_snapshot_capture(&guc->log, true); 2073 snapshot_ct = xe_guc_ct_snapshot_capture((ct)); 2074 2075 spin_lock_irqsave(&ct->dead.lock, flags); 2076 2077 if (ct->dead.snapshot_log || ct->dead.snapshot_ct) { 2078 xe_gt_err(ct_to_gt(ct), "Got unexpected dead CT capture!\n"); 2079 xe_guc_log_snapshot_free(snapshot_log); 2080 xe_guc_ct_snapshot_free(snapshot_ct); 2081 } else { 2082 ct->dead.snapshot_log = snapshot_log; 2083 ct->dead.snapshot_ct = snapshot_ct; 2084 } 2085 2086 spin_unlock_irqrestore(&ct->dead.lock, flags); 2087 2088 queue_work(system_unbound_wq, &(ct)->dead.worker); 2089 } 2090 2091 static void ct_dead_print(struct xe_dead_ct *dead) 2092 { 2093 struct xe_guc_ct *ct = container_of(dead, struct xe_guc_ct, dead); 2094 struct xe_device *xe = ct_to_xe(ct); 2095 struct xe_gt *gt = ct_to_gt(ct); 2096 static int g_count; 2097 struct drm_printer ip = xe_gt_info_printer(gt); 2098 struct drm_printer lp = drm_line_printer(&ip, "Capture", ++g_count); 2099 2100 if (!dead->reason) { 2101 xe_gt_err(gt, "CTB is dead for no reason!?\n"); 2102 return; 2103 } 2104 2105 /* Can't generate a genuine core dump at this point, so just do the good bits */ 2106 drm_puts(&lp, "**** Xe Device Coredump ****\n"); 2107 drm_printf(&lp, "Reason: CTB is dead - 0x%X\n", dead->reason); 2108 xe_device_snapshot_print(xe, &lp); 2109 2110 drm_printf(&lp, "**** GT #%d ****\n", gt->info.id); 2111 drm_printf(&lp, "\tTile: %d\n", gt->tile->id); 2112 2113 drm_puts(&lp, "**** GuC Log ****\n"); 2114 xe_guc_log_snapshot_print(dead->snapshot_log, &lp); 2115 2116 drm_puts(&lp, "**** GuC CT ****\n"); 2117 xe_guc_ct_snapshot_print(dead->snapshot_ct, &lp); 2118 2119 drm_puts(&lp, "Done.\n"); 2120 } 2121 2122 static void ct_dead_worker_func(struct work_struct *w) 2123 { 2124 struct xe_guc_ct *ct = container_of(w, struct xe_guc_ct, dead.worker); 2125 2126 if (!ct->dead.reported) { 2127 ct->dead.reported = true; 2128 ct_dead_print(&ct->dead); 2129 } 2130 2131 spin_lock_irq(&ct->dead.lock); 2132 2133 xe_guc_log_snapshot_free(ct->dead.snapshot_log); 2134 ct->dead.snapshot_log = NULL; 2135 xe_guc_ct_snapshot_free(ct->dead.snapshot_ct); 2136 ct->dead.snapshot_ct = NULL; 2137 2138 if (ct->dead.reason & (1 << CT_DEAD_STATE_REARM)) { 2139 /* A reset has occurred so re-arm the error reporting */ 2140 ct->dead.reason = 0; 2141 ct->dead.reported = false; 2142 } 2143 2144 spin_unlock_irq(&ct->dead.lock); 2145 } 2146 #endif 2147