1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2021 Intel Corporation 4 */ 5 6 #include "xe_lrc.h" 7 8 #include <generated/xe_wa_oob.h> 9 10 #include <linux/ascii85.h> 11 #include <linux/panic.h> 12 13 #include "instructions/xe_mi_commands.h" 14 #include "instructions/xe_gfxpipe_commands.h" 15 #include "instructions/xe_gfx_state_commands.h" 16 #include "regs/xe_engine_regs.h" 17 #include "regs/xe_lrc_layout.h" 18 #include "xe_bb.h" 19 #include "xe_bo.h" 20 #include "xe_configfs.h" 21 #include "xe_device.h" 22 #include "xe_drm_client.h" 23 #include "xe_exec_queue_types.h" 24 #include "xe_gt.h" 25 #include "xe_gt_printk.h" 26 #include "xe_hw_fence.h" 27 #include "xe_map.h" 28 #include "xe_memirq.h" 29 #include "xe_mmio.h" 30 #include "xe_sriov.h" 31 #include "xe_trace_lrc.h" 32 #include "xe_vm.h" 33 #include "xe_wa.h" 34 35 #define LRC_VALID BIT_ULL(0) 36 #define LRC_PRIVILEGE BIT_ULL(8) 37 #define LRC_ADDRESSING_MODE GENMASK_ULL(4, 3) 38 #define LRC_LEGACY_64B_CONTEXT 3 39 40 #define LRC_ENGINE_CLASS GENMASK_ULL(63, 61) 41 #define LRC_ENGINE_INSTANCE GENMASK_ULL(53, 48) 42 43 #define LRC_PPHWSP_SIZE SZ_4K 44 #define LRC_INDIRECT_CTX_BO_SIZE SZ_4K 45 #define LRC_INDIRECT_RING_STATE_SIZE SZ_4K 46 47 #define LRC_PRIORITY GENMASK_ULL(10, 9) 48 #define LRC_PRIORITY_LOW 0 49 #define LRC_PRIORITY_NORMAL 1 50 #define LRC_PRIORITY_HIGH 2 51 52 /* 53 * Layout of the LRC and associated data allocated as 54 * lrc->bo: 55 * 56 * Region Size 57 * +============================+=================================+ <- __xe_lrc_ring_offset() 58 * | Ring | ring_size, see | 59 * | | xe_lrc_init() | 60 * +============================+=================================+ <- __xe_lrc_pphwsp_offset() 61 * | PPHWSP (includes SW state) | 4K | 62 * +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset() 63 * | Engine Context Image | n * 4K, see | 64 * | | xe_gt_lrc_size() | 65 * +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset() 66 * | Indirect Ring State Page | 0 or 4k, see | 67 * | | XE_LRC_FLAG_INDIRECT_RING_STATE | 68 * +============================+=================================+ <- __xe_lrc_indirect_ctx_offset() 69 * | Indirect Context Page | 0 or 4k, see | 70 * | | XE_LRC_FLAG_INDIRECT_CTX | 71 * +============================+=================================+ <- __xe_lrc_wa_bb_offset() 72 * | WA BB Per Ctx | 4k | 73 * +============================+=================================+ <- xe_bo_size(lrc->bo) 74 */ 75 76 static struct xe_device * 77 lrc_to_xe(struct xe_lrc *lrc) 78 { 79 return gt_to_xe(lrc->fence_ctx.gt); 80 } 81 82 static bool 83 gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class) 84 { 85 struct xe_device *xe = gt_to_xe(gt); 86 87 if (XE_GT_WA(gt, 16010904313) && 88 (class == XE_ENGINE_CLASS_RENDER || 89 class == XE_ENGINE_CLASS_COMPUTE)) 90 return true; 91 92 if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev), 93 class, NULL)) 94 return true; 95 96 return false; 97 } 98 99 /** 100 * xe_gt_lrc_hang_replay_size() - Hang replay size 101 * @gt: The GT 102 * @class: Hardware engine class 103 * 104 * Determine size of GPU hang replay state for a GT and hardware engine class. 105 * 106 * Return: Size of GPU hang replay size 107 */ 108 size_t xe_gt_lrc_hang_replay_size(struct xe_gt *gt, enum xe_engine_class class) 109 { 110 struct xe_device *xe = gt_to_xe(gt); 111 size_t size = 0; 112 113 /* Engine context image */ 114 switch (class) { 115 case XE_ENGINE_CLASS_RENDER: 116 if (GRAPHICS_VER(xe) >= 20) 117 size += 3 * SZ_4K; 118 else 119 size += 13 * SZ_4K; 120 break; 121 case XE_ENGINE_CLASS_COMPUTE: 122 if (GRAPHICS_VER(xe) >= 20) 123 size += 2 * SZ_4K; 124 else 125 size += 13 * SZ_4K; 126 break; 127 default: 128 WARN(1, "Unknown engine class: %d", class); 129 fallthrough; 130 case XE_ENGINE_CLASS_COPY: 131 case XE_ENGINE_CLASS_VIDEO_DECODE: 132 case XE_ENGINE_CLASS_VIDEO_ENHANCE: 133 case XE_ENGINE_CLASS_OTHER: 134 size += 1 * SZ_4K; 135 } 136 137 return size; 138 } 139 140 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class) 141 { 142 size_t size = xe_gt_lrc_hang_replay_size(gt, class); 143 144 /* Add indirect ring state page */ 145 if (xe_gt_has_indirect_ring_state(gt)) 146 size += LRC_INDIRECT_RING_STATE_SIZE; 147 148 return size + LRC_PPHWSP_SIZE; 149 } 150 151 /* 152 * The per-platform tables are u8-encoded in @data. Decode @data and set the 153 * addresses' offset and commands in @regs. The following encoding is used 154 * for each byte. There are 2 steps: decoding commands and decoding addresses. 155 * 156 * Commands: 157 * [7]: create NOPs - number of NOPs are set in lower bits 158 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set 159 * MI_LRI_FORCE_POSTED 160 * [5:0]: Number of NOPs or registers to set values to in case of 161 * MI_LOAD_REGISTER_IMM 162 * 163 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count" 164 * number of registers. They are set by using the REG/REG16 macros: the former 165 * is used for offsets smaller than 0x200 while the latter is for values bigger 166 * than that. Those macros already set all the bits documented below correctly: 167 * 168 * [7]: When a register offset needs more than 6 bits, use additional bytes, to 169 * follow, for the lower bits 170 * [6:0]: Register offset, without considering the engine base. 171 * 172 * This function only tweaks the commands and register offsets. Values are not 173 * filled out. 174 */ 175 static void set_offsets(u32 *regs, 176 const u8 *data, 177 const struct xe_hw_engine *hwe) 178 #define NOP(x) (BIT(7) | (x)) 179 #define LRI(count, flags) ((flags) << 6 | (count) | \ 180 BUILD_BUG_ON_ZERO(count >= BIT(6))) 181 #define POSTED BIT(0) 182 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 183 #define REG16(x) \ 184 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 185 (((x) >> 2) & 0x7f) 186 { 187 const u32 base = hwe->mmio_base; 188 189 while (*data) { 190 u8 count, flags; 191 192 if (*data & BIT(7)) { /* skip */ 193 count = *data++ & ~BIT(7); 194 regs += count; 195 continue; 196 } 197 198 count = *data & 0x3f; 199 flags = *data >> 6; 200 data++; 201 202 *regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count); 203 if (flags & POSTED) 204 *regs |= MI_LRI_FORCE_POSTED; 205 *regs |= MI_LRI_LRM_CS_MMIO; 206 regs++; 207 208 xe_gt_assert(hwe->gt, count); 209 do { 210 u32 offset = 0; 211 u8 v; 212 213 do { 214 v = *data++; 215 offset <<= 7; 216 offset |= v & ~BIT(7); 217 } while (v & BIT(7)); 218 219 regs[0] = base + (offset << 2); 220 regs += 2; 221 } while (--count); 222 } 223 224 *regs = MI_BATCH_BUFFER_END | BIT(0); 225 } 226 227 static const u8 gen12_xcs_offsets[] = { 228 NOP(1), 229 LRI(13, POSTED), 230 REG16(0x244), 231 REG(0x034), 232 REG(0x030), 233 REG(0x038), 234 REG(0x03c), 235 REG(0x168), 236 REG(0x140), 237 REG(0x110), 238 REG(0x1c0), 239 REG(0x1c4), 240 REG(0x1c8), 241 REG(0x180), 242 REG16(0x2b4), 243 244 NOP(5), 245 LRI(9, POSTED), 246 REG16(0x3a8), 247 REG16(0x28c), 248 REG16(0x288), 249 REG16(0x284), 250 REG16(0x280), 251 REG16(0x27c), 252 REG16(0x278), 253 REG16(0x274), 254 REG16(0x270), 255 256 0 257 }; 258 259 static const u8 dg2_xcs_offsets[] = { 260 NOP(1), 261 LRI(15, POSTED), 262 REG16(0x244), 263 REG(0x034), 264 REG(0x030), 265 REG(0x038), 266 REG(0x03c), 267 REG(0x168), 268 REG(0x140), 269 REG(0x110), 270 REG(0x1c0), 271 REG(0x1c4), 272 REG(0x1c8), 273 REG(0x180), 274 REG16(0x2b4), 275 REG(0x120), 276 REG(0x124), 277 278 NOP(1), 279 LRI(9, POSTED), 280 REG16(0x3a8), 281 REG16(0x28c), 282 REG16(0x288), 283 REG16(0x284), 284 REG16(0x280), 285 REG16(0x27c), 286 REG16(0x278), 287 REG16(0x274), 288 REG16(0x270), 289 290 0 291 }; 292 293 static const u8 gen12_rcs_offsets[] = { 294 NOP(1), 295 LRI(13, POSTED), 296 REG16(0x244), 297 REG(0x034), 298 REG(0x030), 299 REG(0x038), 300 REG(0x03c), 301 REG(0x168), 302 REG(0x140), 303 REG(0x110), 304 REG(0x1c0), 305 REG(0x1c4), 306 REG(0x1c8), 307 REG(0x180), 308 REG16(0x2b4), 309 310 NOP(5), 311 LRI(9, POSTED), 312 REG16(0x3a8), 313 REG16(0x28c), 314 REG16(0x288), 315 REG16(0x284), 316 REG16(0x280), 317 REG16(0x27c), 318 REG16(0x278), 319 REG16(0x274), 320 REG16(0x270), 321 322 LRI(3, POSTED), 323 REG(0x1b0), 324 REG16(0x5a8), 325 REG16(0x5ac), 326 327 NOP(6), 328 LRI(1, 0), 329 REG(0x0c8), 330 NOP(3 + 9 + 1), 331 332 LRI(51, POSTED), 333 REG16(0x588), 334 REG16(0x588), 335 REG16(0x588), 336 REG16(0x588), 337 REG16(0x588), 338 REG16(0x588), 339 REG(0x028), 340 REG(0x09c), 341 REG(0x0c0), 342 REG(0x178), 343 REG(0x17c), 344 REG16(0x358), 345 REG(0x170), 346 REG(0x150), 347 REG(0x154), 348 REG(0x158), 349 REG16(0x41c), 350 REG16(0x600), 351 REG16(0x604), 352 REG16(0x608), 353 REG16(0x60c), 354 REG16(0x610), 355 REG16(0x614), 356 REG16(0x618), 357 REG16(0x61c), 358 REG16(0x620), 359 REG16(0x624), 360 REG16(0x628), 361 REG16(0x62c), 362 REG16(0x630), 363 REG16(0x634), 364 REG16(0x638), 365 REG16(0x63c), 366 REG16(0x640), 367 REG16(0x644), 368 REG16(0x648), 369 REG16(0x64c), 370 REG16(0x650), 371 REG16(0x654), 372 REG16(0x658), 373 REG16(0x65c), 374 REG16(0x660), 375 REG16(0x664), 376 REG16(0x668), 377 REG16(0x66c), 378 REG16(0x670), 379 REG16(0x674), 380 REG16(0x678), 381 REG16(0x67c), 382 REG(0x068), 383 REG(0x084), 384 NOP(1), 385 386 0 387 }; 388 389 static const u8 xehp_rcs_offsets[] = { 390 NOP(1), 391 LRI(13, POSTED), 392 REG16(0x244), 393 REG(0x034), 394 REG(0x030), 395 REG(0x038), 396 REG(0x03c), 397 REG(0x168), 398 REG(0x140), 399 REG(0x110), 400 REG(0x1c0), 401 REG(0x1c4), 402 REG(0x1c8), 403 REG(0x180), 404 REG16(0x2b4), 405 406 NOP(5), 407 LRI(9, POSTED), 408 REG16(0x3a8), 409 REG16(0x28c), 410 REG16(0x288), 411 REG16(0x284), 412 REG16(0x280), 413 REG16(0x27c), 414 REG16(0x278), 415 REG16(0x274), 416 REG16(0x270), 417 418 LRI(3, POSTED), 419 REG(0x1b0), 420 REG16(0x5a8), 421 REG16(0x5ac), 422 423 NOP(6), 424 LRI(1, 0), 425 REG(0x0c8), 426 427 0 428 }; 429 430 static const u8 dg2_rcs_offsets[] = { 431 NOP(1), 432 LRI(15, POSTED), 433 REG16(0x244), 434 REG(0x034), 435 REG(0x030), 436 REG(0x038), 437 REG(0x03c), 438 REG(0x168), 439 REG(0x140), 440 REG(0x110), 441 REG(0x1c0), 442 REG(0x1c4), 443 REG(0x1c8), 444 REG(0x180), 445 REG16(0x2b4), 446 REG(0x120), 447 REG(0x124), 448 449 NOP(1), 450 LRI(9, POSTED), 451 REG16(0x3a8), 452 REG16(0x28c), 453 REG16(0x288), 454 REG16(0x284), 455 REG16(0x280), 456 REG16(0x27c), 457 REG16(0x278), 458 REG16(0x274), 459 REG16(0x270), 460 461 LRI(3, POSTED), 462 REG(0x1b0), 463 REG16(0x5a8), 464 REG16(0x5ac), 465 466 NOP(6), 467 LRI(1, 0), 468 REG(0x0c8), 469 470 0 471 }; 472 473 static const u8 mtl_rcs_offsets[] = { 474 NOP(1), 475 LRI(15, POSTED), 476 REG16(0x244), 477 REG(0x034), 478 REG(0x030), 479 REG(0x038), 480 REG(0x03c), 481 REG(0x168), 482 REG(0x140), 483 REG(0x110), 484 REG(0x1c0), 485 REG(0x1c4), 486 REG(0x1c8), 487 REG(0x180), 488 REG16(0x2b4), 489 REG(0x120), 490 REG(0x124), 491 492 NOP(1), 493 LRI(9, POSTED), 494 REG16(0x3a8), 495 REG16(0x28c), 496 REG16(0x288), 497 REG16(0x284), 498 REG16(0x280), 499 REG16(0x27c), 500 REG16(0x278), 501 REG16(0x274), 502 REG16(0x270), 503 504 NOP(2), 505 LRI(2, POSTED), 506 REG16(0x5a8), 507 REG16(0x5ac), 508 509 NOP(6), 510 LRI(1, 0), 511 REG(0x0c8), 512 513 0 514 }; 515 516 #define XE2_CTX_COMMON \ 517 NOP(1), /* [0x00] */ \ 518 LRI(15, POSTED), /* [0x01] */ \ 519 REG16(0x244), /* [0x02] CTXT_SR_CTL */ \ 520 REG(0x034), /* [0x04] RING_BUFFER_HEAD */ \ 521 REG(0x030), /* [0x06] RING_BUFFER_TAIL */ \ 522 REG(0x038), /* [0x08] RING_BUFFER_START */ \ 523 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ \ 524 REG(0x168), /* [0x0c] BB_ADDR_UDW */ \ 525 REG(0x140), /* [0x0e] BB_ADDR */ \ 526 REG(0x110), /* [0x10] BB_STATE */ \ 527 REG(0x1c0), /* [0x12] BB_PER_CTX_PTR */ \ 528 REG(0x1c4), /* [0x14] RCS_INDIRECT_CTX */ \ 529 REG(0x1c8), /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \ 530 REG(0x180), /* [0x18] CCID */ \ 531 REG16(0x2b4), /* [0x1a] SEMAPHORE_TOKEN */ \ 532 REG(0x120), /* [0x1c] PRT_BB_STATE */ \ 533 REG(0x124), /* [0x1e] PRT_BB_STATE_UDW */ \ 534 \ 535 NOP(1), /* [0x20] */ \ 536 LRI(9, POSTED), /* [0x21] */ \ 537 REG16(0x3a8), /* [0x22] CTX_TIMESTAMP */ \ 538 REG16(0x3ac), /* [0x24] CTX_TIMESTAMP_UDW */ \ 539 REG(0x108), /* [0x26] INDIRECT_RING_STATE */ \ 540 REG16(0x284), /* [0x28] dummy reg */ \ 541 REG16(0x280), /* [0x2a] CS_ACC_CTR_THOLD */ \ 542 REG16(0x27c), /* [0x2c] CS_CTX_SYS_PASID */ \ 543 REG16(0x278), /* [0x2e] CS_CTX_ASID */ \ 544 REG16(0x274), /* [0x30] PTBP_UDW */ \ 545 REG16(0x270) /* [0x32] PTBP_LDW */ 546 547 static const u8 xe2_rcs_offsets[] = { 548 XE2_CTX_COMMON, 549 550 NOP(2), /* [0x34] */ 551 LRI(2, POSTED), /* [0x36] */ 552 REG16(0x5a8), /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */ 553 REG16(0x5ac), /* [0x39] PREEMPTION_STATUS */ 554 555 NOP(6), /* [0x41] */ 556 LRI(1, 0), /* [0x47] */ 557 REG(0x0c8), /* [0x48] R_PWR_CLK_STATE */ 558 559 0 560 }; 561 562 static const u8 xe2_bcs_offsets[] = { 563 XE2_CTX_COMMON, 564 565 NOP(4 + 8 + 1), /* [0x34] */ 566 LRI(2, POSTED), /* [0x41] */ 567 REG16(0x200), /* [0x42] BCS_SWCTRL */ 568 REG16(0x204), /* [0x44] BLIT_CCTL */ 569 570 0 571 }; 572 573 static const u8 xe2_xcs_offsets[] = { 574 XE2_CTX_COMMON, 575 576 0 577 }; 578 579 static const u8 xe2_indirect_ring_state_offsets[] = { 580 NOP(1), /* [0x00] */ 581 LRI(5, POSTED), /* [0x01] */ 582 REG(0x034), /* [0x02] RING_BUFFER_HEAD */ 583 REG(0x030), /* [0x04] RING_BUFFER_TAIL */ 584 REG(0x038), /* [0x06] RING_BUFFER_START */ 585 REG(0x048), /* [0x08] RING_BUFFER_START_UDW */ 586 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ 587 588 NOP(5), /* [0x0c] */ 589 LRI(9, POSTED), /* [0x11] */ 590 REG(0x168), /* [0x12] BB_ADDR_UDW */ 591 REG(0x140), /* [0x14] BB_ADDR */ 592 REG(0x110), /* [0x16] BB_STATE */ 593 REG16(0x588), /* [0x18] BB_STACK_WRITE_PORT */ 594 REG16(0x588), /* [0x20] BB_STACK_WRITE_PORT */ 595 REG16(0x588), /* [0x22] BB_STACK_WRITE_PORT */ 596 REG16(0x588), /* [0x24] BB_STACK_WRITE_PORT */ 597 REG16(0x588), /* [0x26] BB_STACK_WRITE_PORT */ 598 REG16(0x588), /* [0x28] BB_STACK_WRITE_PORT */ 599 600 NOP(12), /* [0x00] */ 601 602 0 603 }; 604 605 #undef REG16 606 #undef REG 607 #undef LRI 608 #undef NOP 609 610 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class) 611 { 612 if (class == XE_ENGINE_CLASS_RENDER) { 613 if (GRAPHICS_VER(xe) >= 20) 614 return xe2_rcs_offsets; 615 else if (GRAPHICS_VERx100(xe) >= 1270) 616 return mtl_rcs_offsets; 617 else if (GRAPHICS_VERx100(xe) >= 1255) 618 return dg2_rcs_offsets; 619 else if (GRAPHICS_VERx100(xe) >= 1250) 620 return xehp_rcs_offsets; 621 else 622 return gen12_rcs_offsets; 623 } else if (class == XE_ENGINE_CLASS_COPY) { 624 if (GRAPHICS_VER(xe) >= 20) 625 return xe2_bcs_offsets; 626 else 627 return gen12_xcs_offsets; 628 } else { 629 if (GRAPHICS_VER(xe) >= 20) 630 return xe2_xcs_offsets; 631 else if (GRAPHICS_VERx100(xe) >= 1255) 632 return dg2_xcs_offsets; 633 else 634 return gen12_xcs_offsets; 635 } 636 } 637 638 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe) 639 { 640 regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | 641 CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 642 643 if (xe_gt_has_indirect_ring_state(hwe->gt)) 644 regs[CTX_CONTEXT_CONTROL] |= 645 _MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE); 646 } 647 648 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe) 649 { 650 struct xe_memirq *memirq = >_to_tile(hwe->gt)->memirq; 651 struct xe_device *xe = gt_to_xe(hwe->gt); 652 u8 num_regs; 653 654 if (!xe_device_uses_memirq(xe)) 655 return; 656 657 regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM | 658 MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT; 659 regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr; 660 regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq); 661 662 num_regs = xe_device_has_msix(xe) ? 3 : 2; 663 regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) | 664 MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED; 665 regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr; 666 regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe); 667 regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr; 668 regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe); 669 670 if (xe_device_has_msix(xe)) { 671 regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr; 672 /* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */ 673 } 674 } 675 676 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe) 677 { 678 struct xe_device *xe = gt_to_xe(hwe->gt); 679 680 if (GRAPHICS_VERx100(xe) >= 1250) 681 return 0x70; 682 else 683 return 0x60; 684 } 685 686 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe) 687 { 688 int x; 689 690 x = lrc_ring_mi_mode(hwe); 691 regs[x + 1] &= ~STOP_RING; 692 regs[x + 1] |= STOP_RING << 16; 693 } 694 695 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc) 696 { 697 return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE; 698 } 699 700 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc) 701 { 702 return 0; 703 } 704 705 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc) 706 { 707 return lrc->ring.size; 708 } 709 710 /* Make the magic macros work */ 711 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset 712 #define __xe_lrc_regs_offset xe_lrc_regs_offset 713 714 #define LRC_SEQNO_PPHWSP_OFFSET 512 715 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8) 716 #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8) 717 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024 718 #define LRC_PARALLEL_PPHWSP_OFFSET 2048 719 720 u32 xe_lrc_regs_offset(struct xe_lrc *lrc) 721 { 722 return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE; 723 } 724 725 /** 726 * xe_lrc_reg_size() - Get size of the LRC registers area within queues 727 * @xe: the &xe_device struct instance 728 * 729 * Returns: Size of the LRC registers area for current platform 730 */ 731 size_t xe_lrc_reg_size(struct xe_device *xe) 732 { 733 if (GRAPHICS_VERx100(xe) >= 1250) 734 return 96 * sizeof(u32); 735 else 736 return 80 * sizeof(u32); 737 } 738 739 size_t xe_lrc_skip_size(struct xe_device *xe) 740 { 741 return LRC_PPHWSP_SIZE + xe_lrc_reg_size(xe); 742 } 743 744 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc) 745 { 746 /* The seqno is stored in the driver-defined portion of PPHWSP */ 747 return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET; 748 } 749 750 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc) 751 { 752 /* The start seqno is stored in the driver-defined portion of PPHWSP */ 753 return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET; 754 } 755 756 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc) 757 { 758 /* This is stored in the driver-defined portion of PPHWSP */ 759 return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET; 760 } 761 762 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc) 763 { 764 /* The parallel is stored in the driver-defined portion of PPHWSP */ 765 return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET; 766 } 767 768 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc) 769 { 770 return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET; 771 } 772 773 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc) 774 { 775 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32); 776 } 777 778 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc) 779 { 780 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32); 781 } 782 783 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc) 784 { 785 u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - 786 LRC_INDIRECT_RING_STATE_SIZE; 787 788 if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX) 789 offset -= LRC_INDIRECT_CTX_BO_SIZE; 790 791 return offset; 792 } 793 794 static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc) 795 { 796 return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE; 797 } 798 799 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc) 800 { 801 return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE; 802 } 803 804 #define DECL_MAP_ADDR_HELPERS(elem) \ 805 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \ 806 { \ 807 struct iosys_map map = lrc->bo->vmap; \ 808 \ 809 xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map)); \ 810 iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \ 811 return map; \ 812 } \ 813 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \ 814 { \ 815 return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \ 816 } \ 817 818 DECL_MAP_ADDR_HELPERS(ring) 819 DECL_MAP_ADDR_HELPERS(pphwsp) 820 DECL_MAP_ADDR_HELPERS(seqno) 821 DECL_MAP_ADDR_HELPERS(regs) 822 DECL_MAP_ADDR_HELPERS(start_seqno) 823 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp) 824 DECL_MAP_ADDR_HELPERS(ctx_timestamp) 825 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw) 826 DECL_MAP_ADDR_HELPERS(parallel) 827 DECL_MAP_ADDR_HELPERS(indirect_ring) 828 DECL_MAP_ADDR_HELPERS(engine_id) 829 830 #undef DECL_MAP_ADDR_HELPERS 831 832 /** 833 * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address 834 * @lrc: Pointer to the lrc. 835 * 836 * Returns: ctx timestamp GGTT address 837 */ 838 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc) 839 { 840 return __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 841 } 842 843 /** 844 * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address 845 * @lrc: Pointer to the lrc. 846 * 847 * Returns: ctx timestamp udw GGTT address 848 */ 849 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc) 850 { 851 return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc); 852 } 853 854 /** 855 * xe_lrc_ctx_timestamp() - Read ctx timestamp value 856 * @lrc: Pointer to the lrc. 857 * 858 * Returns: ctx timestamp value 859 */ 860 static u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc) 861 { 862 struct xe_device *xe = lrc_to_xe(lrc); 863 struct iosys_map map; 864 u32 ldw, udw = 0; 865 866 map = __xe_lrc_ctx_timestamp_map(lrc); 867 ldw = xe_map_read32(xe, &map); 868 869 if (xe->info.has_64bit_timestamp) { 870 map = __xe_lrc_ctx_timestamp_udw_map(lrc); 871 udw = xe_map_read32(xe, &map); 872 } 873 874 return (u64)udw << 32 | ldw; 875 } 876 877 /** 878 * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address 879 * @lrc: Pointer to the lrc. 880 * 881 * Returns: ctx timestamp job GGTT address 882 */ 883 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc) 884 { 885 return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc); 886 } 887 888 /** 889 * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value 890 * @lrc: Pointer to the lrc. 891 * 892 * Returns: ctx timestamp job value 893 */ 894 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc) 895 { 896 struct xe_device *xe = lrc_to_xe(lrc); 897 struct iosys_map map; 898 899 map = __xe_lrc_ctx_job_timestamp_map(lrc); 900 return xe_map_read32(xe, &map); 901 } 902 903 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc) 904 { 905 return __xe_lrc_pphwsp_ggtt_addr(lrc); 906 } 907 908 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc) 909 { 910 if (!xe_lrc_has_indirect_ring_state(lrc)) 911 return 0; 912 913 return __xe_lrc_indirect_ring_ggtt_addr(lrc); 914 } 915 916 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr) 917 { 918 struct xe_device *xe = lrc_to_xe(lrc); 919 struct iosys_map map; 920 921 map = __xe_lrc_indirect_ring_map(lrc); 922 iosys_map_incr(&map, reg_nr * sizeof(u32)); 923 return xe_map_read32(xe, &map); 924 } 925 926 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc, 927 int reg_nr, u32 val) 928 { 929 struct xe_device *xe = lrc_to_xe(lrc); 930 struct iosys_map map; 931 932 map = __xe_lrc_indirect_ring_map(lrc); 933 iosys_map_incr(&map, reg_nr * sizeof(u32)); 934 xe_map_write32(xe, &map, val); 935 } 936 937 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr) 938 { 939 struct xe_device *xe = lrc_to_xe(lrc); 940 struct iosys_map map; 941 942 map = __xe_lrc_regs_map(lrc); 943 iosys_map_incr(&map, reg_nr * sizeof(u32)); 944 return xe_map_read32(xe, &map); 945 } 946 947 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val) 948 { 949 struct xe_device *xe = lrc_to_xe(lrc); 950 struct iosys_map map; 951 952 map = __xe_lrc_regs_map(lrc); 953 iosys_map_incr(&map, reg_nr * sizeof(u32)); 954 xe_map_write32(xe, &map, val); 955 } 956 957 static void *empty_lrc_data(struct xe_hw_engine *hwe) 958 { 959 struct xe_gt *gt = hwe->gt; 960 void *data; 961 u32 *regs; 962 963 data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL); 964 if (!data) 965 return NULL; 966 967 /* 1st page: Per-Process of HW status Page */ 968 regs = data + LRC_PPHWSP_SIZE; 969 set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe); 970 set_context_control(regs, hwe); 971 set_memory_based_intr(regs, hwe); 972 reset_stop_ring(regs, hwe); 973 if (xe_gt_has_indirect_ring_state(gt)) { 974 regs = data + xe_gt_lrc_size(gt, hwe->class) - 975 LRC_INDIRECT_RING_STATE_SIZE; 976 set_offsets(regs, xe2_indirect_ring_state_offsets, hwe); 977 } 978 979 return data; 980 } 981 982 /** 983 * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC 984 * of given engine. 985 * @hwe: the &xe_hw_engine struct instance 986 */ 987 void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe) 988 { 989 struct xe_gt *gt = hwe->gt; 990 u32 *regs; 991 992 if (!gt->default_lrc[hwe->class]) 993 return; 994 995 regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE; 996 set_memory_based_intr(regs, hwe); 997 } 998 999 /** 1000 * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data 1001 * for given LRC. 1002 * @lrc: the &xe_lrc struct instance 1003 * @hwe: the &xe_hw_engine struct instance 1004 * @regs: scratch buffer to be used as temporary storage 1005 */ 1006 void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 1007 u32 *regs) 1008 { 1009 struct xe_gt *gt = hwe->gt; 1010 struct iosys_map map; 1011 size_t regs_len; 1012 1013 if (!xe_device_uses_memirq(gt_to_xe(gt))) 1014 return; 1015 1016 map = __xe_lrc_regs_map(lrc); 1017 regs_len = xe_lrc_reg_size(gt_to_xe(gt)); 1018 xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len); 1019 set_memory_based_intr(regs, hwe); 1020 xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len); 1021 } 1022 1023 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm) 1024 { 1025 u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt)); 1026 1027 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc)); 1028 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc)); 1029 } 1030 1031 static void xe_lrc_finish(struct xe_lrc *lrc) 1032 { 1033 xe_hw_fence_ctx_finish(&lrc->fence_ctx); 1034 xe_bo_unpin_map_no_vm(lrc->bo); 1035 } 1036 1037 /* 1038 * wa_bb_setup_utilization() - Write commands to wa bb to assist 1039 * in calculating active context run ticks. 1040 * 1041 * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the 1042 * context, but only gets updated when the context switches out. In order to 1043 * check how long a context has been active before it switches out, two things 1044 * are required: 1045 * 1046 * (1) Determine if the context is running: 1047 * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in 1048 * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is 1049 * initialized. During a query, we just check for this value to determine if the 1050 * context is active. If the context switched out, it would overwrite this 1051 * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as 1052 * the last part of context restore, so reusing this LRC location will not 1053 * clobber anything. 1054 * 1055 * (2) Calculate the time that the context has been active for: 1056 * The CTX_TIMESTAMP ticks only when the context is active. If a context is 1057 * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization. 1058 * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific 1059 * engine instance. Since we do not know which instance the context is running 1060 * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and 1061 * store it in the PPHSWP. 1062 */ 1063 #define CONTEXT_ACTIVE 1ULL 1064 static ssize_t setup_utilization_wa(struct xe_lrc *lrc, 1065 struct xe_hw_engine *hwe, 1066 u32 *batch, 1067 size_t max_len) 1068 { 1069 u32 *cmd = batch; 1070 1071 if (IS_SRIOV_VF(gt_to_xe(lrc->gt))) 1072 return 0; 1073 1074 if (xe_gt_WARN_ON(lrc->gt, max_len < 12)) 1075 return -ENOSPC; 1076 1077 *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET; 1078 *cmd++ = ENGINE_ID(0).addr; 1079 *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc); 1080 *cmd++ = 0; 1081 1082 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); 1083 *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 1084 *cmd++ = 0; 1085 *cmd++ = lower_32_bits(CONTEXT_ACTIVE); 1086 1087 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) { 1088 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); 1089 *cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc); 1090 *cmd++ = 0; 1091 *cmd++ = upper_32_bits(CONTEXT_ACTIVE); 1092 } 1093 1094 return cmd - batch; 1095 } 1096 1097 static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 1098 u32 *batch, size_t max_len) 1099 { 1100 const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 1101 u32 *cmd = batch; 1102 1103 if (!XE_GT_WA(lrc->gt, 16010904313) || 1104 !(hwe->class == XE_ENGINE_CLASS_RENDER || 1105 hwe->class == XE_ENGINE_CLASS_COMPUTE || 1106 hwe->class == XE_ENGINE_CLASS_COPY || 1107 hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE || 1108 hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE)) 1109 return 0; 1110 1111 if (xe_gt_WARN_ON(lrc->gt, max_len < 12)) 1112 return -ENOSPC; 1113 1114 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO | 1115 MI_LRM_ASYNC; 1116 *cmd++ = RING_CTX_TIMESTAMP(0).addr; 1117 *cmd++ = ts_addr; 1118 *cmd++ = 0; 1119 1120 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO | 1121 MI_LRM_ASYNC; 1122 *cmd++ = RING_CTX_TIMESTAMP(0).addr; 1123 *cmd++ = ts_addr; 1124 *cmd++ = 0; 1125 1126 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO; 1127 *cmd++ = RING_CTX_TIMESTAMP(0).addr; 1128 *cmd++ = ts_addr; 1129 *cmd++ = 0; 1130 1131 return cmd - batch; 1132 } 1133 1134 static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc, 1135 struct xe_hw_engine *hwe, 1136 u32 *batch, size_t max_len) 1137 { 1138 struct xe_device *xe = gt_to_xe(lrc->gt); 1139 const u32 *user_batch; 1140 u32 *cmd = batch; 1141 u32 count; 1142 1143 count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev), 1144 hwe->class, &user_batch); 1145 if (!count) 1146 return 0; 1147 1148 if (count > max_len) 1149 return -ENOSPC; 1150 1151 /* 1152 * This should be used only for tests and validation. Taint the kernel 1153 * as anything could be submitted directly in context switches 1154 */ 1155 add_taint(TAINT_TEST, LOCKDEP_STILL_OK); 1156 1157 memcpy(cmd, user_batch, count * sizeof(u32)); 1158 cmd += count; 1159 1160 return cmd - batch; 1161 } 1162 1163 static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc, 1164 struct xe_hw_engine *hwe, 1165 u32 *batch, size_t max_len) 1166 { 1167 struct xe_device *xe = gt_to_xe(lrc->gt); 1168 const u32 *user_batch; 1169 u32 *cmd = batch; 1170 u32 count; 1171 1172 count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev), 1173 hwe->class, &user_batch); 1174 if (!count) 1175 return 0; 1176 1177 if (count > max_len) 1178 return -ENOSPC; 1179 1180 /* 1181 * This should be used only for tests and validation. Taint the kernel 1182 * as anything could be submitted directly in context switches 1183 */ 1184 add_taint(TAINT_TEST, LOCKDEP_STILL_OK); 1185 1186 memcpy(cmd, user_batch, count * sizeof(u32)); 1187 cmd += count; 1188 1189 return cmd - batch; 1190 } 1191 1192 static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc, 1193 struct xe_hw_engine *hwe, 1194 u32 *batch, size_t max_len) 1195 { 1196 u32 *cmd = batch; 1197 1198 if (!XE_GT_WA(lrc->gt, 18022495364) || 1199 hwe->class != XE_ENGINE_CLASS_RENDER) 1200 return 0; 1201 1202 if (xe_gt_WARN_ON(lrc->gt, max_len < 3)) 1203 return -ENOSPC; 1204 1205 *cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1); 1206 *cmd++ = CS_DEBUG_MODE2(0).addr; 1207 *cmd++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE); 1208 1209 return cmd - batch; 1210 } 1211 1212 struct bo_setup { 1213 ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 1214 u32 *batch, size_t max_size); 1215 }; 1216 1217 struct bo_setup_state { 1218 /* Input: */ 1219 struct xe_lrc *lrc; 1220 struct xe_hw_engine *hwe; 1221 size_t max_size; 1222 size_t reserve_dw; 1223 unsigned int offset; 1224 const struct bo_setup *funcs; 1225 unsigned int num_funcs; 1226 1227 /* State: */ 1228 u32 *buffer; 1229 u32 *ptr; 1230 unsigned int written; 1231 }; 1232 1233 static int setup_bo(struct bo_setup_state *state) 1234 { 1235 ssize_t remain; 1236 1237 if (state->lrc->bo->vmap.is_iomem) { 1238 xe_gt_assert(state->hwe->gt, state->buffer); 1239 state->ptr = state->buffer; 1240 } else { 1241 state->ptr = state->lrc->bo->vmap.vaddr + state->offset; 1242 } 1243 1244 remain = state->max_size / sizeof(u32); 1245 1246 for (size_t i = 0; i < state->num_funcs; i++) { 1247 ssize_t len = state->funcs[i].setup(state->lrc, state->hwe, 1248 state->ptr, remain); 1249 1250 remain -= len; 1251 1252 /* 1253 * Caller has asked for at least reserve_dw to remain unused. 1254 */ 1255 if (len < 0 || 1256 xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw)) 1257 goto fail; 1258 1259 state->ptr += len; 1260 state->written += len; 1261 } 1262 1263 return 0; 1264 1265 fail: 1266 return -ENOSPC; 1267 } 1268 1269 static void finish_bo(struct bo_setup_state *state) 1270 { 1271 if (!state->lrc->bo->vmap.is_iomem) 1272 return; 1273 1274 xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap, 1275 state->offset, state->buffer, 1276 state->written * sizeof(u32)); 1277 } 1278 1279 /** 1280 * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks. 1281 * @lrc: the &xe_lrc struct instance 1282 * @hwe: the &xe_hw_engine struct instance 1283 * @scratch: preallocated scratch buffer for temporary storage 1284 * Return: 0 on success, negative error code on failure 1285 */ 1286 int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch) 1287 { 1288 static const struct bo_setup funcs[] = { 1289 { .setup = setup_timestamp_wa }, 1290 { .setup = setup_invalidate_state_cache_wa }, 1291 { .setup = setup_utilization_wa }, 1292 { .setup = setup_configfs_post_ctx_restore_bb }, 1293 }; 1294 struct bo_setup_state state = { 1295 .lrc = lrc, 1296 .hwe = hwe, 1297 .max_size = LRC_WA_BB_SIZE, 1298 .buffer = scratch, 1299 .reserve_dw = 1, 1300 .offset = __xe_lrc_wa_bb_offset(lrc), 1301 .funcs = funcs, 1302 .num_funcs = ARRAY_SIZE(funcs), 1303 }; 1304 int ret; 1305 1306 ret = setup_bo(&state); 1307 if (ret) 1308 return ret; 1309 1310 *state.ptr++ = MI_BATCH_BUFFER_END; 1311 state.written++; 1312 1313 finish_bo(&state); 1314 1315 xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR, 1316 xe_bo_ggtt_addr(lrc->bo) + state.offset + 1); 1317 1318 return 0; 1319 } 1320 1321 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe) 1322 { 1323 u32 *buf = NULL; 1324 int ret; 1325 1326 if (lrc->bo->vmap.is_iomem) { 1327 buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL); 1328 if (!buf) 1329 return -ENOMEM; 1330 } 1331 1332 ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf); 1333 1334 kfree(buf); 1335 1336 return ret; 1337 } 1338 1339 static int 1340 setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe) 1341 { 1342 static const struct bo_setup rcs_funcs[] = { 1343 { .setup = setup_timestamp_wa }, 1344 { .setup = setup_configfs_mid_ctx_restore_bb }, 1345 }; 1346 static const struct bo_setup xcs_funcs[] = { 1347 { .setup = setup_configfs_mid_ctx_restore_bb }, 1348 }; 1349 struct bo_setup_state state = { 1350 .lrc = lrc, 1351 .hwe = hwe, 1352 .max_size = (63 * 64) /* max 63 cachelines */, 1353 .buffer = NULL, 1354 .offset = __xe_lrc_indirect_ctx_offset(lrc), 1355 }; 1356 int ret; 1357 1358 if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)) 1359 return 0; 1360 1361 if (hwe->class == XE_ENGINE_CLASS_RENDER || 1362 hwe->class == XE_ENGINE_CLASS_COMPUTE) { 1363 state.funcs = rcs_funcs; 1364 state.num_funcs = ARRAY_SIZE(rcs_funcs); 1365 } else { 1366 state.funcs = xcs_funcs; 1367 state.num_funcs = ARRAY_SIZE(xcs_funcs); 1368 } 1369 1370 if (xe_gt_WARN_ON(lrc->gt, !state.funcs)) 1371 return 0; 1372 1373 if (lrc->bo->vmap.is_iomem) { 1374 state.buffer = kmalloc(state.max_size, GFP_KERNEL); 1375 if (!state.buffer) 1376 return -ENOMEM; 1377 } 1378 1379 ret = setup_bo(&state); 1380 if (ret) { 1381 kfree(state.buffer); 1382 return ret; 1383 } 1384 1385 /* 1386 * Align to 64B cacheline so there's no garbage at the end for CS to 1387 * execute: size for indirect ctx must be a multiple of 64. 1388 */ 1389 while (state.written & 0xf) { 1390 *state.ptr++ = MI_NOOP; 1391 state.written++; 1392 } 1393 1394 finish_bo(&state); 1395 kfree(state.buffer); 1396 1397 /* 1398 * Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it 1399 * varies per engine class, but the default is good enough 1400 */ 1401 xe_lrc_write_ctx_reg(lrc, 1402 CTX_CS_INDIRECT_CTX, 1403 (xe_bo_ggtt_addr(lrc->bo) + state.offset) | 1404 /* Size in CLs. */ 1405 (state.written * sizeof(u32) / 64)); 1406 1407 return 0; 1408 } 1409 1410 static u8 xe_multi_queue_prio_to_lrc(struct xe_lrc *lrc, enum xe_multi_queue_priority priority) 1411 { 1412 struct xe_device *xe = gt_to_xe(lrc->gt); 1413 1414 xe_assert(xe, (priority >= XE_MULTI_QUEUE_PRIORITY_LOW && 1415 priority <= XE_MULTI_QUEUE_PRIORITY_HIGH)); 1416 1417 /* xe_multi_queue_priority is directly mapped to LRC priority values */ 1418 return priority; 1419 } 1420 1421 /** 1422 * xe_lrc_set_multi_queue_priority() - Set multi queue priority in LRC 1423 * @lrc: Logical Ring Context 1424 * @priority: Multi queue priority of the exec queue 1425 * 1426 * Convert @priority to LRC multi queue priority and update the @lrc descriptor 1427 */ 1428 void xe_lrc_set_multi_queue_priority(struct xe_lrc *lrc, enum xe_multi_queue_priority priority) 1429 { 1430 lrc->desc &= ~LRC_PRIORITY; 1431 lrc->desc |= FIELD_PREP(LRC_PRIORITY, xe_multi_queue_prio_to_lrc(lrc, priority)); 1432 } 1433 1434 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 1435 struct xe_vm *vm, void *replay_state, u32 ring_size, 1436 u16 msix_vec, 1437 u32 init_flags) 1438 { 1439 struct xe_gt *gt = hwe->gt; 1440 const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class); 1441 u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE; 1442 struct xe_tile *tile = gt_to_tile(gt); 1443 struct xe_device *xe = gt_to_xe(gt); 1444 struct iosys_map map; 1445 u32 arb_enable; 1446 u32 bo_flags; 1447 int err; 1448 1449 kref_init(&lrc->refcount); 1450 lrc->gt = gt; 1451 lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class); 1452 lrc->size = lrc_size; 1453 lrc->flags = 0; 1454 lrc->ring.size = ring_size; 1455 lrc->ring.tail = 0; 1456 1457 if (gt_engine_needs_indirect_ctx(gt, hwe->class)) { 1458 lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX; 1459 bo_size += LRC_INDIRECT_CTX_BO_SIZE; 1460 } 1461 1462 if (xe_gt_has_indirect_ring_state(gt)) 1463 lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE; 1464 1465 bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT | 1466 XE_BO_FLAG_GGTT_INVALIDATE; 1467 1468 if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */ 1469 bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM; 1470 1471 lrc->bo = xe_bo_create_pin_map_novm(xe, tile, 1472 bo_size, 1473 ttm_bo_type_kernel, 1474 bo_flags, false); 1475 if (IS_ERR(lrc->bo)) 1476 return PTR_ERR(lrc->bo); 1477 1478 xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt, 1479 hwe->fence_irq, hwe->name); 1480 1481 /* 1482 * Init Per-Process of HW status Page, LRC / context state to known 1483 * values. If there's already a primed default_lrc, just copy it, otherwise 1484 * it's the early submission to record the lrc: build a new empty one from 1485 * scratch. 1486 */ 1487 map = __xe_lrc_pphwsp_map(lrc); 1488 if (gt->default_lrc[hwe->class] || replay_state) { 1489 xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE); /* PPHWSP */ 1490 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE, 1491 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE, 1492 lrc_size - LRC_PPHWSP_SIZE); 1493 if (replay_state) 1494 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE, 1495 replay_state, lrc->replay_size); 1496 } else { 1497 void *init_data = empty_lrc_data(hwe); 1498 1499 if (!init_data) { 1500 err = -ENOMEM; 1501 goto err_lrc_finish; 1502 } 1503 1504 xe_map_memcpy_to(xe, &map, 0, init_data, lrc_size); 1505 kfree(init_data); 1506 } 1507 1508 if (vm) { 1509 xe_lrc_set_ppgtt(lrc, vm); 1510 1511 if (vm->xef) 1512 xe_drm_client_add_bo(vm->xef->client, lrc->bo); 1513 } 1514 1515 if (xe_device_has_msix(xe)) { 1516 xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR, 1517 xe_memirq_status_ptr(&tile->memirq, hwe)); 1518 xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR, 1519 xe_memirq_source_ptr(&tile->memirq, hwe)); 1520 xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec); 1521 } 1522 1523 if (xe_gt_has_indirect_ring_state(gt)) { 1524 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE, 1525 __xe_lrc_indirect_ring_ggtt_addr(lrc)); 1526 1527 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START, 1528 __xe_lrc_ring_ggtt_addr(lrc)); 1529 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0); 1530 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0); 1531 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail); 1532 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL, 1533 RING_CTL_SIZE(lrc->ring.size) | RING_VALID); 1534 } else { 1535 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc)); 1536 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0); 1537 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail); 1538 xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL, 1539 RING_CTL_SIZE(lrc->ring.size) | RING_VALID); 1540 } 1541 1542 if (init_flags & XE_LRC_CREATE_RUNALONE) 1543 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL, 1544 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) | 1545 _MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE)); 1546 1547 if (init_flags & XE_LRC_CREATE_PXP) 1548 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL, 1549 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) | 1550 _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE)); 1551 1552 lrc->ctx_timestamp = 0; 1553 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0); 1554 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) 1555 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0); 1556 1557 if (xe->info.has_asid && vm) 1558 xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid); 1559 1560 lrc->desc = LRC_VALID; 1561 lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT); 1562 /* TODO: Priority */ 1563 1564 /* While this appears to have something about privileged batches or 1565 * some such, it really just means PPGTT mode. 1566 */ 1567 if (vm) 1568 lrc->desc |= LRC_PRIVILEGE; 1569 1570 if (GRAPHICS_VERx100(xe) < 1250) { 1571 lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance); 1572 lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class); 1573 } 1574 1575 arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1576 xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable)); 1577 1578 map = __xe_lrc_seqno_map(lrc); 1579 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); 1580 1581 map = __xe_lrc_start_seqno_map(lrc); 1582 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); 1583 1584 err = setup_wa_bb(lrc, hwe); 1585 if (err) 1586 goto err_lrc_finish; 1587 1588 err = setup_indirect_ctx(lrc, hwe); 1589 if (err) 1590 goto err_lrc_finish; 1591 1592 return 0; 1593 1594 err_lrc_finish: 1595 xe_lrc_finish(lrc); 1596 return err; 1597 } 1598 1599 /** 1600 * xe_lrc_create - Create a LRC 1601 * @hwe: Hardware Engine 1602 * @vm: The VM (address space) 1603 * @replay_state: GPU hang replay state 1604 * @ring_size: LRC ring size 1605 * @msix_vec: MSI-X interrupt vector (for platforms that support it) 1606 * @flags: LRC initialization flags 1607 * 1608 * Allocate and initialize the Logical Ring Context (LRC). 1609 * 1610 * Return pointer to created LRC upon success and an error pointer 1611 * upon failure. 1612 */ 1613 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm, 1614 void *replay_state, u32 ring_size, u16 msix_vec, u32 flags) 1615 { 1616 struct xe_lrc *lrc; 1617 int err; 1618 1619 lrc = kzalloc(sizeof(*lrc), GFP_KERNEL); 1620 if (!lrc) 1621 return ERR_PTR(-ENOMEM); 1622 1623 err = xe_lrc_init(lrc, hwe, vm, replay_state, ring_size, msix_vec, flags); 1624 if (err) { 1625 kfree(lrc); 1626 return ERR_PTR(err); 1627 } 1628 1629 return lrc; 1630 } 1631 1632 /** 1633 * xe_lrc_destroy - Destroy the LRC 1634 * @ref: reference to LRC 1635 * 1636 * Called when ref == 0, release resources held by the Logical Ring Context 1637 * (LRC) and free the LRC memory. 1638 */ 1639 void xe_lrc_destroy(struct kref *ref) 1640 { 1641 struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount); 1642 1643 xe_lrc_finish(lrc); 1644 kfree(lrc); 1645 } 1646 1647 /** 1648 * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC. 1649 * @lrc: the &xe_lrc struct instance 1650 */ 1651 void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc) 1652 { 1653 if (xe_lrc_has_indirect_ring_state(lrc)) { 1654 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE, 1655 __xe_lrc_indirect_ring_ggtt_addr(lrc)); 1656 1657 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START, 1658 __xe_lrc_ring_ggtt_addr(lrc)); 1659 } else { 1660 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc)); 1661 } 1662 } 1663 1664 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail) 1665 { 1666 if (xe_lrc_has_indirect_ring_state(lrc)) 1667 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail); 1668 else 1669 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail); 1670 } 1671 1672 u32 xe_lrc_ring_tail(struct xe_lrc *lrc) 1673 { 1674 if (xe_lrc_has_indirect_ring_state(lrc)) 1675 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR; 1676 else 1677 return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR; 1678 } 1679 1680 static u32 xe_lrc_ring_start(struct xe_lrc *lrc) 1681 { 1682 if (xe_lrc_has_indirect_ring_state(lrc)) 1683 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START); 1684 else 1685 return xe_lrc_read_ctx_reg(lrc, CTX_RING_START); 1686 } 1687 1688 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head) 1689 { 1690 if (xe_lrc_has_indirect_ring_state(lrc)) 1691 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head); 1692 else 1693 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head); 1694 } 1695 1696 u32 xe_lrc_ring_head(struct xe_lrc *lrc) 1697 { 1698 if (xe_lrc_has_indirect_ring_state(lrc)) 1699 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR; 1700 else 1701 return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR; 1702 } 1703 1704 u32 xe_lrc_ring_space(struct xe_lrc *lrc) 1705 { 1706 const u32 head = xe_lrc_ring_head(lrc); 1707 const u32 tail = lrc->ring.tail; 1708 const u32 size = lrc->ring.size; 1709 1710 return ((head - tail - 1) & (size - 1)) + 1; 1711 } 1712 1713 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring, 1714 const void *data, size_t size) 1715 { 1716 struct xe_device *xe = lrc_to_xe(lrc); 1717 1718 iosys_map_incr(&ring, lrc->ring.tail); 1719 xe_map_memcpy_to(xe, &ring, 0, data, size); 1720 lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1); 1721 } 1722 1723 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size) 1724 { 1725 struct xe_device *xe = lrc_to_xe(lrc); 1726 struct iosys_map ring; 1727 u32 rhs; 1728 size_t aligned_size; 1729 1730 xe_assert(xe, IS_ALIGNED(size, 4)); 1731 aligned_size = ALIGN(size, 8); 1732 1733 ring = __xe_lrc_ring_map(lrc); 1734 1735 xe_assert(xe, lrc->ring.tail < lrc->ring.size); 1736 rhs = lrc->ring.size - lrc->ring.tail; 1737 if (size > rhs) { 1738 __xe_lrc_write_ring(lrc, ring, data, rhs); 1739 __xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs); 1740 } else { 1741 __xe_lrc_write_ring(lrc, ring, data, size); 1742 } 1743 1744 if (aligned_size > size) { 1745 u32 noop = MI_NOOP; 1746 1747 __xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop)); 1748 } 1749 } 1750 1751 u64 xe_lrc_descriptor(struct xe_lrc *lrc) 1752 { 1753 return lrc->desc | xe_lrc_ggtt_addr(lrc); 1754 } 1755 1756 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc) 1757 { 1758 return __xe_lrc_seqno_ggtt_addr(lrc); 1759 } 1760 1761 /** 1762 * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence. 1763 * 1764 * Allocate but don't initialize an lrc seqno fence. 1765 * 1766 * Return: Pointer to the allocated fence or 1767 * negative error pointer on error. 1768 */ 1769 struct dma_fence *xe_lrc_alloc_seqno_fence(void) 1770 { 1771 return xe_hw_fence_alloc(); 1772 } 1773 1774 /** 1775 * xe_lrc_free_seqno_fence() - Free an lrc seqno fence. 1776 * @fence: Pointer to the fence to free. 1777 * 1778 * Frees an lrc seqno fence that hasn't yet been 1779 * initialized. 1780 */ 1781 void xe_lrc_free_seqno_fence(struct dma_fence *fence) 1782 { 1783 xe_hw_fence_free(fence); 1784 } 1785 1786 /** 1787 * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence. 1788 * @lrc: Pointer to the lrc. 1789 * @fence: Pointer to the fence to initialize. 1790 * 1791 * Initializes a pre-allocated lrc seqno fence. 1792 * After initialization, the fence is subject to normal 1793 * dma-fence refcounting. 1794 */ 1795 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence) 1796 { 1797 xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc)); 1798 } 1799 1800 s32 xe_lrc_seqno(struct xe_lrc *lrc) 1801 { 1802 struct iosys_map map = __xe_lrc_seqno_map(lrc); 1803 1804 return xe_map_read32(lrc_to_xe(lrc), &map); 1805 } 1806 1807 s32 xe_lrc_start_seqno(struct xe_lrc *lrc) 1808 { 1809 struct iosys_map map = __xe_lrc_start_seqno_map(lrc); 1810 1811 return xe_map_read32(lrc_to_xe(lrc), &map); 1812 } 1813 1814 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc) 1815 { 1816 return __xe_lrc_start_seqno_ggtt_addr(lrc); 1817 } 1818 1819 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc) 1820 { 1821 return __xe_lrc_parallel_ggtt_addr(lrc); 1822 } 1823 1824 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc) 1825 { 1826 return __xe_lrc_parallel_map(lrc); 1827 } 1828 1829 /** 1830 * xe_lrc_engine_id() - Read engine id value 1831 * @lrc: Pointer to the lrc. 1832 * 1833 * Returns: context id value 1834 */ 1835 static u32 xe_lrc_engine_id(struct xe_lrc *lrc) 1836 { 1837 struct xe_device *xe = lrc_to_xe(lrc); 1838 struct iosys_map map; 1839 1840 map = __xe_lrc_engine_id_map(lrc); 1841 return xe_map_read32(xe, &map); 1842 } 1843 1844 static int instr_dw(u32 cmd_header) 1845 { 1846 /* GFXPIPE "SINGLE_DW" opcodes are a single dword */ 1847 if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) == 1848 GFXPIPE_SINGLE_DW_CMD(0, 0)) 1849 return 1; 1850 1851 /* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */ 1852 if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST) 1853 return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2; 1854 1855 /* Most instructions have the # of dwords (minus 2) in 7:0 */ 1856 return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2; 1857 } 1858 1859 static int dump_mi_command(struct drm_printer *p, 1860 struct xe_gt *gt, 1861 u32 *dw, 1862 int remaining_dw) 1863 { 1864 u32 inst_header = *dw; 1865 u32 numdw = instr_dw(inst_header); 1866 u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header); 1867 int num_noop; 1868 1869 /* First check for commands that don't have/use a '# DW' field */ 1870 switch (inst_header & MI_OPCODE) { 1871 case MI_NOOP: 1872 num_noop = 1; 1873 while (num_noop < remaining_dw && 1874 (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP) 1875 num_noop++; 1876 drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop); 1877 return num_noop; 1878 1879 case MI_TOPOLOGY_FILTER: 1880 drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header); 1881 return 1; 1882 1883 case MI_BATCH_BUFFER_END: 1884 drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header); 1885 /* Return 'remaining_dw' to consume the rest of the LRC */ 1886 return remaining_dw; 1887 } 1888 1889 /* 1890 * Any remaining commands include a # of dwords. We should make sure 1891 * it doesn't exceed the remaining size of the LRC. 1892 */ 1893 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 1894 numdw = remaining_dw; 1895 1896 switch (inst_header & MI_OPCODE) { 1897 case MI_LOAD_REGISTER_IMM: 1898 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n", 1899 inst_header, (numdw - 1) / 2); 1900 for (int i = 1; i < numdw; i += 2) 1901 drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]); 1902 return numdw; 1903 1904 case MI_LOAD_REGISTER_MEM & MI_OPCODE: 1905 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n", 1906 inst_header, 1907 dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "", 1908 dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : ""); 1909 if (numdw == 4) 1910 drm_printf(p, " - %#6x = %#010llx\n", 1911 dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2]))); 1912 else 1913 drm_printf(p, " - %*ph (%s)\n", 1914 (int)sizeof(u32) * (numdw - 1), dw + 1, 1915 numdw < 4 ? "truncated" : "malformed"); 1916 return numdw; 1917 1918 case MI_FORCE_WAKEUP: 1919 drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header); 1920 return numdw; 1921 1922 default: 1923 drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n", 1924 inst_header, opcode, numdw); 1925 return numdw; 1926 } 1927 } 1928 1929 static int dump_gfxpipe_command(struct drm_printer *p, 1930 struct xe_gt *gt, 1931 u32 *dw, 1932 int remaining_dw) 1933 { 1934 u32 numdw = instr_dw(*dw); 1935 u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw); 1936 u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw); 1937 u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw); 1938 1939 /* 1940 * Make sure we haven't mis-parsed a number of dwords that exceeds the 1941 * remaining size of the LRC. 1942 */ 1943 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 1944 numdw = remaining_dw; 1945 1946 switch (*dw & GFXPIPE_MATCH_MASK) { 1947 #define MATCH(cmd) \ 1948 case cmd: \ 1949 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \ 1950 return numdw 1951 #define MATCH3D(cmd) \ 1952 case CMD_##cmd: \ 1953 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \ 1954 return numdw 1955 1956 MATCH(STATE_BASE_ADDRESS); 1957 MATCH(STATE_SIP); 1958 MATCH(GPGPU_CSR_BASE_ADDRESS); 1959 MATCH(STATE_COMPUTE_MODE); 1960 MATCH3D(3DSTATE_BTD); 1961 MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS); 1962 MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS); 1963 1964 MATCH3D(3DSTATE_VF_STATISTICS); 1965 1966 MATCH(PIPELINE_SELECT); 1967 1968 MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST); 1969 MATCH3D(3DSTATE_CLEAR_PARAMS); 1970 MATCH3D(3DSTATE_DEPTH_BUFFER); 1971 MATCH3D(3DSTATE_STENCIL_BUFFER); 1972 MATCH3D(3DSTATE_HIER_DEPTH_BUFFER); 1973 MATCH3D(3DSTATE_VERTEX_BUFFERS); 1974 MATCH3D(3DSTATE_VERTEX_ELEMENTS); 1975 MATCH3D(3DSTATE_INDEX_BUFFER); 1976 MATCH3D(3DSTATE_VF); 1977 MATCH3D(3DSTATE_MULTISAMPLE); 1978 MATCH3D(3DSTATE_CC_STATE_POINTERS); 1979 MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS); 1980 MATCH3D(3DSTATE_VS); 1981 MATCH3D(3DSTATE_GS); 1982 MATCH3D(3DSTATE_CLIP); 1983 MATCH3D(3DSTATE_SF); 1984 MATCH3D(3DSTATE_WM); 1985 MATCH3D(3DSTATE_CONSTANT_VS); 1986 MATCH3D(3DSTATE_CONSTANT_GS); 1987 MATCH3D(3DSTATE_CONSTANT_PS); 1988 MATCH3D(3DSTATE_SAMPLE_MASK); 1989 MATCH3D(3DSTATE_CONSTANT_HS); 1990 MATCH3D(3DSTATE_CONSTANT_DS); 1991 MATCH3D(3DSTATE_HS); 1992 MATCH3D(3DSTATE_TE); 1993 MATCH3D(3DSTATE_DS); 1994 MATCH3D(3DSTATE_STREAMOUT); 1995 MATCH3D(3DSTATE_SBE); 1996 MATCH3D(3DSTATE_PS); 1997 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP); 1998 MATCH3D(3DSTATE_CPS_POINTERS); 1999 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC); 2000 MATCH3D(3DSTATE_BLEND_STATE_POINTERS); 2001 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS); 2002 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS); 2003 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS); 2004 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS); 2005 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS); 2006 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS); 2007 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS); 2008 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS); 2009 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS); 2010 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS); 2011 MATCH3D(3DSTATE_VF_INSTANCING); 2012 MATCH3D(3DSTATE_VF_SGVS); 2013 MATCH3D(3DSTATE_VF_TOPOLOGY); 2014 MATCH3D(3DSTATE_WM_CHROMAKEY); 2015 MATCH3D(3DSTATE_PS_BLEND); 2016 MATCH3D(3DSTATE_WM_DEPTH_STENCIL); 2017 MATCH3D(3DSTATE_PS_EXTRA); 2018 MATCH3D(3DSTATE_RASTER); 2019 MATCH3D(3DSTATE_SBE_SWIZ); 2020 MATCH3D(3DSTATE_WM_HZ_OP); 2021 MATCH3D(3DSTATE_VF_COMPONENT_PACKING); 2022 MATCH3D(3DSTATE_VF_SGVS_2); 2023 MATCH3D(3DSTATE_VFG); 2024 MATCH3D(3DSTATE_URB_ALLOC_VS); 2025 MATCH3D(3DSTATE_URB_ALLOC_HS); 2026 MATCH3D(3DSTATE_URB_ALLOC_DS); 2027 MATCH3D(3DSTATE_URB_ALLOC_GS); 2028 MATCH3D(3DSTATE_SO_BUFFER_INDEX_0); 2029 MATCH3D(3DSTATE_SO_BUFFER_INDEX_1); 2030 MATCH3D(3DSTATE_SO_BUFFER_INDEX_2); 2031 MATCH3D(3DSTATE_SO_BUFFER_INDEX_3); 2032 MATCH3D(3DSTATE_PRIMITIVE_REPLICATION); 2033 MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO); 2034 MATCH3D(3DSTATE_AMFS); 2035 MATCH3D(3DSTATE_DEPTH_BOUNDS); 2036 MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS); 2037 MATCH3D(3DSTATE_CONSTANT_TS_POINTER); 2038 MATCH3D(3DSTATE_MESH_CONTROL); 2039 MATCH3D(3DSTATE_MESH_DISTRIB); 2040 MATCH3D(3DSTATE_TASK_REDISTRIB); 2041 MATCH3D(3DSTATE_MESH_SHADER); 2042 MATCH3D(3DSTATE_MESH_SHADER_DATA); 2043 MATCH3D(3DSTATE_TASK_CONTROL); 2044 MATCH3D(3DSTATE_TASK_SHADER); 2045 MATCH3D(3DSTATE_TASK_SHADER_DATA); 2046 MATCH3D(3DSTATE_URB_ALLOC_MESH); 2047 MATCH3D(3DSTATE_URB_ALLOC_TASK); 2048 MATCH3D(3DSTATE_CLIP_MESH); 2049 MATCH3D(3DSTATE_SBE_MESH); 2050 MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER); 2051 MATCH3D(3DSTATE_COARSE_PIXEL); 2052 2053 MATCH3D(3DSTATE_DRAWING_RECTANGLE); 2054 MATCH3D(3DSTATE_CHROMA_KEY); 2055 MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET); 2056 MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN); 2057 MATCH3D(3DSTATE_LINE_STIPPLE); 2058 MATCH3D(3DSTATE_AA_LINE_PARAMETERS); 2059 MATCH3D(3DSTATE_MONOFILTER_SIZE); 2060 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS); 2061 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS); 2062 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS); 2063 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS); 2064 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS); 2065 MATCH3D(3DSTATE_SO_DECL_LIST); 2066 MATCH3D(3DSTATE_SO_BUFFER); 2067 MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC); 2068 MATCH3D(3DSTATE_SAMPLE_PATTERN); 2069 MATCH3D(3DSTATE_3D_MODE); 2070 MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE); 2071 MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS); 2072 MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO); 2073 2074 default: 2075 drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n", 2076 *dw, pipeline, opcode, subopcode, numdw); 2077 return numdw; 2078 } 2079 } 2080 2081 static int dump_gfx_state_command(struct drm_printer *p, 2082 struct xe_gt *gt, 2083 u32 *dw, 2084 int remaining_dw) 2085 { 2086 u32 numdw = instr_dw(*dw); 2087 u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw); 2088 2089 /* 2090 * Make sure we haven't mis-parsed a number of dwords that exceeds the 2091 * remaining size of the LRC. 2092 */ 2093 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 2094 numdw = remaining_dw; 2095 2096 switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) { 2097 MATCH(STATE_WRITE_INLINE); 2098 2099 default: 2100 drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n", 2101 *dw, opcode, numdw); 2102 return numdw; 2103 } 2104 } 2105 2106 void xe_lrc_dump_default(struct drm_printer *p, 2107 struct xe_gt *gt, 2108 enum xe_engine_class hwe_class) 2109 { 2110 u32 *dw; 2111 int remaining_dw, num_dw; 2112 2113 if (!gt->default_lrc[hwe_class]) { 2114 drm_printf(p, "No default LRC for class %d\n", hwe_class); 2115 return; 2116 } 2117 2118 /* 2119 * Skip the beginning of the LRC since it contains the per-process 2120 * hardware status page. 2121 */ 2122 dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE; 2123 remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4; 2124 2125 while (remaining_dw > 0) { 2126 if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) { 2127 num_dw = dump_mi_command(p, gt, dw, remaining_dw); 2128 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) { 2129 num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw); 2130 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) { 2131 num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw); 2132 } else { 2133 num_dw = min(instr_dw(*dw), remaining_dw); 2134 drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n", 2135 *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw), 2136 num_dw); 2137 } 2138 2139 dw += num_dw; 2140 remaining_dw -= num_dw; 2141 } 2142 } 2143 2144 struct instr_state { 2145 u32 instr; 2146 u16 num_dw; 2147 }; 2148 2149 static const struct instr_state xe_hpg_svg_state[] = { 2150 { .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 }, 2151 { .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 }, 2152 { .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 }, 2153 { .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 }, 2154 { .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 }, 2155 { .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 }, 2156 { .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 }, 2157 { .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 }, 2158 { .instr = CMD_3DSTATE_VS, .num_dw = 9 }, 2159 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 }, 2160 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 }, 2161 { .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 }, 2162 { .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 }, 2163 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 }, 2164 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 }, 2165 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 }, 2166 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 }, 2167 { .instr = CMD_3DSTATE_CLIP, .num_dw = 4 }, 2168 { .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 }, 2169 { .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 }, 2170 { .instr = CMD_3DSTATE_SF, .num_dw = 4 }, 2171 { .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 }, 2172 { .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 }, 2173 { .instr = CMD_3DSTATE_RASTER, .num_dw = 5 }, 2174 { .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 }, 2175 { .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 }, 2176 { .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 }, 2177 { .instr = CMD_3DSTATE_HS, .num_dw = 9 }, 2178 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 }, 2179 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 }, 2180 { .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 }, 2181 { .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 }, 2182 { .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 }, 2183 { .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 }, 2184 { .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 }, 2185 { .instr = CMD_3DSTATE_TE, .num_dw = 5 }, 2186 { .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 }, 2187 { .instr = CMD_3DSTATE_DS, .num_dw = 11 }, 2188 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 }, 2189 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 }, 2190 { .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 }, 2191 { .instr = CMD_3DSTATE_GS, .num_dw = 10 }, 2192 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 }, 2193 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 }, 2194 { .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 }, 2195 { .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 }, 2196 { .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 }, 2197 { .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 }, 2198 { .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 }, 2199 { .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 }, 2200 }; 2201 2202 u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs) 2203 { 2204 struct xe_gt *gt = q->hwe->gt; 2205 struct xe_device *xe = gt_to_xe(gt); 2206 const struct instr_state *state_table = NULL; 2207 int state_table_size = 0; 2208 2209 /* 2210 * Wa_14019789679 2211 * 2212 * If the driver doesn't explicitly emit the SVG instructions while 2213 * setting up the default LRC, the context switch will write 0's 2214 * (noops) into the LRC memory rather than the expected instruction 2215 * headers. Application contexts start out as a copy of the default 2216 * LRC, and if they also do not emit specific settings for some SVG 2217 * state, then on context restore they'll unintentionally inherit 2218 * whatever state setting the previous context had programmed into the 2219 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will 2220 * prevent the hardware from resetting that state back to any specific 2221 * value). 2222 * 2223 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL 2224 * since that's a specific state setting that can easily cause GPU 2225 * hangs if unintentionally inherited. However to be safe we'll 2226 * continue to emit all of the SVG state since it's best not to leak 2227 * any of the state between contexts, even if that leakage is harmless. 2228 */ 2229 if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) { 2230 state_table = xe_hpg_svg_state; 2231 state_table_size = ARRAY_SIZE(xe_hpg_svg_state); 2232 } 2233 2234 if (!state_table) { 2235 xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n", 2236 GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100); 2237 return cs; 2238 } 2239 2240 for (int i = 0; i < state_table_size; i++) { 2241 u32 instr = state_table[i].instr; 2242 u16 num_dw = state_table[i].num_dw; 2243 bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW); 2244 2245 xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE); 2246 xe_gt_assert(gt, num_dw != 0); 2247 xe_gt_assert(gt, is_single_dw ^ (num_dw > 1)); 2248 2249 /* 2250 * Xe2's SVG context is the same as the one on DG2 / MTL 2251 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has 2252 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined). 2253 * Just make the replacement here rather than defining a 2254 * whole separate table for the single trivial change. 2255 */ 2256 if (GRAPHICS_VER(xe) >= 20 && 2257 instr == CMD_3DSTATE_DRAWING_RECTANGLE) 2258 instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST; 2259 2260 *cs = instr; 2261 if (!is_single_dw) 2262 *cs |= (num_dw - 2); 2263 2264 cs += num_dw; 2265 } 2266 2267 return cs; 2268 } 2269 2270 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc) 2271 { 2272 struct xe_lrc_snapshot *snapshot = kmalloc(sizeof(*snapshot), GFP_NOWAIT); 2273 2274 if (!snapshot) 2275 return NULL; 2276 2277 snapshot->context_desc = xe_lrc_ggtt_addr(lrc); 2278 snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc); 2279 snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc); 2280 snapshot->head = xe_lrc_ring_head(lrc); 2281 snapshot->tail.internal = lrc->ring.tail; 2282 snapshot->tail.memory = xe_lrc_ring_tail(lrc); 2283 snapshot->start = xe_lrc_ring_start(lrc); 2284 snapshot->start_seqno = xe_lrc_start_seqno(lrc); 2285 snapshot->seqno = xe_lrc_seqno(lrc); 2286 snapshot->lrc_bo = xe_bo_get(lrc->bo); 2287 snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc); 2288 snapshot->lrc_size = lrc->size; 2289 snapshot->replay_offset = 0; 2290 snapshot->replay_size = lrc->replay_size; 2291 snapshot->lrc_snapshot = NULL; 2292 snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc)); 2293 snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc); 2294 return snapshot; 2295 } 2296 2297 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot) 2298 { 2299 struct xe_bo *bo; 2300 struct iosys_map src; 2301 2302 if (!snapshot) 2303 return; 2304 2305 bo = snapshot->lrc_bo; 2306 snapshot->lrc_bo = NULL; 2307 2308 snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL); 2309 if (!snapshot->lrc_snapshot) 2310 goto put_bo; 2311 2312 xe_bo_lock(bo, false); 2313 if (!ttm_bo_vmap(&bo->ttm, &src)) { 2314 xe_map_memcpy_from(xe_bo_device(bo), 2315 snapshot->lrc_snapshot, &src, snapshot->lrc_offset, 2316 snapshot->lrc_size); 2317 ttm_bo_vunmap(&bo->ttm, &src); 2318 } else { 2319 kvfree(snapshot->lrc_snapshot); 2320 snapshot->lrc_snapshot = NULL; 2321 } 2322 xe_bo_unlock(bo); 2323 put_bo: 2324 xe_bo_put(bo); 2325 } 2326 2327 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p) 2328 { 2329 unsigned long i; 2330 2331 if (!snapshot) 2332 return; 2333 2334 drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc); 2335 drm_printf(p, "\tHW Ring address: 0x%08x\n", 2336 snapshot->ring_addr); 2337 drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n", 2338 snapshot->indirect_context_desc); 2339 drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head); 2340 drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n", 2341 snapshot->tail.internal, snapshot->tail.memory); 2342 drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start); 2343 drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno); 2344 drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno); 2345 drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp); 2346 drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp); 2347 2348 if (!snapshot->lrc_snapshot) 2349 return; 2350 2351 drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE); 2352 drm_puts(p, "\t[HWSP].data: "); 2353 for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) { 2354 u32 *val = snapshot->lrc_snapshot + i; 2355 char dumped[ASCII85_BUFSZ]; 2356 2357 drm_puts(p, ascii85_encode(*val, dumped)); 2358 } 2359 2360 drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE); 2361 drm_printf(p, "\n\t[HWCTX].replay_offset: 0x%lx\n", snapshot->replay_offset); 2362 drm_printf(p, "\n\t[HWCTX].replay_length: 0x%lx\n", snapshot->replay_size); 2363 2364 drm_puts(p, "\t[HWCTX].data: "); 2365 for (; i < snapshot->lrc_size; i += sizeof(u32)) { 2366 u32 *val = snapshot->lrc_snapshot + i; 2367 char dumped[ASCII85_BUFSZ]; 2368 2369 drm_puts(p, ascii85_encode(*val, dumped)); 2370 } 2371 drm_puts(p, "\n"); 2372 } 2373 2374 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot) 2375 { 2376 if (!snapshot) 2377 return; 2378 2379 kvfree(snapshot->lrc_snapshot); 2380 if (snapshot->lrc_bo) 2381 xe_bo_put(snapshot->lrc_bo); 2382 2383 kfree(snapshot); 2384 } 2385 2386 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts) 2387 { 2388 u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id); 2389 u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id); 2390 struct xe_hw_engine *hwe; 2391 u64 val; 2392 2393 hwe = xe_gt_hw_engine(lrc->gt, class, instance, false); 2394 if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe), 2395 "Unexpected engine class:instance %d:%d for context utilization\n", 2396 class, instance)) 2397 return -1; 2398 2399 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) 2400 val = xe_mmio_read64_2x32(&hwe->gt->mmio, 2401 RING_CTX_TIMESTAMP(hwe->mmio_base)); 2402 else 2403 val = xe_mmio_read32(&hwe->gt->mmio, 2404 RING_CTX_TIMESTAMP(hwe->mmio_base)); 2405 2406 *reg_ctx_ts = val; 2407 2408 return 0; 2409 } 2410 2411 /** 2412 * xe_lrc_timestamp() - Current ctx timestamp 2413 * @lrc: Pointer to the lrc. 2414 * 2415 * Return latest ctx timestamp. With support for active contexts, the 2416 * calculation may bb slightly racy, so follow a read-again logic to ensure that 2417 * the context is still active before returning the right timestamp. 2418 * 2419 * Returns: New ctx timestamp value 2420 */ 2421 u64 xe_lrc_timestamp(struct xe_lrc *lrc) 2422 { 2423 u64 lrc_ts, reg_ts, new_ts; 2424 u32 engine_id; 2425 2426 lrc_ts = xe_lrc_ctx_timestamp(lrc); 2427 /* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */ 2428 if (IS_SRIOV_VF(lrc_to_xe(lrc))) { 2429 new_ts = lrc_ts; 2430 goto done; 2431 } 2432 2433 if (lrc_ts == CONTEXT_ACTIVE) { 2434 engine_id = xe_lrc_engine_id(lrc); 2435 if (!get_ctx_timestamp(lrc, engine_id, ®_ts)) 2436 new_ts = reg_ts; 2437 2438 /* read lrc again to ensure context is still active */ 2439 lrc_ts = xe_lrc_ctx_timestamp(lrc); 2440 } 2441 2442 /* 2443 * If context switched out, just use the lrc_ts. Note that this needs to 2444 * be a separate if condition. 2445 */ 2446 if (lrc_ts != CONTEXT_ACTIVE) 2447 new_ts = lrc_ts; 2448 2449 done: 2450 return new_ts; 2451 } 2452 2453 /** 2454 * xe_lrc_update_timestamp() - Update ctx timestamp 2455 * @lrc: Pointer to the lrc. 2456 * @old_ts: Old timestamp value 2457 * 2458 * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and 2459 * update saved value. 2460 * 2461 * Returns: New ctx timestamp value 2462 */ 2463 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts) 2464 { 2465 *old_ts = lrc->ctx_timestamp; 2466 lrc->ctx_timestamp = xe_lrc_timestamp(lrc); 2467 2468 trace_xe_lrc_update_timestamp(lrc, *old_ts); 2469 2470 return lrc->ctx_timestamp; 2471 } 2472 2473 /** 2474 * xe_lrc_ring_is_idle() - LRC is idle 2475 * @lrc: Pointer to the lrc. 2476 * 2477 * Compare LRC ring head and tail to determine if idle. 2478 * 2479 * Return: True is ring is idle, False otherwise 2480 */ 2481 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc) 2482 { 2483 return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc); 2484 } 2485