1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2021 Intel Corporation 4 */ 5 6 #include "xe_lrc.h" 7 8 #include <generated/xe_wa_oob.h> 9 10 #include <linux/ascii85.h> 11 #include <linux/panic.h> 12 13 #include "instructions/xe_mi_commands.h" 14 #include "instructions/xe_gfxpipe_commands.h" 15 #include "instructions/xe_gfx_state_commands.h" 16 #include "regs/xe_engine_regs.h" 17 #include "regs/xe_lrc_layout.h" 18 #include "xe_bb.h" 19 #include "xe_bo.h" 20 #include "xe_configfs.h" 21 #include "xe_device.h" 22 #include "xe_drm_client.h" 23 #include "xe_exec_queue_types.h" 24 #include "xe_gt.h" 25 #include "xe_gt_printk.h" 26 #include "xe_hw_fence.h" 27 #include "xe_map.h" 28 #include "xe_memirq.h" 29 #include "xe_mmio.h" 30 #include "xe_sriov.h" 31 #include "xe_trace_lrc.h" 32 #include "xe_vm.h" 33 #include "xe_wa.h" 34 35 #define LRC_VALID BIT_ULL(0) 36 #define LRC_PRIVILEGE BIT_ULL(8) 37 #define LRC_ADDRESSING_MODE GENMASK_ULL(4, 3) 38 #define LRC_LEGACY_64B_CONTEXT 3 39 40 #define LRC_ENGINE_CLASS GENMASK_ULL(63, 61) 41 #define LRC_ENGINE_INSTANCE GENMASK_ULL(53, 48) 42 43 #define LRC_PPHWSP_SIZE SZ_4K 44 #define LRC_INDIRECT_CTX_BO_SIZE SZ_4K 45 #define LRC_INDIRECT_RING_STATE_SIZE SZ_4K 46 47 #define LRC_PRIORITY GENMASK_ULL(10, 9) 48 #define LRC_PRIORITY_LOW 0 49 #define LRC_PRIORITY_NORMAL 1 50 #define LRC_PRIORITY_HIGH 2 51 52 /* 53 * Layout of the LRC and associated data allocated as 54 * lrc->bo: 55 * 56 * Region Size 57 * +============================+=================================+ <- __xe_lrc_ring_offset() 58 * | Ring | ring_size, see | 59 * | | xe_lrc_init() | 60 * +============================+=================================+ <- __xe_lrc_pphwsp_offset() 61 * | PPHWSP (includes SW state) | 4K | 62 * +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset() 63 * | Engine Context Image | n * 4K, see | 64 * | | xe_gt_lrc_size() | 65 * +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset() 66 * | Indirect Ring State Page | 0 or 4k, see | 67 * | | XE_LRC_FLAG_INDIRECT_RING_STATE | 68 * +============================+=================================+ <- __xe_lrc_indirect_ctx_offset() 69 * | Indirect Context Page | 0 or 4k, see | 70 * | | XE_LRC_FLAG_INDIRECT_CTX | 71 * +============================+=================================+ <- __xe_lrc_wa_bb_offset() 72 * | WA BB Per Ctx | 4k | 73 * +============================+=================================+ <- xe_bo_size(lrc->bo) 74 */ 75 76 static struct xe_device * 77 lrc_to_xe(struct xe_lrc *lrc) 78 { 79 return gt_to_xe(lrc->fence_ctx.gt); 80 } 81 82 static bool 83 gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class) 84 { 85 struct xe_device *xe = gt_to_xe(gt); 86 87 if (XE_GT_WA(gt, 16010904313) && 88 (class == XE_ENGINE_CLASS_RENDER || 89 class == XE_ENGINE_CLASS_COMPUTE)) 90 return true; 91 92 if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev), 93 class, NULL)) 94 return true; 95 96 return false; 97 } 98 99 /** 100 * xe_gt_lrc_hang_replay_size() - Hang replay size 101 * @gt: The GT 102 * @class: Hardware engine class 103 * 104 * Determine size of GPU hang replay state for a GT and hardware engine class. 105 * 106 * Return: Size of GPU hang replay size 107 */ 108 size_t xe_gt_lrc_hang_replay_size(struct xe_gt *gt, enum xe_engine_class class) 109 { 110 struct xe_device *xe = gt_to_xe(gt); 111 size_t size = 0; 112 113 /* Engine context image */ 114 switch (class) { 115 case XE_ENGINE_CLASS_RENDER: 116 if (GRAPHICS_VERx100(xe) >= 3510) 117 size += 7 * SZ_4K; 118 else if (GRAPHICS_VER(xe) >= 20) 119 size += 3 * SZ_4K; 120 else 121 size += 13 * SZ_4K; 122 break; 123 case XE_ENGINE_CLASS_COMPUTE: 124 if (GRAPHICS_VERx100(xe) >= 3510) 125 size += 5 * SZ_4K; 126 else if (GRAPHICS_VER(xe) >= 20) 127 size += 2 * SZ_4K; 128 else 129 size += 13 * SZ_4K; 130 break; 131 default: 132 WARN(1, "Unknown engine class: %d", class); 133 fallthrough; 134 case XE_ENGINE_CLASS_COPY: 135 case XE_ENGINE_CLASS_VIDEO_DECODE: 136 case XE_ENGINE_CLASS_VIDEO_ENHANCE: 137 case XE_ENGINE_CLASS_OTHER: 138 size += 1 * SZ_4K; 139 } 140 141 return size; 142 } 143 144 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class) 145 { 146 size_t size = xe_gt_lrc_hang_replay_size(gt, class); 147 148 /* Add indirect ring state page */ 149 if (xe_gt_has_indirect_ring_state(gt)) 150 size += LRC_INDIRECT_RING_STATE_SIZE; 151 152 return size + LRC_PPHWSP_SIZE; 153 } 154 155 /* 156 * The per-platform tables are u8-encoded in @data. Decode @data and set the 157 * addresses' offset and commands in @regs. The following encoding is used 158 * for each byte. There are 2 steps: decoding commands and decoding addresses. 159 * 160 * Commands: 161 * [7]: create NOPs - number of NOPs are set in lower bits 162 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set 163 * MI_LRI_FORCE_POSTED 164 * [5:0]: Number of NOPs or registers to set values to in case of 165 * MI_LOAD_REGISTER_IMM 166 * 167 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count" 168 * number of registers. They are set by using the REG/REG16 macros: the former 169 * is used for offsets smaller than 0x200 while the latter is for values bigger 170 * than that. Those macros already set all the bits documented below correctly: 171 * 172 * [7]: When a register offset needs more than 6 bits, use additional bytes, to 173 * follow, for the lower bits 174 * [6:0]: Register offset, without considering the engine base. 175 * 176 * This function only tweaks the commands and register offsets. Values are not 177 * filled out. 178 */ 179 static void set_offsets(u32 *regs, 180 const u8 *data, 181 const struct xe_hw_engine *hwe) 182 #define NOP(x) (BIT(7) | (x)) 183 #define LRI(count, flags) ((flags) << 6 | (count) | \ 184 BUILD_BUG_ON_ZERO(count >= BIT(6))) 185 #define POSTED BIT(0) 186 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 187 #define REG16(x) \ 188 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 189 (((x) >> 2) & 0x7f) 190 { 191 const u32 base = hwe->mmio_base; 192 193 while (*data) { 194 u8 count, flags; 195 196 if (*data & BIT(7)) { /* skip */ 197 count = *data++ & ~BIT(7); 198 regs += count; 199 continue; 200 } 201 202 count = *data & 0x3f; 203 flags = *data >> 6; 204 data++; 205 206 *regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count); 207 if (flags & POSTED) 208 *regs |= MI_LRI_FORCE_POSTED; 209 *regs |= MI_LRI_LRM_CS_MMIO; 210 regs++; 211 212 xe_gt_assert(hwe->gt, count); 213 do { 214 u32 offset = 0; 215 u8 v; 216 217 do { 218 v = *data++; 219 offset <<= 7; 220 offset |= v & ~BIT(7); 221 } while (v & BIT(7)); 222 223 regs[0] = base + (offset << 2); 224 regs += 2; 225 } while (--count); 226 } 227 228 *regs = MI_BATCH_BUFFER_END | BIT(0); 229 } 230 231 static const u8 gen12_xcs_offsets[] = { 232 NOP(1), 233 LRI(13, POSTED), 234 REG16(0x244), 235 REG(0x034), 236 REG(0x030), 237 REG(0x038), 238 REG(0x03c), 239 REG(0x168), 240 REG(0x140), 241 REG(0x110), 242 REG(0x1c0), 243 REG(0x1c4), 244 REG(0x1c8), 245 REG(0x180), 246 REG16(0x2b4), 247 248 NOP(5), 249 LRI(9, POSTED), 250 REG16(0x3a8), 251 REG16(0x28c), 252 REG16(0x288), 253 REG16(0x284), 254 REG16(0x280), 255 REG16(0x27c), 256 REG16(0x278), 257 REG16(0x274), 258 REG16(0x270), 259 260 0 261 }; 262 263 static const u8 dg2_xcs_offsets[] = { 264 NOP(1), 265 LRI(15, POSTED), 266 REG16(0x244), 267 REG(0x034), 268 REG(0x030), 269 REG(0x038), 270 REG(0x03c), 271 REG(0x168), 272 REG(0x140), 273 REG(0x110), 274 REG(0x1c0), 275 REG(0x1c4), 276 REG(0x1c8), 277 REG(0x180), 278 REG16(0x2b4), 279 REG(0x120), 280 REG(0x124), 281 282 NOP(1), 283 LRI(9, POSTED), 284 REG16(0x3a8), 285 REG16(0x28c), 286 REG16(0x288), 287 REG16(0x284), 288 REG16(0x280), 289 REG16(0x27c), 290 REG16(0x278), 291 REG16(0x274), 292 REG16(0x270), 293 294 0 295 }; 296 297 static const u8 gen12_rcs_offsets[] = { 298 NOP(1), 299 LRI(13, POSTED), 300 REG16(0x244), 301 REG(0x034), 302 REG(0x030), 303 REG(0x038), 304 REG(0x03c), 305 REG(0x168), 306 REG(0x140), 307 REG(0x110), 308 REG(0x1c0), 309 REG(0x1c4), 310 REG(0x1c8), 311 REG(0x180), 312 REG16(0x2b4), 313 314 NOP(5), 315 LRI(9, POSTED), 316 REG16(0x3a8), 317 REG16(0x28c), 318 REG16(0x288), 319 REG16(0x284), 320 REG16(0x280), 321 REG16(0x27c), 322 REG16(0x278), 323 REG16(0x274), 324 REG16(0x270), 325 326 LRI(3, POSTED), 327 REG(0x1b0), 328 REG16(0x5a8), 329 REG16(0x5ac), 330 331 NOP(6), 332 LRI(1, 0), 333 REG(0x0c8), 334 NOP(3 + 9 + 1), 335 336 LRI(51, POSTED), 337 REG16(0x588), 338 REG16(0x588), 339 REG16(0x588), 340 REG16(0x588), 341 REG16(0x588), 342 REG16(0x588), 343 REG(0x028), 344 REG(0x09c), 345 REG(0x0c0), 346 REG(0x178), 347 REG(0x17c), 348 REG16(0x358), 349 REG(0x170), 350 REG(0x150), 351 REG(0x154), 352 REG(0x158), 353 REG16(0x41c), 354 REG16(0x600), 355 REG16(0x604), 356 REG16(0x608), 357 REG16(0x60c), 358 REG16(0x610), 359 REG16(0x614), 360 REG16(0x618), 361 REG16(0x61c), 362 REG16(0x620), 363 REG16(0x624), 364 REG16(0x628), 365 REG16(0x62c), 366 REG16(0x630), 367 REG16(0x634), 368 REG16(0x638), 369 REG16(0x63c), 370 REG16(0x640), 371 REG16(0x644), 372 REG16(0x648), 373 REG16(0x64c), 374 REG16(0x650), 375 REG16(0x654), 376 REG16(0x658), 377 REG16(0x65c), 378 REG16(0x660), 379 REG16(0x664), 380 REG16(0x668), 381 REG16(0x66c), 382 REG16(0x670), 383 REG16(0x674), 384 REG16(0x678), 385 REG16(0x67c), 386 REG(0x068), 387 REG(0x084), 388 NOP(1), 389 390 0 391 }; 392 393 static const u8 xehp_rcs_offsets[] = { 394 NOP(1), 395 LRI(13, POSTED), 396 REG16(0x244), 397 REG(0x034), 398 REG(0x030), 399 REG(0x038), 400 REG(0x03c), 401 REG(0x168), 402 REG(0x140), 403 REG(0x110), 404 REG(0x1c0), 405 REG(0x1c4), 406 REG(0x1c8), 407 REG(0x180), 408 REG16(0x2b4), 409 410 NOP(5), 411 LRI(9, POSTED), 412 REG16(0x3a8), 413 REG16(0x28c), 414 REG16(0x288), 415 REG16(0x284), 416 REG16(0x280), 417 REG16(0x27c), 418 REG16(0x278), 419 REG16(0x274), 420 REG16(0x270), 421 422 LRI(3, POSTED), 423 REG(0x1b0), 424 REG16(0x5a8), 425 REG16(0x5ac), 426 427 NOP(6), 428 LRI(1, 0), 429 REG(0x0c8), 430 431 0 432 }; 433 434 static const u8 dg2_rcs_offsets[] = { 435 NOP(1), 436 LRI(15, POSTED), 437 REG16(0x244), 438 REG(0x034), 439 REG(0x030), 440 REG(0x038), 441 REG(0x03c), 442 REG(0x168), 443 REG(0x140), 444 REG(0x110), 445 REG(0x1c0), 446 REG(0x1c4), 447 REG(0x1c8), 448 REG(0x180), 449 REG16(0x2b4), 450 REG(0x120), 451 REG(0x124), 452 453 NOP(1), 454 LRI(9, POSTED), 455 REG16(0x3a8), 456 REG16(0x28c), 457 REG16(0x288), 458 REG16(0x284), 459 REG16(0x280), 460 REG16(0x27c), 461 REG16(0x278), 462 REG16(0x274), 463 REG16(0x270), 464 465 LRI(3, POSTED), 466 REG(0x1b0), 467 REG16(0x5a8), 468 REG16(0x5ac), 469 470 NOP(6), 471 LRI(1, 0), 472 REG(0x0c8), 473 474 0 475 }; 476 477 static const u8 mtl_rcs_offsets[] = { 478 NOP(1), 479 LRI(15, POSTED), 480 REG16(0x244), 481 REG(0x034), 482 REG(0x030), 483 REG(0x038), 484 REG(0x03c), 485 REG(0x168), 486 REG(0x140), 487 REG(0x110), 488 REG(0x1c0), 489 REG(0x1c4), 490 REG(0x1c8), 491 REG(0x180), 492 REG16(0x2b4), 493 REG(0x120), 494 REG(0x124), 495 496 NOP(1), 497 LRI(9, POSTED), 498 REG16(0x3a8), 499 REG16(0x28c), 500 REG16(0x288), 501 REG16(0x284), 502 REG16(0x280), 503 REG16(0x27c), 504 REG16(0x278), 505 REG16(0x274), 506 REG16(0x270), 507 508 NOP(2), 509 LRI(2, POSTED), 510 REG16(0x5a8), 511 REG16(0x5ac), 512 513 NOP(6), 514 LRI(1, 0), 515 REG(0x0c8), 516 517 0 518 }; 519 520 #define XE2_CTX_COMMON \ 521 NOP(1), /* [0x00] */ \ 522 LRI(15, POSTED), /* [0x01] */ \ 523 REG16(0x244), /* [0x02] CTXT_SR_CTL */ \ 524 REG(0x034), /* [0x04] RING_BUFFER_HEAD */ \ 525 REG(0x030), /* [0x06] RING_BUFFER_TAIL */ \ 526 REG(0x038), /* [0x08] RING_BUFFER_START */ \ 527 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ \ 528 REG(0x168), /* [0x0c] BB_ADDR_UDW */ \ 529 REG(0x140), /* [0x0e] BB_ADDR */ \ 530 REG(0x110), /* [0x10] BB_STATE */ \ 531 REG(0x1c0), /* [0x12] BB_PER_CTX_PTR */ \ 532 REG(0x1c4), /* [0x14] RCS_INDIRECT_CTX */ \ 533 REG(0x1c8), /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \ 534 REG(0x180), /* [0x18] CCID */ \ 535 REG16(0x2b4), /* [0x1a] SEMAPHORE_TOKEN */ \ 536 REG(0x120), /* [0x1c] PRT_BB_STATE */ \ 537 REG(0x124), /* [0x1e] PRT_BB_STATE_UDW */ \ 538 \ 539 NOP(1), /* [0x20] */ \ 540 LRI(9, POSTED), /* [0x21] */ \ 541 REG16(0x3a8), /* [0x22] CTX_TIMESTAMP */ \ 542 REG16(0x3ac), /* [0x24] CTX_TIMESTAMP_UDW */ \ 543 REG(0x108), /* [0x26] INDIRECT_RING_STATE */ \ 544 REG16(0x284), /* [0x28] dummy reg */ \ 545 REG16(0x280), /* [0x2a] CS_ACC_CTR_THOLD */ \ 546 REG16(0x27c), /* [0x2c] CS_CTX_SYS_PASID */ \ 547 REG16(0x278), /* [0x2e] CS_CTX_ASID */ \ 548 REG16(0x274), /* [0x30] PTBP_UDW */ \ 549 REG16(0x270) /* [0x32] PTBP_LDW */ 550 551 static const u8 xe2_rcs_offsets[] = { 552 XE2_CTX_COMMON, 553 554 NOP(2), /* [0x34] */ 555 LRI(2, POSTED), /* [0x36] */ 556 REG16(0x5a8), /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */ 557 REG16(0x5ac), /* [0x39] PREEMPTION_STATUS */ 558 559 NOP(6), /* [0x41] */ 560 LRI(1, 0), /* [0x47] */ 561 REG(0x0c8), /* [0x48] R_PWR_CLK_STATE */ 562 563 0 564 }; 565 566 static const u8 xe2_bcs_offsets[] = { 567 XE2_CTX_COMMON, 568 569 NOP(4 + 8 + 1), /* [0x34] */ 570 LRI(2, POSTED), /* [0x41] */ 571 REG16(0x200), /* [0x42] BCS_SWCTRL */ 572 REG16(0x204), /* [0x44] BLIT_CCTL */ 573 574 0 575 }; 576 577 static const u8 xe2_xcs_offsets[] = { 578 XE2_CTX_COMMON, 579 580 0 581 }; 582 583 static const u8 xe2_indirect_ring_state_offsets[] = { 584 NOP(1), /* [0x00] */ 585 LRI(5, POSTED), /* [0x01] */ 586 REG(0x034), /* [0x02] RING_BUFFER_HEAD */ 587 REG(0x030), /* [0x04] RING_BUFFER_TAIL */ 588 REG(0x038), /* [0x06] RING_BUFFER_START */ 589 REG(0x048), /* [0x08] RING_BUFFER_START_UDW */ 590 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ 591 592 NOP(5), /* [0x0c] */ 593 LRI(9, POSTED), /* [0x11] */ 594 REG(0x168), /* [0x12] BB_ADDR_UDW */ 595 REG(0x140), /* [0x14] BB_ADDR */ 596 REG(0x110), /* [0x16] BB_STATE */ 597 REG16(0x588), /* [0x18] BB_STACK_WRITE_PORT */ 598 REG16(0x588), /* [0x20] BB_STACK_WRITE_PORT */ 599 REG16(0x588), /* [0x22] BB_STACK_WRITE_PORT */ 600 REG16(0x588), /* [0x24] BB_STACK_WRITE_PORT */ 601 REG16(0x588), /* [0x26] BB_STACK_WRITE_PORT */ 602 REG16(0x588), /* [0x28] BB_STACK_WRITE_PORT */ 603 604 NOP(12), /* [0x00] */ 605 606 0 607 }; 608 609 #undef REG16 610 #undef REG 611 #undef LRI 612 #undef NOP 613 614 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class) 615 { 616 if (class == XE_ENGINE_CLASS_RENDER) { 617 if (GRAPHICS_VER(xe) >= 20) 618 return xe2_rcs_offsets; 619 else if (GRAPHICS_VERx100(xe) >= 1270) 620 return mtl_rcs_offsets; 621 else if (GRAPHICS_VERx100(xe) >= 1255) 622 return dg2_rcs_offsets; 623 else if (GRAPHICS_VERx100(xe) >= 1250) 624 return xehp_rcs_offsets; 625 else 626 return gen12_rcs_offsets; 627 } else if (class == XE_ENGINE_CLASS_COPY) { 628 if (GRAPHICS_VER(xe) >= 20) 629 return xe2_bcs_offsets; 630 else 631 return gen12_xcs_offsets; 632 } else { 633 if (GRAPHICS_VER(xe) >= 20) 634 return xe2_xcs_offsets; 635 else if (GRAPHICS_VERx100(xe) >= 1255) 636 return dg2_xcs_offsets; 637 else 638 return gen12_xcs_offsets; 639 } 640 } 641 642 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe) 643 { 644 regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | 645 CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 646 647 if (xe_gt_has_indirect_ring_state(hwe->gt)) 648 regs[CTX_CONTEXT_CONTROL] |= 649 _MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE); 650 } 651 652 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe) 653 { 654 struct xe_memirq *memirq = >_to_tile(hwe->gt)->memirq; 655 struct xe_device *xe = gt_to_xe(hwe->gt); 656 u8 num_regs; 657 658 if (!xe_device_uses_memirq(xe)) 659 return; 660 661 regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM | 662 MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT; 663 regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr; 664 regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq); 665 666 num_regs = xe_device_has_msix(xe) ? 3 : 2; 667 regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) | 668 MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED; 669 regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr; 670 regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe); 671 regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr; 672 regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe); 673 674 if (xe_device_has_msix(xe)) { 675 regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr; 676 /* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */ 677 } 678 } 679 680 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe) 681 { 682 struct xe_device *xe = gt_to_xe(hwe->gt); 683 684 if (GRAPHICS_VERx100(xe) >= 1250) 685 return 0x70; 686 else 687 return 0x60; 688 } 689 690 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe) 691 { 692 int x; 693 694 x = lrc_ring_mi_mode(hwe); 695 regs[x + 1] &= ~STOP_RING; 696 regs[x + 1] |= STOP_RING << 16; 697 } 698 699 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc) 700 { 701 return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE; 702 } 703 704 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc) 705 { 706 return 0; 707 } 708 709 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc) 710 { 711 return lrc->ring.size; 712 } 713 714 /* Make the magic macros work */ 715 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset 716 #define __xe_lrc_regs_offset xe_lrc_regs_offset 717 718 #define LRC_CTX_JOB_TIMESTAMP_OFFSET 512 719 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024 720 #define LRC_PARALLEL_PPHWSP_OFFSET 2048 721 722 #define LRC_SEQNO_OFFSET 0 723 #define LRC_START_SEQNO_OFFSET (LRC_SEQNO_OFFSET + 8) 724 725 u32 xe_lrc_regs_offset(struct xe_lrc *lrc) 726 { 727 return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE; 728 } 729 730 /** 731 * xe_lrc_reg_size() - Get size of the LRC registers area within queues 732 * @xe: the &xe_device struct instance 733 * 734 * Returns: Size of the LRC registers area for current platform 735 */ 736 size_t xe_lrc_reg_size(struct xe_device *xe) 737 { 738 if (GRAPHICS_VERx100(xe) >= 1250) 739 return 96 * sizeof(u32); 740 else 741 return 80 * sizeof(u32); 742 } 743 744 size_t xe_lrc_skip_size(struct xe_device *xe) 745 { 746 return LRC_PPHWSP_SIZE + xe_lrc_reg_size(xe); 747 } 748 749 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc) 750 { 751 return LRC_SEQNO_OFFSET; 752 } 753 754 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc) 755 { 756 return LRC_START_SEQNO_OFFSET; 757 } 758 759 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc) 760 { 761 /* This is stored in the driver-defined portion of PPHWSP */ 762 return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET; 763 } 764 765 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc) 766 { 767 /* The parallel is stored in the driver-defined portion of PPHWSP */ 768 return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET; 769 } 770 771 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc) 772 { 773 return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET; 774 } 775 776 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc) 777 { 778 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32); 779 } 780 781 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc) 782 { 783 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32); 784 } 785 786 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc) 787 { 788 u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - 789 LRC_INDIRECT_RING_STATE_SIZE; 790 791 if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX) 792 offset -= LRC_INDIRECT_CTX_BO_SIZE; 793 794 return offset; 795 } 796 797 static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc) 798 { 799 return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE; 800 } 801 802 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc) 803 { 804 return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE; 805 } 806 807 #define DECL_MAP_ADDR_HELPERS(elem, bo_expr) \ 808 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \ 809 { \ 810 struct xe_bo *bo = (bo_expr); \ 811 struct iosys_map map = bo->vmap; \ 812 \ 813 xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map)); \ 814 iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \ 815 return map; \ 816 } \ 817 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \ 818 { \ 819 struct xe_bo *bo = (bo_expr); \ 820 \ 821 return xe_bo_ggtt_addr(bo) + __xe_lrc_##elem##_offset(lrc); \ 822 } \ 823 824 DECL_MAP_ADDR_HELPERS(ring, lrc->bo) 825 DECL_MAP_ADDR_HELPERS(pphwsp, lrc->bo) 826 DECL_MAP_ADDR_HELPERS(seqno, lrc->seqno_bo) 827 DECL_MAP_ADDR_HELPERS(regs, lrc->bo) 828 DECL_MAP_ADDR_HELPERS(start_seqno, lrc->seqno_bo) 829 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp, lrc->bo) 830 DECL_MAP_ADDR_HELPERS(ctx_timestamp, lrc->bo) 831 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw, lrc->bo) 832 DECL_MAP_ADDR_HELPERS(parallel, lrc->bo) 833 DECL_MAP_ADDR_HELPERS(indirect_ring, lrc->bo) 834 DECL_MAP_ADDR_HELPERS(engine_id, lrc->bo) 835 836 #undef DECL_MAP_ADDR_HELPERS 837 838 /** 839 * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address 840 * @lrc: Pointer to the lrc. 841 * 842 * Returns: ctx timestamp GGTT address 843 */ 844 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc) 845 { 846 return __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 847 } 848 849 /** 850 * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address 851 * @lrc: Pointer to the lrc. 852 * 853 * Returns: ctx timestamp udw GGTT address 854 */ 855 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc) 856 { 857 return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc); 858 } 859 860 /** 861 * xe_lrc_ctx_timestamp() - Read ctx timestamp value 862 * @lrc: Pointer to the lrc. 863 * 864 * Returns: ctx timestamp value 865 */ 866 static u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc) 867 { 868 struct xe_device *xe = lrc_to_xe(lrc); 869 struct iosys_map map; 870 u32 ldw, udw = 0; 871 872 map = __xe_lrc_ctx_timestamp_map(lrc); 873 ldw = xe_map_read32(xe, &map); 874 875 if (xe->info.has_64bit_timestamp) { 876 map = __xe_lrc_ctx_timestamp_udw_map(lrc); 877 udw = xe_map_read32(xe, &map); 878 } 879 880 return (u64)udw << 32 | ldw; 881 } 882 883 /** 884 * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address 885 * @lrc: Pointer to the lrc. 886 * 887 * Returns: ctx timestamp job GGTT address 888 */ 889 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc) 890 { 891 return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc); 892 } 893 894 /** 895 * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value 896 * @lrc: Pointer to the lrc. 897 * 898 * Returns: ctx timestamp job value 899 */ 900 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc) 901 { 902 struct xe_device *xe = lrc_to_xe(lrc); 903 struct iosys_map map; 904 905 map = __xe_lrc_ctx_job_timestamp_map(lrc); 906 return xe_map_read32(xe, &map); 907 } 908 909 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc) 910 { 911 return __xe_lrc_pphwsp_ggtt_addr(lrc); 912 } 913 914 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc) 915 { 916 if (!xe_lrc_has_indirect_ring_state(lrc)) 917 return 0; 918 919 return __xe_lrc_indirect_ring_ggtt_addr(lrc); 920 } 921 922 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr) 923 { 924 struct xe_device *xe = lrc_to_xe(lrc); 925 struct iosys_map map; 926 927 map = __xe_lrc_indirect_ring_map(lrc); 928 iosys_map_incr(&map, reg_nr * sizeof(u32)); 929 return xe_map_read32(xe, &map); 930 } 931 932 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc, 933 int reg_nr, u32 val) 934 { 935 struct xe_device *xe = lrc_to_xe(lrc); 936 struct iosys_map map; 937 938 map = __xe_lrc_indirect_ring_map(lrc); 939 iosys_map_incr(&map, reg_nr * sizeof(u32)); 940 xe_map_write32(xe, &map, val); 941 } 942 943 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr) 944 { 945 struct xe_device *xe = lrc_to_xe(lrc); 946 struct iosys_map map; 947 948 map = __xe_lrc_regs_map(lrc); 949 iosys_map_incr(&map, reg_nr * sizeof(u32)); 950 return xe_map_read32(xe, &map); 951 } 952 953 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val) 954 { 955 struct xe_device *xe = lrc_to_xe(lrc); 956 struct iosys_map map; 957 958 map = __xe_lrc_regs_map(lrc); 959 iosys_map_incr(&map, reg_nr * sizeof(u32)); 960 xe_map_write32(xe, &map, val); 961 } 962 963 static void *empty_lrc_data(struct xe_hw_engine *hwe) 964 { 965 struct xe_gt *gt = hwe->gt; 966 void *data; 967 u32 *regs; 968 969 data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL); 970 if (!data) 971 return NULL; 972 973 /* 1st page: Per-Process of HW status Page */ 974 regs = data + LRC_PPHWSP_SIZE; 975 set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe); 976 set_context_control(regs, hwe); 977 set_memory_based_intr(regs, hwe); 978 reset_stop_ring(regs, hwe); 979 if (xe_gt_has_indirect_ring_state(gt)) { 980 regs = data + xe_gt_lrc_size(gt, hwe->class) - 981 LRC_INDIRECT_RING_STATE_SIZE; 982 set_offsets(regs, xe2_indirect_ring_state_offsets, hwe); 983 } 984 985 return data; 986 } 987 988 /** 989 * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC 990 * of given engine. 991 * @hwe: the &xe_hw_engine struct instance 992 */ 993 void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe) 994 { 995 struct xe_gt *gt = hwe->gt; 996 u32 *regs; 997 998 if (!gt->default_lrc[hwe->class]) 999 return; 1000 1001 regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE; 1002 set_memory_based_intr(regs, hwe); 1003 } 1004 1005 /** 1006 * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data 1007 * for given LRC. 1008 * @lrc: the &xe_lrc struct instance 1009 * @hwe: the &xe_hw_engine struct instance 1010 * @regs: scratch buffer to be used as temporary storage 1011 */ 1012 void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 1013 u32 *regs) 1014 { 1015 struct xe_gt *gt = hwe->gt; 1016 struct iosys_map map; 1017 size_t regs_len; 1018 1019 if (!xe_device_uses_memirq(gt_to_xe(gt))) 1020 return; 1021 1022 map = __xe_lrc_regs_map(lrc); 1023 regs_len = xe_lrc_reg_size(gt_to_xe(gt)); 1024 xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len); 1025 set_memory_based_intr(regs, hwe); 1026 xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len); 1027 } 1028 1029 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm) 1030 { 1031 u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt)); 1032 1033 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc)); 1034 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc)); 1035 } 1036 1037 static void xe_lrc_finish(struct xe_lrc *lrc) 1038 { 1039 xe_hw_fence_ctx_finish(&lrc->fence_ctx); 1040 xe_bo_unpin_map_no_vm(lrc->bo); 1041 xe_bo_unpin_map_no_vm(lrc->seqno_bo); 1042 } 1043 1044 /* 1045 * wa_bb_setup_utilization() - Write commands to wa bb to assist 1046 * in calculating active context run ticks. 1047 * 1048 * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the 1049 * context, but only gets updated when the context switches out. In order to 1050 * check how long a context has been active before it switches out, two things 1051 * are required: 1052 * 1053 * (1) Determine if the context is running: 1054 * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in 1055 * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is 1056 * initialized. During a query, we just check for this value to determine if the 1057 * context is active. If the context switched out, it would overwrite this 1058 * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as 1059 * the last part of context restore, so reusing this LRC location will not 1060 * clobber anything. 1061 * 1062 * (2) Calculate the time that the context has been active for: 1063 * The CTX_TIMESTAMP ticks only when the context is active. If a context is 1064 * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization. 1065 * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific 1066 * engine instance. Since we do not know which instance the context is running 1067 * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and 1068 * store it in the PPHSWP. 1069 */ 1070 #define CONTEXT_ACTIVE 1ULL 1071 static ssize_t setup_utilization_wa(struct xe_lrc *lrc, 1072 struct xe_hw_engine *hwe, 1073 u32 *batch, 1074 size_t max_len) 1075 { 1076 u32 *cmd = batch; 1077 1078 if (IS_SRIOV_VF(gt_to_xe(lrc->gt))) 1079 return 0; 1080 1081 if (xe_gt_WARN_ON(lrc->gt, max_len < 12)) 1082 return -ENOSPC; 1083 1084 *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET; 1085 *cmd++ = ENGINE_ID(0).addr; 1086 *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc); 1087 *cmd++ = 0; 1088 1089 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); 1090 *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 1091 *cmd++ = 0; 1092 *cmd++ = lower_32_bits(CONTEXT_ACTIVE); 1093 1094 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) { 1095 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); 1096 *cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc); 1097 *cmd++ = 0; 1098 *cmd++ = upper_32_bits(CONTEXT_ACTIVE); 1099 } 1100 1101 return cmd - batch; 1102 } 1103 1104 static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 1105 u32 *batch, size_t max_len) 1106 { 1107 const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 1108 u32 *cmd = batch; 1109 1110 if (!XE_GT_WA(lrc->gt, 16010904313) || 1111 !(hwe->class == XE_ENGINE_CLASS_RENDER || 1112 hwe->class == XE_ENGINE_CLASS_COMPUTE || 1113 hwe->class == XE_ENGINE_CLASS_COPY || 1114 hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE || 1115 hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE)) 1116 return 0; 1117 1118 if (xe_gt_WARN_ON(lrc->gt, max_len < 12)) 1119 return -ENOSPC; 1120 1121 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO | 1122 MI_LRM_ASYNC; 1123 *cmd++ = RING_CTX_TIMESTAMP(0).addr; 1124 *cmd++ = ts_addr; 1125 *cmd++ = 0; 1126 1127 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO | 1128 MI_LRM_ASYNC; 1129 *cmd++ = RING_CTX_TIMESTAMP(0).addr; 1130 *cmd++ = ts_addr; 1131 *cmd++ = 0; 1132 1133 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO; 1134 *cmd++ = RING_CTX_TIMESTAMP(0).addr; 1135 *cmd++ = ts_addr; 1136 *cmd++ = 0; 1137 1138 return cmd - batch; 1139 } 1140 1141 static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc, 1142 struct xe_hw_engine *hwe, 1143 u32 *batch, size_t max_len) 1144 { 1145 struct xe_device *xe = gt_to_xe(lrc->gt); 1146 const u32 *user_batch; 1147 u32 *cmd = batch; 1148 u32 count; 1149 1150 count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev), 1151 hwe->class, &user_batch); 1152 if (!count) 1153 return 0; 1154 1155 if (count > max_len) 1156 return -ENOSPC; 1157 1158 /* 1159 * This should be used only for tests and validation. Taint the kernel 1160 * as anything could be submitted directly in context switches 1161 */ 1162 add_taint(TAINT_TEST, LOCKDEP_STILL_OK); 1163 1164 memcpy(cmd, user_batch, count * sizeof(u32)); 1165 cmd += count; 1166 1167 return cmd - batch; 1168 } 1169 1170 static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc, 1171 struct xe_hw_engine *hwe, 1172 u32 *batch, size_t max_len) 1173 { 1174 struct xe_device *xe = gt_to_xe(lrc->gt); 1175 const u32 *user_batch; 1176 u32 *cmd = batch; 1177 u32 count; 1178 1179 count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev), 1180 hwe->class, &user_batch); 1181 if (!count) 1182 return 0; 1183 1184 if (count > max_len) 1185 return -ENOSPC; 1186 1187 /* 1188 * This should be used only for tests and validation. Taint the kernel 1189 * as anything could be submitted directly in context switches 1190 */ 1191 add_taint(TAINT_TEST, LOCKDEP_STILL_OK); 1192 1193 memcpy(cmd, user_batch, count * sizeof(u32)); 1194 cmd += count; 1195 1196 return cmd - batch; 1197 } 1198 1199 static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc, 1200 struct xe_hw_engine *hwe, 1201 u32 *batch, size_t max_len) 1202 { 1203 u32 *cmd = batch; 1204 1205 if (!XE_GT_WA(lrc->gt, 18022495364) || 1206 hwe->class != XE_ENGINE_CLASS_RENDER) 1207 return 0; 1208 1209 if (xe_gt_WARN_ON(lrc->gt, max_len < 3)) 1210 return -ENOSPC; 1211 1212 *cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1); 1213 *cmd++ = CS_DEBUG_MODE2(0).addr; 1214 *cmd++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE); 1215 1216 return cmd - batch; 1217 } 1218 1219 struct bo_setup { 1220 ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 1221 u32 *batch, size_t max_size); 1222 }; 1223 1224 struct bo_setup_state { 1225 /* Input: */ 1226 struct xe_lrc *lrc; 1227 struct xe_hw_engine *hwe; 1228 size_t max_size; 1229 size_t reserve_dw; 1230 unsigned int offset; 1231 const struct bo_setup *funcs; 1232 unsigned int num_funcs; 1233 1234 /* State: */ 1235 u32 *buffer; 1236 u32 *ptr; 1237 unsigned int written; 1238 }; 1239 1240 static int setup_bo(struct bo_setup_state *state) 1241 { 1242 ssize_t remain; 1243 1244 if (state->lrc->bo->vmap.is_iomem) { 1245 xe_gt_assert(state->hwe->gt, state->buffer); 1246 state->ptr = state->buffer; 1247 } else { 1248 state->ptr = state->lrc->bo->vmap.vaddr + state->offset; 1249 } 1250 1251 remain = state->max_size / sizeof(u32); 1252 1253 for (size_t i = 0; i < state->num_funcs; i++) { 1254 ssize_t len = state->funcs[i].setup(state->lrc, state->hwe, 1255 state->ptr, remain); 1256 1257 remain -= len; 1258 1259 /* 1260 * Caller has asked for at least reserve_dw to remain unused. 1261 */ 1262 if (len < 0 || 1263 xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw)) 1264 goto fail; 1265 1266 state->ptr += len; 1267 state->written += len; 1268 } 1269 1270 return 0; 1271 1272 fail: 1273 return -ENOSPC; 1274 } 1275 1276 static void finish_bo(struct bo_setup_state *state) 1277 { 1278 if (!state->lrc->bo->vmap.is_iomem) 1279 return; 1280 1281 xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap, 1282 state->offset, state->buffer, 1283 state->written * sizeof(u32)); 1284 } 1285 1286 /** 1287 * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks. 1288 * @lrc: the &xe_lrc struct instance 1289 * @hwe: the &xe_hw_engine struct instance 1290 * @scratch: preallocated scratch buffer for temporary storage 1291 * Return: 0 on success, negative error code on failure 1292 */ 1293 int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch) 1294 { 1295 static const struct bo_setup funcs[] = { 1296 { .setup = setup_timestamp_wa }, 1297 { .setup = setup_invalidate_state_cache_wa }, 1298 { .setup = setup_utilization_wa }, 1299 { .setup = setup_configfs_post_ctx_restore_bb }, 1300 }; 1301 struct bo_setup_state state = { 1302 .lrc = lrc, 1303 .hwe = hwe, 1304 .max_size = LRC_WA_BB_SIZE, 1305 .buffer = scratch, 1306 .reserve_dw = 1, 1307 .offset = __xe_lrc_wa_bb_offset(lrc), 1308 .funcs = funcs, 1309 .num_funcs = ARRAY_SIZE(funcs), 1310 }; 1311 int ret; 1312 1313 ret = setup_bo(&state); 1314 if (ret) 1315 return ret; 1316 1317 *state.ptr++ = MI_BATCH_BUFFER_END; 1318 state.written++; 1319 1320 finish_bo(&state); 1321 1322 xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR, 1323 xe_bo_ggtt_addr(lrc->bo) + state.offset + 1); 1324 1325 return 0; 1326 } 1327 1328 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe) 1329 { 1330 u32 *buf = NULL; 1331 int ret; 1332 1333 if (lrc->bo->vmap.is_iomem) { 1334 buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL); 1335 if (!buf) 1336 return -ENOMEM; 1337 } 1338 1339 ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf); 1340 1341 kfree(buf); 1342 1343 return ret; 1344 } 1345 1346 static int 1347 setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe) 1348 { 1349 static const struct bo_setup rcs_funcs[] = { 1350 { .setup = setup_timestamp_wa }, 1351 { .setup = setup_configfs_mid_ctx_restore_bb }, 1352 }; 1353 static const struct bo_setup xcs_funcs[] = { 1354 { .setup = setup_configfs_mid_ctx_restore_bb }, 1355 }; 1356 struct bo_setup_state state = { 1357 .lrc = lrc, 1358 .hwe = hwe, 1359 .max_size = (63 * 64) /* max 63 cachelines */, 1360 .buffer = NULL, 1361 .offset = __xe_lrc_indirect_ctx_offset(lrc), 1362 }; 1363 int ret; 1364 1365 if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)) 1366 return 0; 1367 1368 if (hwe->class == XE_ENGINE_CLASS_RENDER || 1369 hwe->class == XE_ENGINE_CLASS_COMPUTE) { 1370 state.funcs = rcs_funcs; 1371 state.num_funcs = ARRAY_SIZE(rcs_funcs); 1372 } else { 1373 state.funcs = xcs_funcs; 1374 state.num_funcs = ARRAY_SIZE(xcs_funcs); 1375 } 1376 1377 if (xe_gt_WARN_ON(lrc->gt, !state.funcs)) 1378 return 0; 1379 1380 if (lrc->bo->vmap.is_iomem) { 1381 state.buffer = kmalloc(state.max_size, GFP_KERNEL); 1382 if (!state.buffer) 1383 return -ENOMEM; 1384 } 1385 1386 ret = setup_bo(&state); 1387 if (ret) { 1388 kfree(state.buffer); 1389 return ret; 1390 } 1391 1392 /* 1393 * Align to 64B cacheline so there's no garbage at the end for CS to 1394 * execute: size for indirect ctx must be a multiple of 64. 1395 */ 1396 while (state.written & 0xf) { 1397 *state.ptr++ = MI_NOOP; 1398 state.written++; 1399 } 1400 1401 finish_bo(&state); 1402 kfree(state.buffer); 1403 1404 /* 1405 * Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it 1406 * varies per engine class, but the default is good enough 1407 */ 1408 xe_lrc_write_ctx_reg(lrc, 1409 CTX_CS_INDIRECT_CTX, 1410 (xe_bo_ggtt_addr(lrc->bo) + state.offset) | 1411 /* Size in CLs. */ 1412 (state.written * sizeof(u32) / 64)); 1413 1414 return 0; 1415 } 1416 1417 static u8 xe_multi_queue_prio_to_lrc(struct xe_lrc *lrc, enum xe_multi_queue_priority priority) 1418 { 1419 struct xe_device *xe = gt_to_xe(lrc->gt); 1420 1421 xe_assert(xe, (priority >= XE_MULTI_QUEUE_PRIORITY_LOW && 1422 priority <= XE_MULTI_QUEUE_PRIORITY_HIGH)); 1423 1424 /* xe_multi_queue_priority is directly mapped to LRC priority values */ 1425 return priority; 1426 } 1427 1428 /** 1429 * xe_lrc_set_multi_queue_priority() - Set multi queue priority in LRC 1430 * @lrc: Logical Ring Context 1431 * @priority: Multi queue priority of the exec queue 1432 * 1433 * Convert @priority to LRC multi queue priority and update the @lrc descriptor 1434 */ 1435 void xe_lrc_set_multi_queue_priority(struct xe_lrc *lrc, enum xe_multi_queue_priority priority) 1436 { 1437 lrc->desc &= ~LRC_PRIORITY; 1438 lrc->desc |= FIELD_PREP(LRC_PRIORITY, xe_multi_queue_prio_to_lrc(lrc, priority)); 1439 } 1440 1441 static int xe_lrc_ctx_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm, 1442 void *replay_state, u16 msix_vec, u32 init_flags) 1443 { 1444 struct xe_gt *gt = hwe->gt; 1445 struct xe_tile *tile = gt_to_tile(gt); 1446 struct xe_device *xe = gt_to_xe(gt); 1447 struct iosys_map map; 1448 u32 arb_enable; 1449 int err; 1450 1451 /* 1452 * Init Per-Process of HW status Page, LRC / context state to known 1453 * values. If there's already a primed default_lrc, just copy it, otherwise 1454 * it's the early submission to record the lrc: build a new empty one from 1455 * scratch. 1456 */ 1457 map = __xe_lrc_pphwsp_map(lrc); 1458 if (gt->default_lrc[hwe->class] || replay_state) { 1459 xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE); /* PPHWSP */ 1460 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE, 1461 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE, 1462 lrc->size - LRC_PPHWSP_SIZE); 1463 if (replay_state) 1464 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE, 1465 replay_state, lrc->replay_size); 1466 } else { 1467 void *init_data = empty_lrc_data(hwe); 1468 1469 if (!init_data) { 1470 return -ENOMEM; 1471 } 1472 1473 xe_map_memcpy_to(xe, &map, 0, init_data, lrc->size); 1474 kfree(init_data); 1475 } 1476 1477 if (vm) 1478 xe_lrc_set_ppgtt(lrc, vm); 1479 1480 if (xe_device_has_msix(xe)) { 1481 xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR, 1482 xe_memirq_status_ptr(&tile->memirq, hwe)); 1483 xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR, 1484 xe_memirq_source_ptr(&tile->memirq, hwe)); 1485 xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec); 1486 } 1487 1488 if (xe_gt_has_indirect_ring_state(gt)) { 1489 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE, 1490 __xe_lrc_indirect_ring_ggtt_addr(lrc)); 1491 1492 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START, 1493 __xe_lrc_ring_ggtt_addr(lrc)); 1494 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0); 1495 1496 /* Match head and tail pointers */ 1497 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, lrc->ring.tail); 1498 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail); 1499 1500 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL, 1501 RING_CTL_SIZE(lrc->ring.size) | RING_VALID); 1502 } else { 1503 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc)); 1504 1505 /* Match head and tail pointers */ 1506 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, lrc->ring.tail); 1507 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail); 1508 1509 xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL, 1510 RING_CTL_SIZE(lrc->ring.size) | RING_VALID); 1511 } 1512 1513 if (init_flags & XE_LRC_CREATE_RUNALONE) 1514 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL, 1515 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) | 1516 _MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE)); 1517 1518 if (init_flags & XE_LRC_CREATE_PXP) 1519 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL, 1520 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) | 1521 _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE)); 1522 1523 lrc->ctx_timestamp = 0; 1524 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0); 1525 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) 1526 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0); 1527 1528 if (xe->info.has_asid && vm) 1529 xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid); 1530 1531 lrc->desc = LRC_VALID; 1532 lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT); 1533 /* TODO: Priority */ 1534 1535 /* While this appears to have something about privileged batches or 1536 * some such, it really just means PPGTT mode. 1537 */ 1538 if (vm) 1539 lrc->desc |= LRC_PRIVILEGE; 1540 1541 if (GRAPHICS_VERx100(xe) < 1250) { 1542 lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance); 1543 lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class); 1544 } 1545 1546 arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1547 xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable)); 1548 1549 map = __xe_lrc_seqno_map(lrc); 1550 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); 1551 1552 map = __xe_lrc_start_seqno_map(lrc); 1553 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); 1554 1555 err = setup_wa_bb(lrc, hwe); 1556 if (err) 1557 return err; 1558 1559 err = setup_indirect_ctx(lrc, hwe); 1560 1561 return err; 1562 } 1563 1564 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm, 1565 void *replay_state, u32 ring_size, u16 msix_vec, u32 init_flags) 1566 { 1567 struct xe_gt *gt = hwe->gt; 1568 const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class); 1569 u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE; 1570 struct xe_tile *tile = gt_to_tile(gt); 1571 struct xe_device *xe = gt_to_xe(gt); 1572 struct xe_bo *bo; 1573 u32 bo_flags; 1574 int err; 1575 1576 kref_init(&lrc->refcount); 1577 lrc->gt = gt; 1578 lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class); 1579 lrc->size = lrc_size; 1580 lrc->flags = 0; 1581 lrc->ring.size = ring_size; 1582 lrc->ring.tail = 0; 1583 1584 if (gt_engine_needs_indirect_ctx(gt, hwe->class)) { 1585 lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX; 1586 bo_size += LRC_INDIRECT_CTX_BO_SIZE; 1587 } 1588 1589 if (xe_gt_has_indirect_ring_state(gt)) 1590 lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE; 1591 1592 bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT | 1593 XE_BO_FLAG_GGTT_INVALIDATE; 1594 1595 if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */ 1596 bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM; 1597 1598 bo = xe_bo_create_pin_map_novm(xe, tile, bo_size, 1599 ttm_bo_type_kernel, 1600 bo_flags, false); 1601 if (IS_ERR(lrc->bo)) 1602 return PTR_ERR(lrc->bo); 1603 1604 lrc->bo = bo; 1605 1606 bo = xe_bo_create_pin_map_novm(xe, tile, PAGE_SIZE, 1607 ttm_bo_type_kernel, 1608 XE_BO_FLAG_GGTT | 1609 XE_BO_FLAG_GGTT_INVALIDATE | 1610 XE_BO_FLAG_SYSTEM, false); 1611 if (IS_ERR(bo)) { 1612 err = PTR_ERR(bo); 1613 goto err_lrc_finish; 1614 } 1615 lrc->seqno_bo = bo; 1616 1617 xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt, 1618 hwe->fence_irq, hwe->name); 1619 1620 err = xe_lrc_ctx_init(lrc, hwe, vm, replay_state, msix_vec, init_flags); 1621 if (err) 1622 goto err_lrc_finish; 1623 1624 if (vm && vm->xef) 1625 xe_drm_client_add_bo(vm->xef->client, lrc->bo); 1626 1627 return 0; 1628 1629 err_lrc_finish: 1630 xe_lrc_finish(lrc); 1631 return err; 1632 } 1633 1634 /** 1635 * xe_lrc_create - Create a LRC 1636 * @hwe: Hardware Engine 1637 * @vm: The VM (address space) 1638 * @replay_state: GPU hang replay state 1639 * @ring_size: LRC ring size 1640 * @msix_vec: MSI-X interrupt vector (for platforms that support it) 1641 * @flags: LRC initialization flags 1642 * 1643 * Allocate and initialize the Logical Ring Context (LRC). 1644 * 1645 * Return pointer to created LRC upon success and an error pointer 1646 * upon failure. 1647 */ 1648 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm, 1649 void *replay_state, u32 ring_size, u16 msix_vec, u32 flags) 1650 { 1651 struct xe_lrc *lrc; 1652 int err; 1653 1654 lrc = kzalloc_obj(*lrc); 1655 if (!lrc) 1656 return ERR_PTR(-ENOMEM); 1657 1658 err = xe_lrc_init(lrc, hwe, vm, replay_state, ring_size, msix_vec, flags); 1659 if (err) { 1660 kfree(lrc); 1661 return ERR_PTR(err); 1662 } 1663 1664 return lrc; 1665 } 1666 1667 /** 1668 * xe_lrc_destroy - Destroy the LRC 1669 * @ref: reference to LRC 1670 * 1671 * Called when ref == 0, release resources held by the Logical Ring Context 1672 * (LRC) and free the LRC memory. 1673 */ 1674 void xe_lrc_destroy(struct kref *ref) 1675 { 1676 struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount); 1677 1678 xe_lrc_finish(lrc); 1679 kfree(lrc); 1680 } 1681 1682 /** 1683 * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC. 1684 * @lrc: the &xe_lrc struct instance 1685 */ 1686 void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc) 1687 { 1688 if (xe_lrc_has_indirect_ring_state(lrc)) { 1689 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE, 1690 __xe_lrc_indirect_ring_ggtt_addr(lrc)); 1691 1692 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START, 1693 __xe_lrc_ring_ggtt_addr(lrc)); 1694 } else { 1695 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc)); 1696 } 1697 } 1698 1699 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail) 1700 { 1701 if (xe_lrc_has_indirect_ring_state(lrc)) 1702 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail); 1703 else 1704 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail); 1705 } 1706 1707 u32 xe_lrc_ring_tail(struct xe_lrc *lrc) 1708 { 1709 if (xe_lrc_has_indirect_ring_state(lrc)) 1710 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR; 1711 else 1712 return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR; 1713 } 1714 1715 static u32 xe_lrc_ring_start(struct xe_lrc *lrc) 1716 { 1717 if (xe_lrc_has_indirect_ring_state(lrc)) 1718 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START); 1719 else 1720 return xe_lrc_read_ctx_reg(lrc, CTX_RING_START); 1721 } 1722 1723 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head) 1724 { 1725 if (xe_lrc_has_indirect_ring_state(lrc)) 1726 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head); 1727 else 1728 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head); 1729 } 1730 1731 u32 xe_lrc_ring_head(struct xe_lrc *lrc) 1732 { 1733 if (xe_lrc_has_indirect_ring_state(lrc)) 1734 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR; 1735 else 1736 return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR; 1737 } 1738 1739 u32 xe_lrc_ring_space(struct xe_lrc *lrc) 1740 { 1741 const u32 head = xe_lrc_ring_head(lrc); 1742 const u32 tail = lrc->ring.tail; 1743 const u32 size = lrc->ring.size; 1744 1745 return ((head - tail - 1) & (size - 1)) + 1; 1746 } 1747 1748 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring, 1749 const void *data, size_t size) 1750 { 1751 struct xe_device *xe = lrc_to_xe(lrc); 1752 1753 iosys_map_incr(&ring, lrc->ring.tail); 1754 xe_map_memcpy_to(xe, &ring, 0, data, size); 1755 lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1); 1756 } 1757 1758 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size) 1759 { 1760 struct xe_device *xe = lrc_to_xe(lrc); 1761 struct iosys_map ring; 1762 u32 rhs; 1763 size_t aligned_size; 1764 1765 xe_assert(xe, IS_ALIGNED(size, 4)); 1766 aligned_size = ALIGN(size, 8); 1767 1768 ring = __xe_lrc_ring_map(lrc); 1769 1770 xe_assert(xe, lrc->ring.tail < lrc->ring.size); 1771 rhs = lrc->ring.size - lrc->ring.tail; 1772 if (size > rhs) { 1773 __xe_lrc_write_ring(lrc, ring, data, rhs); 1774 __xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs); 1775 } else { 1776 __xe_lrc_write_ring(lrc, ring, data, size); 1777 } 1778 1779 if (aligned_size > size) { 1780 u32 noop = MI_NOOP; 1781 1782 __xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop)); 1783 } 1784 } 1785 1786 u64 xe_lrc_descriptor(struct xe_lrc *lrc) 1787 { 1788 return lrc->desc | xe_lrc_ggtt_addr(lrc); 1789 } 1790 1791 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc) 1792 { 1793 return __xe_lrc_seqno_ggtt_addr(lrc); 1794 } 1795 1796 /** 1797 * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence. 1798 * 1799 * Allocate but don't initialize an lrc seqno fence. 1800 * 1801 * Return: Pointer to the allocated fence or 1802 * negative error pointer on error. 1803 */ 1804 struct dma_fence *xe_lrc_alloc_seqno_fence(void) 1805 { 1806 return xe_hw_fence_alloc(); 1807 } 1808 1809 /** 1810 * xe_lrc_free_seqno_fence() - Free an lrc seqno fence. 1811 * @fence: Pointer to the fence to free. 1812 * 1813 * Frees an lrc seqno fence that hasn't yet been 1814 * initialized. 1815 */ 1816 void xe_lrc_free_seqno_fence(struct dma_fence *fence) 1817 { 1818 xe_hw_fence_free(fence); 1819 } 1820 1821 /** 1822 * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence. 1823 * @lrc: Pointer to the lrc. 1824 * @fence: Pointer to the fence to initialize. 1825 * 1826 * Initializes a pre-allocated lrc seqno fence. 1827 * After initialization, the fence is subject to normal 1828 * dma-fence refcounting. 1829 */ 1830 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence) 1831 { 1832 xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc)); 1833 } 1834 1835 s32 xe_lrc_seqno(struct xe_lrc *lrc) 1836 { 1837 struct iosys_map map = __xe_lrc_seqno_map(lrc); 1838 1839 return xe_map_read32(lrc_to_xe(lrc), &map); 1840 } 1841 1842 s32 xe_lrc_start_seqno(struct xe_lrc *lrc) 1843 { 1844 struct iosys_map map = __xe_lrc_start_seqno_map(lrc); 1845 1846 return xe_map_read32(lrc_to_xe(lrc), &map); 1847 } 1848 1849 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc) 1850 { 1851 return __xe_lrc_start_seqno_ggtt_addr(lrc); 1852 } 1853 1854 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc) 1855 { 1856 return __xe_lrc_parallel_ggtt_addr(lrc); 1857 } 1858 1859 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc) 1860 { 1861 return __xe_lrc_parallel_map(lrc); 1862 } 1863 1864 /** 1865 * xe_lrc_engine_id() - Read engine id value 1866 * @lrc: Pointer to the lrc. 1867 * 1868 * Returns: context id value 1869 */ 1870 static u32 xe_lrc_engine_id(struct xe_lrc *lrc) 1871 { 1872 struct xe_device *xe = lrc_to_xe(lrc); 1873 struct iosys_map map; 1874 1875 map = __xe_lrc_engine_id_map(lrc); 1876 return xe_map_read32(xe, &map); 1877 } 1878 1879 static int instr_dw(u32 cmd_header) 1880 { 1881 /* GFXPIPE "SINGLE_DW" opcodes are a single dword */ 1882 if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) == 1883 GFXPIPE_SINGLE_DW_CMD(0, 0)) 1884 return 1; 1885 1886 /* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */ 1887 if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST) 1888 return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2; 1889 1890 /* Most instructions have the # of dwords (minus 2) in 7:0 */ 1891 return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2; 1892 } 1893 1894 static int dump_mi_command(struct drm_printer *p, 1895 struct xe_gt *gt, 1896 u32 *dw, 1897 int remaining_dw) 1898 { 1899 u32 inst_header = *dw; 1900 u32 numdw = instr_dw(inst_header); 1901 u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header); 1902 int num_noop; 1903 1904 /* First check for commands that don't have/use a '# DW' field */ 1905 switch (inst_header & MI_OPCODE) { 1906 case MI_NOOP: 1907 num_noop = 1; 1908 while (num_noop < remaining_dw && 1909 (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP) 1910 num_noop++; 1911 drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop); 1912 return num_noop; 1913 1914 case MI_TOPOLOGY_FILTER: 1915 drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header); 1916 return 1; 1917 1918 case MI_BATCH_BUFFER_END: 1919 drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header); 1920 /* Return 'remaining_dw' to consume the rest of the LRC */ 1921 return remaining_dw; 1922 } 1923 1924 /* 1925 * Any remaining commands include a # of dwords. We should make sure 1926 * it doesn't exceed the remaining size of the LRC. 1927 */ 1928 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 1929 numdw = remaining_dw; 1930 1931 switch (inst_header & MI_OPCODE) { 1932 case MI_LOAD_REGISTER_IMM: 1933 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n", 1934 inst_header, (numdw - 1) / 2); 1935 for (int i = 1; i < numdw; i += 2) 1936 drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]); 1937 return numdw; 1938 1939 case MI_LOAD_REGISTER_MEM & MI_OPCODE: 1940 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n", 1941 inst_header, 1942 dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "", 1943 dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : ""); 1944 if (numdw == 4) 1945 drm_printf(p, " - %#6x = %#010llx\n", 1946 dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2]))); 1947 else 1948 drm_printf(p, " - %*ph (%s)\n", 1949 (int)sizeof(u32) * (numdw - 1), dw + 1, 1950 numdw < 4 ? "truncated" : "malformed"); 1951 return numdw; 1952 1953 case MI_FORCE_WAKEUP: 1954 drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header); 1955 return numdw; 1956 1957 default: 1958 drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n", 1959 inst_header, opcode, numdw); 1960 return numdw; 1961 } 1962 } 1963 1964 static int dump_gfxpipe_command(struct drm_printer *p, 1965 struct xe_gt *gt, 1966 u32 *dw, 1967 int remaining_dw) 1968 { 1969 u32 numdw = instr_dw(*dw); 1970 u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw); 1971 u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw); 1972 u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw); 1973 1974 /* 1975 * Make sure we haven't mis-parsed a number of dwords that exceeds the 1976 * remaining size of the LRC. 1977 */ 1978 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 1979 numdw = remaining_dw; 1980 1981 switch (*dw & GFXPIPE_MATCH_MASK) { 1982 #define MATCH(cmd) \ 1983 case cmd: \ 1984 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \ 1985 return numdw 1986 #define MATCH3D(cmd) \ 1987 case CMD_##cmd: \ 1988 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \ 1989 return numdw 1990 1991 MATCH(STATE_BASE_ADDRESS); 1992 MATCH(STATE_SIP); 1993 MATCH(GPGPU_CSR_BASE_ADDRESS); 1994 MATCH(STATE_COMPUTE_MODE); 1995 MATCH3D(3DSTATE_BTD); 1996 MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS); 1997 MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS); 1998 1999 MATCH3D(3DSTATE_VF_STATISTICS); 2000 2001 MATCH(PIPELINE_SELECT); 2002 2003 MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST); 2004 MATCH3D(3DSTATE_CUSTOM_SAMPLE_PATTERN); 2005 MATCH3D(3DSTATE_CLEAR_PARAMS); 2006 MATCH3D(3DSTATE_DEPTH_BUFFER); 2007 MATCH3D(3DSTATE_STENCIL_BUFFER); 2008 MATCH3D(3DSTATE_HIER_DEPTH_BUFFER); 2009 MATCH3D(3DSTATE_VERTEX_BUFFERS); 2010 MATCH3D(3DSTATE_VERTEX_ELEMENTS); 2011 MATCH3D(3DSTATE_INDEX_BUFFER); 2012 MATCH3D(3DSTATE_VF); 2013 MATCH3D(3DSTATE_MULTISAMPLE); 2014 MATCH3D(3DSTATE_CC_STATE_POINTERS); 2015 MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS); 2016 MATCH3D(3DSTATE_VS); 2017 MATCH3D(3DSTATE_GS); 2018 MATCH3D(3DSTATE_CLIP); 2019 MATCH3D(3DSTATE_SF); 2020 MATCH3D(3DSTATE_WM); 2021 MATCH3D(3DSTATE_CONSTANT_VS); 2022 MATCH3D(3DSTATE_CONSTANT_GS); 2023 MATCH3D(3DSTATE_CONSTANT_PS); 2024 MATCH3D(3DSTATE_SAMPLE_MASK); 2025 MATCH3D(3DSTATE_CONSTANT_HS); 2026 MATCH3D(3DSTATE_CONSTANT_DS); 2027 MATCH3D(3DSTATE_HS); 2028 MATCH3D(3DSTATE_TE); 2029 MATCH3D(3DSTATE_DS); 2030 MATCH3D(3DSTATE_STREAMOUT); 2031 MATCH3D(3DSTATE_SBE); 2032 MATCH3D(3DSTATE_PS); 2033 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP); 2034 MATCH3D(3DSTATE_CPS_POINTERS); 2035 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC); 2036 MATCH3D(3DSTATE_BLEND_STATE_POINTERS); 2037 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS); 2038 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS); 2039 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS); 2040 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS); 2041 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS); 2042 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS); 2043 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS); 2044 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS); 2045 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS); 2046 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS); 2047 MATCH3D(3DSTATE_VF_INSTANCING); 2048 MATCH3D(3DSTATE_VF_SGVS); 2049 MATCH3D(3DSTATE_VF_TOPOLOGY); 2050 MATCH3D(3DSTATE_WM_CHROMAKEY); 2051 MATCH3D(3DSTATE_PS_BLEND); 2052 MATCH3D(3DSTATE_WM_DEPTH_STENCIL); 2053 MATCH3D(3DSTATE_PS_EXTRA); 2054 MATCH3D(3DSTATE_RASTER); 2055 MATCH3D(3DSTATE_SBE_SWIZ); 2056 MATCH3D(3DSTATE_WM_HZ_OP); 2057 MATCH3D(3DSTATE_VF_COMPONENT_PACKING); 2058 MATCH3D(3DSTATE_VF_SGVS_2); 2059 MATCH3D(3DSTATE_VFG); 2060 MATCH3D(3DSTATE_URB_ALLOC_VS); 2061 MATCH3D(3DSTATE_URB_ALLOC_HS); 2062 MATCH3D(3DSTATE_URB_ALLOC_DS); 2063 MATCH3D(3DSTATE_URB_ALLOC_GS); 2064 MATCH3D(3DSTATE_SO_BUFFER_INDEX_0); 2065 MATCH3D(3DSTATE_SO_BUFFER_INDEX_1); 2066 MATCH3D(3DSTATE_SO_BUFFER_INDEX_2); 2067 MATCH3D(3DSTATE_SO_BUFFER_INDEX_3); 2068 MATCH3D(3DSTATE_PRIMITIVE_REPLICATION); 2069 MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO); 2070 MATCH3D(3DSTATE_AMFS); 2071 MATCH3D(3DSTATE_DEPTH_BOUNDS); 2072 MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS); 2073 MATCH3D(3DSTATE_CONSTANT_TS_POINTER); 2074 MATCH3D(3DSTATE_MESH_CONTROL); 2075 MATCH3D(3DSTATE_MESH_DISTRIB); 2076 MATCH3D(3DSTATE_TASK_REDISTRIB); 2077 MATCH3D(3DSTATE_MESH_SHADER); 2078 MATCH3D(3DSTATE_MESH_SHADER_DATA); 2079 MATCH3D(3DSTATE_TASK_CONTROL); 2080 MATCH3D(3DSTATE_TASK_SHADER); 2081 MATCH3D(3DSTATE_TASK_SHADER_DATA); 2082 MATCH3D(3DSTATE_URB_ALLOC_MESH); 2083 MATCH3D(3DSTATE_URB_ALLOC_TASK); 2084 MATCH3D(3DSTATE_CLIP_MESH); 2085 MATCH3D(3DSTATE_SBE_MESH); 2086 MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER); 2087 MATCH3D(3DSTATE_COARSE_PIXEL); 2088 MATCH3D(3DSTATE_MESH_SHADER_DATA_EXT); 2089 MATCH3D(3DSTATE_TASK_SHADER_DATA_EXT); 2090 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC_2); 2091 MATCH3D(3DSTATE_CC_STATE_POINTERS_2); 2092 MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS_2); 2093 MATCH3D(3DSTATE_BLEND_STATE_POINTERS_2); 2094 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_2); 2095 2096 MATCH3D(3DSTATE_DRAWING_RECTANGLE); 2097 MATCH3D(3DSTATE_URB_MEMORY); 2098 MATCH3D(3DSTATE_CHROMA_KEY); 2099 MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET); 2100 MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN); 2101 MATCH3D(3DSTATE_LINE_STIPPLE); 2102 MATCH3D(3DSTATE_AA_LINE_PARAMETERS); 2103 MATCH3D(3DSTATE_MONOFILTER_SIZE); 2104 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS); 2105 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS); 2106 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS); 2107 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS); 2108 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS); 2109 MATCH3D(3DSTATE_SO_DECL_LIST); 2110 MATCH3D(3DSTATE_SO_BUFFER); 2111 MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC); 2112 MATCH3D(3DSTATE_SAMPLE_PATTERN); 2113 MATCH3D(3DSTATE_3D_MODE); 2114 MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE); 2115 MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS); 2116 MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO); 2117 MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTER_2); 2118 2119 default: 2120 drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n", 2121 *dw, pipeline, opcode, subopcode, numdw); 2122 return numdw; 2123 } 2124 } 2125 2126 static int dump_gfx_state_command(struct drm_printer *p, 2127 struct xe_gt *gt, 2128 u32 *dw, 2129 int remaining_dw) 2130 { 2131 u32 numdw = instr_dw(*dw); 2132 u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw); 2133 2134 /* 2135 * Make sure we haven't mis-parsed a number of dwords that exceeds the 2136 * remaining size of the LRC. 2137 */ 2138 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 2139 numdw = remaining_dw; 2140 2141 switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) { 2142 MATCH(STATE_WRITE_INLINE); 2143 2144 default: 2145 drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n", 2146 *dw, opcode, numdw); 2147 return numdw; 2148 } 2149 } 2150 2151 void xe_lrc_dump_default(struct drm_printer *p, 2152 struct xe_gt *gt, 2153 enum xe_engine_class hwe_class) 2154 { 2155 u32 *dw; 2156 int remaining_dw, num_dw; 2157 2158 if (!gt->default_lrc[hwe_class]) { 2159 drm_printf(p, "No default LRC for class %d\n", hwe_class); 2160 return; 2161 } 2162 2163 /* 2164 * Skip the beginning of the LRC since it contains the per-process 2165 * hardware status page. 2166 */ 2167 dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE; 2168 remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4; 2169 2170 while (remaining_dw > 0) { 2171 if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) { 2172 num_dw = dump_mi_command(p, gt, dw, remaining_dw); 2173 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) { 2174 num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw); 2175 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) { 2176 num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw); 2177 } else { 2178 num_dw = min(instr_dw(*dw), remaining_dw); 2179 drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n", 2180 *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw), 2181 num_dw); 2182 } 2183 2184 dw += num_dw; 2185 remaining_dw -= num_dw; 2186 } 2187 } 2188 2189 /* 2190 * Lookup the value of a register within the offset/value pairs of an 2191 * MI_LOAD_REGISTER_IMM instruction. 2192 * 2193 * Return -ENOENT if the register is not present in the MI_LRI instruction. 2194 */ 2195 static int lookup_reg_in_mi_lri(u32 offset, u32 *value, 2196 const u32 *dword_pair, int num_regs) 2197 { 2198 for (int i = 0; i < num_regs; i++) { 2199 if (dword_pair[2 * i] == offset) { 2200 *value = dword_pair[2 * i + 1]; 2201 return 0; 2202 } 2203 } 2204 2205 return -ENOENT; 2206 } 2207 2208 /* 2209 * Lookup the value of a register in a specific engine type's default LRC. 2210 * 2211 * Return -EINVAL if the default LRC doesn't exist, or ENOENT if the register 2212 * cannot be found in the default LRC. 2213 */ 2214 int xe_lrc_lookup_default_reg_value(struct xe_gt *gt, 2215 enum xe_engine_class hwe_class, 2216 u32 offset, 2217 u32 *value) 2218 { 2219 u32 *dw; 2220 int remaining_dw, ret; 2221 2222 if (!gt->default_lrc[hwe_class]) 2223 return -EINVAL; 2224 2225 /* 2226 * Skip the beginning of the LRC since it contains the per-process 2227 * hardware status page. 2228 */ 2229 dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE; 2230 remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4; 2231 2232 while (remaining_dw > 0) { 2233 u32 num_dw = instr_dw(*dw); 2234 2235 if (num_dw > remaining_dw) 2236 num_dw = remaining_dw; 2237 2238 switch (*dw & XE_INSTR_CMD_TYPE) { 2239 case XE_INSTR_MI: 2240 switch (*dw & MI_OPCODE) { 2241 case MI_BATCH_BUFFER_END: 2242 /* End of LRC; register not found */ 2243 return -ENOENT; 2244 2245 case MI_NOOP: 2246 case MI_TOPOLOGY_FILTER: 2247 /* 2248 * MI_NOOP and MI_TOPOLOGY_FILTER don't have 2249 * a length field and are always 1-dword 2250 * instructions. 2251 */ 2252 remaining_dw--; 2253 dw++; 2254 break; 2255 2256 case MI_LOAD_REGISTER_IMM: 2257 ret = lookup_reg_in_mi_lri(offset, value, 2258 dw + 1, (num_dw - 1) / 2); 2259 if (ret == 0) 2260 return 0; 2261 2262 fallthrough; 2263 2264 default: 2265 /* 2266 * Jump to next instruction based on length 2267 * field. 2268 */ 2269 remaining_dw -= num_dw; 2270 dw += num_dw; 2271 break; 2272 } 2273 break; 2274 2275 default: 2276 /* Jump to next instruction based on length field. */ 2277 remaining_dw -= num_dw; 2278 dw += num_dw; 2279 } 2280 } 2281 2282 return -ENOENT; 2283 } 2284 2285 struct instr_state { 2286 u32 instr; 2287 u16 num_dw; 2288 }; 2289 2290 static const struct instr_state xe_hpg_svg_state[] = { 2291 { .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 }, 2292 { .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 }, 2293 { .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 }, 2294 { .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 }, 2295 { .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 }, 2296 { .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 }, 2297 { .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 }, 2298 { .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 }, 2299 { .instr = CMD_3DSTATE_VS, .num_dw = 9 }, 2300 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 }, 2301 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 }, 2302 { .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 }, 2303 { .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 }, 2304 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 }, 2305 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 }, 2306 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 }, 2307 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 }, 2308 { .instr = CMD_3DSTATE_CLIP, .num_dw = 4 }, 2309 { .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 }, 2310 { .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 }, 2311 { .instr = CMD_3DSTATE_SF, .num_dw = 4 }, 2312 { .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 }, 2313 { .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 }, 2314 { .instr = CMD_3DSTATE_RASTER, .num_dw = 5 }, 2315 { .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 }, 2316 { .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 }, 2317 { .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 }, 2318 { .instr = CMD_3DSTATE_HS, .num_dw = 9 }, 2319 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 }, 2320 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 }, 2321 { .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 }, 2322 { .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 }, 2323 { .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 }, 2324 { .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 }, 2325 { .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 }, 2326 { .instr = CMD_3DSTATE_TE, .num_dw = 5 }, 2327 { .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 }, 2328 { .instr = CMD_3DSTATE_DS, .num_dw = 11 }, 2329 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 }, 2330 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 }, 2331 { .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 }, 2332 { .instr = CMD_3DSTATE_GS, .num_dw = 10 }, 2333 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 }, 2334 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 }, 2335 { .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 }, 2336 { .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 }, 2337 { .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 }, 2338 { .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 }, 2339 { .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 }, 2340 { .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 }, 2341 }; 2342 2343 u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs) 2344 { 2345 struct xe_gt *gt = q->hwe->gt; 2346 struct xe_device *xe = gt_to_xe(gt); 2347 const struct instr_state *state_table = NULL; 2348 int state_table_size = 0; 2349 2350 /* 2351 * Wa_14019789679 2352 * 2353 * If the driver doesn't explicitly emit the SVG instructions while 2354 * setting up the default LRC, the context switch will write 0's 2355 * (noops) into the LRC memory rather than the expected instruction 2356 * headers. Application contexts start out as a copy of the default 2357 * LRC, and if they also do not emit specific settings for some SVG 2358 * state, then on context restore they'll unintentionally inherit 2359 * whatever state setting the previous context had programmed into the 2360 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will 2361 * prevent the hardware from resetting that state back to any specific 2362 * value). 2363 * 2364 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL 2365 * since that's a specific state setting that can easily cause GPU 2366 * hangs if unintentionally inherited. However to be safe we'll 2367 * continue to emit all of the SVG state since it's best not to leak 2368 * any of the state between contexts, even if that leakage is harmless. 2369 */ 2370 if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) { 2371 state_table = xe_hpg_svg_state; 2372 state_table_size = ARRAY_SIZE(xe_hpg_svg_state); 2373 } 2374 2375 if (!state_table) { 2376 xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n", 2377 GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100); 2378 return cs; 2379 } 2380 2381 for (int i = 0; i < state_table_size; i++) { 2382 u32 instr = state_table[i].instr; 2383 u16 num_dw = state_table[i].num_dw; 2384 bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW); 2385 2386 xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE); 2387 xe_gt_assert(gt, num_dw != 0); 2388 xe_gt_assert(gt, is_single_dw ^ (num_dw > 1)); 2389 2390 /* 2391 * Xe2's SVG context is the same as the one on DG2 / MTL 2392 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has 2393 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined). 2394 * Just make the replacement here rather than defining a 2395 * whole separate table for the single trivial change. 2396 */ 2397 if (GRAPHICS_VER(xe) >= 20 && 2398 instr == CMD_3DSTATE_DRAWING_RECTANGLE) 2399 instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST; 2400 2401 *cs = instr; 2402 if (!is_single_dw) 2403 *cs |= (num_dw - 2); 2404 2405 cs += num_dw; 2406 } 2407 2408 return cs; 2409 } 2410 2411 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc) 2412 { 2413 struct xe_lrc_snapshot *snapshot = kmalloc_obj(*snapshot, GFP_NOWAIT); 2414 2415 if (!snapshot) 2416 return NULL; 2417 2418 snapshot->context_desc = xe_lrc_ggtt_addr(lrc); 2419 snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc); 2420 snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc); 2421 snapshot->head = xe_lrc_ring_head(lrc); 2422 snapshot->tail.internal = lrc->ring.tail; 2423 snapshot->tail.memory = xe_lrc_ring_tail(lrc); 2424 snapshot->start = xe_lrc_ring_start(lrc); 2425 snapshot->start_seqno = xe_lrc_start_seqno(lrc); 2426 snapshot->seqno = xe_lrc_seqno(lrc); 2427 snapshot->lrc_bo = xe_bo_get(lrc->bo); 2428 snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc); 2429 snapshot->lrc_size = lrc->size; 2430 snapshot->replay_offset = 0; 2431 snapshot->replay_size = lrc->replay_size; 2432 snapshot->lrc_snapshot = NULL; 2433 snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc)); 2434 snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc); 2435 return snapshot; 2436 } 2437 2438 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot) 2439 { 2440 struct xe_bo *bo; 2441 struct iosys_map src; 2442 2443 if (!snapshot) 2444 return; 2445 2446 bo = snapshot->lrc_bo; 2447 snapshot->lrc_bo = NULL; 2448 2449 snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL); 2450 if (!snapshot->lrc_snapshot) 2451 goto put_bo; 2452 2453 xe_bo_lock(bo, false); 2454 if (!ttm_bo_vmap(&bo->ttm, &src)) { 2455 xe_map_memcpy_from(xe_bo_device(bo), 2456 snapshot->lrc_snapshot, &src, snapshot->lrc_offset, 2457 snapshot->lrc_size); 2458 ttm_bo_vunmap(&bo->ttm, &src); 2459 } else { 2460 kvfree(snapshot->lrc_snapshot); 2461 snapshot->lrc_snapshot = NULL; 2462 } 2463 xe_bo_unlock(bo); 2464 put_bo: 2465 xe_bo_put(bo); 2466 } 2467 2468 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p) 2469 { 2470 unsigned long i; 2471 2472 if (!snapshot) 2473 return; 2474 2475 drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc); 2476 drm_printf(p, "\tHW Ring address: 0x%08x\n", 2477 snapshot->ring_addr); 2478 drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n", 2479 snapshot->indirect_context_desc); 2480 drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head); 2481 drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n", 2482 snapshot->tail.internal, snapshot->tail.memory); 2483 drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start); 2484 drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno); 2485 drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno); 2486 drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp); 2487 drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp); 2488 2489 if (!snapshot->lrc_snapshot) 2490 return; 2491 2492 drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE); 2493 drm_puts(p, "\t[HWSP].data: "); 2494 for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) { 2495 u32 *val = snapshot->lrc_snapshot + i; 2496 char dumped[ASCII85_BUFSZ]; 2497 2498 drm_puts(p, ascii85_encode(*val, dumped)); 2499 } 2500 2501 drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE); 2502 drm_printf(p, "\n\t[HWCTX].replay_offset: 0x%lx\n", snapshot->replay_offset); 2503 drm_printf(p, "\n\t[HWCTX].replay_length: 0x%lx\n", snapshot->replay_size); 2504 2505 drm_puts(p, "\t[HWCTX].data: "); 2506 for (; i < snapshot->lrc_size; i += sizeof(u32)) { 2507 u32 *val = snapshot->lrc_snapshot + i; 2508 char dumped[ASCII85_BUFSZ]; 2509 2510 drm_puts(p, ascii85_encode(*val, dumped)); 2511 } 2512 drm_puts(p, "\n"); 2513 } 2514 2515 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot) 2516 { 2517 if (!snapshot) 2518 return; 2519 2520 kvfree(snapshot->lrc_snapshot); 2521 if (snapshot->lrc_bo) 2522 xe_bo_put(snapshot->lrc_bo); 2523 2524 kfree(snapshot); 2525 } 2526 2527 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts) 2528 { 2529 u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id); 2530 u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id); 2531 struct xe_hw_engine *hwe; 2532 u64 val; 2533 2534 hwe = xe_gt_hw_engine(lrc->gt, class, instance, false); 2535 if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe), 2536 "Unexpected engine class:instance %d:%d for context utilization\n", 2537 class, instance)) 2538 return -1; 2539 2540 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) 2541 val = xe_mmio_read64_2x32(&hwe->gt->mmio, 2542 RING_CTX_TIMESTAMP(hwe->mmio_base)); 2543 else 2544 val = xe_mmio_read32(&hwe->gt->mmio, 2545 RING_CTX_TIMESTAMP(hwe->mmio_base)); 2546 2547 *reg_ctx_ts = val; 2548 2549 return 0; 2550 } 2551 2552 /** 2553 * xe_lrc_timestamp() - Current ctx timestamp 2554 * @lrc: Pointer to the lrc. 2555 * 2556 * Return latest ctx timestamp. With support for active contexts, the 2557 * calculation may bb slightly racy, so follow a read-again logic to ensure that 2558 * the context is still active before returning the right timestamp. 2559 * 2560 * Returns: New ctx timestamp value 2561 */ 2562 u64 xe_lrc_timestamp(struct xe_lrc *lrc) 2563 { 2564 u64 lrc_ts, reg_ts, new_ts; 2565 u32 engine_id; 2566 2567 lrc_ts = xe_lrc_ctx_timestamp(lrc); 2568 /* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */ 2569 if (IS_SRIOV_VF(lrc_to_xe(lrc))) { 2570 new_ts = lrc_ts; 2571 goto done; 2572 } 2573 2574 if (lrc_ts == CONTEXT_ACTIVE) { 2575 engine_id = xe_lrc_engine_id(lrc); 2576 if (!get_ctx_timestamp(lrc, engine_id, ®_ts)) 2577 new_ts = reg_ts; 2578 2579 /* read lrc again to ensure context is still active */ 2580 lrc_ts = xe_lrc_ctx_timestamp(lrc); 2581 } 2582 2583 /* 2584 * If context switched out, just use the lrc_ts. Note that this needs to 2585 * be a separate if condition. 2586 */ 2587 if (lrc_ts != CONTEXT_ACTIVE) 2588 new_ts = lrc_ts; 2589 2590 done: 2591 return new_ts; 2592 } 2593 2594 /** 2595 * xe_lrc_update_timestamp() - Update ctx timestamp 2596 * @lrc: Pointer to the lrc. 2597 * @old_ts: Old timestamp value 2598 * 2599 * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and 2600 * update saved value. 2601 * 2602 * Returns: New ctx timestamp value 2603 */ 2604 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts) 2605 { 2606 *old_ts = lrc->ctx_timestamp; 2607 lrc->ctx_timestamp = xe_lrc_timestamp(lrc); 2608 2609 trace_xe_lrc_update_timestamp(lrc, *old_ts); 2610 2611 return lrc->ctx_timestamp; 2612 } 2613 2614 /** 2615 * xe_lrc_ring_is_idle() - LRC is idle 2616 * @lrc: Pointer to the lrc. 2617 * 2618 * Compare LRC ring head and tail to determine if idle. 2619 * 2620 * Return: True is ring is idle, False otherwise 2621 */ 2622 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc) 2623 { 2624 return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc); 2625 } 2626