1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2021 Intel Corporation 4 */ 5 6 #include "xe_lrc.h" 7 8 #include <generated/xe_wa_oob.h> 9 10 #include <linux/ascii85.h> 11 #include <linux/panic.h> 12 13 #include "instructions/xe_mi_commands.h" 14 #include "instructions/xe_gfxpipe_commands.h" 15 #include "instructions/xe_gfx_state_commands.h" 16 #include "regs/xe_engine_regs.h" 17 #include "regs/xe_gt_regs.h" 18 #include "regs/xe_lrc_layout.h" 19 #include "xe_bb.h" 20 #include "xe_bo.h" 21 #include "xe_configfs.h" 22 #include "xe_device.h" 23 #include "xe_drm_client.h" 24 #include "xe_exec_queue_types.h" 25 #include "xe_gt.h" 26 #include "xe_gt_printk.h" 27 #include "xe_hw_fence.h" 28 #include "xe_map.h" 29 #include "xe_memirq.h" 30 #include "xe_mmio.h" 31 #include "xe_ring_ops.h" 32 #include "xe_sriov.h" 33 #include "xe_trace_lrc.h" 34 #include "xe_vm.h" 35 #include "xe_wa.h" 36 37 #define LRC_VALID BIT_ULL(0) 38 #define LRC_PRIVILEGE BIT_ULL(8) 39 #define LRC_ADDRESSING_MODE GENMASK_ULL(4, 3) 40 #define LRC_LEGACY_64B_CONTEXT 3 41 42 #define LRC_ENGINE_CLASS GENMASK_ULL(63, 61) 43 #define LRC_ENGINE_INSTANCE GENMASK_ULL(53, 48) 44 45 #define LRC_PPHWSP_SIZE SZ_4K 46 #define LRC_INDIRECT_CTX_BO_SIZE SZ_4K 47 #define LRC_INDIRECT_RING_STATE_SIZE SZ_4K 48 49 #define LRC_PRIORITY GENMASK_ULL(10, 9) 50 #define LRC_PRIORITY_LOW 0 51 #define LRC_PRIORITY_NORMAL 1 52 #define LRC_PRIORITY_HIGH 2 53 54 /* 55 * Layout of the LRC and associated data allocated as 56 * lrc->bo: 57 * 58 * Region Size 59 * +============================+=================================+ <- __xe_lrc_ring_offset() 60 * | Ring | ring_size, see | 61 * | | xe_lrc_init() | 62 * +============================+=================================+ <- __xe_lrc_pphwsp_offset() 63 * | PPHWSP (includes SW state) | 4K | 64 * +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset() 65 * | Engine Context Image | n * 4K, see | 66 * | | xe_gt_lrc_size() | 67 * +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset() 68 * | Indirect Ring State Page | 0 or 4k, see | 69 * | | XE_LRC_FLAG_INDIRECT_RING_STATE | 70 * +============================+=================================+ <- __xe_lrc_indirect_ctx_offset() 71 * | Indirect Context Page | 0 or 4k, see | 72 * | | XE_LRC_FLAG_INDIRECT_CTX | 73 * +============================+=================================+ <- __xe_lrc_wa_bb_offset() 74 * | WA BB Per Ctx | 4k | 75 * +============================+=================================+ <- xe_bo_size(lrc->bo) 76 */ 77 78 static struct xe_device * 79 lrc_to_xe(struct xe_lrc *lrc) 80 { 81 return gt_to_xe(lrc->fence_ctx.gt); 82 } 83 84 static bool 85 gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class) 86 { 87 struct xe_device *xe = gt_to_xe(gt); 88 89 if (XE_GT_WA(gt, 16010904313) && 90 (class == XE_ENGINE_CLASS_RENDER || 91 class == XE_ENGINE_CLASS_COMPUTE)) 92 return true; 93 94 if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev), 95 class, NULL)) 96 return true; 97 98 if (gt->ring_ops[class]->emit_aux_table_inv) 99 return true; 100 101 return false; 102 } 103 104 /** 105 * xe_gt_lrc_hang_replay_size() - Hang replay size 106 * @gt: The GT 107 * @class: Hardware engine class 108 * 109 * Determine size of GPU hang replay state for a GT and hardware engine class. 110 * 111 * Return: Size of GPU hang replay size 112 */ 113 size_t xe_gt_lrc_hang_replay_size(struct xe_gt *gt, enum xe_engine_class class) 114 { 115 struct xe_device *xe = gt_to_xe(gt); 116 size_t size = 0; 117 118 /* Engine context image */ 119 switch (class) { 120 case XE_ENGINE_CLASS_RENDER: 121 if (GRAPHICS_VERx100(xe) >= 3510) 122 size += 7 * SZ_4K; 123 else if (GRAPHICS_VER(xe) >= 20) 124 size += 3 * SZ_4K; 125 else 126 size += 13 * SZ_4K; 127 break; 128 case XE_ENGINE_CLASS_COMPUTE: 129 if (GRAPHICS_VERx100(xe) >= 3510) 130 size += 5 * SZ_4K; 131 else if (GRAPHICS_VER(xe) >= 20) 132 size += 2 * SZ_4K; 133 else 134 size += 13 * SZ_4K; 135 break; 136 default: 137 WARN(1, "Unknown engine class: %d", class); 138 fallthrough; 139 case XE_ENGINE_CLASS_COPY: 140 case XE_ENGINE_CLASS_VIDEO_DECODE: 141 case XE_ENGINE_CLASS_VIDEO_ENHANCE: 142 case XE_ENGINE_CLASS_OTHER: 143 size += 1 * SZ_4K; 144 } 145 146 return size; 147 } 148 149 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class) 150 { 151 size_t size = xe_gt_lrc_hang_replay_size(gt, class); 152 153 /* Add indirect ring state page */ 154 if (xe_gt_has_indirect_ring_state(gt)) 155 size += LRC_INDIRECT_RING_STATE_SIZE; 156 157 return size + LRC_PPHWSP_SIZE; 158 } 159 160 /* 161 * The per-platform tables are u8-encoded in @data. Decode @data and set the 162 * addresses' offset and commands in @regs. The following encoding is used 163 * for each byte. There are 2 steps: decoding commands and decoding addresses. 164 * 165 * Commands: 166 * [7]: create NOPs - number of NOPs are set in lower bits 167 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set 168 * MI_LRI_FORCE_POSTED 169 * [5:0]: Number of NOPs or registers to set values to in case of 170 * MI_LOAD_REGISTER_IMM 171 * 172 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count" 173 * number of registers. They are set by using the REG/REG16 macros: the former 174 * is used for offsets smaller than 0x200 while the latter is for values bigger 175 * than that. Those macros already set all the bits documented below correctly: 176 * 177 * [7]: When a register offset needs more than 6 bits, use additional bytes, to 178 * follow, for the lower bits 179 * [6:0]: Register offset, without considering the engine base. 180 * 181 * This function only tweaks the commands and register offsets. Values are not 182 * filled out. 183 */ 184 static void set_offsets(u32 *regs, 185 const u8 *data, 186 const struct xe_hw_engine *hwe) 187 #define NOP(x) (BIT(7) | (x)) 188 #define LRI(count, flags) ((flags) << 6 | (count) | \ 189 BUILD_BUG_ON_ZERO(count >= BIT(6))) 190 #define POSTED BIT(0) 191 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 192 #define REG16(x) \ 193 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 194 (((x) >> 2) & 0x7f) 195 { 196 const u32 base = hwe->mmio_base; 197 198 while (*data) { 199 u8 count, flags; 200 201 if (*data & BIT(7)) { /* skip */ 202 count = *data++ & ~BIT(7); 203 regs += count; 204 continue; 205 } 206 207 count = *data & 0x3f; 208 flags = *data >> 6; 209 data++; 210 211 *regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count); 212 if (flags & POSTED) 213 *regs |= MI_LRI_FORCE_POSTED; 214 *regs |= MI_LRI_LRM_CS_MMIO; 215 regs++; 216 217 xe_gt_assert(hwe->gt, count); 218 do { 219 u32 offset = 0; 220 u8 v; 221 222 do { 223 v = *data++; 224 offset <<= 7; 225 offset |= v & ~BIT(7); 226 } while (v & BIT(7)); 227 228 regs[0] = base + (offset << 2); 229 regs += 2; 230 } while (--count); 231 } 232 233 *regs = MI_BATCH_BUFFER_END | BIT(0); 234 } 235 236 static const u8 gen12_xcs_offsets[] = { 237 NOP(1), 238 LRI(13, POSTED), 239 REG16(0x244), 240 REG(0x034), 241 REG(0x030), 242 REG(0x038), 243 REG(0x03c), 244 REG(0x168), 245 REG(0x140), 246 REG(0x110), 247 REG(0x1c0), 248 REG(0x1c4), 249 REG(0x1c8), 250 REG(0x180), 251 REG16(0x2b4), 252 253 NOP(5), 254 LRI(9, POSTED), 255 REG16(0x3a8), 256 REG16(0x28c), 257 REG16(0x288), 258 REG16(0x284), 259 REG16(0x280), 260 REG16(0x27c), 261 REG16(0x278), 262 REG16(0x274), 263 REG16(0x270), 264 265 0 266 }; 267 268 static const u8 dg2_xcs_offsets[] = { 269 NOP(1), 270 LRI(15, POSTED), 271 REG16(0x244), 272 REG(0x034), 273 REG(0x030), 274 REG(0x038), 275 REG(0x03c), 276 REG(0x168), 277 REG(0x140), 278 REG(0x110), 279 REG(0x1c0), 280 REG(0x1c4), 281 REG(0x1c8), 282 REG(0x180), 283 REG16(0x2b4), 284 REG(0x120), 285 REG(0x124), 286 287 NOP(1), 288 LRI(9, POSTED), 289 REG16(0x3a8), 290 REG16(0x28c), 291 REG16(0x288), 292 REG16(0x284), 293 REG16(0x280), 294 REG16(0x27c), 295 REG16(0x278), 296 REG16(0x274), 297 REG16(0x270), 298 299 0 300 }; 301 302 static const u8 gen12_rcs_offsets[] = { 303 NOP(1), 304 LRI(13, POSTED), 305 REG16(0x244), 306 REG(0x034), 307 REG(0x030), 308 REG(0x038), 309 REG(0x03c), 310 REG(0x168), 311 REG(0x140), 312 REG(0x110), 313 REG(0x1c0), 314 REG(0x1c4), 315 REG(0x1c8), 316 REG(0x180), 317 REG16(0x2b4), 318 319 NOP(5), 320 LRI(9, POSTED), 321 REG16(0x3a8), 322 REG16(0x28c), 323 REG16(0x288), 324 REG16(0x284), 325 REG16(0x280), 326 REG16(0x27c), 327 REG16(0x278), 328 REG16(0x274), 329 REG16(0x270), 330 331 LRI(3, POSTED), 332 REG(0x1b0), 333 REG16(0x5a8), 334 REG16(0x5ac), 335 336 NOP(6), 337 LRI(1, 0), 338 REG(0x0c8), 339 NOP(3 + 9 + 1), 340 341 LRI(51, POSTED), 342 REG16(0x588), 343 REG16(0x588), 344 REG16(0x588), 345 REG16(0x588), 346 REG16(0x588), 347 REG16(0x588), 348 REG(0x028), 349 REG(0x09c), 350 REG(0x0c0), 351 REG(0x178), 352 REG(0x17c), 353 REG16(0x358), 354 REG(0x170), 355 REG(0x150), 356 REG(0x154), 357 REG(0x158), 358 REG16(0x41c), 359 REG16(0x600), 360 REG16(0x604), 361 REG16(0x608), 362 REG16(0x60c), 363 REG16(0x610), 364 REG16(0x614), 365 REG16(0x618), 366 REG16(0x61c), 367 REG16(0x620), 368 REG16(0x624), 369 REG16(0x628), 370 REG16(0x62c), 371 REG16(0x630), 372 REG16(0x634), 373 REG16(0x638), 374 REG16(0x63c), 375 REG16(0x640), 376 REG16(0x644), 377 REG16(0x648), 378 REG16(0x64c), 379 REG16(0x650), 380 REG16(0x654), 381 REG16(0x658), 382 REG16(0x65c), 383 REG16(0x660), 384 REG16(0x664), 385 REG16(0x668), 386 REG16(0x66c), 387 REG16(0x670), 388 REG16(0x674), 389 REG16(0x678), 390 REG16(0x67c), 391 REG(0x068), 392 REG(0x084), 393 NOP(1), 394 395 0 396 }; 397 398 static const u8 xehp_rcs_offsets[] = { 399 NOP(1), 400 LRI(13, POSTED), 401 REG16(0x244), 402 REG(0x034), 403 REG(0x030), 404 REG(0x038), 405 REG(0x03c), 406 REG(0x168), 407 REG(0x140), 408 REG(0x110), 409 REG(0x1c0), 410 REG(0x1c4), 411 REG(0x1c8), 412 REG(0x180), 413 REG16(0x2b4), 414 415 NOP(5), 416 LRI(9, POSTED), 417 REG16(0x3a8), 418 REG16(0x28c), 419 REG16(0x288), 420 REG16(0x284), 421 REG16(0x280), 422 REG16(0x27c), 423 REG16(0x278), 424 REG16(0x274), 425 REG16(0x270), 426 427 LRI(3, POSTED), 428 REG(0x1b0), 429 REG16(0x5a8), 430 REG16(0x5ac), 431 432 NOP(6), 433 LRI(1, 0), 434 REG(0x0c8), 435 436 0 437 }; 438 439 static const u8 dg2_rcs_offsets[] = { 440 NOP(1), 441 LRI(15, POSTED), 442 REG16(0x244), 443 REG(0x034), 444 REG(0x030), 445 REG(0x038), 446 REG(0x03c), 447 REG(0x168), 448 REG(0x140), 449 REG(0x110), 450 REG(0x1c0), 451 REG(0x1c4), 452 REG(0x1c8), 453 REG(0x180), 454 REG16(0x2b4), 455 REG(0x120), 456 REG(0x124), 457 458 NOP(1), 459 LRI(9, POSTED), 460 REG16(0x3a8), 461 REG16(0x28c), 462 REG16(0x288), 463 REG16(0x284), 464 REG16(0x280), 465 REG16(0x27c), 466 REG16(0x278), 467 REG16(0x274), 468 REG16(0x270), 469 470 LRI(3, POSTED), 471 REG(0x1b0), 472 REG16(0x5a8), 473 REG16(0x5ac), 474 475 NOP(6), 476 LRI(1, 0), 477 REG(0x0c8), 478 479 0 480 }; 481 482 static const u8 mtl_rcs_offsets[] = { 483 NOP(1), 484 LRI(15, POSTED), 485 REG16(0x244), 486 REG(0x034), 487 REG(0x030), 488 REG(0x038), 489 REG(0x03c), 490 REG(0x168), 491 REG(0x140), 492 REG(0x110), 493 REG(0x1c0), 494 REG(0x1c4), 495 REG(0x1c8), 496 REG(0x180), 497 REG16(0x2b4), 498 REG(0x120), 499 REG(0x124), 500 501 NOP(1), 502 LRI(9, POSTED), 503 REG16(0x3a8), 504 REG16(0x28c), 505 REG16(0x288), 506 REG16(0x284), 507 REG16(0x280), 508 REG16(0x27c), 509 REG16(0x278), 510 REG16(0x274), 511 REG16(0x270), 512 513 NOP(2), 514 LRI(2, POSTED), 515 REG16(0x5a8), 516 REG16(0x5ac), 517 518 NOP(6), 519 LRI(1, 0), 520 REG(0x0c8), 521 522 0 523 }; 524 525 #define XE2_CTX_COMMON \ 526 NOP(1), /* [0x00] */ \ 527 LRI(15, POSTED), /* [0x01] */ \ 528 REG16(0x244), /* [0x02] CTXT_SR_CTL */ \ 529 REG(0x034), /* [0x04] RING_BUFFER_HEAD */ \ 530 REG(0x030), /* [0x06] RING_BUFFER_TAIL */ \ 531 REG(0x038), /* [0x08] RING_BUFFER_START */ \ 532 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ \ 533 REG(0x168), /* [0x0c] BB_ADDR_UDW */ \ 534 REG(0x140), /* [0x0e] BB_ADDR */ \ 535 REG(0x110), /* [0x10] BB_STATE */ \ 536 REG(0x1c0), /* [0x12] BB_PER_CTX_PTR */ \ 537 REG(0x1c4), /* [0x14] RCS_INDIRECT_CTX */ \ 538 REG(0x1c8), /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \ 539 REG(0x180), /* [0x18] CCID */ \ 540 REG16(0x2b4), /* [0x1a] SEMAPHORE_TOKEN */ \ 541 REG(0x120), /* [0x1c] PRT_BB_STATE */ \ 542 REG(0x124), /* [0x1e] PRT_BB_STATE_UDW */ \ 543 \ 544 NOP(1), /* [0x20] */ \ 545 LRI(9, POSTED), /* [0x21] */ \ 546 REG16(0x3a8), /* [0x22] CTX_TIMESTAMP */ \ 547 REG16(0x3ac), /* [0x24] CTX_TIMESTAMP_UDW */ \ 548 REG(0x108), /* [0x26] INDIRECT_RING_STATE */ \ 549 REG16(0x284), /* [0x28] dummy reg */ \ 550 REG16(0x280), /* [0x2a] CS_ACC_CTR_THOLD */ \ 551 REG16(0x27c), /* [0x2c] CS_CTX_SYS_PASID */ \ 552 REG16(0x278), /* [0x2e] CS_CTX_ASID */ \ 553 REG16(0x274), /* [0x30] PTBP_UDW */ \ 554 REG16(0x270) /* [0x32] PTBP_LDW */ 555 556 static const u8 xe2_rcs_offsets[] = { 557 XE2_CTX_COMMON, 558 559 NOP(2), /* [0x34] */ 560 LRI(2, POSTED), /* [0x36] */ 561 REG16(0x5a8), /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */ 562 REG16(0x5ac), /* [0x39] PREEMPTION_STATUS */ 563 564 NOP(6), /* [0x41] */ 565 LRI(1, 0), /* [0x47] */ 566 REG(0x0c8), /* [0x48] R_PWR_CLK_STATE */ 567 568 0 569 }; 570 571 static const u8 xe2_bcs_offsets[] = { 572 XE2_CTX_COMMON, 573 574 NOP(4 + 8 + 1), /* [0x34] */ 575 LRI(2, POSTED), /* [0x41] */ 576 REG16(0x200), /* [0x42] BCS_SWCTRL */ 577 REG16(0x204), /* [0x44] BLIT_CCTL */ 578 579 0 580 }; 581 582 static const u8 xe2_xcs_offsets[] = { 583 XE2_CTX_COMMON, 584 585 0 586 }; 587 588 static const u8 xe2_indirect_ring_state_offsets[] = { 589 NOP(1), /* [0x00] */ 590 LRI(5, POSTED), /* [0x01] */ 591 REG(0x034), /* [0x02] RING_BUFFER_HEAD */ 592 REG(0x030), /* [0x04] RING_BUFFER_TAIL */ 593 REG(0x038), /* [0x06] RING_BUFFER_START */ 594 REG(0x048), /* [0x08] RING_BUFFER_START_UDW */ 595 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ 596 597 NOP(5), /* [0x0c] */ 598 LRI(9, POSTED), /* [0x11] */ 599 REG(0x168), /* [0x12] BB_ADDR_UDW */ 600 REG(0x140), /* [0x14] BB_ADDR */ 601 REG(0x110), /* [0x16] BB_STATE */ 602 REG16(0x588), /* [0x18] BB_STACK_WRITE_PORT */ 603 REG16(0x588), /* [0x20] BB_STACK_WRITE_PORT */ 604 REG16(0x588), /* [0x22] BB_STACK_WRITE_PORT */ 605 REG16(0x588), /* [0x24] BB_STACK_WRITE_PORT */ 606 REG16(0x588), /* [0x26] BB_STACK_WRITE_PORT */ 607 REG16(0x588), /* [0x28] BB_STACK_WRITE_PORT */ 608 609 NOP(12), /* [0x00] */ 610 611 0 612 }; 613 614 #undef REG16 615 #undef REG 616 #undef LRI 617 #undef NOP 618 619 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class) 620 { 621 if (class == XE_ENGINE_CLASS_RENDER) { 622 if (GRAPHICS_VER(xe) >= 20) 623 return xe2_rcs_offsets; 624 else if (GRAPHICS_VERx100(xe) >= 1270) 625 return mtl_rcs_offsets; 626 else if (GRAPHICS_VERx100(xe) >= 1255) 627 return dg2_rcs_offsets; 628 else if (GRAPHICS_VERx100(xe) >= 1250) 629 return xehp_rcs_offsets; 630 else 631 return gen12_rcs_offsets; 632 } else if (class == XE_ENGINE_CLASS_COPY) { 633 if (GRAPHICS_VER(xe) >= 20) 634 return xe2_bcs_offsets; 635 else 636 return gen12_xcs_offsets; 637 } else { 638 if (GRAPHICS_VER(xe) >= 20) 639 return xe2_xcs_offsets; 640 else if (GRAPHICS_VERx100(xe) >= 1255) 641 return dg2_xcs_offsets; 642 else 643 return gen12_xcs_offsets; 644 } 645 } 646 647 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe) 648 { 649 regs[CTX_CONTEXT_CONTROL] = REG_MASKED_FIELD_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | 650 CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 651 652 if (xe_gt_has_indirect_ring_state(hwe->gt)) 653 regs[CTX_CONTEXT_CONTROL] |= 654 REG_MASKED_FIELD_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE); 655 } 656 657 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe) 658 { 659 struct xe_memirq *memirq = >_to_tile(hwe->gt)->memirq; 660 struct xe_device *xe = gt_to_xe(hwe->gt); 661 u8 num_regs; 662 663 if (!xe_device_uses_memirq(xe)) 664 return; 665 666 regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM | 667 MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT; 668 regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr; 669 regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq); 670 671 num_regs = xe_device_has_msix(xe) ? 3 : 2; 672 regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) | 673 MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED; 674 regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr; 675 regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe); 676 regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr; 677 regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe); 678 679 if (xe_device_has_msix(xe)) { 680 regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr; 681 /* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */ 682 } 683 } 684 685 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc) 686 { 687 return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE; 688 } 689 690 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc) 691 { 692 return 0; 693 } 694 695 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc) 696 { 697 return lrc->ring.size; 698 } 699 700 /* Make the magic macros work */ 701 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset 702 #define __xe_lrc_regs_offset xe_lrc_regs_offset 703 704 #define LRC_CTX_JOB_TIMESTAMP_OFFSET 512 705 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024 706 #define LRC_PARALLEL_PPHWSP_OFFSET 2048 707 708 #define LRC_SEQNO_OFFSET 0 709 #define LRC_START_SEQNO_OFFSET (LRC_SEQNO_OFFSET + 8) 710 711 u32 xe_lrc_regs_offset(struct xe_lrc *lrc) 712 { 713 return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE; 714 } 715 716 /** 717 * xe_lrc_reg_size() - Get size of the LRC registers area within queues 718 * @xe: the &xe_device struct instance 719 * 720 * Returns: Size of the LRC registers area for current platform 721 */ 722 size_t xe_lrc_reg_size(struct xe_device *xe) 723 { 724 if (GRAPHICS_VERx100(xe) >= 1250) 725 return 96 * sizeof(u32); 726 else 727 return 80 * sizeof(u32); 728 } 729 730 size_t xe_lrc_skip_size(struct xe_device *xe) 731 { 732 return LRC_PPHWSP_SIZE + xe_lrc_reg_size(xe); 733 } 734 735 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc) 736 { 737 return LRC_SEQNO_OFFSET; 738 } 739 740 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc) 741 { 742 return LRC_START_SEQNO_OFFSET; 743 } 744 745 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc) 746 { 747 /* This is stored in the driver-defined portion of PPHWSP */ 748 return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET; 749 } 750 751 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc) 752 { 753 /* The parallel is stored in the driver-defined portion of PPHWSP */ 754 return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET; 755 } 756 757 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc) 758 { 759 return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET; 760 } 761 762 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc) 763 { 764 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32); 765 } 766 767 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc) 768 { 769 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32); 770 } 771 772 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc) 773 { 774 u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - 775 LRC_INDIRECT_RING_STATE_SIZE; 776 777 if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX) 778 offset -= LRC_INDIRECT_CTX_BO_SIZE; 779 780 return offset; 781 } 782 783 static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc) 784 { 785 return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE; 786 } 787 788 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc) 789 { 790 return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE; 791 } 792 793 #define DECL_MAP_ADDR_HELPERS(elem, bo_expr) \ 794 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \ 795 { \ 796 struct xe_bo *bo = (bo_expr); \ 797 struct iosys_map map = bo->vmap; \ 798 \ 799 xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map)); \ 800 iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \ 801 return map; \ 802 } \ 803 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \ 804 { \ 805 struct xe_bo *bo = (bo_expr); \ 806 \ 807 return xe_bo_ggtt_addr(bo) + __xe_lrc_##elem##_offset(lrc); \ 808 } \ 809 810 DECL_MAP_ADDR_HELPERS(ring, lrc->bo) 811 DECL_MAP_ADDR_HELPERS(pphwsp, lrc->bo) 812 DECL_MAP_ADDR_HELPERS(seqno, lrc->seqno_bo) 813 DECL_MAP_ADDR_HELPERS(regs, lrc->bo) 814 DECL_MAP_ADDR_HELPERS(start_seqno, lrc->seqno_bo) 815 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp, lrc->bo) 816 DECL_MAP_ADDR_HELPERS(ctx_timestamp, lrc->bo) 817 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw, lrc->bo) 818 DECL_MAP_ADDR_HELPERS(parallel, lrc->bo) 819 DECL_MAP_ADDR_HELPERS(indirect_ring, lrc->bo) 820 DECL_MAP_ADDR_HELPERS(engine_id, lrc->bo) 821 822 #undef DECL_MAP_ADDR_HELPERS 823 824 /** 825 * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address 826 * @lrc: Pointer to the lrc. 827 * 828 * Returns: ctx timestamp GGTT address 829 */ 830 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc) 831 { 832 return __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 833 } 834 835 /** 836 * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address 837 * @lrc: Pointer to the lrc. 838 * 839 * Returns: ctx timestamp udw GGTT address 840 */ 841 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc) 842 { 843 return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc); 844 } 845 846 /** 847 * xe_lrc_ctx_timestamp() - Read ctx timestamp value 848 * @lrc: Pointer to the lrc. 849 * 850 * Returns: ctx timestamp value 851 */ 852 static u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc) 853 { 854 struct xe_device *xe = lrc_to_xe(lrc); 855 struct iosys_map map; 856 u32 ldw, udw = 0; 857 858 map = __xe_lrc_ctx_timestamp_map(lrc); 859 ldw = xe_map_read32(xe, &map); 860 861 if (xe->info.has_64bit_timestamp) { 862 map = __xe_lrc_ctx_timestamp_udw_map(lrc); 863 udw = xe_map_read32(xe, &map); 864 } 865 866 return (u64)udw << 32 | ldw; 867 } 868 869 /** 870 * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address 871 * @lrc: Pointer to the lrc. 872 * 873 * Returns: ctx timestamp job GGTT address 874 */ 875 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc) 876 { 877 return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc); 878 } 879 880 /** 881 * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value 882 * @lrc: Pointer to the lrc. 883 * 884 * Returns: ctx timestamp job value 885 */ 886 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc) 887 { 888 struct xe_device *xe = lrc_to_xe(lrc); 889 struct iosys_map map; 890 891 map = __xe_lrc_ctx_job_timestamp_map(lrc); 892 return xe_map_read32(xe, &map); 893 } 894 895 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc) 896 { 897 return __xe_lrc_pphwsp_ggtt_addr(lrc); 898 } 899 900 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc) 901 { 902 if (!xe_lrc_has_indirect_ring_state(lrc)) 903 return 0; 904 905 return __xe_lrc_indirect_ring_ggtt_addr(lrc); 906 } 907 908 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr) 909 { 910 struct xe_device *xe = lrc_to_xe(lrc); 911 struct iosys_map map; 912 913 map = __xe_lrc_indirect_ring_map(lrc); 914 iosys_map_incr(&map, reg_nr * sizeof(u32)); 915 return xe_map_read32(xe, &map); 916 } 917 918 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc, 919 int reg_nr, u32 val) 920 { 921 struct xe_device *xe = lrc_to_xe(lrc); 922 struct iosys_map map; 923 924 map = __xe_lrc_indirect_ring_map(lrc); 925 iosys_map_incr(&map, reg_nr * sizeof(u32)); 926 xe_map_write32(xe, &map, val); 927 } 928 929 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr) 930 { 931 struct xe_device *xe = lrc_to_xe(lrc); 932 struct iosys_map map; 933 934 map = __xe_lrc_regs_map(lrc); 935 iosys_map_incr(&map, reg_nr * sizeof(u32)); 936 return xe_map_read32(xe, &map); 937 } 938 939 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val) 940 { 941 struct xe_device *xe = lrc_to_xe(lrc); 942 struct iosys_map map; 943 944 map = __xe_lrc_regs_map(lrc); 945 iosys_map_incr(&map, reg_nr * sizeof(u32)); 946 xe_map_write32(xe, &map, val); 947 } 948 949 static void *empty_lrc_data(struct xe_hw_engine *hwe) 950 { 951 struct xe_gt *gt = hwe->gt; 952 void *data; 953 u32 *regs; 954 955 data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL); 956 if (!data) 957 return NULL; 958 959 /* 1st page: Per-Process of HW status Page */ 960 regs = data + LRC_PPHWSP_SIZE; 961 set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe); 962 set_context_control(regs, hwe); 963 set_memory_based_intr(regs, hwe); 964 if (xe_gt_has_indirect_ring_state(gt)) { 965 regs = data + xe_gt_lrc_size(gt, hwe->class) - 966 LRC_INDIRECT_RING_STATE_SIZE; 967 set_offsets(regs, xe2_indirect_ring_state_offsets, hwe); 968 } 969 970 return data; 971 } 972 973 /** 974 * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC 975 * of given engine. 976 * @hwe: the &xe_hw_engine struct instance 977 */ 978 void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe) 979 { 980 struct xe_gt *gt = hwe->gt; 981 u32 *regs; 982 983 if (!gt->default_lrc[hwe->class]) 984 return; 985 986 regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE; 987 set_memory_based_intr(regs, hwe); 988 } 989 990 /** 991 * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data 992 * for given LRC. 993 * @lrc: the &xe_lrc struct instance 994 * @hwe: the &xe_hw_engine struct instance 995 * @regs: scratch buffer to be used as temporary storage 996 */ 997 void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 998 u32 *regs) 999 { 1000 struct xe_gt *gt = hwe->gt; 1001 struct iosys_map map; 1002 size_t regs_len; 1003 1004 if (!xe_device_uses_memirq(gt_to_xe(gt))) 1005 return; 1006 1007 map = __xe_lrc_regs_map(lrc); 1008 regs_len = xe_lrc_reg_size(gt_to_xe(gt)); 1009 xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len); 1010 set_memory_based_intr(regs, hwe); 1011 xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len); 1012 } 1013 1014 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm) 1015 { 1016 u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt)); 1017 1018 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc)); 1019 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc)); 1020 } 1021 1022 static void xe_lrc_finish(struct xe_lrc *lrc) 1023 { 1024 xe_hw_fence_ctx_finish(&lrc->fence_ctx); 1025 xe_bo_unpin_map_no_vm(lrc->bo); 1026 xe_bo_unpin_map_no_vm(lrc->seqno_bo); 1027 } 1028 1029 /* 1030 * wa_bb_setup_utilization() - Write commands to wa bb to assist 1031 * in calculating active context run ticks. 1032 * 1033 * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the 1034 * context, but only gets updated when the context switches out. In order to 1035 * check how long a context has been active before it switches out, two things 1036 * are required: 1037 * 1038 * (1) Determine if the context is running: 1039 * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in 1040 * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is 1041 * initialized. During a query, we just check for this value to determine if the 1042 * context is active. If the context switched out, it would overwrite this 1043 * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as 1044 * the last part of context restore, so reusing this LRC location will not 1045 * clobber anything. 1046 * 1047 * (2) Calculate the time that the context has been active for: 1048 * The CTX_TIMESTAMP ticks only when the context is active. If a context is 1049 * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization. 1050 * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific 1051 * engine instance. Since we do not know which instance the context is running 1052 * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and 1053 * store it in the PPHSWP. 1054 */ 1055 #define CONTEXT_ACTIVE 1ULL 1056 static ssize_t setup_utilization_wa(struct xe_lrc *lrc, 1057 struct xe_hw_engine *hwe, 1058 u32 *batch, 1059 size_t max_len) 1060 { 1061 u32 *cmd = batch; 1062 1063 if (IS_SRIOV_VF(gt_to_xe(lrc->gt))) 1064 return 0; 1065 1066 if (xe_gt_WARN_ON(lrc->gt, max_len < 12)) 1067 return -ENOSPC; 1068 1069 *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET; 1070 *cmd++ = ENGINE_ID(0).addr; 1071 *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc); 1072 *cmd++ = 0; 1073 1074 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); 1075 *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 1076 *cmd++ = 0; 1077 *cmd++ = lower_32_bits(CONTEXT_ACTIVE); 1078 1079 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) { 1080 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); 1081 *cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc); 1082 *cmd++ = 0; 1083 *cmd++ = upper_32_bits(CONTEXT_ACTIVE); 1084 } 1085 1086 return cmd - batch; 1087 } 1088 1089 static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 1090 u32 *batch, size_t max_len) 1091 { 1092 const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 1093 u32 *cmd = batch; 1094 1095 if (!XE_GT_WA(lrc->gt, 16010904313) || 1096 !(hwe->class == XE_ENGINE_CLASS_RENDER || 1097 hwe->class == XE_ENGINE_CLASS_COMPUTE || 1098 hwe->class == XE_ENGINE_CLASS_COPY || 1099 hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE || 1100 hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE)) 1101 return 0; 1102 1103 if (xe_gt_WARN_ON(lrc->gt, max_len < 12)) 1104 return -ENOSPC; 1105 1106 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO | 1107 MI_LRM_ASYNC; 1108 *cmd++ = RING_CTX_TIMESTAMP(0).addr; 1109 *cmd++ = ts_addr; 1110 *cmd++ = 0; 1111 1112 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO | 1113 MI_LRM_ASYNC; 1114 *cmd++ = RING_CTX_TIMESTAMP(0).addr; 1115 *cmd++ = ts_addr; 1116 *cmd++ = 0; 1117 1118 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO; 1119 *cmd++ = RING_CTX_TIMESTAMP(0).addr; 1120 *cmd++ = ts_addr; 1121 *cmd++ = 0; 1122 1123 return cmd - batch; 1124 } 1125 1126 static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc, 1127 struct xe_hw_engine *hwe, 1128 u32 *batch, size_t max_len) 1129 { 1130 struct xe_device *xe = gt_to_xe(lrc->gt); 1131 const u32 *user_batch; 1132 u32 *cmd = batch; 1133 u32 count; 1134 1135 count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev), 1136 hwe->class, &user_batch); 1137 if (!count) 1138 return 0; 1139 1140 if (count > max_len) 1141 return -ENOSPC; 1142 1143 /* 1144 * This should be used only for tests and validation. Taint the kernel 1145 * as anything could be submitted directly in context switches 1146 */ 1147 add_taint(TAINT_TEST, LOCKDEP_STILL_OK); 1148 1149 memcpy(cmd, user_batch, count * sizeof(u32)); 1150 cmd += count; 1151 1152 return cmd - batch; 1153 } 1154 1155 static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc, 1156 struct xe_hw_engine *hwe, 1157 u32 *batch, size_t max_len) 1158 { 1159 struct xe_device *xe = gt_to_xe(lrc->gt); 1160 const u32 *user_batch; 1161 u32 *cmd = batch; 1162 u32 count; 1163 1164 count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev), 1165 hwe->class, &user_batch); 1166 if (!count) 1167 return 0; 1168 1169 if (count > max_len) 1170 return -ENOSPC; 1171 1172 /* 1173 * This should be used only for tests and validation. Taint the kernel 1174 * as anything could be submitted directly in context switches 1175 */ 1176 add_taint(TAINT_TEST, LOCKDEP_STILL_OK); 1177 1178 memcpy(cmd, user_batch, count * sizeof(u32)); 1179 cmd += count; 1180 1181 return cmd - batch; 1182 } 1183 1184 static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc, 1185 struct xe_hw_engine *hwe, 1186 u32 *batch, size_t max_len) 1187 { 1188 u32 *cmd = batch; 1189 1190 if (!XE_GT_WA(lrc->gt, 18022495364) || 1191 hwe->class != XE_ENGINE_CLASS_RENDER) 1192 return 0; 1193 1194 if (xe_gt_WARN_ON(lrc->gt, max_len < 3)) 1195 return -ENOSPC; 1196 1197 *cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_LRM_CS_MMIO | MI_LRI_NUM_REGS(1); 1198 *cmd++ = CS_DEBUG_MODE2(0).addr; 1199 *cmd++ = REG_MASKED_FIELD_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE); 1200 1201 return cmd - batch; 1202 } 1203 1204 static ssize_t setup_invalidate_auxccs_wa(struct xe_lrc *lrc, 1205 struct xe_hw_engine *hwe, 1206 u32 *batch, size_t max_len) 1207 { 1208 struct xe_gt *gt = lrc->gt; 1209 u32 *(*emit)(struct xe_gt *gt, u32 *cmd) = 1210 gt->ring_ops[hwe->class]->emit_aux_table_inv; 1211 1212 if (!emit) 1213 return 0; 1214 1215 if (xe_gt_WARN_ON(gt, max_len < 8)) 1216 return -ENOSPC; 1217 1218 return emit(gt, batch) - batch; 1219 } 1220 1221 struct bo_setup { 1222 ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 1223 u32 *batch, size_t max_size); 1224 }; 1225 1226 struct bo_setup_state { 1227 /* Input: */ 1228 struct xe_lrc *lrc; 1229 struct xe_hw_engine *hwe; 1230 size_t max_size; 1231 size_t reserve_dw; 1232 unsigned int offset; 1233 const struct bo_setup *funcs; 1234 unsigned int num_funcs; 1235 1236 /* State: */ 1237 u32 *buffer; 1238 u32 *ptr; 1239 unsigned int written; 1240 }; 1241 1242 static int setup_bo(struct bo_setup_state *state) 1243 { 1244 ssize_t remain; 1245 1246 if (state->lrc->bo->vmap.is_iomem) { 1247 xe_gt_assert(state->hwe->gt, state->buffer); 1248 state->ptr = state->buffer; 1249 } else { 1250 state->ptr = state->lrc->bo->vmap.vaddr + state->offset; 1251 } 1252 1253 remain = state->max_size / sizeof(u32); 1254 1255 for (size_t i = 0; i < state->num_funcs; i++) { 1256 ssize_t len = state->funcs[i].setup(state->lrc, state->hwe, 1257 state->ptr, remain); 1258 1259 remain -= len; 1260 1261 /* 1262 * Caller has asked for at least reserve_dw to remain unused. 1263 */ 1264 if (len < 0 || 1265 xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw)) 1266 goto fail; 1267 1268 state->ptr += len; 1269 state->written += len; 1270 } 1271 1272 return 0; 1273 1274 fail: 1275 return -ENOSPC; 1276 } 1277 1278 static void finish_bo(struct bo_setup_state *state) 1279 { 1280 if (!state->lrc->bo->vmap.is_iomem) 1281 return; 1282 1283 xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap, 1284 state->offset, state->buffer, 1285 state->written * sizeof(u32)); 1286 } 1287 1288 /** 1289 * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks. 1290 * @lrc: the &xe_lrc struct instance 1291 * @hwe: the &xe_hw_engine struct instance 1292 * @scratch: preallocated scratch buffer for temporary storage 1293 * Return: 0 on success, negative error code on failure 1294 */ 1295 int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch) 1296 { 1297 static const struct bo_setup funcs[] = { 1298 { .setup = setup_timestamp_wa }, 1299 { .setup = setup_invalidate_state_cache_wa }, 1300 { .setup = setup_utilization_wa }, 1301 { .setup = setup_configfs_post_ctx_restore_bb }, 1302 }; 1303 struct bo_setup_state state = { 1304 .lrc = lrc, 1305 .hwe = hwe, 1306 .max_size = LRC_WA_BB_SIZE, 1307 .buffer = scratch, 1308 .reserve_dw = 1, 1309 .offset = __xe_lrc_wa_bb_offset(lrc), 1310 .funcs = funcs, 1311 .num_funcs = ARRAY_SIZE(funcs), 1312 }; 1313 int ret; 1314 1315 ret = setup_bo(&state); 1316 if (ret) 1317 return ret; 1318 1319 *state.ptr++ = MI_BATCH_BUFFER_END; 1320 state.written++; 1321 1322 finish_bo(&state); 1323 1324 xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR, 1325 xe_bo_ggtt_addr(lrc->bo) + state.offset + 1); 1326 1327 return 0; 1328 } 1329 1330 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe) 1331 { 1332 u32 *buf = NULL; 1333 int ret; 1334 1335 if (lrc->bo->vmap.is_iomem) { 1336 buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL); 1337 if (!buf) 1338 return -ENOMEM; 1339 } 1340 1341 ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf); 1342 1343 kfree(buf); 1344 1345 return ret; 1346 } 1347 1348 static int 1349 setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe) 1350 { 1351 static const struct bo_setup rcs_funcs[] = { 1352 { .setup = setup_timestamp_wa }, 1353 { .setup = setup_invalidate_auxccs_wa }, 1354 { .setup = setup_configfs_mid_ctx_restore_bb }, 1355 }; 1356 static const struct bo_setup xcs_funcs[] = { 1357 { .setup = setup_invalidate_auxccs_wa }, 1358 { .setup = setup_configfs_mid_ctx_restore_bb }, 1359 }; 1360 struct bo_setup_state state = { 1361 .lrc = lrc, 1362 .hwe = hwe, 1363 .max_size = (63 * 64) /* max 63 cachelines */, 1364 .buffer = NULL, 1365 .offset = __xe_lrc_indirect_ctx_offset(lrc), 1366 }; 1367 int ret; 1368 1369 if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)) 1370 return 0; 1371 1372 if (hwe->class == XE_ENGINE_CLASS_RENDER || 1373 hwe->class == XE_ENGINE_CLASS_COMPUTE) { 1374 state.funcs = rcs_funcs; 1375 state.num_funcs = ARRAY_SIZE(rcs_funcs); 1376 } else { 1377 state.funcs = xcs_funcs; 1378 state.num_funcs = ARRAY_SIZE(xcs_funcs); 1379 } 1380 1381 if (xe_gt_WARN_ON(lrc->gt, !state.funcs)) 1382 return 0; 1383 1384 if (lrc->bo->vmap.is_iomem) { 1385 state.buffer = kmalloc(state.max_size, GFP_KERNEL); 1386 if (!state.buffer) 1387 return -ENOMEM; 1388 } 1389 1390 ret = setup_bo(&state); 1391 if (ret) { 1392 kfree(state.buffer); 1393 return ret; 1394 } 1395 1396 /* 1397 * Align to 64B cacheline so there's no garbage at the end for CS to 1398 * execute: size for indirect ctx must be a multiple of 64. 1399 */ 1400 while (state.written & 0xf) { 1401 *state.ptr++ = MI_NOOP; 1402 state.written++; 1403 } 1404 1405 finish_bo(&state); 1406 kfree(state.buffer); 1407 1408 /* 1409 * Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it 1410 * varies per engine class, but the default is good enough 1411 */ 1412 xe_lrc_write_ctx_reg(lrc, 1413 CTX_CS_INDIRECT_CTX, 1414 (xe_bo_ggtt_addr(lrc->bo) + state.offset) | 1415 /* Size in CLs. */ 1416 (state.written * sizeof(u32) / 64)); 1417 1418 return 0; 1419 } 1420 1421 static u8 xe_multi_queue_prio_to_lrc(struct xe_lrc *lrc, enum xe_multi_queue_priority priority) 1422 { 1423 struct xe_device *xe = gt_to_xe(lrc->gt); 1424 1425 xe_assert(xe, (priority >= XE_MULTI_QUEUE_PRIORITY_LOW && 1426 priority <= XE_MULTI_QUEUE_PRIORITY_HIGH)); 1427 1428 /* xe_multi_queue_priority is directly mapped to LRC priority values */ 1429 return priority; 1430 } 1431 1432 /** 1433 * xe_lrc_set_multi_queue_priority() - Set multi queue priority in LRC 1434 * @lrc: Logical Ring Context 1435 * @priority: Multi queue priority of the exec queue 1436 * 1437 * Convert @priority to LRC multi queue priority and update the @lrc descriptor 1438 */ 1439 void xe_lrc_set_multi_queue_priority(struct xe_lrc *lrc, enum xe_multi_queue_priority priority) 1440 { 1441 lrc->desc &= ~LRC_PRIORITY; 1442 lrc->desc |= FIELD_PREP(LRC_PRIORITY, xe_multi_queue_prio_to_lrc(lrc, priority)); 1443 } 1444 1445 static int xe_lrc_ctx_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm, 1446 void *replay_state, u16 msix_vec, u32 init_flags) 1447 { 1448 struct xe_gt *gt = hwe->gt; 1449 struct xe_tile *tile = gt_to_tile(gt); 1450 struct xe_device *xe = gt_to_xe(gt); 1451 struct iosys_map map; 1452 u32 arb_enable; 1453 u32 state_cache_perf_fix[3]; 1454 int err; 1455 1456 /* 1457 * Init Per-Process of HW status Page, LRC / context state to known 1458 * values. If there's already a primed default_lrc, just copy it, otherwise 1459 * it's the early submission to record the lrc: build a new empty one from 1460 * scratch. 1461 */ 1462 map = __xe_lrc_pphwsp_map(lrc); 1463 if (gt->default_lrc[hwe->class] || replay_state) { 1464 xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE); /* PPHWSP */ 1465 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE, 1466 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE, 1467 lrc->size - LRC_PPHWSP_SIZE); 1468 if (replay_state) 1469 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE, 1470 replay_state, lrc->replay_size); 1471 } else { 1472 void *init_data = empty_lrc_data(hwe); 1473 1474 if (!init_data) { 1475 return -ENOMEM; 1476 } 1477 1478 xe_map_memcpy_to(xe, &map, 0, init_data, lrc->size); 1479 kfree(init_data); 1480 } 1481 1482 if (vm) 1483 xe_lrc_set_ppgtt(lrc, vm); 1484 1485 if (xe_device_has_msix(xe)) { 1486 xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR, 1487 xe_memirq_status_ptr(&tile->memirq, hwe)); 1488 xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR, 1489 xe_memirq_source_ptr(&tile->memirq, hwe)); 1490 xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec); 1491 } 1492 1493 if (xe_gt_has_indirect_ring_state(gt)) { 1494 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE, 1495 __xe_lrc_indirect_ring_ggtt_addr(lrc)); 1496 1497 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START, 1498 __xe_lrc_ring_ggtt_addr(lrc)); 1499 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0); 1500 1501 /* Match head and tail pointers */ 1502 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, lrc->ring.tail); 1503 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail); 1504 1505 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL, 1506 RING_CTL_SIZE(lrc->ring.size) | RING_VALID); 1507 } else { 1508 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc)); 1509 1510 /* Match head and tail pointers */ 1511 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, lrc->ring.tail); 1512 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail); 1513 1514 xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL, 1515 RING_CTL_SIZE(lrc->ring.size) | RING_VALID); 1516 } 1517 1518 if (init_flags & XE_LRC_CREATE_RUNALONE) 1519 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL, 1520 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) | 1521 REG_MASKED_FIELD_ENABLE(CTX_CTRL_RUN_ALONE)); 1522 1523 if (init_flags & XE_LRC_CREATE_PXP) 1524 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL, 1525 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) | 1526 REG_MASKED_FIELD_ENABLE(CTX_CTRL_PXP_ENABLE)); 1527 1528 lrc->ctx_timestamp = 0; 1529 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0); 1530 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) 1531 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0); 1532 1533 if (xe->info.has_asid && vm) 1534 xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid); 1535 1536 lrc->desc = LRC_VALID; 1537 lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT); 1538 /* TODO: Priority */ 1539 1540 /* While this appears to have something about privileged batches or 1541 * some such, it really just means PPGTT mode. 1542 */ 1543 if (vm) 1544 lrc->desc |= LRC_PRIVILEGE; 1545 1546 if (GRAPHICS_VERx100(xe) < 1250) { 1547 lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance); 1548 lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class); 1549 } 1550 1551 arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1552 xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable)); 1553 1554 if (init_flags & XE_LRC_DISABLE_STATE_CACHE_PERF_FIX) { 1555 state_cache_perf_fix[0] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1); 1556 state_cache_perf_fix[1] = COMMON_SLICE_CHICKEN3.addr; 1557 state_cache_perf_fix[2] = REG_MASKED_FIELD_ENABLE(DISABLE_STATE_CACHE_PERF_FIX); 1558 xe_lrc_write_ring(lrc, state_cache_perf_fix, sizeof(state_cache_perf_fix)); 1559 } 1560 1561 map = __xe_lrc_seqno_map(lrc); 1562 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); 1563 1564 map = __xe_lrc_start_seqno_map(lrc); 1565 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); 1566 1567 err = setup_wa_bb(lrc, hwe); 1568 if (err) 1569 return err; 1570 1571 err = setup_indirect_ctx(lrc, hwe); 1572 1573 return err; 1574 } 1575 1576 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_vm *vm, 1577 void *replay_state, u32 ring_size, u16 msix_vec, u32 init_flags) 1578 { 1579 struct xe_gt *gt = hwe->gt; 1580 const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class); 1581 u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE; 1582 struct xe_tile *tile = gt_to_tile(gt); 1583 struct xe_device *xe = gt_to_xe(gt); 1584 struct xe_bo *bo; 1585 u32 bo_flags; 1586 int err; 1587 1588 kref_init(&lrc->refcount); 1589 lrc->gt = gt; 1590 lrc->replay_size = xe_gt_lrc_hang_replay_size(gt, hwe->class); 1591 lrc->size = lrc_size; 1592 lrc->flags = 0; 1593 lrc->ring.size = ring_size; 1594 lrc->ring.tail = 0; 1595 1596 if (gt_engine_needs_indirect_ctx(gt, hwe->class)) { 1597 lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX; 1598 bo_size += LRC_INDIRECT_CTX_BO_SIZE; 1599 } 1600 1601 if (xe_gt_has_indirect_ring_state(gt)) 1602 lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE; 1603 1604 bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT | 1605 XE_BO_FLAG_GGTT_INVALIDATE; 1606 1607 if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */ 1608 bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM; 1609 1610 bo = xe_bo_create_pin_map_novm(xe, tile, bo_size, 1611 ttm_bo_type_kernel, 1612 bo_flags, false); 1613 if (IS_ERR(bo)) 1614 return PTR_ERR(bo); 1615 1616 lrc->bo = bo; 1617 1618 bo = xe_bo_create_pin_map_novm(xe, tile, PAGE_SIZE, 1619 ttm_bo_type_kernel, 1620 XE_BO_FLAG_GGTT | 1621 XE_BO_FLAG_GGTT_INVALIDATE | 1622 XE_BO_FLAG_SYSTEM, false); 1623 if (IS_ERR(bo)) { 1624 err = PTR_ERR(bo); 1625 goto err_lrc_finish; 1626 } 1627 lrc->seqno_bo = bo; 1628 1629 xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt, 1630 hwe->fence_irq, hwe->name); 1631 1632 err = xe_lrc_ctx_init(lrc, hwe, vm, replay_state, msix_vec, init_flags); 1633 if (err) 1634 goto err_lrc_finish; 1635 1636 if (vm && vm->xef) 1637 xe_drm_client_add_bo(vm->xef->client, lrc->bo); 1638 1639 return 0; 1640 1641 err_lrc_finish: 1642 xe_lrc_finish(lrc); 1643 return err; 1644 } 1645 1646 /** 1647 * xe_lrc_create - Create a LRC 1648 * @hwe: Hardware Engine 1649 * @vm: The VM (address space) 1650 * @replay_state: GPU hang replay state 1651 * @ring_size: LRC ring size 1652 * @msix_vec: MSI-X interrupt vector (for platforms that support it) 1653 * @flags: LRC initialization flags 1654 * 1655 * Allocate and initialize the Logical Ring Context (LRC). 1656 * 1657 * Return pointer to created LRC upon success and an error pointer 1658 * upon failure. 1659 */ 1660 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm, 1661 void *replay_state, u32 ring_size, u16 msix_vec, u32 flags) 1662 { 1663 struct xe_lrc *lrc; 1664 int err; 1665 1666 lrc = kzalloc_obj(*lrc); 1667 if (!lrc) 1668 return ERR_PTR(-ENOMEM); 1669 1670 err = xe_lrc_init(lrc, hwe, vm, replay_state, ring_size, msix_vec, flags); 1671 if (err) { 1672 kfree(lrc); 1673 return ERR_PTR(err); 1674 } 1675 1676 return lrc; 1677 } 1678 1679 /** 1680 * xe_lrc_destroy - Destroy the LRC 1681 * @ref: reference to LRC 1682 * 1683 * Called when ref == 0, release resources held by the Logical Ring Context 1684 * (LRC) and free the LRC memory. 1685 */ 1686 void xe_lrc_destroy(struct kref *ref) 1687 { 1688 struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount); 1689 1690 xe_lrc_finish(lrc); 1691 kfree(lrc); 1692 } 1693 1694 /** 1695 * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC. 1696 * @lrc: the &xe_lrc struct instance 1697 */ 1698 void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc) 1699 { 1700 if (xe_lrc_has_indirect_ring_state(lrc)) { 1701 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE, 1702 __xe_lrc_indirect_ring_ggtt_addr(lrc)); 1703 1704 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START, 1705 __xe_lrc_ring_ggtt_addr(lrc)); 1706 } else { 1707 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc)); 1708 } 1709 } 1710 1711 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail) 1712 { 1713 if (xe_lrc_has_indirect_ring_state(lrc)) 1714 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail); 1715 else 1716 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail); 1717 } 1718 1719 u32 xe_lrc_ring_tail(struct xe_lrc *lrc) 1720 { 1721 if (xe_lrc_has_indirect_ring_state(lrc)) 1722 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR; 1723 else 1724 return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR; 1725 } 1726 1727 static u32 xe_lrc_ring_start(struct xe_lrc *lrc) 1728 { 1729 if (xe_lrc_has_indirect_ring_state(lrc)) 1730 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START); 1731 else 1732 return xe_lrc_read_ctx_reg(lrc, CTX_RING_START); 1733 } 1734 1735 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head) 1736 { 1737 if (xe_lrc_has_indirect_ring_state(lrc)) 1738 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head); 1739 else 1740 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head); 1741 } 1742 1743 u32 xe_lrc_ring_head(struct xe_lrc *lrc) 1744 { 1745 if (xe_lrc_has_indirect_ring_state(lrc)) 1746 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR; 1747 else 1748 return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR; 1749 } 1750 1751 u32 xe_lrc_ring_space(struct xe_lrc *lrc) 1752 { 1753 const u32 head = xe_lrc_ring_head(lrc); 1754 const u32 tail = lrc->ring.tail; 1755 const u32 size = lrc->ring.size; 1756 1757 return ((head - tail - 1) & (size - 1)) + 1; 1758 } 1759 1760 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring, 1761 const void *data, size_t size) 1762 { 1763 struct xe_device *xe = lrc_to_xe(lrc); 1764 1765 iosys_map_incr(&ring, lrc->ring.tail); 1766 xe_map_memcpy_to(xe, &ring, 0, data, size); 1767 lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1); 1768 } 1769 1770 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size) 1771 { 1772 struct xe_device *xe = lrc_to_xe(lrc); 1773 struct iosys_map ring; 1774 u32 rhs; 1775 size_t aligned_size; 1776 1777 xe_assert(xe, IS_ALIGNED(size, 4)); 1778 aligned_size = ALIGN(size, 8); 1779 1780 ring = __xe_lrc_ring_map(lrc); 1781 1782 xe_assert(xe, lrc->ring.tail < lrc->ring.size); 1783 rhs = lrc->ring.size - lrc->ring.tail; 1784 if (size > rhs) { 1785 __xe_lrc_write_ring(lrc, ring, data, rhs); 1786 __xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs); 1787 } else { 1788 __xe_lrc_write_ring(lrc, ring, data, size); 1789 } 1790 1791 if (aligned_size > size) { 1792 u32 noop = MI_NOOP; 1793 1794 __xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop)); 1795 } 1796 } 1797 1798 u64 xe_lrc_descriptor(struct xe_lrc *lrc) 1799 { 1800 return lrc->desc | xe_lrc_ggtt_addr(lrc); 1801 } 1802 1803 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc) 1804 { 1805 return __xe_lrc_seqno_ggtt_addr(lrc); 1806 } 1807 1808 /** 1809 * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence. 1810 * 1811 * Allocate but don't initialize an lrc seqno fence. 1812 * 1813 * Return: Pointer to the allocated fence or 1814 * negative error pointer on error. 1815 */ 1816 struct dma_fence *xe_lrc_alloc_seqno_fence(void) 1817 { 1818 return xe_hw_fence_alloc(); 1819 } 1820 1821 /** 1822 * xe_lrc_free_seqno_fence() - Free an lrc seqno fence. 1823 * @fence: Pointer to the fence to free. 1824 * 1825 * Frees an lrc seqno fence that hasn't yet been 1826 * initialized. 1827 */ 1828 void xe_lrc_free_seqno_fence(struct dma_fence *fence) 1829 { 1830 xe_hw_fence_free(fence); 1831 } 1832 1833 /** 1834 * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence. 1835 * @lrc: Pointer to the lrc. 1836 * @fence: Pointer to the fence to initialize. 1837 * 1838 * Initializes a pre-allocated lrc seqno fence. 1839 * After initialization, the fence is subject to normal 1840 * dma-fence refcounting. 1841 */ 1842 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence) 1843 { 1844 xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc)); 1845 } 1846 1847 s32 xe_lrc_seqno(struct xe_lrc *lrc) 1848 { 1849 struct iosys_map map = __xe_lrc_seqno_map(lrc); 1850 1851 return xe_map_read32(lrc_to_xe(lrc), &map); 1852 } 1853 1854 s32 xe_lrc_start_seqno(struct xe_lrc *lrc) 1855 { 1856 struct iosys_map map = __xe_lrc_start_seqno_map(lrc); 1857 1858 return xe_map_read32(lrc_to_xe(lrc), &map); 1859 } 1860 1861 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc) 1862 { 1863 return __xe_lrc_start_seqno_ggtt_addr(lrc); 1864 } 1865 1866 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc) 1867 { 1868 return __xe_lrc_parallel_ggtt_addr(lrc); 1869 } 1870 1871 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc) 1872 { 1873 return __xe_lrc_parallel_map(lrc); 1874 } 1875 1876 /** 1877 * xe_lrc_engine_id() - Read engine id value 1878 * @lrc: Pointer to the lrc. 1879 * 1880 * Returns: context id value 1881 */ 1882 static u32 xe_lrc_engine_id(struct xe_lrc *lrc) 1883 { 1884 struct xe_device *xe = lrc_to_xe(lrc); 1885 struct iosys_map map; 1886 1887 map = __xe_lrc_engine_id_map(lrc); 1888 return xe_map_read32(xe, &map); 1889 } 1890 1891 static int instr_dw(u32 cmd_header) 1892 { 1893 /* GFXPIPE "SINGLE_DW" opcodes are a single dword */ 1894 if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) == 1895 GFXPIPE_SINGLE_DW_CMD(0, 0)) 1896 return 1; 1897 1898 /* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */ 1899 if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST) 1900 return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2; 1901 1902 /* Most instructions have the # of dwords (minus 2) in 7:0 */ 1903 return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2; 1904 } 1905 1906 static int dump_mi_command(struct drm_printer *p, 1907 struct xe_gt *gt, 1908 u32 *start, 1909 u32 *dw, 1910 int remaining_dw) 1911 { 1912 u32 inst_header = *dw; 1913 u32 numdw = instr_dw(inst_header); 1914 u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header); 1915 int num_noop; 1916 1917 /* First check for commands that don't have/use a '# DW' field */ 1918 switch (inst_header & MI_OPCODE) { 1919 case MI_NOOP: 1920 num_noop = 1; 1921 while (num_noop < remaining_dw && 1922 (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP) 1923 num_noop++; 1924 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_NOOP (%d dwords)\n", 1925 dw - num_noop - start, inst_header, num_noop); 1926 return num_noop; 1927 1928 case MI_TOPOLOGY_FILTER: 1929 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_TOPOLOGY_FILTER\n", 1930 dw - start, inst_header); 1931 return 1; 1932 1933 case MI_BATCH_BUFFER_END: 1934 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_BATCH_BUFFER_END\n", 1935 dw - start, inst_header); 1936 /* Return 'remaining_dw' to consume the rest of the LRC */ 1937 return remaining_dw; 1938 } 1939 1940 /* 1941 * Any remaining commands include a # of dwords. We should make sure 1942 * it doesn't exceed the remaining size of the LRC. 1943 */ 1944 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 1945 numdw = remaining_dw; 1946 1947 switch (inst_header & MI_OPCODE) { 1948 case MI_LOAD_REGISTER_IMM: 1949 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_LOAD_REGISTER_IMM: %d regs\n", 1950 dw - start, inst_header, (numdw - 1) / 2); 1951 for (int i = 1; i < numdw; i += 2) 1952 drm_printf(p, "LRC[%#5tx] = - %#6x = %#010x\n", 1953 &dw[i] - start, dw[i], dw[i + 1]); 1954 return numdw; 1955 1956 case MI_LOAD_REGISTER_MEM & MI_OPCODE: 1957 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_LOAD_REGISTER_MEM: %s%s\n", 1958 dw - start, inst_header, 1959 dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "", 1960 dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : ""); 1961 if (numdw == 4) 1962 drm_printf(p, "LRC[%#5tx] = - %#6x = %#010llx\n", 1963 dw - start, 1964 dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2]))); 1965 else 1966 drm_printf(p, "LRC[%#5tx] = - %*ph (%s)\n", 1967 dw - start, (int)sizeof(u32) * (numdw - 1), 1968 dw + 1, numdw < 4 ? "truncated" : "malformed"); 1969 return numdw; 1970 1971 case MI_FORCE_WAKEUP: 1972 drm_printf(p, "LRC[%#5tx] = [%#010x] MI_FORCE_WAKEUP\n", 1973 dw - start, inst_header); 1974 return numdw; 1975 1976 default: 1977 drm_printf(p, "LRC[%#5tx] = [%#010x] unknown MI opcode %#x, likely %d dwords\n", 1978 dw - start, inst_header, opcode, numdw); 1979 return numdw; 1980 } 1981 } 1982 1983 static int dump_gfxpipe_command(struct drm_printer *p, 1984 struct xe_gt *gt, 1985 u32 *start, 1986 u32 *dw, 1987 int remaining_dw) 1988 { 1989 u32 numdw = instr_dw(*dw); 1990 u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw); 1991 u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw); 1992 u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw); 1993 1994 /* 1995 * Make sure we haven't mis-parsed a number of dwords that exceeds the 1996 * remaining size of the LRC. 1997 */ 1998 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 1999 numdw = remaining_dw; 2000 2001 switch (*dw & GFXPIPE_MATCH_MASK) { 2002 #define MATCH(cmd) \ 2003 case cmd: \ 2004 drm_printf(p, "LRC[%#5tx] = [%#010x] " #cmd " (%d dwords)\n", \ 2005 dw - start, *dw, numdw); \ 2006 return numdw 2007 #define MATCH3D(cmd) \ 2008 case CMD_##cmd: \ 2009 drm_printf(p, "LRC[%#5tx] = [%#010x] " #cmd " (%d dwords)\n", \ 2010 dw - start, *dw, numdw); \ 2011 return numdw 2012 2013 MATCH(STATE_BASE_ADDRESS); 2014 MATCH(STATE_SIP); 2015 MATCH(GPGPU_CSR_BASE_ADDRESS); 2016 MATCH(STATE_COMPUTE_MODE); 2017 MATCH3D(3DSTATE_BTD); 2018 MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS); 2019 MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS); 2020 2021 MATCH3D(3DSTATE_VF_STATISTICS); 2022 2023 MATCH(PIPELINE_SELECT); 2024 2025 MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST); 2026 MATCH3D(3DSTATE_CUSTOM_SAMPLE_PATTERN); 2027 MATCH3D(3DSTATE_CLEAR_PARAMS); 2028 MATCH3D(3DSTATE_DEPTH_BUFFER); 2029 MATCH3D(3DSTATE_STENCIL_BUFFER); 2030 MATCH3D(3DSTATE_HIER_DEPTH_BUFFER); 2031 MATCH3D(3DSTATE_VERTEX_BUFFERS); 2032 MATCH3D(3DSTATE_VERTEX_ELEMENTS); 2033 MATCH3D(3DSTATE_INDEX_BUFFER); 2034 MATCH3D(3DSTATE_VF); 2035 MATCH3D(3DSTATE_MULTISAMPLE); 2036 MATCH3D(3DSTATE_CC_STATE_POINTERS); 2037 MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS); 2038 MATCH3D(3DSTATE_VS); 2039 MATCH3D(3DSTATE_GS); 2040 MATCH3D(3DSTATE_CLIP); 2041 MATCH3D(3DSTATE_SF); 2042 MATCH3D(3DSTATE_WM); 2043 MATCH3D(3DSTATE_CONSTANT_VS); 2044 MATCH3D(3DSTATE_CONSTANT_GS); 2045 MATCH3D(3DSTATE_CONSTANT_PS); 2046 MATCH3D(3DSTATE_SAMPLE_MASK); 2047 MATCH3D(3DSTATE_CONSTANT_HS); 2048 MATCH3D(3DSTATE_CONSTANT_DS); 2049 MATCH3D(3DSTATE_HS); 2050 MATCH3D(3DSTATE_TE); 2051 MATCH3D(3DSTATE_DS); 2052 MATCH3D(3DSTATE_STREAMOUT); 2053 MATCH3D(3DSTATE_SBE); 2054 MATCH3D(3DSTATE_PS); 2055 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP); 2056 MATCH3D(3DSTATE_CPS_POINTERS); 2057 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC); 2058 MATCH3D(3DSTATE_BLEND_STATE_POINTERS); 2059 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS); 2060 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS); 2061 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS); 2062 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS); 2063 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS); 2064 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS); 2065 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS); 2066 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS); 2067 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS); 2068 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS); 2069 MATCH3D(3DSTATE_VF_INSTANCING); 2070 MATCH3D(3DSTATE_VF_SGVS); 2071 MATCH3D(3DSTATE_VF_TOPOLOGY); 2072 MATCH3D(3DSTATE_WM_CHROMAKEY); 2073 MATCH3D(3DSTATE_PS_BLEND); 2074 MATCH3D(3DSTATE_WM_DEPTH_STENCIL); 2075 MATCH3D(3DSTATE_PS_EXTRA); 2076 MATCH3D(3DSTATE_RASTER); 2077 MATCH3D(3DSTATE_SBE_SWIZ); 2078 MATCH3D(3DSTATE_WM_HZ_OP); 2079 MATCH3D(3DSTATE_VF_COMPONENT_PACKING); 2080 MATCH3D(3DSTATE_VF_SGVS_2); 2081 MATCH3D(3DSTATE_VFG); 2082 MATCH3D(3DSTATE_URB_ALLOC_VS); 2083 MATCH3D(3DSTATE_URB_ALLOC_HS); 2084 MATCH3D(3DSTATE_URB_ALLOC_DS); 2085 MATCH3D(3DSTATE_URB_ALLOC_GS); 2086 MATCH3D(3DSTATE_SO_BUFFER_INDEX_0); 2087 MATCH3D(3DSTATE_SO_BUFFER_INDEX_1); 2088 MATCH3D(3DSTATE_SO_BUFFER_INDEX_2); 2089 MATCH3D(3DSTATE_SO_BUFFER_INDEX_3); 2090 MATCH3D(3DSTATE_PRIMITIVE_REPLICATION); 2091 MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO); 2092 MATCH3D(3DSTATE_AMFS); 2093 MATCH3D(3DSTATE_DEPTH_BOUNDS); 2094 MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS); 2095 MATCH3D(3DSTATE_CONSTANT_TS_POINTER); 2096 MATCH3D(3DSTATE_MESH_CONTROL); 2097 MATCH3D(3DSTATE_MESH_DISTRIB); 2098 MATCH3D(3DSTATE_TASK_REDISTRIB); 2099 MATCH3D(3DSTATE_MESH_SHADER); 2100 MATCH3D(3DSTATE_MESH_SHADER_DATA); 2101 MATCH3D(3DSTATE_TASK_CONTROL); 2102 MATCH3D(3DSTATE_TASK_SHADER); 2103 MATCH3D(3DSTATE_TASK_SHADER_DATA); 2104 MATCH3D(3DSTATE_URB_ALLOC_MESH); 2105 MATCH3D(3DSTATE_URB_ALLOC_TASK); 2106 MATCH3D(3DSTATE_CLIP_MESH); 2107 MATCH3D(3DSTATE_SBE_MESH); 2108 MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER); 2109 MATCH3D(3DSTATE_COARSE_PIXEL); 2110 MATCH3D(3DSTATE_MESH_SHADER_DATA_EXT); 2111 MATCH3D(3DSTATE_TASK_SHADER_DATA_EXT); 2112 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC_2); 2113 MATCH3D(3DSTATE_CC_STATE_POINTERS_2); 2114 MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS_2); 2115 MATCH3D(3DSTATE_BLEND_STATE_POINTERS_2); 2116 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP_2); 2117 2118 MATCH3D(3DSTATE_DRAWING_RECTANGLE); 2119 MATCH3D(3DSTATE_URB_MEMORY); 2120 MATCH3D(3DSTATE_CHROMA_KEY); 2121 MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET); 2122 MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN); 2123 MATCH3D(3DSTATE_LINE_STIPPLE); 2124 MATCH3D(3DSTATE_AA_LINE_PARAMETERS); 2125 MATCH3D(3DSTATE_MONOFILTER_SIZE); 2126 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS); 2127 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS); 2128 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS); 2129 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS); 2130 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS); 2131 MATCH3D(3DSTATE_SO_DECL_LIST); 2132 MATCH3D(3DSTATE_SO_BUFFER); 2133 MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC); 2134 MATCH3D(3DSTATE_SAMPLE_PATTERN); 2135 MATCH3D(3DSTATE_3D_MODE); 2136 MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE); 2137 MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS); 2138 MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO); 2139 MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTER_2); 2140 2141 default: 2142 drm_printf(p, "LRC[%#5tx] = [%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n", 2143 dw - start, *dw, pipeline, opcode, subopcode, numdw); 2144 return numdw; 2145 } 2146 } 2147 2148 static int dump_gfx_state_command(struct drm_printer *p, 2149 struct xe_gt *gt, 2150 u32 *start, 2151 u32 *dw, 2152 int remaining_dw) 2153 { 2154 u32 numdw = instr_dw(*dw); 2155 u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw); 2156 2157 /* 2158 * Make sure we haven't mis-parsed a number of dwords that exceeds the 2159 * remaining size of the LRC. 2160 */ 2161 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 2162 numdw = remaining_dw; 2163 2164 switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) { 2165 MATCH(STATE_WRITE_INLINE); 2166 2167 default: 2168 drm_printf(p, "LRC[%#5tx] = [%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n", 2169 dw - start, *dw, opcode, numdw); 2170 return numdw; 2171 } 2172 } 2173 2174 void xe_lrc_dump_default(struct drm_printer *p, 2175 struct xe_gt *gt, 2176 enum xe_engine_class hwe_class) 2177 { 2178 u32 *dw, *start; 2179 int remaining_dw, num_dw; 2180 2181 if (!gt->default_lrc[hwe_class]) { 2182 drm_printf(p, "No default LRC for class %d\n", hwe_class); 2183 return; 2184 } 2185 2186 /* 2187 * Skip the beginning of the LRC since it contains the per-process 2188 * hardware status page. 2189 */ 2190 dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE; 2191 start = dw; 2192 remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4; 2193 2194 while (remaining_dw > 0) { 2195 if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) { 2196 num_dw = dump_mi_command(p, gt, start, dw, remaining_dw); 2197 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) { 2198 num_dw = dump_gfxpipe_command(p, gt, start, dw, remaining_dw); 2199 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) { 2200 num_dw = dump_gfx_state_command(p, gt, start, dw, remaining_dw); 2201 } else { 2202 num_dw = min(instr_dw(*dw), remaining_dw); 2203 drm_printf(p, "LRC[%#5tx] = [%#10x] Unknown instruction of type %#x, likely %d dwords\n", 2204 dw - start, 2205 *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw), 2206 num_dw); 2207 } 2208 2209 dw += num_dw; 2210 remaining_dw -= num_dw; 2211 } 2212 } 2213 2214 /* 2215 * Lookup the value of a register within the offset/value pairs of an 2216 * MI_LOAD_REGISTER_IMM instruction. 2217 * 2218 * Return -ENOENT if the register is not present in the MI_LRI instruction. 2219 */ 2220 static int lookup_reg_in_mi_lri(u32 offset, u32 *value, 2221 const u32 *dword_pair, int num_regs) 2222 { 2223 for (int i = 0; i < num_regs; i++) { 2224 if (dword_pair[2 * i] == offset) { 2225 *value = dword_pair[2 * i + 1]; 2226 return 0; 2227 } 2228 } 2229 2230 return -ENOENT; 2231 } 2232 2233 /* 2234 * Lookup the value of a register in a specific engine type's default LRC. 2235 * 2236 * Return -EINVAL if the default LRC doesn't exist, or ENOENT if the register 2237 * cannot be found in the default LRC. 2238 */ 2239 int xe_lrc_lookup_default_reg_value(struct xe_gt *gt, 2240 enum xe_engine_class hwe_class, 2241 u32 offset, 2242 u32 *value) 2243 { 2244 u32 *dw; 2245 int remaining_dw, ret; 2246 2247 if (!gt->default_lrc[hwe_class]) 2248 return -EINVAL; 2249 2250 /* 2251 * Skip the beginning of the LRC since it contains the per-process 2252 * hardware status page. 2253 */ 2254 dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE; 2255 remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4; 2256 2257 while (remaining_dw > 0) { 2258 u32 num_dw = instr_dw(*dw); 2259 2260 if (num_dw > remaining_dw) 2261 num_dw = remaining_dw; 2262 2263 switch (*dw & XE_INSTR_CMD_TYPE) { 2264 case XE_INSTR_MI: 2265 switch (*dw & MI_OPCODE) { 2266 case MI_BATCH_BUFFER_END: 2267 /* End of LRC; register not found */ 2268 return -ENOENT; 2269 2270 case MI_NOOP: 2271 case MI_TOPOLOGY_FILTER: 2272 /* 2273 * MI_NOOP and MI_TOPOLOGY_FILTER don't have 2274 * a length field and are always 1-dword 2275 * instructions. 2276 */ 2277 remaining_dw--; 2278 dw++; 2279 break; 2280 2281 case MI_LOAD_REGISTER_IMM: 2282 ret = lookup_reg_in_mi_lri(offset, value, 2283 dw + 1, (num_dw - 1) / 2); 2284 if (ret == 0) 2285 return 0; 2286 2287 fallthrough; 2288 2289 default: 2290 /* 2291 * Jump to next instruction based on length 2292 * field. 2293 */ 2294 remaining_dw -= num_dw; 2295 dw += num_dw; 2296 break; 2297 } 2298 break; 2299 2300 default: 2301 /* Jump to next instruction based on length field. */ 2302 remaining_dw -= num_dw; 2303 dw += num_dw; 2304 } 2305 } 2306 2307 return -ENOENT; 2308 } 2309 2310 struct instr_state { 2311 u32 instr; 2312 u16 num_dw; 2313 }; 2314 2315 static const struct instr_state xe_hpg_svg_state[] = { 2316 { .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 }, 2317 { .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 }, 2318 { .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 }, 2319 { .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 }, 2320 { .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 }, 2321 { .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 }, 2322 { .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 }, 2323 { .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 }, 2324 { .instr = CMD_3DSTATE_VS, .num_dw = 9 }, 2325 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 }, 2326 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 }, 2327 { .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 }, 2328 { .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 }, 2329 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 }, 2330 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 }, 2331 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 }, 2332 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 }, 2333 { .instr = CMD_3DSTATE_CLIP, .num_dw = 4 }, 2334 { .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 }, 2335 { .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 }, 2336 { .instr = CMD_3DSTATE_SF, .num_dw = 4 }, 2337 { .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 }, 2338 { .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 }, 2339 { .instr = CMD_3DSTATE_RASTER, .num_dw = 5 }, 2340 { .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 }, 2341 { .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 }, 2342 { .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 }, 2343 { .instr = CMD_3DSTATE_HS, .num_dw = 9 }, 2344 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 }, 2345 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 }, 2346 { .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 }, 2347 { .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 }, 2348 { .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 }, 2349 { .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 }, 2350 { .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 }, 2351 { .instr = CMD_3DSTATE_TE, .num_dw = 5 }, 2352 { .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 }, 2353 { .instr = CMD_3DSTATE_DS, .num_dw = 11 }, 2354 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 }, 2355 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 }, 2356 { .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 }, 2357 { .instr = CMD_3DSTATE_GS, .num_dw = 10 }, 2358 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 }, 2359 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 }, 2360 { .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 }, 2361 { .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 }, 2362 { .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 }, 2363 { .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 }, 2364 { .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 }, 2365 { .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 }, 2366 }; 2367 2368 u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs) 2369 { 2370 struct xe_gt *gt = q->hwe->gt; 2371 struct xe_device *xe = gt_to_xe(gt); 2372 const struct instr_state *state_table = NULL; 2373 int state_table_size = 0; 2374 2375 /* 2376 * Wa_14019789679 2377 * 2378 * If the driver doesn't explicitly emit the SVG instructions while 2379 * setting up the default LRC, the context switch will write 0's 2380 * (noops) into the LRC memory rather than the expected instruction 2381 * headers. Application contexts start out as a copy of the default 2382 * LRC, and if they also do not emit specific settings for some SVG 2383 * state, then on context restore they'll unintentionally inherit 2384 * whatever state setting the previous context had programmed into the 2385 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will 2386 * prevent the hardware from resetting that state back to any specific 2387 * value). 2388 * 2389 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL 2390 * since that's a specific state setting that can easily cause GPU 2391 * hangs if unintentionally inherited. However to be safe we'll 2392 * continue to emit all of the SVG state since it's best not to leak 2393 * any of the state between contexts, even if that leakage is harmless. 2394 */ 2395 if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) { 2396 state_table = xe_hpg_svg_state; 2397 state_table_size = ARRAY_SIZE(xe_hpg_svg_state); 2398 } 2399 2400 if (!state_table) { 2401 xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n", 2402 GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100); 2403 return cs; 2404 } 2405 2406 for (int i = 0; i < state_table_size; i++) { 2407 u32 instr = state_table[i].instr; 2408 u16 num_dw = state_table[i].num_dw; 2409 bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW); 2410 2411 xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE); 2412 xe_gt_assert(gt, num_dw != 0); 2413 xe_gt_assert(gt, is_single_dw ^ (num_dw > 1)); 2414 2415 /* 2416 * Xe2's SVG context is the same as the one on DG2 / MTL 2417 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has 2418 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined). 2419 * Just make the replacement here rather than defining a 2420 * whole separate table for the single trivial change. 2421 */ 2422 if (GRAPHICS_VER(xe) >= 20 && 2423 instr == CMD_3DSTATE_DRAWING_RECTANGLE) 2424 instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST; 2425 2426 *cs = instr; 2427 if (!is_single_dw) 2428 *cs |= (num_dw - 2); 2429 2430 cs += num_dw; 2431 } 2432 2433 return cs; 2434 } 2435 2436 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc) 2437 { 2438 struct xe_lrc_snapshot *snapshot = kmalloc_obj(*snapshot, GFP_NOWAIT); 2439 2440 if (!snapshot) 2441 return NULL; 2442 2443 snapshot->context_desc = xe_lrc_ggtt_addr(lrc); 2444 snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc); 2445 snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc); 2446 snapshot->head = xe_lrc_ring_head(lrc); 2447 snapshot->tail.internal = lrc->ring.tail; 2448 snapshot->tail.memory = xe_lrc_ring_tail(lrc); 2449 snapshot->start = xe_lrc_ring_start(lrc); 2450 snapshot->start_seqno = xe_lrc_start_seqno(lrc); 2451 snapshot->seqno = xe_lrc_seqno(lrc); 2452 snapshot->lrc_bo = xe_bo_get(lrc->bo); 2453 snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc); 2454 snapshot->lrc_size = lrc->size; 2455 snapshot->replay_offset = 0; 2456 snapshot->replay_size = lrc->replay_size; 2457 snapshot->lrc_snapshot = NULL; 2458 snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc)); 2459 snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc); 2460 return snapshot; 2461 } 2462 2463 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot) 2464 { 2465 struct xe_bo *bo; 2466 struct iosys_map src; 2467 2468 if (!snapshot) 2469 return; 2470 2471 bo = snapshot->lrc_bo; 2472 snapshot->lrc_bo = NULL; 2473 2474 snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL); 2475 if (!snapshot->lrc_snapshot) 2476 goto put_bo; 2477 2478 xe_bo_lock(bo, false); 2479 if (!ttm_bo_vmap(&bo->ttm, &src)) { 2480 xe_map_memcpy_from(xe_bo_device(bo), 2481 snapshot->lrc_snapshot, &src, snapshot->lrc_offset, 2482 snapshot->lrc_size); 2483 ttm_bo_vunmap(&bo->ttm, &src); 2484 } else { 2485 kvfree(snapshot->lrc_snapshot); 2486 snapshot->lrc_snapshot = NULL; 2487 } 2488 xe_bo_unlock(bo); 2489 put_bo: 2490 xe_bo_put(bo); 2491 } 2492 2493 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p) 2494 { 2495 unsigned long i; 2496 2497 if (!snapshot) 2498 return; 2499 2500 drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc); 2501 drm_printf(p, "\tHW Ring address: 0x%08x\n", 2502 snapshot->ring_addr); 2503 drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n", 2504 snapshot->indirect_context_desc); 2505 drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head); 2506 drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n", 2507 snapshot->tail.internal, snapshot->tail.memory); 2508 drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start); 2509 drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno); 2510 drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno); 2511 drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp); 2512 drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp); 2513 2514 if (!snapshot->lrc_snapshot) 2515 return; 2516 2517 drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE); 2518 drm_puts(p, "\t[HWSP].data: "); 2519 for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) { 2520 u32 *val = snapshot->lrc_snapshot + i; 2521 char dumped[ASCII85_BUFSZ]; 2522 2523 drm_puts(p, ascii85_encode(*val, dumped)); 2524 } 2525 2526 drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE); 2527 drm_printf(p, "\n\t[HWCTX].replay_offset: 0x%lx\n", snapshot->replay_offset); 2528 drm_printf(p, "\n\t[HWCTX].replay_length: 0x%lx\n", snapshot->replay_size); 2529 2530 drm_puts(p, "\t[HWCTX].data: "); 2531 for (; i < snapshot->lrc_size; i += sizeof(u32)) { 2532 u32 *val = snapshot->lrc_snapshot + i; 2533 char dumped[ASCII85_BUFSZ]; 2534 2535 drm_puts(p, ascii85_encode(*val, dumped)); 2536 } 2537 drm_puts(p, "\n"); 2538 } 2539 2540 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot) 2541 { 2542 if (!snapshot) 2543 return; 2544 2545 kvfree(snapshot->lrc_snapshot); 2546 if (snapshot->lrc_bo) 2547 xe_bo_put(snapshot->lrc_bo); 2548 2549 kfree(snapshot); 2550 } 2551 2552 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts) 2553 { 2554 u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id); 2555 u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id); 2556 struct xe_hw_engine *hwe; 2557 u64 val; 2558 2559 hwe = xe_gt_hw_engine(lrc->gt, class, instance, false); 2560 if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe), 2561 "Unexpected engine class:instance %d:%d for context utilization\n", 2562 class, instance)) 2563 return -1; 2564 2565 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) 2566 val = xe_mmio_read64_2x32(&hwe->gt->mmio, 2567 RING_CTX_TIMESTAMP(hwe->mmio_base)); 2568 else 2569 val = xe_mmio_read32(&hwe->gt->mmio, 2570 RING_CTX_TIMESTAMP(hwe->mmio_base)); 2571 2572 *reg_ctx_ts = val; 2573 2574 return 0; 2575 } 2576 2577 /** 2578 * xe_lrc_timestamp() - Current ctx timestamp 2579 * @lrc: Pointer to the lrc. 2580 * 2581 * Return latest ctx timestamp. With support for active contexts, the 2582 * calculation may be slightly racy, so follow a read-again logic to ensure that 2583 * the context is still active before returning the right timestamp. 2584 * 2585 * Returns: New ctx timestamp value 2586 */ 2587 u64 xe_lrc_timestamp(struct xe_lrc *lrc) 2588 { 2589 u64 lrc_ts, reg_ts, new_ts = lrc->ctx_timestamp; 2590 u32 engine_id; 2591 2592 lrc_ts = xe_lrc_ctx_timestamp(lrc); 2593 /* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */ 2594 if (IS_SRIOV_VF(lrc_to_xe(lrc))) { 2595 new_ts = lrc_ts; 2596 goto done; 2597 } 2598 2599 if (lrc_ts == CONTEXT_ACTIVE) { 2600 engine_id = xe_lrc_engine_id(lrc); 2601 if (!get_ctx_timestamp(lrc, engine_id, ®_ts)) 2602 new_ts = reg_ts; 2603 2604 /* read lrc again to ensure context is still active */ 2605 lrc_ts = xe_lrc_ctx_timestamp(lrc); 2606 } 2607 2608 /* 2609 * If context switched out, just use the lrc_ts. Note that this needs to 2610 * be a separate if condition. 2611 */ 2612 if (lrc_ts != CONTEXT_ACTIVE) 2613 new_ts = lrc_ts; 2614 2615 done: 2616 return new_ts; 2617 } 2618 2619 /** 2620 * xe_lrc_update_timestamp() - Update ctx timestamp 2621 * @lrc: Pointer to the lrc. 2622 * @old_ts: Old timestamp value 2623 * 2624 * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and 2625 * update saved value. 2626 * 2627 * Returns: New ctx timestamp value 2628 */ 2629 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts) 2630 { 2631 *old_ts = lrc->ctx_timestamp; 2632 lrc->ctx_timestamp = xe_lrc_timestamp(lrc); 2633 2634 trace_xe_lrc_update_timestamp(lrc, *old_ts); 2635 2636 return lrc->ctx_timestamp; 2637 } 2638 2639 /** 2640 * xe_lrc_ring_is_idle() - LRC is idle 2641 * @lrc: Pointer to the lrc. 2642 * 2643 * Compare LRC ring head and tail to determine if idle. 2644 * 2645 * Return: True is ring is idle, False otherwise 2646 */ 2647 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc) 2648 { 2649 return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc); 2650 } 2651