1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2021 Intel Corporation 4 */ 5 6 #include "xe_lrc.h" 7 8 #include <generated/xe_wa_oob.h> 9 10 #include <linux/ascii85.h> 11 #include <linux/panic.h> 12 13 #include "instructions/xe_mi_commands.h" 14 #include "instructions/xe_gfxpipe_commands.h" 15 #include "instructions/xe_gfx_state_commands.h" 16 #include "regs/xe_engine_regs.h" 17 #include "regs/xe_lrc_layout.h" 18 #include "xe_bb.h" 19 #include "xe_bo.h" 20 #include "xe_configfs.h" 21 #include "xe_device.h" 22 #include "xe_drm_client.h" 23 #include "xe_exec_queue_types.h" 24 #include "xe_gt.h" 25 #include "xe_gt_printk.h" 26 #include "xe_hw_fence.h" 27 #include "xe_map.h" 28 #include "xe_memirq.h" 29 #include "xe_mmio.h" 30 #include "xe_sriov.h" 31 #include "xe_trace_lrc.h" 32 #include "xe_vm.h" 33 #include "xe_wa.h" 34 35 #define LRC_VALID BIT_ULL(0) 36 #define LRC_PRIVILEGE BIT_ULL(8) 37 #define LRC_ADDRESSING_MODE GENMASK_ULL(4, 3) 38 #define LRC_LEGACY_64B_CONTEXT 3 39 40 #define LRC_ENGINE_CLASS GENMASK_ULL(63, 61) 41 #define LRC_ENGINE_INSTANCE GENMASK_ULL(53, 48) 42 43 #define LRC_PPHWSP_SIZE SZ_4K 44 #define LRC_INDIRECT_CTX_BO_SIZE SZ_4K 45 #define LRC_INDIRECT_RING_STATE_SIZE SZ_4K 46 47 /* 48 * Layout of the LRC and associated data allocated as 49 * lrc->bo: 50 * 51 * Region Size 52 * +============================+=================================+ <- __xe_lrc_ring_offset() 53 * | Ring | ring_size, see | 54 * | | xe_lrc_init() | 55 * +============================+=================================+ <- __xe_lrc_pphwsp_offset() 56 * | PPHWSP (includes SW state) | 4K | 57 * +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset() 58 * | Engine Context Image | n * 4K, see | 59 * | | xe_gt_lrc_size() | 60 * +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset() 61 * | Indirect Ring State Page | 0 or 4k, see | 62 * | | XE_LRC_FLAG_INDIRECT_RING_STATE | 63 * +============================+=================================+ <- __xe_lrc_indirect_ctx_offset() 64 * | Indirect Context Page | 0 or 4k, see | 65 * | | XE_LRC_FLAG_INDIRECT_CTX | 66 * +============================+=================================+ <- __xe_lrc_wa_bb_offset() 67 * | WA BB Per Ctx | 4k | 68 * +============================+=================================+ <- xe_bo_size(lrc->bo) 69 */ 70 71 static struct xe_device * 72 lrc_to_xe(struct xe_lrc *lrc) 73 { 74 return gt_to_xe(lrc->fence_ctx.gt); 75 } 76 77 static bool 78 gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class) 79 { 80 struct xe_device *xe = gt_to_xe(gt); 81 82 if (XE_GT_WA(gt, 16010904313) && 83 (class == XE_ENGINE_CLASS_RENDER || 84 class == XE_ENGINE_CLASS_COMPUTE)) 85 return true; 86 87 if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev), 88 class, NULL)) 89 return true; 90 91 return false; 92 } 93 94 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class) 95 { 96 struct xe_device *xe = gt_to_xe(gt); 97 size_t size; 98 99 /* Per-process HW status page (PPHWSP) */ 100 size = LRC_PPHWSP_SIZE; 101 102 /* Engine context image */ 103 switch (class) { 104 case XE_ENGINE_CLASS_RENDER: 105 if (GRAPHICS_VER(xe) >= 20) 106 size += 3 * SZ_4K; 107 else 108 size += 13 * SZ_4K; 109 break; 110 case XE_ENGINE_CLASS_COMPUTE: 111 if (GRAPHICS_VER(xe) >= 20) 112 size += 2 * SZ_4K; 113 else 114 size += 13 * SZ_4K; 115 break; 116 default: 117 WARN(1, "Unknown engine class: %d", class); 118 fallthrough; 119 case XE_ENGINE_CLASS_COPY: 120 case XE_ENGINE_CLASS_VIDEO_DECODE: 121 case XE_ENGINE_CLASS_VIDEO_ENHANCE: 122 case XE_ENGINE_CLASS_OTHER: 123 size += 1 * SZ_4K; 124 } 125 126 /* Add indirect ring state page */ 127 if (xe_gt_has_indirect_ring_state(gt)) 128 size += LRC_INDIRECT_RING_STATE_SIZE; 129 130 return size; 131 } 132 133 /* 134 * The per-platform tables are u8-encoded in @data. Decode @data and set the 135 * addresses' offset and commands in @regs. The following encoding is used 136 * for each byte. There are 2 steps: decoding commands and decoding addresses. 137 * 138 * Commands: 139 * [7]: create NOPs - number of NOPs are set in lower bits 140 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set 141 * MI_LRI_FORCE_POSTED 142 * [5:0]: Number of NOPs or registers to set values to in case of 143 * MI_LOAD_REGISTER_IMM 144 * 145 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count" 146 * number of registers. They are set by using the REG/REG16 macros: the former 147 * is used for offsets smaller than 0x200 while the latter is for values bigger 148 * than that. Those macros already set all the bits documented below correctly: 149 * 150 * [7]: When a register offset needs more than 6 bits, use additional bytes, to 151 * follow, for the lower bits 152 * [6:0]: Register offset, without considering the engine base. 153 * 154 * This function only tweaks the commands and register offsets. Values are not 155 * filled out. 156 */ 157 static void set_offsets(u32 *regs, 158 const u8 *data, 159 const struct xe_hw_engine *hwe) 160 #define NOP(x) (BIT(7) | (x)) 161 #define LRI(count, flags) ((flags) << 6 | (count) | \ 162 BUILD_BUG_ON_ZERO(count >= BIT(6))) 163 #define POSTED BIT(0) 164 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 165 #define REG16(x) \ 166 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 167 (((x) >> 2) & 0x7f) 168 { 169 const u32 base = hwe->mmio_base; 170 171 while (*data) { 172 u8 count, flags; 173 174 if (*data & BIT(7)) { /* skip */ 175 count = *data++ & ~BIT(7); 176 regs += count; 177 continue; 178 } 179 180 count = *data & 0x3f; 181 flags = *data >> 6; 182 data++; 183 184 *regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count); 185 if (flags & POSTED) 186 *regs |= MI_LRI_FORCE_POSTED; 187 *regs |= MI_LRI_LRM_CS_MMIO; 188 regs++; 189 190 xe_gt_assert(hwe->gt, count); 191 do { 192 u32 offset = 0; 193 u8 v; 194 195 do { 196 v = *data++; 197 offset <<= 7; 198 offset |= v & ~BIT(7); 199 } while (v & BIT(7)); 200 201 regs[0] = base + (offset << 2); 202 regs += 2; 203 } while (--count); 204 } 205 206 *regs = MI_BATCH_BUFFER_END | BIT(0); 207 } 208 209 static const u8 gen12_xcs_offsets[] = { 210 NOP(1), 211 LRI(13, POSTED), 212 REG16(0x244), 213 REG(0x034), 214 REG(0x030), 215 REG(0x038), 216 REG(0x03c), 217 REG(0x168), 218 REG(0x140), 219 REG(0x110), 220 REG(0x1c0), 221 REG(0x1c4), 222 REG(0x1c8), 223 REG(0x180), 224 REG16(0x2b4), 225 226 NOP(5), 227 LRI(9, POSTED), 228 REG16(0x3a8), 229 REG16(0x28c), 230 REG16(0x288), 231 REG16(0x284), 232 REG16(0x280), 233 REG16(0x27c), 234 REG16(0x278), 235 REG16(0x274), 236 REG16(0x270), 237 238 0 239 }; 240 241 static const u8 dg2_xcs_offsets[] = { 242 NOP(1), 243 LRI(15, POSTED), 244 REG16(0x244), 245 REG(0x034), 246 REG(0x030), 247 REG(0x038), 248 REG(0x03c), 249 REG(0x168), 250 REG(0x140), 251 REG(0x110), 252 REG(0x1c0), 253 REG(0x1c4), 254 REG(0x1c8), 255 REG(0x180), 256 REG16(0x2b4), 257 REG(0x120), 258 REG(0x124), 259 260 NOP(1), 261 LRI(9, POSTED), 262 REG16(0x3a8), 263 REG16(0x28c), 264 REG16(0x288), 265 REG16(0x284), 266 REG16(0x280), 267 REG16(0x27c), 268 REG16(0x278), 269 REG16(0x274), 270 REG16(0x270), 271 272 0 273 }; 274 275 static const u8 gen12_rcs_offsets[] = { 276 NOP(1), 277 LRI(13, POSTED), 278 REG16(0x244), 279 REG(0x034), 280 REG(0x030), 281 REG(0x038), 282 REG(0x03c), 283 REG(0x168), 284 REG(0x140), 285 REG(0x110), 286 REG(0x1c0), 287 REG(0x1c4), 288 REG(0x1c8), 289 REG(0x180), 290 REG16(0x2b4), 291 292 NOP(5), 293 LRI(9, POSTED), 294 REG16(0x3a8), 295 REG16(0x28c), 296 REG16(0x288), 297 REG16(0x284), 298 REG16(0x280), 299 REG16(0x27c), 300 REG16(0x278), 301 REG16(0x274), 302 REG16(0x270), 303 304 LRI(3, POSTED), 305 REG(0x1b0), 306 REG16(0x5a8), 307 REG16(0x5ac), 308 309 NOP(6), 310 LRI(1, 0), 311 REG(0x0c8), 312 NOP(3 + 9 + 1), 313 314 LRI(51, POSTED), 315 REG16(0x588), 316 REG16(0x588), 317 REG16(0x588), 318 REG16(0x588), 319 REG16(0x588), 320 REG16(0x588), 321 REG(0x028), 322 REG(0x09c), 323 REG(0x0c0), 324 REG(0x178), 325 REG(0x17c), 326 REG16(0x358), 327 REG(0x170), 328 REG(0x150), 329 REG(0x154), 330 REG(0x158), 331 REG16(0x41c), 332 REG16(0x600), 333 REG16(0x604), 334 REG16(0x608), 335 REG16(0x60c), 336 REG16(0x610), 337 REG16(0x614), 338 REG16(0x618), 339 REG16(0x61c), 340 REG16(0x620), 341 REG16(0x624), 342 REG16(0x628), 343 REG16(0x62c), 344 REG16(0x630), 345 REG16(0x634), 346 REG16(0x638), 347 REG16(0x63c), 348 REG16(0x640), 349 REG16(0x644), 350 REG16(0x648), 351 REG16(0x64c), 352 REG16(0x650), 353 REG16(0x654), 354 REG16(0x658), 355 REG16(0x65c), 356 REG16(0x660), 357 REG16(0x664), 358 REG16(0x668), 359 REG16(0x66c), 360 REG16(0x670), 361 REG16(0x674), 362 REG16(0x678), 363 REG16(0x67c), 364 REG(0x068), 365 REG(0x084), 366 NOP(1), 367 368 0 369 }; 370 371 static const u8 xehp_rcs_offsets[] = { 372 NOP(1), 373 LRI(13, POSTED), 374 REG16(0x244), 375 REG(0x034), 376 REG(0x030), 377 REG(0x038), 378 REG(0x03c), 379 REG(0x168), 380 REG(0x140), 381 REG(0x110), 382 REG(0x1c0), 383 REG(0x1c4), 384 REG(0x1c8), 385 REG(0x180), 386 REG16(0x2b4), 387 388 NOP(5), 389 LRI(9, POSTED), 390 REG16(0x3a8), 391 REG16(0x28c), 392 REG16(0x288), 393 REG16(0x284), 394 REG16(0x280), 395 REG16(0x27c), 396 REG16(0x278), 397 REG16(0x274), 398 REG16(0x270), 399 400 LRI(3, POSTED), 401 REG(0x1b0), 402 REG16(0x5a8), 403 REG16(0x5ac), 404 405 NOP(6), 406 LRI(1, 0), 407 REG(0x0c8), 408 409 0 410 }; 411 412 static const u8 dg2_rcs_offsets[] = { 413 NOP(1), 414 LRI(15, POSTED), 415 REG16(0x244), 416 REG(0x034), 417 REG(0x030), 418 REG(0x038), 419 REG(0x03c), 420 REG(0x168), 421 REG(0x140), 422 REG(0x110), 423 REG(0x1c0), 424 REG(0x1c4), 425 REG(0x1c8), 426 REG(0x180), 427 REG16(0x2b4), 428 REG(0x120), 429 REG(0x124), 430 431 NOP(1), 432 LRI(9, POSTED), 433 REG16(0x3a8), 434 REG16(0x28c), 435 REG16(0x288), 436 REG16(0x284), 437 REG16(0x280), 438 REG16(0x27c), 439 REG16(0x278), 440 REG16(0x274), 441 REG16(0x270), 442 443 LRI(3, POSTED), 444 REG(0x1b0), 445 REG16(0x5a8), 446 REG16(0x5ac), 447 448 NOP(6), 449 LRI(1, 0), 450 REG(0x0c8), 451 452 0 453 }; 454 455 static const u8 mtl_rcs_offsets[] = { 456 NOP(1), 457 LRI(15, POSTED), 458 REG16(0x244), 459 REG(0x034), 460 REG(0x030), 461 REG(0x038), 462 REG(0x03c), 463 REG(0x168), 464 REG(0x140), 465 REG(0x110), 466 REG(0x1c0), 467 REG(0x1c4), 468 REG(0x1c8), 469 REG(0x180), 470 REG16(0x2b4), 471 REG(0x120), 472 REG(0x124), 473 474 NOP(1), 475 LRI(9, POSTED), 476 REG16(0x3a8), 477 REG16(0x28c), 478 REG16(0x288), 479 REG16(0x284), 480 REG16(0x280), 481 REG16(0x27c), 482 REG16(0x278), 483 REG16(0x274), 484 REG16(0x270), 485 486 NOP(2), 487 LRI(2, POSTED), 488 REG16(0x5a8), 489 REG16(0x5ac), 490 491 NOP(6), 492 LRI(1, 0), 493 REG(0x0c8), 494 495 0 496 }; 497 498 #define XE2_CTX_COMMON \ 499 NOP(1), /* [0x00] */ \ 500 LRI(15, POSTED), /* [0x01] */ \ 501 REG16(0x244), /* [0x02] CTXT_SR_CTL */ \ 502 REG(0x034), /* [0x04] RING_BUFFER_HEAD */ \ 503 REG(0x030), /* [0x06] RING_BUFFER_TAIL */ \ 504 REG(0x038), /* [0x08] RING_BUFFER_START */ \ 505 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ \ 506 REG(0x168), /* [0x0c] BB_ADDR_UDW */ \ 507 REG(0x140), /* [0x0e] BB_ADDR */ \ 508 REG(0x110), /* [0x10] BB_STATE */ \ 509 REG(0x1c0), /* [0x12] BB_PER_CTX_PTR */ \ 510 REG(0x1c4), /* [0x14] RCS_INDIRECT_CTX */ \ 511 REG(0x1c8), /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \ 512 REG(0x180), /* [0x18] CCID */ \ 513 REG16(0x2b4), /* [0x1a] SEMAPHORE_TOKEN */ \ 514 REG(0x120), /* [0x1c] PRT_BB_STATE */ \ 515 REG(0x124), /* [0x1e] PRT_BB_STATE_UDW */ \ 516 \ 517 NOP(1), /* [0x20] */ \ 518 LRI(9, POSTED), /* [0x21] */ \ 519 REG16(0x3a8), /* [0x22] CTX_TIMESTAMP */ \ 520 REG16(0x3ac), /* [0x24] CTX_TIMESTAMP_UDW */ \ 521 REG(0x108), /* [0x26] INDIRECT_RING_STATE */ \ 522 REG16(0x284), /* [0x28] dummy reg */ \ 523 REG16(0x280), /* [0x2a] CS_ACC_CTR_THOLD */ \ 524 REG16(0x27c), /* [0x2c] CS_CTX_SYS_PASID */ \ 525 REG16(0x278), /* [0x2e] CS_CTX_ASID */ \ 526 REG16(0x274), /* [0x30] PTBP_UDW */ \ 527 REG16(0x270) /* [0x32] PTBP_LDW */ 528 529 static const u8 xe2_rcs_offsets[] = { 530 XE2_CTX_COMMON, 531 532 NOP(2), /* [0x34] */ 533 LRI(2, POSTED), /* [0x36] */ 534 REG16(0x5a8), /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */ 535 REG16(0x5ac), /* [0x39] PREEMPTION_STATUS */ 536 537 NOP(6), /* [0x41] */ 538 LRI(1, 0), /* [0x47] */ 539 REG(0x0c8), /* [0x48] R_PWR_CLK_STATE */ 540 541 0 542 }; 543 544 static const u8 xe2_bcs_offsets[] = { 545 XE2_CTX_COMMON, 546 547 NOP(4 + 8 + 1), /* [0x34] */ 548 LRI(2, POSTED), /* [0x41] */ 549 REG16(0x200), /* [0x42] BCS_SWCTRL */ 550 REG16(0x204), /* [0x44] BLIT_CCTL */ 551 552 0 553 }; 554 555 static const u8 xe2_xcs_offsets[] = { 556 XE2_CTX_COMMON, 557 558 0 559 }; 560 561 static const u8 xe2_indirect_ring_state_offsets[] = { 562 NOP(1), /* [0x00] */ 563 LRI(5, POSTED), /* [0x01] */ 564 REG(0x034), /* [0x02] RING_BUFFER_HEAD */ 565 REG(0x030), /* [0x04] RING_BUFFER_TAIL */ 566 REG(0x038), /* [0x06] RING_BUFFER_START */ 567 REG(0x048), /* [0x08] RING_BUFFER_START_UDW */ 568 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ 569 570 NOP(5), /* [0x0c] */ 571 LRI(9, POSTED), /* [0x11] */ 572 REG(0x168), /* [0x12] BB_ADDR_UDW */ 573 REG(0x140), /* [0x14] BB_ADDR */ 574 REG(0x110), /* [0x16] BB_STATE */ 575 REG16(0x588), /* [0x18] BB_STACK_WRITE_PORT */ 576 REG16(0x588), /* [0x20] BB_STACK_WRITE_PORT */ 577 REG16(0x588), /* [0x22] BB_STACK_WRITE_PORT */ 578 REG16(0x588), /* [0x24] BB_STACK_WRITE_PORT */ 579 REG16(0x588), /* [0x26] BB_STACK_WRITE_PORT */ 580 REG16(0x588), /* [0x28] BB_STACK_WRITE_PORT */ 581 582 NOP(12), /* [0x00] */ 583 584 0 585 }; 586 587 #undef REG16 588 #undef REG 589 #undef LRI 590 #undef NOP 591 592 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class) 593 { 594 if (class == XE_ENGINE_CLASS_RENDER) { 595 if (GRAPHICS_VER(xe) >= 20) 596 return xe2_rcs_offsets; 597 else if (GRAPHICS_VERx100(xe) >= 1270) 598 return mtl_rcs_offsets; 599 else if (GRAPHICS_VERx100(xe) >= 1255) 600 return dg2_rcs_offsets; 601 else if (GRAPHICS_VERx100(xe) >= 1250) 602 return xehp_rcs_offsets; 603 else 604 return gen12_rcs_offsets; 605 } else if (class == XE_ENGINE_CLASS_COPY) { 606 if (GRAPHICS_VER(xe) >= 20) 607 return xe2_bcs_offsets; 608 else 609 return gen12_xcs_offsets; 610 } else { 611 if (GRAPHICS_VER(xe) >= 20) 612 return xe2_xcs_offsets; 613 else if (GRAPHICS_VERx100(xe) >= 1255) 614 return dg2_xcs_offsets; 615 else 616 return gen12_xcs_offsets; 617 } 618 } 619 620 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe) 621 { 622 regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | 623 CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 624 625 if (xe_gt_has_indirect_ring_state(hwe->gt)) 626 regs[CTX_CONTEXT_CONTROL] |= 627 _MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE); 628 } 629 630 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe) 631 { 632 struct xe_memirq *memirq = >_to_tile(hwe->gt)->memirq; 633 struct xe_device *xe = gt_to_xe(hwe->gt); 634 u8 num_regs; 635 636 if (!xe_device_uses_memirq(xe)) 637 return; 638 639 regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM | 640 MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT; 641 regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr; 642 regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq); 643 644 num_regs = xe_device_has_msix(xe) ? 3 : 2; 645 regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) | 646 MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED; 647 regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr; 648 regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe); 649 regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr; 650 regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe); 651 652 if (xe_device_has_msix(xe)) { 653 regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr; 654 /* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */ 655 } 656 } 657 658 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe) 659 { 660 struct xe_device *xe = gt_to_xe(hwe->gt); 661 662 if (GRAPHICS_VERx100(xe) >= 1250) 663 return 0x70; 664 else 665 return 0x60; 666 } 667 668 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe) 669 { 670 int x; 671 672 x = lrc_ring_mi_mode(hwe); 673 regs[x + 1] &= ~STOP_RING; 674 regs[x + 1] |= STOP_RING << 16; 675 } 676 677 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc) 678 { 679 return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE; 680 } 681 682 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc) 683 { 684 return 0; 685 } 686 687 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc) 688 { 689 return lrc->ring.size; 690 } 691 692 /* Make the magic macros work */ 693 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset 694 #define __xe_lrc_regs_offset xe_lrc_regs_offset 695 696 #define LRC_SEQNO_PPHWSP_OFFSET 512 697 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8) 698 #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8) 699 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024 700 #define LRC_PARALLEL_PPHWSP_OFFSET 2048 701 702 u32 xe_lrc_regs_offset(struct xe_lrc *lrc) 703 { 704 return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE; 705 } 706 707 /** 708 * xe_lrc_reg_size() - Get size of the LRC registers area within queues 709 * @xe: the &xe_device struct instance 710 * 711 * Returns: Size of the LRC registers area for current platform 712 */ 713 size_t xe_lrc_reg_size(struct xe_device *xe) 714 { 715 if (GRAPHICS_VERx100(xe) >= 1250) 716 return 96 * sizeof(u32); 717 else 718 return 80 * sizeof(u32); 719 } 720 721 size_t xe_lrc_skip_size(struct xe_device *xe) 722 { 723 return LRC_PPHWSP_SIZE + xe_lrc_reg_size(xe); 724 } 725 726 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc) 727 { 728 /* The seqno is stored in the driver-defined portion of PPHWSP */ 729 return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET; 730 } 731 732 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc) 733 { 734 /* The start seqno is stored in the driver-defined portion of PPHWSP */ 735 return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET; 736 } 737 738 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc) 739 { 740 /* This is stored in the driver-defined portion of PPHWSP */ 741 return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET; 742 } 743 744 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc) 745 { 746 /* The parallel is stored in the driver-defined portion of PPHWSP */ 747 return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET; 748 } 749 750 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc) 751 { 752 return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET; 753 } 754 755 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc) 756 { 757 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32); 758 } 759 760 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc) 761 { 762 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32); 763 } 764 765 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc) 766 { 767 u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - 768 LRC_INDIRECT_RING_STATE_SIZE; 769 770 if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX) 771 offset -= LRC_INDIRECT_CTX_BO_SIZE; 772 773 return offset; 774 } 775 776 static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc) 777 { 778 return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE; 779 } 780 781 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc) 782 { 783 return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE; 784 } 785 786 #define DECL_MAP_ADDR_HELPERS(elem) \ 787 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \ 788 { \ 789 struct iosys_map map = lrc->bo->vmap; \ 790 \ 791 xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map)); \ 792 iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \ 793 return map; \ 794 } \ 795 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \ 796 { \ 797 return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \ 798 } \ 799 800 DECL_MAP_ADDR_HELPERS(ring) 801 DECL_MAP_ADDR_HELPERS(pphwsp) 802 DECL_MAP_ADDR_HELPERS(seqno) 803 DECL_MAP_ADDR_HELPERS(regs) 804 DECL_MAP_ADDR_HELPERS(start_seqno) 805 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp) 806 DECL_MAP_ADDR_HELPERS(ctx_timestamp) 807 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw) 808 DECL_MAP_ADDR_HELPERS(parallel) 809 DECL_MAP_ADDR_HELPERS(indirect_ring) 810 DECL_MAP_ADDR_HELPERS(engine_id) 811 812 #undef DECL_MAP_ADDR_HELPERS 813 814 /** 815 * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address 816 * @lrc: Pointer to the lrc. 817 * 818 * Returns: ctx timestamp GGTT address 819 */ 820 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc) 821 { 822 return __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 823 } 824 825 /** 826 * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address 827 * @lrc: Pointer to the lrc. 828 * 829 * Returns: ctx timestamp udw GGTT address 830 */ 831 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc) 832 { 833 return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc); 834 } 835 836 /** 837 * xe_lrc_ctx_timestamp() - Read ctx timestamp value 838 * @lrc: Pointer to the lrc. 839 * 840 * Returns: ctx timestamp value 841 */ 842 u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc) 843 { 844 struct xe_device *xe = lrc_to_xe(lrc); 845 struct iosys_map map; 846 u32 ldw, udw = 0; 847 848 map = __xe_lrc_ctx_timestamp_map(lrc); 849 ldw = xe_map_read32(xe, &map); 850 851 if (xe->info.has_64bit_timestamp) { 852 map = __xe_lrc_ctx_timestamp_udw_map(lrc); 853 udw = xe_map_read32(xe, &map); 854 } 855 856 return (u64)udw << 32 | ldw; 857 } 858 859 /** 860 * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address 861 * @lrc: Pointer to the lrc. 862 * 863 * Returns: ctx timestamp job GGTT address 864 */ 865 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc) 866 { 867 return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc); 868 } 869 870 /** 871 * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value 872 * @lrc: Pointer to the lrc. 873 * 874 * Returns: ctx timestamp job value 875 */ 876 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc) 877 { 878 struct xe_device *xe = lrc_to_xe(lrc); 879 struct iosys_map map; 880 881 map = __xe_lrc_ctx_job_timestamp_map(lrc); 882 return xe_map_read32(xe, &map); 883 } 884 885 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc) 886 { 887 return __xe_lrc_pphwsp_ggtt_addr(lrc); 888 } 889 890 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc) 891 { 892 if (!xe_lrc_has_indirect_ring_state(lrc)) 893 return 0; 894 895 return __xe_lrc_indirect_ring_ggtt_addr(lrc); 896 } 897 898 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr) 899 { 900 struct xe_device *xe = lrc_to_xe(lrc); 901 struct iosys_map map; 902 903 map = __xe_lrc_indirect_ring_map(lrc); 904 iosys_map_incr(&map, reg_nr * sizeof(u32)); 905 return xe_map_read32(xe, &map); 906 } 907 908 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc, 909 int reg_nr, u32 val) 910 { 911 struct xe_device *xe = lrc_to_xe(lrc); 912 struct iosys_map map; 913 914 map = __xe_lrc_indirect_ring_map(lrc); 915 iosys_map_incr(&map, reg_nr * sizeof(u32)); 916 xe_map_write32(xe, &map, val); 917 } 918 919 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr) 920 { 921 struct xe_device *xe = lrc_to_xe(lrc); 922 struct iosys_map map; 923 924 map = __xe_lrc_regs_map(lrc); 925 iosys_map_incr(&map, reg_nr * sizeof(u32)); 926 return xe_map_read32(xe, &map); 927 } 928 929 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val) 930 { 931 struct xe_device *xe = lrc_to_xe(lrc); 932 struct iosys_map map; 933 934 map = __xe_lrc_regs_map(lrc); 935 iosys_map_incr(&map, reg_nr * sizeof(u32)); 936 xe_map_write32(xe, &map, val); 937 } 938 939 static void *empty_lrc_data(struct xe_hw_engine *hwe) 940 { 941 struct xe_gt *gt = hwe->gt; 942 void *data; 943 u32 *regs; 944 945 data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL); 946 if (!data) 947 return NULL; 948 949 /* 1st page: Per-Process of HW status Page */ 950 regs = data + LRC_PPHWSP_SIZE; 951 set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe); 952 set_context_control(regs, hwe); 953 set_memory_based_intr(regs, hwe); 954 reset_stop_ring(regs, hwe); 955 if (xe_gt_has_indirect_ring_state(gt)) { 956 regs = data + xe_gt_lrc_size(gt, hwe->class) - 957 LRC_INDIRECT_RING_STATE_SIZE; 958 set_offsets(regs, xe2_indirect_ring_state_offsets, hwe); 959 } 960 961 return data; 962 } 963 964 /** 965 * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC 966 * of given engine. 967 * @hwe: the &xe_hw_engine struct instance 968 */ 969 void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe) 970 { 971 struct xe_gt *gt = hwe->gt; 972 u32 *regs; 973 974 if (!gt->default_lrc[hwe->class]) 975 return; 976 977 regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE; 978 set_memory_based_intr(regs, hwe); 979 } 980 981 /** 982 * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data 983 * for given LRC. 984 * @lrc: the &xe_lrc struct instance 985 * @hwe: the &xe_hw_engine struct instance 986 * @regs: scratch buffer to be used as temporary storage 987 */ 988 void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 989 u32 *regs) 990 { 991 struct xe_gt *gt = hwe->gt; 992 struct iosys_map map; 993 size_t regs_len; 994 995 if (!xe_device_uses_memirq(gt_to_xe(gt))) 996 return; 997 998 map = __xe_lrc_regs_map(lrc); 999 regs_len = xe_lrc_reg_size(gt_to_xe(gt)); 1000 xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len); 1001 set_memory_based_intr(regs, hwe); 1002 xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len); 1003 } 1004 1005 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm) 1006 { 1007 u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt)); 1008 1009 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc)); 1010 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc)); 1011 } 1012 1013 static void xe_lrc_finish(struct xe_lrc *lrc) 1014 { 1015 xe_hw_fence_ctx_finish(&lrc->fence_ctx); 1016 xe_bo_unpin_map_no_vm(lrc->bo); 1017 } 1018 1019 /* 1020 * wa_bb_setup_utilization() - Write commands to wa bb to assist 1021 * in calculating active context run ticks. 1022 * 1023 * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the 1024 * context, but only gets updated when the context switches out. In order to 1025 * check how long a context has been active before it switches out, two things 1026 * are required: 1027 * 1028 * (1) Determine if the context is running: 1029 * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in 1030 * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is 1031 * initialized. During a query, we just check for this value to determine if the 1032 * context is active. If the context switched out, it would overwrite this 1033 * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as 1034 * the last part of context restore, so reusing this LRC location will not 1035 * clobber anything. 1036 * 1037 * (2) Calculate the time that the context has been active for: 1038 * The CTX_TIMESTAMP ticks only when the context is active. If a context is 1039 * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization. 1040 * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific 1041 * engine instance. Since we do not know which instance the context is running 1042 * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and 1043 * store it in the PPHSWP. 1044 */ 1045 #define CONTEXT_ACTIVE 1ULL 1046 static ssize_t setup_utilization_wa(struct xe_lrc *lrc, 1047 struct xe_hw_engine *hwe, 1048 u32 *batch, 1049 size_t max_len) 1050 { 1051 u32 *cmd = batch; 1052 1053 if (xe_gt_WARN_ON(lrc->gt, max_len < 12)) 1054 return -ENOSPC; 1055 1056 *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET; 1057 *cmd++ = ENGINE_ID(0).addr; 1058 *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc); 1059 *cmd++ = 0; 1060 1061 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); 1062 *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 1063 *cmd++ = 0; 1064 *cmd++ = lower_32_bits(CONTEXT_ACTIVE); 1065 1066 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) { 1067 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); 1068 *cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc); 1069 *cmd++ = 0; 1070 *cmd++ = upper_32_bits(CONTEXT_ACTIVE); 1071 } 1072 1073 return cmd - batch; 1074 } 1075 1076 static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 1077 u32 *batch, size_t max_len) 1078 { 1079 const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 1080 u32 *cmd = batch; 1081 1082 if (!XE_GT_WA(lrc->gt, 16010904313) || 1083 !(hwe->class == XE_ENGINE_CLASS_RENDER || 1084 hwe->class == XE_ENGINE_CLASS_COMPUTE || 1085 hwe->class == XE_ENGINE_CLASS_COPY || 1086 hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE || 1087 hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE)) 1088 return 0; 1089 1090 if (xe_gt_WARN_ON(lrc->gt, max_len < 12)) 1091 return -ENOSPC; 1092 1093 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO | 1094 MI_LRM_ASYNC; 1095 *cmd++ = RING_CTX_TIMESTAMP(0).addr; 1096 *cmd++ = ts_addr; 1097 *cmd++ = 0; 1098 1099 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO | 1100 MI_LRM_ASYNC; 1101 *cmd++ = RING_CTX_TIMESTAMP(0).addr; 1102 *cmd++ = ts_addr; 1103 *cmd++ = 0; 1104 1105 *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO; 1106 *cmd++ = RING_CTX_TIMESTAMP(0).addr; 1107 *cmd++ = ts_addr; 1108 *cmd++ = 0; 1109 1110 return cmd - batch; 1111 } 1112 1113 static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc, 1114 struct xe_hw_engine *hwe, 1115 u32 *batch, size_t max_len) 1116 { 1117 struct xe_device *xe = gt_to_xe(lrc->gt); 1118 const u32 *user_batch; 1119 u32 *cmd = batch; 1120 u32 count; 1121 1122 count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev), 1123 hwe->class, &user_batch); 1124 if (!count) 1125 return 0; 1126 1127 if (count > max_len) 1128 return -ENOSPC; 1129 1130 /* 1131 * This should be used only for tests and validation. Taint the kernel 1132 * as anything could be submitted directly in context switches 1133 */ 1134 add_taint(TAINT_TEST, LOCKDEP_STILL_OK); 1135 1136 memcpy(cmd, user_batch, count * sizeof(u32)); 1137 cmd += count; 1138 1139 return cmd - batch; 1140 } 1141 1142 static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc, 1143 struct xe_hw_engine *hwe, 1144 u32 *batch, size_t max_len) 1145 { 1146 struct xe_device *xe = gt_to_xe(lrc->gt); 1147 const u32 *user_batch; 1148 u32 *cmd = batch; 1149 u32 count; 1150 1151 count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev), 1152 hwe->class, &user_batch); 1153 if (!count) 1154 return 0; 1155 1156 if (count > max_len) 1157 return -ENOSPC; 1158 1159 /* 1160 * This should be used only for tests and validation. Taint the kernel 1161 * as anything could be submitted directly in context switches 1162 */ 1163 add_taint(TAINT_TEST, LOCKDEP_STILL_OK); 1164 1165 memcpy(cmd, user_batch, count * sizeof(u32)); 1166 cmd += count; 1167 1168 return cmd - batch; 1169 } 1170 1171 static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc, 1172 struct xe_hw_engine *hwe, 1173 u32 *batch, size_t max_len) 1174 { 1175 u32 *cmd = batch; 1176 1177 if (!XE_GT_WA(lrc->gt, 18022495364) || 1178 hwe->class != XE_ENGINE_CLASS_RENDER) 1179 return 0; 1180 1181 if (xe_gt_WARN_ON(lrc->gt, max_len < 3)) 1182 return -ENOSPC; 1183 1184 *cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1); 1185 *cmd++ = CS_DEBUG_MODE1(0).addr; 1186 *cmd++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE); 1187 1188 return cmd - batch; 1189 } 1190 1191 struct bo_setup { 1192 ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 1193 u32 *batch, size_t max_size); 1194 }; 1195 1196 struct bo_setup_state { 1197 /* Input: */ 1198 struct xe_lrc *lrc; 1199 struct xe_hw_engine *hwe; 1200 size_t max_size; 1201 size_t reserve_dw; 1202 unsigned int offset; 1203 const struct bo_setup *funcs; 1204 unsigned int num_funcs; 1205 1206 /* State: */ 1207 u32 *buffer; 1208 u32 *ptr; 1209 unsigned int written; 1210 }; 1211 1212 static int setup_bo(struct bo_setup_state *state) 1213 { 1214 ssize_t remain; 1215 1216 if (state->lrc->bo->vmap.is_iomem) { 1217 if (!state->buffer) 1218 return -ENOMEM; 1219 state->ptr = state->buffer; 1220 } else { 1221 state->ptr = state->lrc->bo->vmap.vaddr + state->offset; 1222 } 1223 1224 remain = state->max_size / sizeof(u32); 1225 1226 for (size_t i = 0; i < state->num_funcs; i++) { 1227 ssize_t len = state->funcs[i].setup(state->lrc, state->hwe, 1228 state->ptr, remain); 1229 1230 remain -= len; 1231 1232 /* 1233 * Caller has asked for at least reserve_dw to remain unused. 1234 */ 1235 if (len < 0 || 1236 xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw)) 1237 goto fail; 1238 1239 state->ptr += len; 1240 state->written += len; 1241 } 1242 1243 return 0; 1244 1245 fail: 1246 return -ENOSPC; 1247 } 1248 1249 static void finish_bo(struct bo_setup_state *state) 1250 { 1251 if (!state->buffer) 1252 return; 1253 1254 xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap, 1255 state->offset, state->buffer, 1256 state->written * sizeof(u32)); 1257 } 1258 1259 /** 1260 * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks. 1261 * @lrc: the &xe_lrc struct instance 1262 * @hwe: the &xe_hw_engine struct instance 1263 * @scratch: preallocated scratch buffer for temporary storage 1264 * Return: 0 on success, negative error code on failure 1265 */ 1266 int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch) 1267 { 1268 static const struct bo_setup funcs[] = { 1269 { .setup = setup_timestamp_wa }, 1270 { .setup = setup_invalidate_state_cache_wa }, 1271 { .setup = setup_utilization_wa }, 1272 { .setup = setup_configfs_post_ctx_restore_bb }, 1273 }; 1274 struct bo_setup_state state = { 1275 .lrc = lrc, 1276 .hwe = hwe, 1277 .max_size = LRC_WA_BB_SIZE, 1278 .buffer = scratch, 1279 .reserve_dw = 1, 1280 .offset = __xe_lrc_wa_bb_offset(lrc), 1281 .funcs = funcs, 1282 .num_funcs = ARRAY_SIZE(funcs), 1283 }; 1284 int ret; 1285 1286 ret = setup_bo(&state); 1287 if (ret) 1288 return ret; 1289 1290 *state.ptr++ = MI_BATCH_BUFFER_END; 1291 state.written++; 1292 1293 finish_bo(&state); 1294 1295 xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR, 1296 xe_bo_ggtt_addr(lrc->bo) + state.offset + 1); 1297 1298 return 0; 1299 } 1300 1301 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe) 1302 { 1303 u32 *buf = NULL; 1304 int ret; 1305 1306 if (lrc->bo->vmap.is_iomem) 1307 buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL); 1308 1309 ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf); 1310 1311 kfree(buf); 1312 1313 return ret; 1314 } 1315 1316 static int 1317 setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe) 1318 { 1319 static const struct bo_setup rcs_funcs[] = { 1320 { .setup = setup_timestamp_wa }, 1321 { .setup = setup_configfs_mid_ctx_restore_bb }, 1322 }; 1323 static const struct bo_setup xcs_funcs[] = { 1324 { .setup = setup_configfs_mid_ctx_restore_bb }, 1325 }; 1326 struct bo_setup_state state = { 1327 .lrc = lrc, 1328 .hwe = hwe, 1329 .max_size = (63 * 64) /* max 63 cachelines */, 1330 .buffer = NULL, 1331 .offset = __xe_lrc_indirect_ctx_offset(lrc), 1332 }; 1333 int ret; 1334 1335 if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)) 1336 return 0; 1337 1338 if (hwe->class == XE_ENGINE_CLASS_RENDER || 1339 hwe->class == XE_ENGINE_CLASS_COMPUTE) { 1340 state.funcs = rcs_funcs; 1341 state.num_funcs = ARRAY_SIZE(rcs_funcs); 1342 } else { 1343 state.funcs = xcs_funcs; 1344 state.num_funcs = ARRAY_SIZE(xcs_funcs); 1345 } 1346 1347 if (xe_gt_WARN_ON(lrc->gt, !state.funcs)) 1348 return 0; 1349 1350 if (lrc->bo->vmap.is_iomem) 1351 state.buffer = kmalloc(state.max_size, GFP_KERNEL); 1352 1353 ret = setup_bo(&state); 1354 if (ret) { 1355 kfree(state.buffer); 1356 return ret; 1357 } 1358 1359 /* 1360 * Align to 64B cacheline so there's no garbage at the end for CS to 1361 * execute: size for indirect ctx must be a multiple of 64. 1362 */ 1363 while (state.written & 0xf) { 1364 *state.ptr++ = MI_NOOP; 1365 state.written++; 1366 } 1367 1368 finish_bo(&state); 1369 kfree(state.buffer); 1370 1371 /* 1372 * Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it 1373 * varies per engine class, but the default is good enough 1374 */ 1375 xe_lrc_write_ctx_reg(lrc, 1376 CTX_CS_INDIRECT_CTX, 1377 (xe_bo_ggtt_addr(lrc->bo) + state.offset) | 1378 /* Size in CLs. */ 1379 (state.written * sizeof(u32) / 64)); 1380 1381 return 0; 1382 } 1383 1384 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 1385 struct xe_vm *vm, u32 ring_size, u16 msix_vec, 1386 u32 init_flags) 1387 { 1388 struct xe_gt *gt = hwe->gt; 1389 const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class); 1390 u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE; 1391 struct xe_tile *tile = gt_to_tile(gt); 1392 struct xe_device *xe = gt_to_xe(gt); 1393 struct iosys_map map; 1394 u32 arb_enable; 1395 u32 bo_flags; 1396 int err; 1397 1398 kref_init(&lrc->refcount); 1399 lrc->gt = gt; 1400 lrc->size = lrc_size; 1401 lrc->flags = 0; 1402 lrc->ring.size = ring_size; 1403 lrc->ring.tail = 0; 1404 1405 if (gt_engine_needs_indirect_ctx(gt, hwe->class)) { 1406 lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX; 1407 bo_size += LRC_INDIRECT_CTX_BO_SIZE; 1408 } 1409 1410 if (xe_gt_has_indirect_ring_state(gt)) 1411 lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE; 1412 1413 bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT | 1414 XE_BO_FLAG_GGTT_INVALIDATE; 1415 if (vm && vm->xef) /* userspace */ 1416 bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE; 1417 1418 lrc->bo = xe_bo_create_pin_map_novm(xe, tile, 1419 bo_size, 1420 ttm_bo_type_kernel, 1421 bo_flags, false); 1422 if (IS_ERR(lrc->bo)) 1423 return PTR_ERR(lrc->bo); 1424 1425 xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt, 1426 hwe->fence_irq, hwe->name); 1427 1428 /* 1429 * Init Per-Process of HW status Page, LRC / context state to known 1430 * values. If there's already a primed default_lrc, just copy it, otherwise 1431 * it's the early submission to record the lrc: build a new empty one from 1432 * scratch. 1433 */ 1434 map = __xe_lrc_pphwsp_map(lrc); 1435 if (gt->default_lrc[hwe->class]) { 1436 xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE); /* PPHWSP */ 1437 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE, 1438 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE, 1439 lrc_size - LRC_PPHWSP_SIZE); 1440 } else { 1441 void *init_data = empty_lrc_data(hwe); 1442 1443 if (!init_data) { 1444 err = -ENOMEM; 1445 goto err_lrc_finish; 1446 } 1447 1448 xe_map_memcpy_to(xe, &map, 0, init_data, lrc_size); 1449 kfree(init_data); 1450 } 1451 1452 if (vm) { 1453 xe_lrc_set_ppgtt(lrc, vm); 1454 1455 if (vm->xef) 1456 xe_drm_client_add_bo(vm->xef->client, lrc->bo); 1457 } 1458 1459 if (xe_device_has_msix(xe)) { 1460 xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR, 1461 xe_memirq_status_ptr(&tile->memirq, hwe)); 1462 xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR, 1463 xe_memirq_source_ptr(&tile->memirq, hwe)); 1464 xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec); 1465 } 1466 1467 if (xe_gt_has_indirect_ring_state(gt)) { 1468 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE, 1469 __xe_lrc_indirect_ring_ggtt_addr(lrc)); 1470 1471 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START, 1472 __xe_lrc_ring_ggtt_addr(lrc)); 1473 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0); 1474 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0); 1475 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail); 1476 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL, 1477 RING_CTL_SIZE(lrc->ring.size) | RING_VALID); 1478 } else { 1479 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc)); 1480 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0); 1481 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail); 1482 xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL, 1483 RING_CTL_SIZE(lrc->ring.size) | RING_VALID); 1484 } 1485 1486 if (init_flags & XE_LRC_CREATE_RUNALONE) 1487 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL, 1488 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) | 1489 _MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE)); 1490 1491 if (init_flags & XE_LRC_CREATE_PXP) 1492 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL, 1493 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) | 1494 _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE)); 1495 1496 lrc->ctx_timestamp = 0; 1497 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0); 1498 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) 1499 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0); 1500 1501 if (xe->info.has_asid && vm) 1502 xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid); 1503 1504 lrc->desc = LRC_VALID; 1505 lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT); 1506 /* TODO: Priority */ 1507 1508 /* While this appears to have something about privileged batches or 1509 * some such, it really just means PPGTT mode. 1510 */ 1511 if (vm) 1512 lrc->desc |= LRC_PRIVILEGE; 1513 1514 if (GRAPHICS_VERx100(xe) < 1250) { 1515 lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance); 1516 lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class); 1517 } 1518 1519 arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1520 xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable)); 1521 1522 map = __xe_lrc_seqno_map(lrc); 1523 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); 1524 1525 map = __xe_lrc_start_seqno_map(lrc); 1526 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); 1527 1528 err = setup_wa_bb(lrc, hwe); 1529 if (err) 1530 goto err_lrc_finish; 1531 1532 err = setup_indirect_ctx(lrc, hwe); 1533 if (err) 1534 goto err_lrc_finish; 1535 1536 return 0; 1537 1538 err_lrc_finish: 1539 xe_lrc_finish(lrc); 1540 return err; 1541 } 1542 1543 /** 1544 * xe_lrc_create - Create a LRC 1545 * @hwe: Hardware Engine 1546 * @vm: The VM (address space) 1547 * @ring_size: LRC ring size 1548 * @msix_vec: MSI-X interrupt vector (for platforms that support it) 1549 * @flags: LRC initialization flags 1550 * 1551 * Allocate and initialize the Logical Ring Context (LRC). 1552 * 1553 * Return pointer to created LRC upon success and an error pointer 1554 * upon failure. 1555 */ 1556 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm, 1557 u32 ring_size, u16 msix_vec, u32 flags) 1558 { 1559 struct xe_lrc *lrc; 1560 int err; 1561 1562 lrc = kzalloc(sizeof(*lrc), GFP_KERNEL); 1563 if (!lrc) 1564 return ERR_PTR(-ENOMEM); 1565 1566 err = xe_lrc_init(lrc, hwe, vm, ring_size, msix_vec, flags); 1567 if (err) { 1568 kfree(lrc); 1569 return ERR_PTR(err); 1570 } 1571 1572 return lrc; 1573 } 1574 1575 /** 1576 * xe_lrc_destroy - Destroy the LRC 1577 * @ref: reference to LRC 1578 * 1579 * Called when ref == 0, release resources held by the Logical Ring Context 1580 * (LRC) and free the LRC memory. 1581 */ 1582 void xe_lrc_destroy(struct kref *ref) 1583 { 1584 struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount); 1585 1586 xe_lrc_finish(lrc); 1587 kfree(lrc); 1588 } 1589 1590 /** 1591 * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC. 1592 * @lrc: the &xe_lrc struct instance 1593 */ 1594 void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc) 1595 { 1596 if (xe_lrc_has_indirect_ring_state(lrc)) { 1597 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE, 1598 __xe_lrc_indirect_ring_ggtt_addr(lrc)); 1599 1600 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START, 1601 __xe_lrc_ring_ggtt_addr(lrc)); 1602 } else { 1603 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc)); 1604 } 1605 } 1606 1607 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail) 1608 { 1609 if (xe_lrc_has_indirect_ring_state(lrc)) 1610 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail); 1611 else 1612 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail); 1613 } 1614 1615 u32 xe_lrc_ring_tail(struct xe_lrc *lrc) 1616 { 1617 if (xe_lrc_has_indirect_ring_state(lrc)) 1618 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR; 1619 else 1620 return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR; 1621 } 1622 1623 static u32 xe_lrc_ring_start(struct xe_lrc *lrc) 1624 { 1625 if (xe_lrc_has_indirect_ring_state(lrc)) 1626 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START); 1627 else 1628 return xe_lrc_read_ctx_reg(lrc, CTX_RING_START); 1629 } 1630 1631 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head) 1632 { 1633 if (xe_lrc_has_indirect_ring_state(lrc)) 1634 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head); 1635 else 1636 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head); 1637 } 1638 1639 u32 xe_lrc_ring_head(struct xe_lrc *lrc) 1640 { 1641 if (xe_lrc_has_indirect_ring_state(lrc)) 1642 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR; 1643 else 1644 return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR; 1645 } 1646 1647 u32 xe_lrc_ring_space(struct xe_lrc *lrc) 1648 { 1649 const u32 head = xe_lrc_ring_head(lrc); 1650 const u32 tail = lrc->ring.tail; 1651 const u32 size = lrc->ring.size; 1652 1653 return ((head - tail - 1) & (size - 1)) + 1; 1654 } 1655 1656 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring, 1657 const void *data, size_t size) 1658 { 1659 struct xe_device *xe = lrc_to_xe(lrc); 1660 1661 iosys_map_incr(&ring, lrc->ring.tail); 1662 xe_map_memcpy_to(xe, &ring, 0, data, size); 1663 lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1); 1664 } 1665 1666 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size) 1667 { 1668 struct xe_device *xe = lrc_to_xe(lrc); 1669 struct iosys_map ring; 1670 u32 rhs; 1671 size_t aligned_size; 1672 1673 xe_assert(xe, IS_ALIGNED(size, 4)); 1674 aligned_size = ALIGN(size, 8); 1675 1676 ring = __xe_lrc_ring_map(lrc); 1677 1678 xe_assert(xe, lrc->ring.tail < lrc->ring.size); 1679 rhs = lrc->ring.size - lrc->ring.tail; 1680 if (size > rhs) { 1681 __xe_lrc_write_ring(lrc, ring, data, rhs); 1682 __xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs); 1683 } else { 1684 __xe_lrc_write_ring(lrc, ring, data, size); 1685 } 1686 1687 if (aligned_size > size) { 1688 u32 noop = MI_NOOP; 1689 1690 __xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop)); 1691 } 1692 } 1693 1694 u64 xe_lrc_descriptor(struct xe_lrc *lrc) 1695 { 1696 return lrc->desc | xe_lrc_ggtt_addr(lrc); 1697 } 1698 1699 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc) 1700 { 1701 return __xe_lrc_seqno_ggtt_addr(lrc); 1702 } 1703 1704 /** 1705 * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence. 1706 * 1707 * Allocate but don't initialize an lrc seqno fence. 1708 * 1709 * Return: Pointer to the allocated fence or 1710 * negative error pointer on error. 1711 */ 1712 struct dma_fence *xe_lrc_alloc_seqno_fence(void) 1713 { 1714 return xe_hw_fence_alloc(); 1715 } 1716 1717 /** 1718 * xe_lrc_free_seqno_fence() - Free an lrc seqno fence. 1719 * @fence: Pointer to the fence to free. 1720 * 1721 * Frees an lrc seqno fence that hasn't yet been 1722 * initialized. 1723 */ 1724 void xe_lrc_free_seqno_fence(struct dma_fence *fence) 1725 { 1726 xe_hw_fence_free(fence); 1727 } 1728 1729 /** 1730 * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence. 1731 * @lrc: Pointer to the lrc. 1732 * @fence: Pointer to the fence to initialize. 1733 * 1734 * Initializes a pre-allocated lrc seqno fence. 1735 * After initialization, the fence is subject to normal 1736 * dma-fence refcounting. 1737 */ 1738 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence) 1739 { 1740 xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc)); 1741 } 1742 1743 s32 xe_lrc_seqno(struct xe_lrc *lrc) 1744 { 1745 struct iosys_map map = __xe_lrc_seqno_map(lrc); 1746 1747 return xe_map_read32(lrc_to_xe(lrc), &map); 1748 } 1749 1750 s32 xe_lrc_start_seqno(struct xe_lrc *lrc) 1751 { 1752 struct iosys_map map = __xe_lrc_start_seqno_map(lrc); 1753 1754 return xe_map_read32(lrc_to_xe(lrc), &map); 1755 } 1756 1757 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc) 1758 { 1759 return __xe_lrc_start_seqno_ggtt_addr(lrc); 1760 } 1761 1762 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc) 1763 { 1764 return __xe_lrc_parallel_ggtt_addr(lrc); 1765 } 1766 1767 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc) 1768 { 1769 return __xe_lrc_parallel_map(lrc); 1770 } 1771 1772 /** 1773 * xe_lrc_engine_id() - Read engine id value 1774 * @lrc: Pointer to the lrc. 1775 * 1776 * Returns: context id value 1777 */ 1778 static u32 xe_lrc_engine_id(struct xe_lrc *lrc) 1779 { 1780 struct xe_device *xe = lrc_to_xe(lrc); 1781 struct iosys_map map; 1782 1783 map = __xe_lrc_engine_id_map(lrc); 1784 return xe_map_read32(xe, &map); 1785 } 1786 1787 static int instr_dw(u32 cmd_header) 1788 { 1789 /* GFXPIPE "SINGLE_DW" opcodes are a single dword */ 1790 if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) == 1791 GFXPIPE_SINGLE_DW_CMD(0, 0)) 1792 return 1; 1793 1794 /* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */ 1795 if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST) 1796 return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2; 1797 1798 /* Most instructions have the # of dwords (minus 2) in 7:0 */ 1799 return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2; 1800 } 1801 1802 static int dump_mi_command(struct drm_printer *p, 1803 struct xe_gt *gt, 1804 u32 *dw, 1805 int remaining_dw) 1806 { 1807 u32 inst_header = *dw; 1808 u32 numdw = instr_dw(inst_header); 1809 u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header); 1810 int num_noop; 1811 1812 /* First check for commands that don't have/use a '# DW' field */ 1813 switch (inst_header & MI_OPCODE) { 1814 case MI_NOOP: 1815 num_noop = 1; 1816 while (num_noop < remaining_dw && 1817 (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP) 1818 num_noop++; 1819 drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop); 1820 return num_noop; 1821 1822 case MI_TOPOLOGY_FILTER: 1823 drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header); 1824 return 1; 1825 1826 case MI_BATCH_BUFFER_END: 1827 drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header); 1828 /* Return 'remaining_dw' to consume the rest of the LRC */ 1829 return remaining_dw; 1830 } 1831 1832 /* 1833 * Any remaining commands include a # of dwords. We should make sure 1834 * it doesn't exceed the remaining size of the LRC. 1835 */ 1836 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 1837 numdw = remaining_dw; 1838 1839 switch (inst_header & MI_OPCODE) { 1840 case MI_LOAD_REGISTER_IMM: 1841 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n", 1842 inst_header, (numdw - 1) / 2); 1843 for (int i = 1; i < numdw; i += 2) 1844 drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]); 1845 return numdw; 1846 1847 case MI_LOAD_REGISTER_MEM & MI_OPCODE: 1848 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n", 1849 inst_header, 1850 dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "", 1851 dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : ""); 1852 if (numdw == 4) 1853 drm_printf(p, " - %#6x = %#010llx\n", 1854 dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2]))); 1855 else 1856 drm_printf(p, " - %*ph (%s)\n", 1857 (int)sizeof(u32) * (numdw - 1), dw + 1, 1858 numdw < 4 ? "truncated" : "malformed"); 1859 return numdw; 1860 1861 case MI_FORCE_WAKEUP: 1862 drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header); 1863 return numdw; 1864 1865 default: 1866 drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n", 1867 inst_header, opcode, numdw); 1868 return numdw; 1869 } 1870 } 1871 1872 static int dump_gfxpipe_command(struct drm_printer *p, 1873 struct xe_gt *gt, 1874 u32 *dw, 1875 int remaining_dw) 1876 { 1877 u32 numdw = instr_dw(*dw); 1878 u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw); 1879 u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw); 1880 u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw); 1881 1882 /* 1883 * Make sure we haven't mis-parsed a number of dwords that exceeds the 1884 * remaining size of the LRC. 1885 */ 1886 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 1887 numdw = remaining_dw; 1888 1889 switch (*dw & GFXPIPE_MATCH_MASK) { 1890 #define MATCH(cmd) \ 1891 case cmd: \ 1892 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \ 1893 return numdw 1894 #define MATCH3D(cmd) \ 1895 case CMD_##cmd: \ 1896 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \ 1897 return numdw 1898 1899 MATCH(STATE_BASE_ADDRESS); 1900 MATCH(STATE_SIP); 1901 MATCH(GPGPU_CSR_BASE_ADDRESS); 1902 MATCH(STATE_COMPUTE_MODE); 1903 MATCH3D(3DSTATE_BTD); 1904 MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS); 1905 MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS); 1906 1907 MATCH3D(3DSTATE_VF_STATISTICS); 1908 1909 MATCH(PIPELINE_SELECT); 1910 1911 MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST); 1912 MATCH3D(3DSTATE_CLEAR_PARAMS); 1913 MATCH3D(3DSTATE_DEPTH_BUFFER); 1914 MATCH3D(3DSTATE_STENCIL_BUFFER); 1915 MATCH3D(3DSTATE_HIER_DEPTH_BUFFER); 1916 MATCH3D(3DSTATE_VERTEX_BUFFERS); 1917 MATCH3D(3DSTATE_VERTEX_ELEMENTS); 1918 MATCH3D(3DSTATE_INDEX_BUFFER); 1919 MATCH3D(3DSTATE_VF); 1920 MATCH3D(3DSTATE_MULTISAMPLE); 1921 MATCH3D(3DSTATE_CC_STATE_POINTERS); 1922 MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS); 1923 MATCH3D(3DSTATE_VS); 1924 MATCH3D(3DSTATE_GS); 1925 MATCH3D(3DSTATE_CLIP); 1926 MATCH3D(3DSTATE_SF); 1927 MATCH3D(3DSTATE_WM); 1928 MATCH3D(3DSTATE_CONSTANT_VS); 1929 MATCH3D(3DSTATE_CONSTANT_GS); 1930 MATCH3D(3DSTATE_CONSTANT_PS); 1931 MATCH3D(3DSTATE_SAMPLE_MASK); 1932 MATCH3D(3DSTATE_CONSTANT_HS); 1933 MATCH3D(3DSTATE_CONSTANT_DS); 1934 MATCH3D(3DSTATE_HS); 1935 MATCH3D(3DSTATE_TE); 1936 MATCH3D(3DSTATE_DS); 1937 MATCH3D(3DSTATE_STREAMOUT); 1938 MATCH3D(3DSTATE_SBE); 1939 MATCH3D(3DSTATE_PS); 1940 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP); 1941 MATCH3D(3DSTATE_CPS_POINTERS); 1942 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC); 1943 MATCH3D(3DSTATE_BLEND_STATE_POINTERS); 1944 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS); 1945 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS); 1946 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS); 1947 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS); 1948 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS); 1949 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS); 1950 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS); 1951 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS); 1952 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS); 1953 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS); 1954 MATCH3D(3DSTATE_VF_INSTANCING); 1955 MATCH3D(3DSTATE_VF_SGVS); 1956 MATCH3D(3DSTATE_VF_TOPOLOGY); 1957 MATCH3D(3DSTATE_WM_CHROMAKEY); 1958 MATCH3D(3DSTATE_PS_BLEND); 1959 MATCH3D(3DSTATE_WM_DEPTH_STENCIL); 1960 MATCH3D(3DSTATE_PS_EXTRA); 1961 MATCH3D(3DSTATE_RASTER); 1962 MATCH3D(3DSTATE_SBE_SWIZ); 1963 MATCH3D(3DSTATE_WM_HZ_OP); 1964 MATCH3D(3DSTATE_VF_COMPONENT_PACKING); 1965 MATCH3D(3DSTATE_VF_SGVS_2); 1966 MATCH3D(3DSTATE_VFG); 1967 MATCH3D(3DSTATE_URB_ALLOC_VS); 1968 MATCH3D(3DSTATE_URB_ALLOC_HS); 1969 MATCH3D(3DSTATE_URB_ALLOC_DS); 1970 MATCH3D(3DSTATE_URB_ALLOC_GS); 1971 MATCH3D(3DSTATE_SO_BUFFER_INDEX_0); 1972 MATCH3D(3DSTATE_SO_BUFFER_INDEX_1); 1973 MATCH3D(3DSTATE_SO_BUFFER_INDEX_2); 1974 MATCH3D(3DSTATE_SO_BUFFER_INDEX_3); 1975 MATCH3D(3DSTATE_PRIMITIVE_REPLICATION); 1976 MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO); 1977 MATCH3D(3DSTATE_AMFS); 1978 MATCH3D(3DSTATE_DEPTH_BOUNDS); 1979 MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS); 1980 MATCH3D(3DSTATE_CONSTANT_TS_POINTER); 1981 MATCH3D(3DSTATE_MESH_CONTROL); 1982 MATCH3D(3DSTATE_MESH_DISTRIB); 1983 MATCH3D(3DSTATE_TASK_REDISTRIB); 1984 MATCH3D(3DSTATE_MESH_SHADER); 1985 MATCH3D(3DSTATE_MESH_SHADER_DATA); 1986 MATCH3D(3DSTATE_TASK_CONTROL); 1987 MATCH3D(3DSTATE_TASK_SHADER); 1988 MATCH3D(3DSTATE_TASK_SHADER_DATA); 1989 MATCH3D(3DSTATE_URB_ALLOC_MESH); 1990 MATCH3D(3DSTATE_URB_ALLOC_TASK); 1991 MATCH3D(3DSTATE_CLIP_MESH); 1992 MATCH3D(3DSTATE_SBE_MESH); 1993 MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER); 1994 MATCH3D(3DSTATE_COARSE_PIXEL); 1995 1996 MATCH3D(3DSTATE_DRAWING_RECTANGLE); 1997 MATCH3D(3DSTATE_CHROMA_KEY); 1998 MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET); 1999 MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN); 2000 MATCH3D(3DSTATE_LINE_STIPPLE); 2001 MATCH3D(3DSTATE_AA_LINE_PARAMETERS); 2002 MATCH3D(3DSTATE_MONOFILTER_SIZE); 2003 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS); 2004 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS); 2005 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS); 2006 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS); 2007 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS); 2008 MATCH3D(3DSTATE_SO_DECL_LIST); 2009 MATCH3D(3DSTATE_SO_BUFFER); 2010 MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC); 2011 MATCH3D(3DSTATE_SAMPLE_PATTERN); 2012 MATCH3D(3DSTATE_3D_MODE); 2013 MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE); 2014 MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS); 2015 MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO); 2016 2017 default: 2018 drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n", 2019 *dw, pipeline, opcode, subopcode, numdw); 2020 return numdw; 2021 } 2022 } 2023 2024 static int dump_gfx_state_command(struct drm_printer *p, 2025 struct xe_gt *gt, 2026 u32 *dw, 2027 int remaining_dw) 2028 { 2029 u32 numdw = instr_dw(*dw); 2030 u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw); 2031 2032 /* 2033 * Make sure we haven't mis-parsed a number of dwords that exceeds the 2034 * remaining size of the LRC. 2035 */ 2036 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 2037 numdw = remaining_dw; 2038 2039 switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) { 2040 MATCH(STATE_WRITE_INLINE); 2041 2042 default: 2043 drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n", 2044 *dw, opcode, numdw); 2045 return numdw; 2046 } 2047 } 2048 2049 void xe_lrc_dump_default(struct drm_printer *p, 2050 struct xe_gt *gt, 2051 enum xe_engine_class hwe_class) 2052 { 2053 u32 *dw; 2054 int remaining_dw, num_dw; 2055 2056 if (!gt->default_lrc[hwe_class]) { 2057 drm_printf(p, "No default LRC for class %d\n", hwe_class); 2058 return; 2059 } 2060 2061 /* 2062 * Skip the beginning of the LRC since it contains the per-process 2063 * hardware status page. 2064 */ 2065 dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE; 2066 remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4; 2067 2068 while (remaining_dw > 0) { 2069 if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) { 2070 num_dw = dump_mi_command(p, gt, dw, remaining_dw); 2071 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) { 2072 num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw); 2073 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) { 2074 num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw); 2075 } else { 2076 num_dw = min(instr_dw(*dw), remaining_dw); 2077 drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n", 2078 *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw), 2079 num_dw); 2080 } 2081 2082 dw += num_dw; 2083 remaining_dw -= num_dw; 2084 } 2085 } 2086 2087 struct instr_state { 2088 u32 instr; 2089 u16 num_dw; 2090 }; 2091 2092 static const struct instr_state xe_hpg_svg_state[] = { 2093 { .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 }, 2094 { .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 }, 2095 { .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 }, 2096 { .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 }, 2097 { .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 }, 2098 { .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 }, 2099 { .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 }, 2100 { .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 }, 2101 { .instr = CMD_3DSTATE_VS, .num_dw = 9 }, 2102 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 }, 2103 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 }, 2104 { .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 }, 2105 { .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 }, 2106 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 }, 2107 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 }, 2108 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 }, 2109 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 }, 2110 { .instr = CMD_3DSTATE_CLIP, .num_dw = 4 }, 2111 { .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 }, 2112 { .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 }, 2113 { .instr = CMD_3DSTATE_SF, .num_dw = 4 }, 2114 { .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 }, 2115 { .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 }, 2116 { .instr = CMD_3DSTATE_RASTER, .num_dw = 5 }, 2117 { .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 }, 2118 { .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 }, 2119 { .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 }, 2120 { .instr = CMD_3DSTATE_HS, .num_dw = 9 }, 2121 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 }, 2122 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 }, 2123 { .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 }, 2124 { .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 }, 2125 { .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 }, 2126 { .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 }, 2127 { .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 }, 2128 { .instr = CMD_3DSTATE_TE, .num_dw = 5 }, 2129 { .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 }, 2130 { .instr = CMD_3DSTATE_DS, .num_dw = 11 }, 2131 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 }, 2132 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 }, 2133 { .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 }, 2134 { .instr = CMD_3DSTATE_GS, .num_dw = 10 }, 2135 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 }, 2136 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 }, 2137 { .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 }, 2138 { .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 }, 2139 { .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 }, 2140 { .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 }, 2141 { .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 }, 2142 { .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 }, 2143 }; 2144 2145 u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs) 2146 { 2147 struct xe_gt *gt = q->hwe->gt; 2148 struct xe_device *xe = gt_to_xe(gt); 2149 const struct instr_state *state_table = NULL; 2150 int state_table_size = 0; 2151 2152 /* 2153 * Wa_14019789679 2154 * 2155 * If the driver doesn't explicitly emit the SVG instructions while 2156 * setting up the default LRC, the context switch will write 0's 2157 * (noops) into the LRC memory rather than the expected instruction 2158 * headers. Application contexts start out as a copy of the default 2159 * LRC, and if they also do not emit specific settings for some SVG 2160 * state, then on context restore they'll unintentionally inherit 2161 * whatever state setting the previous context had programmed into the 2162 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will 2163 * prevent the hardware from resetting that state back to any specific 2164 * value). 2165 * 2166 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL 2167 * since that's a specific state setting that can easily cause GPU 2168 * hangs if unintentionally inherited. However to be safe we'll 2169 * continue to emit all of the SVG state since it's best not to leak 2170 * any of the state between contexts, even if that leakage is harmless. 2171 */ 2172 if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) { 2173 state_table = xe_hpg_svg_state; 2174 state_table_size = ARRAY_SIZE(xe_hpg_svg_state); 2175 } 2176 2177 if (!state_table) { 2178 xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n", 2179 GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100); 2180 return cs; 2181 } 2182 2183 for (int i = 0; i < state_table_size; i++) { 2184 u32 instr = state_table[i].instr; 2185 u16 num_dw = state_table[i].num_dw; 2186 bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW); 2187 2188 xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE); 2189 xe_gt_assert(gt, num_dw != 0); 2190 xe_gt_assert(gt, is_single_dw ^ (num_dw > 1)); 2191 2192 /* 2193 * Xe2's SVG context is the same as the one on DG2 / MTL 2194 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has 2195 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined). 2196 * Just make the replacement here rather than defining a 2197 * whole separate table for the single trivial change. 2198 */ 2199 if (GRAPHICS_VER(xe) >= 20 && 2200 instr == CMD_3DSTATE_DRAWING_RECTANGLE) 2201 instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST; 2202 2203 *cs = instr; 2204 if (!is_single_dw) 2205 *cs |= (num_dw - 2); 2206 2207 cs += num_dw; 2208 } 2209 2210 return cs; 2211 } 2212 2213 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc) 2214 { 2215 struct xe_lrc_snapshot *snapshot = kmalloc(sizeof(*snapshot), GFP_NOWAIT); 2216 2217 if (!snapshot) 2218 return NULL; 2219 2220 snapshot->context_desc = xe_lrc_ggtt_addr(lrc); 2221 snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc); 2222 snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc); 2223 snapshot->head = xe_lrc_ring_head(lrc); 2224 snapshot->tail.internal = lrc->ring.tail; 2225 snapshot->tail.memory = xe_lrc_ring_tail(lrc); 2226 snapshot->start = xe_lrc_ring_start(lrc); 2227 snapshot->start_seqno = xe_lrc_start_seqno(lrc); 2228 snapshot->seqno = xe_lrc_seqno(lrc); 2229 snapshot->lrc_bo = xe_bo_get(lrc->bo); 2230 snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc); 2231 snapshot->lrc_size = lrc->size; 2232 snapshot->lrc_snapshot = NULL; 2233 snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc)); 2234 snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc); 2235 return snapshot; 2236 } 2237 2238 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot) 2239 { 2240 struct xe_bo *bo; 2241 struct iosys_map src; 2242 2243 if (!snapshot) 2244 return; 2245 2246 bo = snapshot->lrc_bo; 2247 snapshot->lrc_bo = NULL; 2248 2249 snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL); 2250 if (!snapshot->lrc_snapshot) 2251 goto put_bo; 2252 2253 xe_bo_lock(bo, false); 2254 if (!ttm_bo_vmap(&bo->ttm, &src)) { 2255 xe_map_memcpy_from(xe_bo_device(bo), 2256 snapshot->lrc_snapshot, &src, snapshot->lrc_offset, 2257 snapshot->lrc_size); 2258 ttm_bo_vunmap(&bo->ttm, &src); 2259 } else { 2260 kvfree(snapshot->lrc_snapshot); 2261 snapshot->lrc_snapshot = NULL; 2262 } 2263 xe_bo_unlock(bo); 2264 put_bo: 2265 xe_bo_put(bo); 2266 } 2267 2268 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p) 2269 { 2270 unsigned long i; 2271 2272 if (!snapshot) 2273 return; 2274 2275 drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc); 2276 drm_printf(p, "\tHW Ring address: 0x%08x\n", 2277 snapshot->ring_addr); 2278 drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n", 2279 snapshot->indirect_context_desc); 2280 drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head); 2281 drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n", 2282 snapshot->tail.internal, snapshot->tail.memory); 2283 drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start); 2284 drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno); 2285 drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno); 2286 drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp); 2287 drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp); 2288 2289 if (!snapshot->lrc_snapshot) 2290 return; 2291 2292 drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE); 2293 drm_puts(p, "\t[HWSP].data: "); 2294 for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) { 2295 u32 *val = snapshot->lrc_snapshot + i; 2296 char dumped[ASCII85_BUFSZ]; 2297 2298 drm_puts(p, ascii85_encode(*val, dumped)); 2299 } 2300 2301 drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE); 2302 drm_puts(p, "\t[HWCTX].data: "); 2303 for (; i < snapshot->lrc_size; i += sizeof(u32)) { 2304 u32 *val = snapshot->lrc_snapshot + i; 2305 char dumped[ASCII85_BUFSZ]; 2306 2307 drm_puts(p, ascii85_encode(*val, dumped)); 2308 } 2309 drm_puts(p, "\n"); 2310 } 2311 2312 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot) 2313 { 2314 if (!snapshot) 2315 return; 2316 2317 kvfree(snapshot->lrc_snapshot); 2318 if (snapshot->lrc_bo) 2319 xe_bo_put(snapshot->lrc_bo); 2320 2321 kfree(snapshot); 2322 } 2323 2324 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts) 2325 { 2326 u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id); 2327 u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id); 2328 struct xe_hw_engine *hwe; 2329 u64 val; 2330 2331 hwe = xe_gt_hw_engine(lrc->gt, class, instance, false); 2332 if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe), 2333 "Unexpected engine class:instance %d:%d for context utilization\n", 2334 class, instance)) 2335 return -1; 2336 2337 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) 2338 val = xe_mmio_read64_2x32(&hwe->gt->mmio, 2339 RING_CTX_TIMESTAMP(hwe->mmio_base)); 2340 else 2341 val = xe_mmio_read32(&hwe->gt->mmio, 2342 RING_CTX_TIMESTAMP(hwe->mmio_base)); 2343 2344 *reg_ctx_ts = val; 2345 2346 return 0; 2347 } 2348 2349 /** 2350 * xe_lrc_update_timestamp() - Update ctx timestamp 2351 * @lrc: Pointer to the lrc. 2352 * @old_ts: Old timestamp value 2353 * 2354 * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and 2355 * update saved value. With support for active contexts, the calculation may be 2356 * slightly racy, so follow a read-again logic to ensure that the context is 2357 * still active before returning the right timestamp. 2358 * 2359 * Returns: New ctx timestamp value 2360 */ 2361 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts) 2362 { 2363 u64 lrc_ts, reg_ts; 2364 u32 engine_id; 2365 2366 *old_ts = lrc->ctx_timestamp; 2367 2368 lrc_ts = xe_lrc_ctx_timestamp(lrc); 2369 /* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */ 2370 if (IS_SRIOV_VF(lrc_to_xe(lrc))) { 2371 lrc->ctx_timestamp = lrc_ts; 2372 goto done; 2373 } 2374 2375 if (lrc_ts == CONTEXT_ACTIVE) { 2376 engine_id = xe_lrc_engine_id(lrc); 2377 if (!get_ctx_timestamp(lrc, engine_id, ®_ts)) 2378 lrc->ctx_timestamp = reg_ts; 2379 2380 /* read lrc again to ensure context is still active */ 2381 lrc_ts = xe_lrc_ctx_timestamp(lrc); 2382 } 2383 2384 /* 2385 * If context switched out, just use the lrc_ts. Note that this needs to 2386 * be a separate if condition. 2387 */ 2388 if (lrc_ts != CONTEXT_ACTIVE) 2389 lrc->ctx_timestamp = lrc_ts; 2390 2391 done: 2392 trace_xe_lrc_update_timestamp(lrc, *old_ts); 2393 2394 return lrc->ctx_timestamp; 2395 } 2396 2397 /** 2398 * xe_lrc_ring_is_idle() - LRC is idle 2399 * @lrc: Pointer to the lrc. 2400 * 2401 * Compare LRC ring head and tail to determine if idle. 2402 * 2403 * Return: True is ring is idle, False otherwise 2404 */ 2405 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc) 2406 { 2407 return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc); 2408 } 2409