1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2021 Intel Corporation 4 */ 5 6 #include "xe_lrc.h" 7 8 #include <generated/xe_wa_oob.h> 9 10 #include <linux/ascii85.h> 11 12 #include "instructions/xe_mi_commands.h" 13 #include "instructions/xe_gfxpipe_commands.h" 14 #include "instructions/xe_gfx_state_commands.h" 15 #include "regs/xe_engine_regs.h" 16 #include "regs/xe_lrc_layout.h" 17 #include "xe_bb.h" 18 #include "xe_bo.h" 19 #include "xe_device.h" 20 #include "xe_drm_client.h" 21 #include "xe_exec_queue_types.h" 22 #include "xe_gt.h" 23 #include "xe_gt_printk.h" 24 #include "xe_hw_fence.h" 25 #include "xe_map.h" 26 #include "xe_memirq.h" 27 #include "xe_mmio.h" 28 #include "xe_sriov.h" 29 #include "xe_trace_lrc.h" 30 #include "xe_vm.h" 31 #include "xe_wa.h" 32 33 #define LRC_VALID BIT_ULL(0) 34 #define LRC_PRIVILEGE BIT_ULL(8) 35 #define LRC_ADDRESSING_MODE GENMASK_ULL(4, 3) 36 #define LRC_LEGACY_64B_CONTEXT 3 37 38 #define LRC_ENGINE_CLASS GENMASK_ULL(63, 61) 39 #define LRC_ENGINE_INSTANCE GENMASK_ULL(53, 48) 40 41 #define LRC_PPHWSP_SIZE SZ_4K 42 #define LRC_INDIRECT_RING_STATE_SIZE SZ_4K 43 44 static struct xe_device * 45 lrc_to_xe(struct xe_lrc *lrc) 46 { 47 return gt_to_xe(lrc->fence_ctx.gt); 48 } 49 50 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class) 51 { 52 struct xe_device *xe = gt_to_xe(gt); 53 size_t size; 54 55 /* Per-process HW status page (PPHWSP) */ 56 size = LRC_PPHWSP_SIZE; 57 58 /* Engine context image */ 59 switch (class) { 60 case XE_ENGINE_CLASS_RENDER: 61 if (GRAPHICS_VER(xe) >= 20) 62 size += 3 * SZ_4K; 63 else 64 size += 13 * SZ_4K; 65 break; 66 case XE_ENGINE_CLASS_COMPUTE: 67 if (GRAPHICS_VER(xe) >= 20) 68 size += 2 * SZ_4K; 69 else 70 size += 13 * SZ_4K; 71 break; 72 default: 73 WARN(1, "Unknown engine class: %d", class); 74 fallthrough; 75 case XE_ENGINE_CLASS_COPY: 76 case XE_ENGINE_CLASS_VIDEO_DECODE: 77 case XE_ENGINE_CLASS_VIDEO_ENHANCE: 78 case XE_ENGINE_CLASS_OTHER: 79 size += 1 * SZ_4K; 80 } 81 82 /* Add indirect ring state page */ 83 if (xe_gt_has_indirect_ring_state(gt)) 84 size += LRC_INDIRECT_RING_STATE_SIZE; 85 86 return size; 87 } 88 89 /* 90 * The per-platform tables are u8-encoded in @data. Decode @data and set the 91 * addresses' offset and commands in @regs. The following encoding is used 92 * for each byte. There are 2 steps: decoding commands and decoding addresses. 93 * 94 * Commands: 95 * [7]: create NOPs - number of NOPs are set in lower bits 96 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set 97 * MI_LRI_FORCE_POSTED 98 * [5:0]: Number of NOPs or registers to set values to in case of 99 * MI_LOAD_REGISTER_IMM 100 * 101 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count" 102 * number of registers. They are set by using the REG/REG16 macros: the former 103 * is used for offsets smaller than 0x200 while the latter is for values bigger 104 * than that. Those macros already set all the bits documented below correctly: 105 * 106 * [7]: When a register offset needs more than 6 bits, use additional bytes, to 107 * follow, for the lower bits 108 * [6:0]: Register offset, without considering the engine base. 109 * 110 * This function only tweaks the commands and register offsets. Values are not 111 * filled out. 112 */ 113 static void set_offsets(u32 *regs, 114 const u8 *data, 115 const struct xe_hw_engine *hwe) 116 #define NOP(x) (BIT(7) | (x)) 117 #define LRI(count, flags) ((flags) << 6 | (count) | \ 118 BUILD_BUG_ON_ZERO(count >= BIT(6))) 119 #define POSTED BIT(0) 120 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 121 #define REG16(x) \ 122 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 123 (((x) >> 2) & 0x7f) 124 { 125 const u32 base = hwe->mmio_base; 126 127 while (*data) { 128 u8 count, flags; 129 130 if (*data & BIT(7)) { /* skip */ 131 count = *data++ & ~BIT(7); 132 regs += count; 133 continue; 134 } 135 136 count = *data & 0x3f; 137 flags = *data >> 6; 138 data++; 139 140 *regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count); 141 if (flags & POSTED) 142 *regs |= MI_LRI_FORCE_POSTED; 143 *regs |= MI_LRI_LRM_CS_MMIO; 144 regs++; 145 146 xe_gt_assert(hwe->gt, count); 147 do { 148 u32 offset = 0; 149 u8 v; 150 151 do { 152 v = *data++; 153 offset <<= 7; 154 offset |= v & ~BIT(7); 155 } while (v & BIT(7)); 156 157 regs[0] = base + (offset << 2); 158 regs += 2; 159 } while (--count); 160 } 161 162 *regs = MI_BATCH_BUFFER_END | BIT(0); 163 } 164 165 static const u8 gen12_xcs_offsets[] = { 166 NOP(1), 167 LRI(13, POSTED), 168 REG16(0x244), 169 REG(0x034), 170 REG(0x030), 171 REG(0x038), 172 REG(0x03c), 173 REG(0x168), 174 REG(0x140), 175 REG(0x110), 176 REG(0x1c0), 177 REG(0x1c4), 178 REG(0x1c8), 179 REG(0x180), 180 REG16(0x2b4), 181 182 NOP(5), 183 LRI(9, POSTED), 184 REG16(0x3a8), 185 REG16(0x28c), 186 REG16(0x288), 187 REG16(0x284), 188 REG16(0x280), 189 REG16(0x27c), 190 REG16(0x278), 191 REG16(0x274), 192 REG16(0x270), 193 194 0 195 }; 196 197 static const u8 dg2_xcs_offsets[] = { 198 NOP(1), 199 LRI(15, POSTED), 200 REG16(0x244), 201 REG(0x034), 202 REG(0x030), 203 REG(0x038), 204 REG(0x03c), 205 REG(0x168), 206 REG(0x140), 207 REG(0x110), 208 REG(0x1c0), 209 REG(0x1c4), 210 REG(0x1c8), 211 REG(0x180), 212 REG16(0x2b4), 213 REG(0x120), 214 REG(0x124), 215 216 NOP(1), 217 LRI(9, POSTED), 218 REG16(0x3a8), 219 REG16(0x28c), 220 REG16(0x288), 221 REG16(0x284), 222 REG16(0x280), 223 REG16(0x27c), 224 REG16(0x278), 225 REG16(0x274), 226 REG16(0x270), 227 228 0 229 }; 230 231 static const u8 gen12_rcs_offsets[] = { 232 NOP(1), 233 LRI(13, POSTED), 234 REG16(0x244), 235 REG(0x034), 236 REG(0x030), 237 REG(0x038), 238 REG(0x03c), 239 REG(0x168), 240 REG(0x140), 241 REG(0x110), 242 REG(0x1c0), 243 REG(0x1c4), 244 REG(0x1c8), 245 REG(0x180), 246 REG16(0x2b4), 247 248 NOP(5), 249 LRI(9, POSTED), 250 REG16(0x3a8), 251 REG16(0x28c), 252 REG16(0x288), 253 REG16(0x284), 254 REG16(0x280), 255 REG16(0x27c), 256 REG16(0x278), 257 REG16(0x274), 258 REG16(0x270), 259 260 LRI(3, POSTED), 261 REG(0x1b0), 262 REG16(0x5a8), 263 REG16(0x5ac), 264 265 NOP(6), 266 LRI(1, 0), 267 REG(0x0c8), 268 NOP(3 + 9 + 1), 269 270 LRI(51, POSTED), 271 REG16(0x588), 272 REG16(0x588), 273 REG16(0x588), 274 REG16(0x588), 275 REG16(0x588), 276 REG16(0x588), 277 REG(0x028), 278 REG(0x09c), 279 REG(0x0c0), 280 REG(0x178), 281 REG(0x17c), 282 REG16(0x358), 283 REG(0x170), 284 REG(0x150), 285 REG(0x154), 286 REG(0x158), 287 REG16(0x41c), 288 REG16(0x600), 289 REG16(0x604), 290 REG16(0x608), 291 REG16(0x60c), 292 REG16(0x610), 293 REG16(0x614), 294 REG16(0x618), 295 REG16(0x61c), 296 REG16(0x620), 297 REG16(0x624), 298 REG16(0x628), 299 REG16(0x62c), 300 REG16(0x630), 301 REG16(0x634), 302 REG16(0x638), 303 REG16(0x63c), 304 REG16(0x640), 305 REG16(0x644), 306 REG16(0x648), 307 REG16(0x64c), 308 REG16(0x650), 309 REG16(0x654), 310 REG16(0x658), 311 REG16(0x65c), 312 REG16(0x660), 313 REG16(0x664), 314 REG16(0x668), 315 REG16(0x66c), 316 REG16(0x670), 317 REG16(0x674), 318 REG16(0x678), 319 REG16(0x67c), 320 REG(0x068), 321 REG(0x084), 322 NOP(1), 323 324 0 325 }; 326 327 static const u8 xehp_rcs_offsets[] = { 328 NOP(1), 329 LRI(13, POSTED), 330 REG16(0x244), 331 REG(0x034), 332 REG(0x030), 333 REG(0x038), 334 REG(0x03c), 335 REG(0x168), 336 REG(0x140), 337 REG(0x110), 338 REG(0x1c0), 339 REG(0x1c4), 340 REG(0x1c8), 341 REG(0x180), 342 REG16(0x2b4), 343 344 NOP(5), 345 LRI(9, POSTED), 346 REG16(0x3a8), 347 REG16(0x28c), 348 REG16(0x288), 349 REG16(0x284), 350 REG16(0x280), 351 REG16(0x27c), 352 REG16(0x278), 353 REG16(0x274), 354 REG16(0x270), 355 356 LRI(3, POSTED), 357 REG(0x1b0), 358 REG16(0x5a8), 359 REG16(0x5ac), 360 361 NOP(6), 362 LRI(1, 0), 363 REG(0x0c8), 364 365 0 366 }; 367 368 static const u8 dg2_rcs_offsets[] = { 369 NOP(1), 370 LRI(15, POSTED), 371 REG16(0x244), 372 REG(0x034), 373 REG(0x030), 374 REG(0x038), 375 REG(0x03c), 376 REG(0x168), 377 REG(0x140), 378 REG(0x110), 379 REG(0x1c0), 380 REG(0x1c4), 381 REG(0x1c8), 382 REG(0x180), 383 REG16(0x2b4), 384 REG(0x120), 385 REG(0x124), 386 387 NOP(1), 388 LRI(9, POSTED), 389 REG16(0x3a8), 390 REG16(0x28c), 391 REG16(0x288), 392 REG16(0x284), 393 REG16(0x280), 394 REG16(0x27c), 395 REG16(0x278), 396 REG16(0x274), 397 REG16(0x270), 398 399 LRI(3, POSTED), 400 REG(0x1b0), 401 REG16(0x5a8), 402 REG16(0x5ac), 403 404 NOP(6), 405 LRI(1, 0), 406 REG(0x0c8), 407 408 0 409 }; 410 411 static const u8 mtl_rcs_offsets[] = { 412 NOP(1), 413 LRI(15, POSTED), 414 REG16(0x244), 415 REG(0x034), 416 REG(0x030), 417 REG(0x038), 418 REG(0x03c), 419 REG(0x168), 420 REG(0x140), 421 REG(0x110), 422 REG(0x1c0), 423 REG(0x1c4), 424 REG(0x1c8), 425 REG(0x180), 426 REG16(0x2b4), 427 REG(0x120), 428 REG(0x124), 429 430 NOP(1), 431 LRI(9, POSTED), 432 REG16(0x3a8), 433 REG16(0x28c), 434 REG16(0x288), 435 REG16(0x284), 436 REG16(0x280), 437 REG16(0x27c), 438 REG16(0x278), 439 REG16(0x274), 440 REG16(0x270), 441 442 NOP(2), 443 LRI(2, POSTED), 444 REG16(0x5a8), 445 REG16(0x5ac), 446 447 NOP(6), 448 LRI(1, 0), 449 REG(0x0c8), 450 451 0 452 }; 453 454 #define XE2_CTX_COMMON \ 455 NOP(1), /* [0x00] */ \ 456 LRI(15, POSTED), /* [0x01] */ \ 457 REG16(0x244), /* [0x02] CTXT_SR_CTL */ \ 458 REG(0x034), /* [0x04] RING_BUFFER_HEAD */ \ 459 REG(0x030), /* [0x06] RING_BUFFER_TAIL */ \ 460 REG(0x038), /* [0x08] RING_BUFFER_START */ \ 461 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ \ 462 REG(0x168), /* [0x0c] BB_ADDR_UDW */ \ 463 REG(0x140), /* [0x0e] BB_ADDR */ \ 464 REG(0x110), /* [0x10] BB_STATE */ \ 465 REG(0x1c0), /* [0x12] BB_PER_CTX_PTR */ \ 466 REG(0x1c4), /* [0x14] RCS_INDIRECT_CTX */ \ 467 REG(0x1c8), /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \ 468 REG(0x180), /* [0x18] CCID */ \ 469 REG16(0x2b4), /* [0x1a] SEMAPHORE_TOKEN */ \ 470 REG(0x120), /* [0x1c] PRT_BB_STATE */ \ 471 REG(0x124), /* [0x1e] PRT_BB_STATE_UDW */ \ 472 \ 473 NOP(1), /* [0x20] */ \ 474 LRI(9, POSTED), /* [0x21] */ \ 475 REG16(0x3a8), /* [0x22] CTX_TIMESTAMP */ \ 476 REG16(0x3ac), /* [0x24] CTX_TIMESTAMP_UDW */ \ 477 REG(0x108), /* [0x26] INDIRECT_RING_STATE */ \ 478 REG16(0x284), /* [0x28] dummy reg */ \ 479 REG16(0x280), /* [0x2a] CS_ACC_CTR_THOLD */ \ 480 REG16(0x27c), /* [0x2c] CS_CTX_SYS_PASID */ \ 481 REG16(0x278), /* [0x2e] CS_CTX_ASID */ \ 482 REG16(0x274), /* [0x30] PTBP_UDW */ \ 483 REG16(0x270) /* [0x32] PTBP_LDW */ 484 485 static const u8 xe2_rcs_offsets[] = { 486 XE2_CTX_COMMON, 487 488 NOP(2), /* [0x34] */ 489 LRI(2, POSTED), /* [0x36] */ 490 REG16(0x5a8), /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */ 491 REG16(0x5ac), /* [0x39] PREEMPTION_STATUS */ 492 493 NOP(6), /* [0x41] */ 494 LRI(1, 0), /* [0x47] */ 495 REG(0x0c8), /* [0x48] R_PWR_CLK_STATE */ 496 497 0 498 }; 499 500 static const u8 xe2_bcs_offsets[] = { 501 XE2_CTX_COMMON, 502 503 NOP(4 + 8 + 1), /* [0x34] */ 504 LRI(2, POSTED), /* [0x41] */ 505 REG16(0x200), /* [0x42] BCS_SWCTRL */ 506 REG16(0x204), /* [0x44] BLIT_CCTL */ 507 508 0 509 }; 510 511 static const u8 xe2_xcs_offsets[] = { 512 XE2_CTX_COMMON, 513 514 0 515 }; 516 517 static const u8 xe2_indirect_ring_state_offsets[] = { 518 NOP(1), /* [0x00] */ 519 LRI(5, POSTED), /* [0x01] */ 520 REG(0x034), /* [0x02] RING_BUFFER_HEAD */ 521 REG(0x030), /* [0x04] RING_BUFFER_TAIL */ 522 REG(0x038), /* [0x06] RING_BUFFER_START */ 523 REG(0x048), /* [0x08] RING_BUFFER_START_UDW */ 524 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ 525 526 NOP(5), /* [0x0c] */ 527 LRI(9, POSTED), /* [0x11] */ 528 REG(0x168), /* [0x12] BB_ADDR_UDW */ 529 REG(0x140), /* [0x14] BB_ADDR */ 530 REG(0x110), /* [0x16] BB_STATE */ 531 REG16(0x588), /* [0x18] BB_STACK_WRITE_PORT */ 532 REG16(0x588), /* [0x20] BB_STACK_WRITE_PORT */ 533 REG16(0x588), /* [0x22] BB_STACK_WRITE_PORT */ 534 REG16(0x588), /* [0x24] BB_STACK_WRITE_PORT */ 535 REG16(0x588), /* [0x26] BB_STACK_WRITE_PORT */ 536 REG16(0x588), /* [0x28] BB_STACK_WRITE_PORT */ 537 538 NOP(12), /* [0x00] */ 539 540 0 541 }; 542 543 #undef REG16 544 #undef REG 545 #undef LRI 546 #undef NOP 547 548 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class) 549 { 550 if (class == XE_ENGINE_CLASS_RENDER) { 551 if (GRAPHICS_VER(xe) >= 20) 552 return xe2_rcs_offsets; 553 else if (GRAPHICS_VERx100(xe) >= 1270) 554 return mtl_rcs_offsets; 555 else if (GRAPHICS_VERx100(xe) >= 1255) 556 return dg2_rcs_offsets; 557 else if (GRAPHICS_VERx100(xe) >= 1250) 558 return xehp_rcs_offsets; 559 else 560 return gen12_rcs_offsets; 561 } else if (class == XE_ENGINE_CLASS_COPY) { 562 if (GRAPHICS_VER(xe) >= 20) 563 return xe2_bcs_offsets; 564 else 565 return gen12_xcs_offsets; 566 } else { 567 if (GRAPHICS_VER(xe) >= 20) 568 return xe2_xcs_offsets; 569 else if (GRAPHICS_VERx100(xe) >= 1255) 570 return dg2_xcs_offsets; 571 else 572 return gen12_xcs_offsets; 573 } 574 } 575 576 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe) 577 { 578 regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | 579 CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 580 581 if (xe_gt_has_indirect_ring_state(hwe->gt)) 582 regs[CTX_CONTEXT_CONTROL] |= 583 _MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE); 584 585 /* TODO: Timestamp */ 586 } 587 588 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe) 589 { 590 struct xe_memirq *memirq = >_to_tile(hwe->gt)->memirq; 591 struct xe_device *xe = gt_to_xe(hwe->gt); 592 u8 num_regs; 593 594 if (!xe_device_uses_memirq(xe)) 595 return; 596 597 regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM | 598 MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT; 599 regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr; 600 regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq); 601 602 num_regs = xe_device_has_msix(xe) ? 3 : 2; 603 regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) | 604 MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED; 605 regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr; 606 regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe); 607 regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr; 608 regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe); 609 610 if (xe_device_has_msix(xe)) { 611 regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr; 612 /* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */ 613 } 614 } 615 616 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe) 617 { 618 struct xe_device *xe = gt_to_xe(hwe->gt); 619 620 if (GRAPHICS_VERx100(xe) >= 1250) 621 return 0x70; 622 else 623 return 0x60; 624 } 625 626 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe) 627 { 628 int x; 629 630 x = lrc_ring_mi_mode(hwe); 631 regs[x + 1] &= ~STOP_RING; 632 regs[x + 1] |= STOP_RING << 16; 633 } 634 635 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc) 636 { 637 return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE; 638 } 639 640 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc) 641 { 642 return 0; 643 } 644 645 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc) 646 { 647 return lrc->ring.size; 648 } 649 650 /* Make the magic macros work */ 651 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset 652 #define __xe_lrc_regs_offset xe_lrc_regs_offset 653 654 #define LRC_SEQNO_PPHWSP_OFFSET 512 655 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8) 656 #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8) 657 #define LRC_PARALLEL_PPHWSP_OFFSET 2048 658 #define LRC_ENGINE_ID_PPHWSP_OFFSET 2096 659 660 u32 xe_lrc_regs_offset(struct xe_lrc *lrc) 661 { 662 return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE; 663 } 664 665 static size_t lrc_reg_size(struct xe_device *xe) 666 { 667 if (GRAPHICS_VERx100(xe) >= 1250) 668 return 96 * sizeof(u32); 669 else 670 return 80 * sizeof(u32); 671 } 672 673 size_t xe_lrc_skip_size(struct xe_device *xe) 674 { 675 return LRC_PPHWSP_SIZE + lrc_reg_size(xe); 676 } 677 678 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc) 679 { 680 /* The seqno is stored in the driver-defined portion of PPHWSP */ 681 return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET; 682 } 683 684 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc) 685 { 686 /* The start seqno is stored in the driver-defined portion of PPHWSP */ 687 return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET; 688 } 689 690 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc) 691 { 692 /* This is stored in the driver-defined portion of PPHWSP */ 693 return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET; 694 } 695 696 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc) 697 { 698 /* The parallel is stored in the driver-defined portion of PPHWSP */ 699 return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET; 700 } 701 702 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc) 703 { 704 return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET; 705 } 706 707 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc) 708 { 709 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32); 710 } 711 712 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc) 713 { 714 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32); 715 } 716 717 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc) 718 { 719 /* Indirect ring state page is at the very end of LRC */ 720 return lrc->size - LRC_INDIRECT_RING_STATE_SIZE; 721 } 722 723 #define DECL_MAP_ADDR_HELPERS(elem) \ 724 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \ 725 { \ 726 struct iosys_map map = lrc->bo->vmap; \ 727 \ 728 xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map)); \ 729 iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \ 730 return map; \ 731 } \ 732 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \ 733 { \ 734 return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \ 735 } \ 736 737 DECL_MAP_ADDR_HELPERS(ring) 738 DECL_MAP_ADDR_HELPERS(pphwsp) 739 DECL_MAP_ADDR_HELPERS(seqno) 740 DECL_MAP_ADDR_HELPERS(regs) 741 DECL_MAP_ADDR_HELPERS(start_seqno) 742 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp) 743 DECL_MAP_ADDR_HELPERS(ctx_timestamp) 744 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw) 745 DECL_MAP_ADDR_HELPERS(parallel) 746 DECL_MAP_ADDR_HELPERS(indirect_ring) 747 DECL_MAP_ADDR_HELPERS(engine_id) 748 749 #undef DECL_MAP_ADDR_HELPERS 750 751 /** 752 * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address 753 * @lrc: Pointer to the lrc. 754 * 755 * Returns: ctx timestamp GGTT address 756 */ 757 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc) 758 { 759 return __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 760 } 761 762 /** 763 * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address 764 * @lrc: Pointer to the lrc. 765 * 766 * Returns: ctx timestamp udw GGTT address 767 */ 768 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc) 769 { 770 return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc); 771 } 772 773 /** 774 * xe_lrc_ctx_timestamp() - Read ctx timestamp value 775 * @lrc: Pointer to the lrc. 776 * 777 * Returns: ctx timestamp value 778 */ 779 u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc) 780 { 781 struct xe_device *xe = lrc_to_xe(lrc); 782 struct iosys_map map; 783 u32 ldw, udw = 0; 784 785 map = __xe_lrc_ctx_timestamp_map(lrc); 786 ldw = xe_map_read32(xe, &map); 787 788 if (xe->info.has_64bit_timestamp) { 789 map = __xe_lrc_ctx_timestamp_udw_map(lrc); 790 udw = xe_map_read32(xe, &map); 791 } 792 793 return (u64)udw << 32 | ldw; 794 } 795 796 /** 797 * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address 798 * @lrc: Pointer to the lrc. 799 * 800 * Returns: ctx timestamp job GGTT address 801 */ 802 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc) 803 { 804 return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc); 805 } 806 807 /** 808 * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value 809 * @lrc: Pointer to the lrc. 810 * 811 * Returns: ctx timestamp job value 812 */ 813 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc) 814 { 815 struct xe_device *xe = lrc_to_xe(lrc); 816 struct iosys_map map; 817 818 map = __xe_lrc_ctx_job_timestamp_map(lrc); 819 return xe_map_read32(xe, &map); 820 } 821 822 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc) 823 { 824 return __xe_lrc_pphwsp_ggtt_addr(lrc); 825 } 826 827 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc) 828 { 829 if (!xe_lrc_has_indirect_ring_state(lrc)) 830 return 0; 831 832 return __xe_lrc_indirect_ring_ggtt_addr(lrc); 833 } 834 835 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr) 836 { 837 struct xe_device *xe = lrc_to_xe(lrc); 838 struct iosys_map map; 839 840 map = __xe_lrc_indirect_ring_map(lrc); 841 iosys_map_incr(&map, reg_nr * sizeof(u32)); 842 return xe_map_read32(xe, &map); 843 } 844 845 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc, 846 int reg_nr, u32 val) 847 { 848 struct xe_device *xe = lrc_to_xe(lrc); 849 struct iosys_map map; 850 851 map = __xe_lrc_indirect_ring_map(lrc); 852 iosys_map_incr(&map, reg_nr * sizeof(u32)); 853 xe_map_write32(xe, &map, val); 854 } 855 856 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr) 857 { 858 struct xe_device *xe = lrc_to_xe(lrc); 859 struct iosys_map map; 860 861 map = __xe_lrc_regs_map(lrc); 862 iosys_map_incr(&map, reg_nr * sizeof(u32)); 863 return xe_map_read32(xe, &map); 864 } 865 866 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val) 867 { 868 struct xe_device *xe = lrc_to_xe(lrc); 869 struct iosys_map map; 870 871 map = __xe_lrc_regs_map(lrc); 872 iosys_map_incr(&map, reg_nr * sizeof(u32)); 873 xe_map_write32(xe, &map, val); 874 } 875 876 static void *empty_lrc_data(struct xe_hw_engine *hwe) 877 { 878 struct xe_gt *gt = hwe->gt; 879 void *data; 880 u32 *regs; 881 882 data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL); 883 if (!data) 884 return NULL; 885 886 /* 1st page: Per-Process of HW status Page */ 887 regs = data + LRC_PPHWSP_SIZE; 888 set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe); 889 set_context_control(regs, hwe); 890 set_memory_based_intr(regs, hwe); 891 reset_stop_ring(regs, hwe); 892 if (xe_gt_has_indirect_ring_state(gt)) { 893 regs = data + xe_gt_lrc_size(gt, hwe->class) - 894 LRC_INDIRECT_RING_STATE_SIZE; 895 set_offsets(regs, xe2_indirect_ring_state_offsets, hwe); 896 } 897 898 return data; 899 } 900 901 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm) 902 { 903 u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt)); 904 905 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc)); 906 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc)); 907 } 908 909 static void xe_lrc_finish(struct xe_lrc *lrc) 910 { 911 xe_hw_fence_ctx_finish(&lrc->fence_ctx); 912 xe_bo_lock(lrc->bo, false); 913 xe_bo_unpin(lrc->bo); 914 xe_bo_unlock(lrc->bo); 915 xe_bo_put(lrc->bo); 916 xe_bo_unpin_map_no_vm(lrc->bb_per_ctx_bo); 917 } 918 919 /* 920 * xe_lrc_setup_utilization() - Setup wa bb to assist in calculating active 921 * context run ticks. 922 * @lrc: Pointer to the lrc. 923 * 924 * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the 925 * context, but only gets updated when the context switches out. In order to 926 * check how long a context has been active before it switches out, two things 927 * are required: 928 * 929 * (1) Determine if the context is running: 930 * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in 931 * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is 932 * initialized. During a query, we just check for this value to determine if the 933 * context is active. If the context switched out, it would overwrite this 934 * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as 935 * the last part of context restore, so reusing this LRC location will not 936 * clobber anything. 937 * 938 * (2) Calculate the time that the context has been active for: 939 * The CTX_TIMESTAMP ticks only when the context is active. If a context is 940 * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization. 941 * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific 942 * engine instance. Since we do not know which instance the context is running 943 * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and 944 * store it in the PPHSWP. 945 */ 946 #define CONTEXT_ACTIVE 1ULL 947 static void xe_lrc_setup_utilization(struct xe_lrc *lrc) 948 { 949 u32 *cmd; 950 951 cmd = lrc->bb_per_ctx_bo->vmap.vaddr; 952 953 *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET; 954 *cmd++ = ENGINE_ID(0).addr; 955 *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc); 956 *cmd++ = 0; 957 958 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); 959 *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 960 *cmd++ = 0; 961 *cmd++ = lower_32_bits(CONTEXT_ACTIVE); 962 963 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) { 964 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); 965 *cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc); 966 *cmd++ = 0; 967 *cmd++ = upper_32_bits(CONTEXT_ACTIVE); 968 } 969 970 *cmd++ = MI_BATCH_BUFFER_END; 971 972 xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR, 973 xe_bo_ggtt_addr(lrc->bb_per_ctx_bo) | 1); 974 975 } 976 977 #define PVC_CTX_ASID (0x2e + 1) 978 #define PVC_CTX_ACC_CTR_THOLD (0x2a + 1) 979 980 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 981 struct xe_vm *vm, u32 ring_size, u16 msix_vec, 982 u32 init_flags) 983 { 984 struct xe_gt *gt = hwe->gt; 985 struct xe_tile *tile = gt_to_tile(gt); 986 struct xe_device *xe = gt_to_xe(gt); 987 struct iosys_map map; 988 void *init_data = NULL; 989 u32 arb_enable; 990 u32 lrc_size; 991 u32 bo_flags; 992 int err; 993 994 kref_init(&lrc->refcount); 995 lrc->gt = gt; 996 lrc->flags = 0; 997 lrc_size = ring_size + xe_gt_lrc_size(gt, hwe->class); 998 if (xe_gt_has_indirect_ring_state(gt)) 999 lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE; 1000 1001 bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT | 1002 XE_BO_FLAG_GGTT_INVALIDATE; 1003 if (vm && vm->xef) /* userspace */ 1004 bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE; 1005 1006 /* 1007 * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address 1008 * via VM bind calls. 1009 */ 1010 lrc->bo = xe_bo_create_pin_map(xe, tile, vm, lrc_size, 1011 ttm_bo_type_kernel, 1012 bo_flags); 1013 if (IS_ERR(lrc->bo)) 1014 return PTR_ERR(lrc->bo); 1015 1016 lrc->bb_per_ctx_bo = xe_bo_create_pin_map(xe, tile, NULL, SZ_4K, 1017 ttm_bo_type_kernel, 1018 bo_flags); 1019 if (IS_ERR(lrc->bb_per_ctx_bo)) { 1020 err = PTR_ERR(lrc->bb_per_ctx_bo); 1021 goto err_lrc_finish; 1022 } 1023 1024 lrc->size = lrc_size; 1025 lrc->ring.size = ring_size; 1026 lrc->ring.tail = 0; 1027 1028 xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt, 1029 hwe->fence_irq, hwe->name); 1030 1031 if (!gt->default_lrc[hwe->class]) { 1032 init_data = empty_lrc_data(hwe); 1033 if (!init_data) { 1034 err = -ENOMEM; 1035 goto err_lrc_finish; 1036 } 1037 } 1038 1039 /* 1040 * Init Per-Process of HW status Page, LRC / context state to known 1041 * values 1042 */ 1043 map = __xe_lrc_pphwsp_map(lrc); 1044 if (!init_data) { 1045 xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE); /* PPHWSP */ 1046 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE, 1047 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE, 1048 xe_gt_lrc_size(gt, hwe->class) - LRC_PPHWSP_SIZE); 1049 } else { 1050 xe_map_memcpy_to(xe, &map, 0, init_data, 1051 xe_gt_lrc_size(gt, hwe->class)); 1052 kfree(init_data); 1053 } 1054 1055 if (vm) { 1056 xe_lrc_set_ppgtt(lrc, vm); 1057 1058 if (vm->xef) 1059 xe_drm_client_add_bo(vm->xef->client, lrc->bo); 1060 } 1061 1062 if (xe_device_has_msix(xe)) { 1063 xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR, 1064 xe_memirq_status_ptr(&tile->memirq, hwe)); 1065 xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR, 1066 xe_memirq_source_ptr(&tile->memirq, hwe)); 1067 xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec); 1068 } 1069 1070 if (xe_gt_has_indirect_ring_state(gt)) { 1071 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE, 1072 __xe_lrc_indirect_ring_ggtt_addr(lrc)); 1073 1074 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START, 1075 __xe_lrc_ring_ggtt_addr(lrc)); 1076 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0); 1077 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0); 1078 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail); 1079 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL, 1080 RING_CTL_SIZE(lrc->ring.size) | RING_VALID); 1081 } else { 1082 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc)); 1083 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0); 1084 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail); 1085 xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL, 1086 RING_CTL_SIZE(lrc->ring.size) | RING_VALID); 1087 } 1088 1089 if (init_flags & XE_LRC_CREATE_RUNALONE) 1090 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL, 1091 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) | 1092 _MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE)); 1093 1094 if (init_flags & XE_LRC_CREATE_PXP) 1095 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL, 1096 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) | 1097 _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE)); 1098 1099 lrc->ctx_timestamp = 0; 1100 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0); 1101 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) 1102 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0); 1103 1104 if (xe->info.has_asid && vm) 1105 xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid); 1106 1107 lrc->desc = LRC_VALID; 1108 lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT); 1109 /* TODO: Priority */ 1110 1111 /* While this appears to have something about privileged batches or 1112 * some such, it really just means PPGTT mode. 1113 */ 1114 if (vm) 1115 lrc->desc |= LRC_PRIVILEGE; 1116 1117 if (GRAPHICS_VERx100(xe) < 1250) { 1118 lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance); 1119 lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class); 1120 } 1121 1122 arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1123 xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable)); 1124 1125 map = __xe_lrc_seqno_map(lrc); 1126 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); 1127 1128 map = __xe_lrc_start_seqno_map(lrc); 1129 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); 1130 1131 xe_lrc_setup_utilization(lrc); 1132 1133 return 0; 1134 1135 err_lrc_finish: 1136 xe_lrc_finish(lrc); 1137 return err; 1138 } 1139 1140 /** 1141 * xe_lrc_create - Create a LRC 1142 * @hwe: Hardware Engine 1143 * @vm: The VM (address space) 1144 * @ring_size: LRC ring size 1145 * @msix_vec: MSI-X interrupt vector (for platforms that support it) 1146 * @flags: LRC initialization flags 1147 * 1148 * Allocate and initialize the Logical Ring Context (LRC). 1149 * 1150 * Return pointer to created LRC upon success and an error pointer 1151 * upon failure. 1152 */ 1153 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm, 1154 u32 ring_size, u16 msix_vec, u32 flags) 1155 { 1156 struct xe_lrc *lrc; 1157 int err; 1158 1159 lrc = kzalloc(sizeof(*lrc), GFP_KERNEL); 1160 if (!lrc) 1161 return ERR_PTR(-ENOMEM); 1162 1163 err = xe_lrc_init(lrc, hwe, vm, ring_size, msix_vec, flags); 1164 if (err) { 1165 kfree(lrc); 1166 return ERR_PTR(err); 1167 } 1168 1169 return lrc; 1170 } 1171 1172 /** 1173 * xe_lrc_destroy - Destroy the LRC 1174 * @ref: reference to LRC 1175 * 1176 * Called when ref == 0, release resources held by the Logical Ring Context 1177 * (LRC) and free the LRC memory. 1178 */ 1179 void xe_lrc_destroy(struct kref *ref) 1180 { 1181 struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount); 1182 1183 xe_lrc_finish(lrc); 1184 kfree(lrc); 1185 } 1186 1187 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail) 1188 { 1189 if (xe_lrc_has_indirect_ring_state(lrc)) 1190 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail); 1191 else 1192 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail); 1193 } 1194 1195 u32 xe_lrc_ring_tail(struct xe_lrc *lrc) 1196 { 1197 if (xe_lrc_has_indirect_ring_state(lrc)) 1198 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR; 1199 else 1200 return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR; 1201 } 1202 1203 static u32 xe_lrc_ring_start(struct xe_lrc *lrc) 1204 { 1205 if (xe_lrc_has_indirect_ring_state(lrc)) 1206 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START); 1207 else 1208 return xe_lrc_read_ctx_reg(lrc, CTX_RING_START); 1209 } 1210 1211 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head) 1212 { 1213 if (xe_lrc_has_indirect_ring_state(lrc)) 1214 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head); 1215 else 1216 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head); 1217 } 1218 1219 u32 xe_lrc_ring_head(struct xe_lrc *lrc) 1220 { 1221 if (xe_lrc_has_indirect_ring_state(lrc)) 1222 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR; 1223 else 1224 return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR; 1225 } 1226 1227 u32 xe_lrc_ring_space(struct xe_lrc *lrc) 1228 { 1229 const u32 head = xe_lrc_ring_head(lrc); 1230 const u32 tail = lrc->ring.tail; 1231 const u32 size = lrc->ring.size; 1232 1233 return ((head - tail - 1) & (size - 1)) + 1; 1234 } 1235 1236 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring, 1237 const void *data, size_t size) 1238 { 1239 struct xe_device *xe = lrc_to_xe(lrc); 1240 1241 iosys_map_incr(&ring, lrc->ring.tail); 1242 xe_map_memcpy_to(xe, &ring, 0, data, size); 1243 lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1); 1244 } 1245 1246 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size) 1247 { 1248 struct xe_device *xe = lrc_to_xe(lrc); 1249 struct iosys_map ring; 1250 u32 rhs; 1251 size_t aligned_size; 1252 1253 xe_assert(xe, IS_ALIGNED(size, 4)); 1254 aligned_size = ALIGN(size, 8); 1255 1256 ring = __xe_lrc_ring_map(lrc); 1257 1258 xe_assert(xe, lrc->ring.tail < lrc->ring.size); 1259 rhs = lrc->ring.size - lrc->ring.tail; 1260 if (size > rhs) { 1261 __xe_lrc_write_ring(lrc, ring, data, rhs); 1262 __xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs); 1263 } else { 1264 __xe_lrc_write_ring(lrc, ring, data, size); 1265 } 1266 1267 if (aligned_size > size) { 1268 u32 noop = MI_NOOP; 1269 1270 __xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop)); 1271 } 1272 } 1273 1274 u64 xe_lrc_descriptor(struct xe_lrc *lrc) 1275 { 1276 return lrc->desc | xe_lrc_ggtt_addr(lrc); 1277 } 1278 1279 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc) 1280 { 1281 return __xe_lrc_seqno_ggtt_addr(lrc); 1282 } 1283 1284 /** 1285 * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence. 1286 * 1287 * Allocate but don't initialize an lrc seqno fence. 1288 * 1289 * Return: Pointer to the allocated fence or 1290 * negative error pointer on error. 1291 */ 1292 struct dma_fence *xe_lrc_alloc_seqno_fence(void) 1293 { 1294 return xe_hw_fence_alloc(); 1295 } 1296 1297 /** 1298 * xe_lrc_free_seqno_fence() - Free an lrc seqno fence. 1299 * @fence: Pointer to the fence to free. 1300 * 1301 * Frees an lrc seqno fence that hasn't yet been 1302 * initialized. 1303 */ 1304 void xe_lrc_free_seqno_fence(struct dma_fence *fence) 1305 { 1306 xe_hw_fence_free(fence); 1307 } 1308 1309 /** 1310 * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence. 1311 * @lrc: Pointer to the lrc. 1312 * @fence: Pointer to the fence to initialize. 1313 * 1314 * Initializes a pre-allocated lrc seqno fence. 1315 * After initialization, the fence is subject to normal 1316 * dma-fence refcounting. 1317 */ 1318 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence) 1319 { 1320 xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc)); 1321 } 1322 1323 s32 xe_lrc_seqno(struct xe_lrc *lrc) 1324 { 1325 struct iosys_map map = __xe_lrc_seqno_map(lrc); 1326 1327 return xe_map_read32(lrc_to_xe(lrc), &map); 1328 } 1329 1330 s32 xe_lrc_start_seqno(struct xe_lrc *lrc) 1331 { 1332 struct iosys_map map = __xe_lrc_start_seqno_map(lrc); 1333 1334 return xe_map_read32(lrc_to_xe(lrc), &map); 1335 } 1336 1337 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc) 1338 { 1339 return __xe_lrc_start_seqno_ggtt_addr(lrc); 1340 } 1341 1342 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc) 1343 { 1344 return __xe_lrc_parallel_ggtt_addr(lrc); 1345 } 1346 1347 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc) 1348 { 1349 return __xe_lrc_parallel_map(lrc); 1350 } 1351 1352 /** 1353 * xe_lrc_engine_id() - Read engine id value 1354 * @lrc: Pointer to the lrc. 1355 * 1356 * Returns: context id value 1357 */ 1358 static u32 xe_lrc_engine_id(struct xe_lrc *lrc) 1359 { 1360 struct xe_device *xe = lrc_to_xe(lrc); 1361 struct iosys_map map; 1362 1363 map = __xe_lrc_engine_id_map(lrc); 1364 return xe_map_read32(xe, &map); 1365 } 1366 1367 static int instr_dw(u32 cmd_header) 1368 { 1369 /* GFXPIPE "SINGLE_DW" opcodes are a single dword */ 1370 if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) == 1371 GFXPIPE_SINGLE_DW_CMD(0, 0)) 1372 return 1; 1373 1374 /* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */ 1375 if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST) 1376 return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2; 1377 1378 /* Most instructions have the # of dwords (minus 2) in 7:0 */ 1379 return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2; 1380 } 1381 1382 static int dump_mi_command(struct drm_printer *p, 1383 struct xe_gt *gt, 1384 u32 *dw, 1385 int remaining_dw) 1386 { 1387 u32 inst_header = *dw; 1388 u32 numdw = instr_dw(inst_header); 1389 u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header); 1390 int num_noop; 1391 1392 /* First check for commands that don't have/use a '# DW' field */ 1393 switch (inst_header & MI_OPCODE) { 1394 case MI_NOOP: 1395 num_noop = 1; 1396 while (num_noop < remaining_dw && 1397 (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP) 1398 num_noop++; 1399 drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop); 1400 return num_noop; 1401 1402 case MI_TOPOLOGY_FILTER: 1403 drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header); 1404 return 1; 1405 1406 case MI_BATCH_BUFFER_END: 1407 drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header); 1408 /* Return 'remaining_dw' to consume the rest of the LRC */ 1409 return remaining_dw; 1410 } 1411 1412 /* 1413 * Any remaining commands include a # of dwords. We should make sure 1414 * it doesn't exceed the remaining size of the LRC. 1415 */ 1416 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 1417 numdw = remaining_dw; 1418 1419 switch (inst_header & MI_OPCODE) { 1420 case MI_LOAD_REGISTER_IMM: 1421 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n", 1422 inst_header, (numdw - 1) / 2); 1423 for (int i = 1; i < numdw; i += 2) 1424 drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]); 1425 return numdw; 1426 1427 case MI_LOAD_REGISTER_MEM & MI_OPCODE: 1428 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n", 1429 inst_header, 1430 dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "", 1431 dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : ""); 1432 if (numdw == 4) 1433 drm_printf(p, " - %#6x = %#010llx\n", 1434 dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2]))); 1435 else 1436 drm_printf(p, " - %*ph (%s)\n", 1437 (int)sizeof(u32) * (numdw - 1), dw + 1, 1438 numdw < 4 ? "truncated" : "malformed"); 1439 return numdw; 1440 1441 case MI_FORCE_WAKEUP: 1442 drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header); 1443 return numdw; 1444 1445 default: 1446 drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n", 1447 inst_header, opcode, numdw); 1448 return numdw; 1449 } 1450 } 1451 1452 static int dump_gfxpipe_command(struct drm_printer *p, 1453 struct xe_gt *gt, 1454 u32 *dw, 1455 int remaining_dw) 1456 { 1457 u32 numdw = instr_dw(*dw); 1458 u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw); 1459 u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw); 1460 u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw); 1461 1462 /* 1463 * Make sure we haven't mis-parsed a number of dwords that exceeds the 1464 * remaining size of the LRC. 1465 */ 1466 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 1467 numdw = remaining_dw; 1468 1469 switch (*dw & GFXPIPE_MATCH_MASK) { 1470 #define MATCH(cmd) \ 1471 case cmd: \ 1472 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \ 1473 return numdw 1474 #define MATCH3D(cmd) \ 1475 case CMD_##cmd: \ 1476 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \ 1477 return numdw 1478 1479 MATCH(STATE_BASE_ADDRESS); 1480 MATCH(STATE_SIP); 1481 MATCH(GPGPU_CSR_BASE_ADDRESS); 1482 MATCH(STATE_COMPUTE_MODE); 1483 MATCH3D(3DSTATE_BTD); 1484 MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS); 1485 MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS); 1486 1487 MATCH3D(3DSTATE_VF_STATISTICS); 1488 1489 MATCH(PIPELINE_SELECT); 1490 1491 MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST); 1492 MATCH3D(3DSTATE_CLEAR_PARAMS); 1493 MATCH3D(3DSTATE_DEPTH_BUFFER); 1494 MATCH3D(3DSTATE_STENCIL_BUFFER); 1495 MATCH3D(3DSTATE_HIER_DEPTH_BUFFER); 1496 MATCH3D(3DSTATE_VERTEX_BUFFERS); 1497 MATCH3D(3DSTATE_VERTEX_ELEMENTS); 1498 MATCH3D(3DSTATE_INDEX_BUFFER); 1499 MATCH3D(3DSTATE_VF); 1500 MATCH3D(3DSTATE_MULTISAMPLE); 1501 MATCH3D(3DSTATE_CC_STATE_POINTERS); 1502 MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS); 1503 MATCH3D(3DSTATE_VS); 1504 MATCH3D(3DSTATE_GS); 1505 MATCH3D(3DSTATE_CLIP); 1506 MATCH3D(3DSTATE_SF); 1507 MATCH3D(3DSTATE_WM); 1508 MATCH3D(3DSTATE_CONSTANT_VS); 1509 MATCH3D(3DSTATE_CONSTANT_GS); 1510 MATCH3D(3DSTATE_CONSTANT_PS); 1511 MATCH3D(3DSTATE_SAMPLE_MASK); 1512 MATCH3D(3DSTATE_CONSTANT_HS); 1513 MATCH3D(3DSTATE_CONSTANT_DS); 1514 MATCH3D(3DSTATE_HS); 1515 MATCH3D(3DSTATE_TE); 1516 MATCH3D(3DSTATE_DS); 1517 MATCH3D(3DSTATE_STREAMOUT); 1518 MATCH3D(3DSTATE_SBE); 1519 MATCH3D(3DSTATE_PS); 1520 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP); 1521 MATCH3D(3DSTATE_CPS_POINTERS); 1522 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC); 1523 MATCH3D(3DSTATE_BLEND_STATE_POINTERS); 1524 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS); 1525 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS); 1526 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS); 1527 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS); 1528 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS); 1529 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS); 1530 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS); 1531 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS); 1532 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS); 1533 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS); 1534 MATCH3D(3DSTATE_VF_INSTANCING); 1535 MATCH3D(3DSTATE_VF_SGVS); 1536 MATCH3D(3DSTATE_VF_TOPOLOGY); 1537 MATCH3D(3DSTATE_WM_CHROMAKEY); 1538 MATCH3D(3DSTATE_PS_BLEND); 1539 MATCH3D(3DSTATE_WM_DEPTH_STENCIL); 1540 MATCH3D(3DSTATE_PS_EXTRA); 1541 MATCH3D(3DSTATE_RASTER); 1542 MATCH3D(3DSTATE_SBE_SWIZ); 1543 MATCH3D(3DSTATE_WM_HZ_OP); 1544 MATCH3D(3DSTATE_VF_COMPONENT_PACKING); 1545 MATCH3D(3DSTATE_VF_SGVS_2); 1546 MATCH3D(3DSTATE_VFG); 1547 MATCH3D(3DSTATE_URB_ALLOC_VS); 1548 MATCH3D(3DSTATE_URB_ALLOC_HS); 1549 MATCH3D(3DSTATE_URB_ALLOC_DS); 1550 MATCH3D(3DSTATE_URB_ALLOC_GS); 1551 MATCH3D(3DSTATE_SO_BUFFER_INDEX_0); 1552 MATCH3D(3DSTATE_SO_BUFFER_INDEX_1); 1553 MATCH3D(3DSTATE_SO_BUFFER_INDEX_2); 1554 MATCH3D(3DSTATE_SO_BUFFER_INDEX_3); 1555 MATCH3D(3DSTATE_PRIMITIVE_REPLICATION); 1556 MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO); 1557 MATCH3D(3DSTATE_AMFS); 1558 MATCH3D(3DSTATE_DEPTH_BOUNDS); 1559 MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS); 1560 MATCH3D(3DSTATE_CONSTANT_TS_POINTER); 1561 MATCH3D(3DSTATE_MESH_CONTROL); 1562 MATCH3D(3DSTATE_MESH_DISTRIB); 1563 MATCH3D(3DSTATE_TASK_REDISTRIB); 1564 MATCH3D(3DSTATE_MESH_SHADER); 1565 MATCH3D(3DSTATE_MESH_SHADER_DATA); 1566 MATCH3D(3DSTATE_TASK_CONTROL); 1567 MATCH3D(3DSTATE_TASK_SHADER); 1568 MATCH3D(3DSTATE_TASK_SHADER_DATA); 1569 MATCH3D(3DSTATE_URB_ALLOC_MESH); 1570 MATCH3D(3DSTATE_URB_ALLOC_TASK); 1571 MATCH3D(3DSTATE_CLIP_MESH); 1572 MATCH3D(3DSTATE_SBE_MESH); 1573 MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER); 1574 MATCH3D(3DSTATE_COARSE_PIXEL); 1575 1576 MATCH3D(3DSTATE_DRAWING_RECTANGLE); 1577 MATCH3D(3DSTATE_CHROMA_KEY); 1578 MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET); 1579 MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN); 1580 MATCH3D(3DSTATE_LINE_STIPPLE); 1581 MATCH3D(3DSTATE_AA_LINE_PARAMETERS); 1582 MATCH3D(3DSTATE_MONOFILTER_SIZE); 1583 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS); 1584 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS); 1585 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS); 1586 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS); 1587 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS); 1588 MATCH3D(3DSTATE_SO_DECL_LIST); 1589 MATCH3D(3DSTATE_SO_BUFFER); 1590 MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC); 1591 MATCH3D(3DSTATE_SAMPLE_PATTERN); 1592 MATCH3D(3DSTATE_3D_MODE); 1593 MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE); 1594 MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS); 1595 MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO); 1596 1597 default: 1598 drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n", 1599 *dw, pipeline, opcode, subopcode, numdw); 1600 return numdw; 1601 } 1602 } 1603 1604 static int dump_gfx_state_command(struct drm_printer *p, 1605 struct xe_gt *gt, 1606 u32 *dw, 1607 int remaining_dw) 1608 { 1609 u32 numdw = instr_dw(*dw); 1610 u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw); 1611 1612 /* 1613 * Make sure we haven't mis-parsed a number of dwords that exceeds the 1614 * remaining size of the LRC. 1615 */ 1616 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 1617 numdw = remaining_dw; 1618 1619 switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) { 1620 MATCH(STATE_WRITE_INLINE); 1621 1622 default: 1623 drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n", 1624 *dw, opcode, numdw); 1625 return numdw; 1626 } 1627 } 1628 1629 void xe_lrc_dump_default(struct drm_printer *p, 1630 struct xe_gt *gt, 1631 enum xe_engine_class hwe_class) 1632 { 1633 u32 *dw; 1634 int remaining_dw, num_dw; 1635 1636 if (!gt->default_lrc[hwe_class]) { 1637 drm_printf(p, "No default LRC for class %d\n", hwe_class); 1638 return; 1639 } 1640 1641 /* 1642 * Skip the beginning of the LRC since it contains the per-process 1643 * hardware status page. 1644 */ 1645 dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE; 1646 remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4; 1647 1648 while (remaining_dw > 0) { 1649 if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) { 1650 num_dw = dump_mi_command(p, gt, dw, remaining_dw); 1651 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) { 1652 num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw); 1653 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) { 1654 num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw); 1655 } else { 1656 num_dw = min(instr_dw(*dw), remaining_dw); 1657 drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n", 1658 *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw), 1659 num_dw); 1660 } 1661 1662 dw += num_dw; 1663 remaining_dw -= num_dw; 1664 } 1665 } 1666 1667 struct instr_state { 1668 u32 instr; 1669 u16 num_dw; 1670 }; 1671 1672 static const struct instr_state xe_hpg_svg_state[] = { 1673 { .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 }, 1674 { .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 }, 1675 { .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 }, 1676 { .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 }, 1677 { .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 }, 1678 { .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 }, 1679 { .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 }, 1680 { .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 }, 1681 { .instr = CMD_3DSTATE_VS, .num_dw = 9 }, 1682 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 }, 1683 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 }, 1684 { .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 }, 1685 { .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 }, 1686 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 }, 1687 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 }, 1688 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 }, 1689 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 }, 1690 { .instr = CMD_3DSTATE_CLIP, .num_dw = 4 }, 1691 { .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 }, 1692 { .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 }, 1693 { .instr = CMD_3DSTATE_SF, .num_dw = 4 }, 1694 { .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 }, 1695 { .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 }, 1696 { .instr = CMD_3DSTATE_RASTER, .num_dw = 5 }, 1697 { .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 }, 1698 { .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 }, 1699 { .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 }, 1700 { .instr = CMD_3DSTATE_HS, .num_dw = 9 }, 1701 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 }, 1702 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 }, 1703 { .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 }, 1704 { .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 }, 1705 { .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 }, 1706 { .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 }, 1707 { .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 }, 1708 { .instr = CMD_3DSTATE_TE, .num_dw = 5 }, 1709 { .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 }, 1710 { .instr = CMD_3DSTATE_DS, .num_dw = 11 }, 1711 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 }, 1712 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 }, 1713 { .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 }, 1714 { .instr = CMD_3DSTATE_GS, .num_dw = 10 }, 1715 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 }, 1716 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 }, 1717 { .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 }, 1718 { .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 }, 1719 { .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 }, 1720 { .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 }, 1721 { .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 }, 1722 { .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 }, 1723 }; 1724 1725 void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb) 1726 { 1727 struct xe_gt *gt = q->hwe->gt; 1728 struct xe_device *xe = gt_to_xe(gt); 1729 const struct instr_state *state_table = NULL; 1730 int state_table_size = 0; 1731 1732 /* 1733 * Wa_14019789679 1734 * 1735 * If the driver doesn't explicitly emit the SVG instructions while 1736 * setting up the default LRC, the context switch will write 0's 1737 * (noops) into the LRC memory rather than the expected instruction 1738 * headers. Application contexts start out as a copy of the default 1739 * LRC, and if they also do not emit specific settings for some SVG 1740 * state, then on context restore they'll unintentionally inherit 1741 * whatever state setting the previous context had programmed into the 1742 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will 1743 * prevent the hardware from resetting that state back to any specific 1744 * value). 1745 * 1746 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL 1747 * since that's a specific state setting that can easily cause GPU 1748 * hangs if unintentionally inherited. However to be safe we'll 1749 * continue to emit all of the SVG state since it's best not to leak 1750 * any of the state between contexts, even if that leakage is harmless. 1751 */ 1752 if (XE_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) { 1753 state_table = xe_hpg_svg_state; 1754 state_table_size = ARRAY_SIZE(xe_hpg_svg_state); 1755 } 1756 1757 if (!state_table) { 1758 xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n", 1759 GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100); 1760 return; 1761 } 1762 1763 for (int i = 0; i < state_table_size; i++) { 1764 u32 instr = state_table[i].instr; 1765 u16 num_dw = state_table[i].num_dw; 1766 bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW); 1767 1768 xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE); 1769 xe_gt_assert(gt, num_dw != 0); 1770 xe_gt_assert(gt, is_single_dw ^ (num_dw > 1)); 1771 1772 /* 1773 * Xe2's SVG context is the same as the one on DG2 / MTL 1774 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has 1775 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined). 1776 * Just make the replacement here rather than defining a 1777 * whole separate table for the single trivial change. 1778 */ 1779 if (GRAPHICS_VER(xe) >= 20 && 1780 instr == CMD_3DSTATE_DRAWING_RECTANGLE) 1781 instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST; 1782 1783 bb->cs[bb->len] = instr; 1784 if (!is_single_dw) 1785 bb->cs[bb->len] |= (num_dw - 2); 1786 1787 bb->len += num_dw; 1788 } 1789 } 1790 1791 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc) 1792 { 1793 struct xe_lrc_snapshot *snapshot = kmalloc(sizeof(*snapshot), GFP_NOWAIT); 1794 1795 if (!snapshot) 1796 return NULL; 1797 1798 if (lrc->bo->vm) 1799 xe_vm_get(lrc->bo->vm); 1800 1801 snapshot->context_desc = xe_lrc_ggtt_addr(lrc); 1802 snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc); 1803 snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc); 1804 snapshot->head = xe_lrc_ring_head(lrc); 1805 snapshot->tail.internal = lrc->ring.tail; 1806 snapshot->tail.memory = xe_lrc_ring_tail(lrc); 1807 snapshot->start = xe_lrc_ring_start(lrc); 1808 snapshot->start_seqno = xe_lrc_start_seqno(lrc); 1809 snapshot->seqno = xe_lrc_seqno(lrc); 1810 snapshot->lrc_bo = xe_bo_get(lrc->bo); 1811 snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc); 1812 snapshot->lrc_size = lrc->bo->size - snapshot->lrc_offset; 1813 snapshot->lrc_snapshot = NULL; 1814 snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc)); 1815 snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc); 1816 return snapshot; 1817 } 1818 1819 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot) 1820 { 1821 struct xe_bo *bo; 1822 struct xe_vm *vm; 1823 struct iosys_map src; 1824 1825 if (!snapshot) 1826 return; 1827 1828 bo = snapshot->lrc_bo; 1829 vm = bo->vm; 1830 snapshot->lrc_bo = NULL; 1831 1832 snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL); 1833 if (!snapshot->lrc_snapshot) 1834 goto put_bo; 1835 1836 xe_bo_lock(bo, false); 1837 if (!ttm_bo_vmap(&bo->ttm, &src)) { 1838 xe_map_memcpy_from(xe_bo_device(bo), 1839 snapshot->lrc_snapshot, &src, snapshot->lrc_offset, 1840 snapshot->lrc_size); 1841 ttm_bo_vunmap(&bo->ttm, &src); 1842 } else { 1843 kvfree(snapshot->lrc_snapshot); 1844 snapshot->lrc_snapshot = NULL; 1845 } 1846 xe_bo_unlock(bo); 1847 put_bo: 1848 xe_bo_put(bo); 1849 if (vm) 1850 xe_vm_put(vm); 1851 } 1852 1853 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p) 1854 { 1855 unsigned long i; 1856 1857 if (!snapshot) 1858 return; 1859 1860 drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc); 1861 drm_printf(p, "\tHW Ring address: 0x%08x\n", 1862 snapshot->ring_addr); 1863 drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n", 1864 snapshot->indirect_context_desc); 1865 drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head); 1866 drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n", 1867 snapshot->tail.internal, snapshot->tail.memory); 1868 drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start); 1869 drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno); 1870 drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno); 1871 drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp); 1872 drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp); 1873 1874 if (!snapshot->lrc_snapshot) 1875 return; 1876 1877 drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE); 1878 drm_puts(p, "\t[HWSP].data: "); 1879 for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) { 1880 u32 *val = snapshot->lrc_snapshot + i; 1881 char dumped[ASCII85_BUFSZ]; 1882 1883 drm_puts(p, ascii85_encode(*val, dumped)); 1884 } 1885 1886 drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE); 1887 drm_puts(p, "\t[HWCTX].data: "); 1888 for (; i < snapshot->lrc_size; i += sizeof(u32)) { 1889 u32 *val = snapshot->lrc_snapshot + i; 1890 char dumped[ASCII85_BUFSZ]; 1891 1892 drm_puts(p, ascii85_encode(*val, dumped)); 1893 } 1894 drm_puts(p, "\n"); 1895 } 1896 1897 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot) 1898 { 1899 if (!snapshot) 1900 return; 1901 1902 kvfree(snapshot->lrc_snapshot); 1903 if (snapshot->lrc_bo) { 1904 struct xe_vm *vm; 1905 1906 vm = snapshot->lrc_bo->vm; 1907 xe_bo_put(snapshot->lrc_bo); 1908 if (vm) 1909 xe_vm_put(vm); 1910 } 1911 kfree(snapshot); 1912 } 1913 1914 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts) 1915 { 1916 u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id); 1917 u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id); 1918 struct xe_hw_engine *hwe; 1919 u64 val; 1920 1921 hwe = xe_gt_hw_engine(lrc->gt, class, instance, false); 1922 if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe), 1923 "Unexpected engine class:instance %d:%d for context utilization\n", 1924 class, instance)) 1925 return -1; 1926 1927 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) 1928 val = xe_mmio_read64_2x32(&hwe->gt->mmio, 1929 RING_CTX_TIMESTAMP(hwe->mmio_base)); 1930 else 1931 val = xe_mmio_read32(&hwe->gt->mmio, 1932 RING_CTX_TIMESTAMP(hwe->mmio_base)); 1933 1934 *reg_ctx_ts = val; 1935 1936 return 0; 1937 } 1938 1939 /** 1940 * xe_lrc_update_timestamp() - Update ctx timestamp 1941 * @lrc: Pointer to the lrc. 1942 * @old_ts: Old timestamp value 1943 * 1944 * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and 1945 * update saved value. With support for active contexts, the calculation may be 1946 * slightly racy, so follow a read-again logic to ensure that the context is 1947 * still active before returning the right timestamp. 1948 * 1949 * Returns: New ctx timestamp value 1950 */ 1951 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts) 1952 { 1953 u64 lrc_ts, reg_ts; 1954 u32 engine_id; 1955 1956 *old_ts = lrc->ctx_timestamp; 1957 1958 lrc_ts = xe_lrc_ctx_timestamp(lrc); 1959 /* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */ 1960 if (IS_SRIOV_VF(lrc_to_xe(lrc))) { 1961 lrc->ctx_timestamp = lrc_ts; 1962 goto done; 1963 } 1964 1965 if (lrc_ts == CONTEXT_ACTIVE) { 1966 engine_id = xe_lrc_engine_id(lrc); 1967 if (!get_ctx_timestamp(lrc, engine_id, ®_ts)) 1968 lrc->ctx_timestamp = reg_ts; 1969 1970 /* read lrc again to ensure context is still active */ 1971 lrc_ts = xe_lrc_ctx_timestamp(lrc); 1972 } 1973 1974 /* 1975 * If context switched out, just use the lrc_ts. Note that this needs to 1976 * be a separate if condition. 1977 */ 1978 if (lrc_ts != CONTEXT_ACTIVE) 1979 lrc->ctx_timestamp = lrc_ts; 1980 1981 done: 1982 trace_xe_lrc_update_timestamp(lrc, *old_ts); 1983 1984 return lrc->ctx_timestamp; 1985 } 1986 1987 /** 1988 * xe_lrc_ring_is_idle() - LRC is idle 1989 * @lrc: Pointer to the lrc. 1990 * 1991 * Compare LRC ring head and tail to determine if idle. 1992 * 1993 * Return: True is ring is idle, False otherwise 1994 */ 1995 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc) 1996 { 1997 return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc); 1998 } 1999