1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2021 Intel Corporation 4 */ 5 6 #include "xe_lrc.h" 7 8 #include <linux/ascii85.h> 9 10 #include "instructions/xe_mi_commands.h" 11 #include "instructions/xe_gfxpipe_commands.h" 12 #include "instructions/xe_gfx_state_commands.h" 13 #include "regs/xe_engine_regs.h" 14 #include "regs/xe_lrc_layout.h" 15 #include "xe_bb.h" 16 #include "xe_bo.h" 17 #include "xe_device.h" 18 #include "xe_drm_client.h" 19 #include "xe_exec_queue_types.h" 20 #include "xe_gt.h" 21 #include "xe_gt_printk.h" 22 #include "xe_hw_fence.h" 23 #include "xe_map.h" 24 #include "xe_memirq.h" 25 #include "xe_sriov.h" 26 #include "xe_vm.h" 27 28 #define LRC_VALID BIT_ULL(0) 29 #define LRC_PRIVILEGE BIT_ULL(8) 30 #define LRC_ADDRESSING_MODE GENMASK_ULL(4, 3) 31 #define LRC_LEGACY_64B_CONTEXT 3 32 33 #define LRC_ENGINE_CLASS GENMASK_ULL(63, 61) 34 #define LRC_ENGINE_INSTANCE GENMASK_ULL(53, 48) 35 36 #define LRC_INDIRECT_RING_STATE_SIZE SZ_4K 37 38 struct xe_lrc_snapshot { 39 struct xe_bo *lrc_bo; 40 void *lrc_snapshot; 41 unsigned long lrc_size, lrc_offset; 42 43 u32 context_desc; 44 u32 indirect_context_desc; 45 u32 head; 46 struct { 47 u32 internal; 48 u32 memory; 49 } tail; 50 u32 start_seqno; 51 u32 seqno; 52 }; 53 54 static struct xe_device * 55 lrc_to_xe(struct xe_lrc *lrc) 56 { 57 return gt_to_xe(lrc->fence_ctx.gt); 58 } 59 60 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class) 61 { 62 struct xe_device *xe = gt_to_xe(gt); 63 size_t size; 64 65 switch (class) { 66 case XE_ENGINE_CLASS_RENDER: 67 if (GRAPHICS_VER(xe) >= 20) 68 size = 4 * SZ_4K; 69 else 70 size = 14 * SZ_4K; 71 break; 72 case XE_ENGINE_CLASS_COMPUTE: 73 /* 14 pages since graphics_ver == 11 */ 74 if (GRAPHICS_VER(xe) >= 20) 75 size = 3 * SZ_4K; 76 else 77 size = 14 * SZ_4K; 78 break; 79 default: 80 WARN(1, "Unknown engine class: %d", class); 81 fallthrough; 82 case XE_ENGINE_CLASS_COPY: 83 case XE_ENGINE_CLASS_VIDEO_DECODE: 84 case XE_ENGINE_CLASS_VIDEO_ENHANCE: 85 case XE_ENGINE_CLASS_OTHER: 86 size = 2 * SZ_4K; 87 } 88 89 /* Add indirect ring state page */ 90 if (xe_gt_has_indirect_ring_state(gt)) 91 size += LRC_INDIRECT_RING_STATE_SIZE; 92 93 return size; 94 } 95 96 /* 97 * The per-platform tables are u8-encoded in @data. Decode @data and set the 98 * addresses' offset and commands in @regs. The following encoding is used 99 * for each byte. There are 2 steps: decoding commands and decoding addresses. 100 * 101 * Commands: 102 * [7]: create NOPs - number of NOPs are set in lower bits 103 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set 104 * MI_LRI_FORCE_POSTED 105 * [5:0]: Number of NOPs or registers to set values to in case of 106 * MI_LOAD_REGISTER_IMM 107 * 108 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count" 109 * number of registers. They are set by using the REG/REG16 macros: the former 110 * is used for offsets smaller than 0x200 while the latter is for values bigger 111 * than that. Those macros already set all the bits documented below correctly: 112 * 113 * [7]: When a register offset needs more than 6 bits, use additional bytes, to 114 * follow, for the lower bits 115 * [6:0]: Register offset, without considering the engine base. 116 * 117 * This function only tweaks the commands and register offsets. Values are not 118 * filled out. 119 */ 120 static void set_offsets(u32 *regs, 121 const u8 *data, 122 const struct xe_hw_engine *hwe) 123 #define NOP(x) (BIT(7) | (x)) 124 #define LRI(count, flags) ((flags) << 6 | (count) | \ 125 BUILD_BUG_ON_ZERO(count >= BIT(6))) 126 #define POSTED BIT(0) 127 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 128 #define REG16(x) \ 129 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 130 (((x) >> 2) & 0x7f) 131 { 132 const u32 base = hwe->mmio_base; 133 134 while (*data) { 135 u8 count, flags; 136 137 if (*data & BIT(7)) { /* skip */ 138 count = *data++ & ~BIT(7); 139 regs += count; 140 continue; 141 } 142 143 count = *data & 0x3f; 144 flags = *data >> 6; 145 data++; 146 147 *regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count); 148 if (flags & POSTED) 149 *regs |= MI_LRI_FORCE_POSTED; 150 *regs |= MI_LRI_LRM_CS_MMIO; 151 regs++; 152 153 xe_gt_assert(hwe->gt, count); 154 do { 155 u32 offset = 0; 156 u8 v; 157 158 do { 159 v = *data++; 160 offset <<= 7; 161 offset |= v & ~BIT(7); 162 } while (v & BIT(7)); 163 164 regs[0] = base + (offset << 2); 165 regs += 2; 166 } while (--count); 167 } 168 169 *regs = MI_BATCH_BUFFER_END | BIT(0); 170 } 171 172 static const u8 gen12_xcs_offsets[] = { 173 NOP(1), 174 LRI(13, POSTED), 175 REG16(0x244), 176 REG(0x034), 177 REG(0x030), 178 REG(0x038), 179 REG(0x03c), 180 REG(0x168), 181 REG(0x140), 182 REG(0x110), 183 REG(0x1c0), 184 REG(0x1c4), 185 REG(0x1c8), 186 REG(0x180), 187 REG16(0x2b4), 188 189 NOP(5), 190 LRI(9, POSTED), 191 REG16(0x3a8), 192 REG16(0x28c), 193 REG16(0x288), 194 REG16(0x284), 195 REG16(0x280), 196 REG16(0x27c), 197 REG16(0x278), 198 REG16(0x274), 199 REG16(0x270), 200 201 0 202 }; 203 204 static const u8 dg2_xcs_offsets[] = { 205 NOP(1), 206 LRI(15, POSTED), 207 REG16(0x244), 208 REG(0x034), 209 REG(0x030), 210 REG(0x038), 211 REG(0x03c), 212 REG(0x168), 213 REG(0x140), 214 REG(0x110), 215 REG(0x1c0), 216 REG(0x1c4), 217 REG(0x1c8), 218 REG(0x180), 219 REG16(0x2b4), 220 REG(0x120), 221 REG(0x124), 222 223 NOP(1), 224 LRI(9, POSTED), 225 REG16(0x3a8), 226 REG16(0x28c), 227 REG16(0x288), 228 REG16(0x284), 229 REG16(0x280), 230 REG16(0x27c), 231 REG16(0x278), 232 REG16(0x274), 233 REG16(0x270), 234 235 0 236 }; 237 238 static const u8 gen12_rcs_offsets[] = { 239 NOP(1), 240 LRI(13, POSTED), 241 REG16(0x244), 242 REG(0x034), 243 REG(0x030), 244 REG(0x038), 245 REG(0x03c), 246 REG(0x168), 247 REG(0x140), 248 REG(0x110), 249 REG(0x1c0), 250 REG(0x1c4), 251 REG(0x1c8), 252 REG(0x180), 253 REG16(0x2b4), 254 255 NOP(5), 256 LRI(9, POSTED), 257 REG16(0x3a8), 258 REG16(0x28c), 259 REG16(0x288), 260 REG16(0x284), 261 REG16(0x280), 262 REG16(0x27c), 263 REG16(0x278), 264 REG16(0x274), 265 REG16(0x270), 266 267 LRI(3, POSTED), 268 REG(0x1b0), 269 REG16(0x5a8), 270 REG16(0x5ac), 271 272 NOP(6), 273 LRI(1, 0), 274 REG(0x0c8), 275 NOP(3 + 9 + 1), 276 277 LRI(51, POSTED), 278 REG16(0x588), 279 REG16(0x588), 280 REG16(0x588), 281 REG16(0x588), 282 REG16(0x588), 283 REG16(0x588), 284 REG(0x028), 285 REG(0x09c), 286 REG(0x0c0), 287 REG(0x178), 288 REG(0x17c), 289 REG16(0x358), 290 REG(0x170), 291 REG(0x150), 292 REG(0x154), 293 REG(0x158), 294 REG16(0x41c), 295 REG16(0x600), 296 REG16(0x604), 297 REG16(0x608), 298 REG16(0x60c), 299 REG16(0x610), 300 REG16(0x614), 301 REG16(0x618), 302 REG16(0x61c), 303 REG16(0x620), 304 REG16(0x624), 305 REG16(0x628), 306 REG16(0x62c), 307 REG16(0x630), 308 REG16(0x634), 309 REG16(0x638), 310 REG16(0x63c), 311 REG16(0x640), 312 REG16(0x644), 313 REG16(0x648), 314 REG16(0x64c), 315 REG16(0x650), 316 REG16(0x654), 317 REG16(0x658), 318 REG16(0x65c), 319 REG16(0x660), 320 REG16(0x664), 321 REG16(0x668), 322 REG16(0x66c), 323 REG16(0x670), 324 REG16(0x674), 325 REG16(0x678), 326 REG16(0x67c), 327 REG(0x068), 328 REG(0x084), 329 NOP(1), 330 331 0 332 }; 333 334 static const u8 xehp_rcs_offsets[] = { 335 NOP(1), 336 LRI(13, POSTED), 337 REG16(0x244), 338 REG(0x034), 339 REG(0x030), 340 REG(0x038), 341 REG(0x03c), 342 REG(0x168), 343 REG(0x140), 344 REG(0x110), 345 REG(0x1c0), 346 REG(0x1c4), 347 REG(0x1c8), 348 REG(0x180), 349 REG16(0x2b4), 350 351 NOP(5), 352 LRI(9, POSTED), 353 REG16(0x3a8), 354 REG16(0x28c), 355 REG16(0x288), 356 REG16(0x284), 357 REG16(0x280), 358 REG16(0x27c), 359 REG16(0x278), 360 REG16(0x274), 361 REG16(0x270), 362 363 LRI(3, POSTED), 364 REG(0x1b0), 365 REG16(0x5a8), 366 REG16(0x5ac), 367 368 NOP(6), 369 LRI(1, 0), 370 REG(0x0c8), 371 372 0 373 }; 374 375 static const u8 dg2_rcs_offsets[] = { 376 NOP(1), 377 LRI(15, POSTED), 378 REG16(0x244), 379 REG(0x034), 380 REG(0x030), 381 REG(0x038), 382 REG(0x03c), 383 REG(0x168), 384 REG(0x140), 385 REG(0x110), 386 REG(0x1c0), 387 REG(0x1c4), 388 REG(0x1c8), 389 REG(0x180), 390 REG16(0x2b4), 391 REG(0x120), 392 REG(0x124), 393 394 NOP(1), 395 LRI(9, POSTED), 396 REG16(0x3a8), 397 REG16(0x28c), 398 REG16(0x288), 399 REG16(0x284), 400 REG16(0x280), 401 REG16(0x27c), 402 REG16(0x278), 403 REG16(0x274), 404 REG16(0x270), 405 406 LRI(3, POSTED), 407 REG(0x1b0), 408 REG16(0x5a8), 409 REG16(0x5ac), 410 411 NOP(6), 412 LRI(1, 0), 413 REG(0x0c8), 414 415 0 416 }; 417 418 static const u8 mtl_rcs_offsets[] = { 419 NOP(1), 420 LRI(15, POSTED), 421 REG16(0x244), 422 REG(0x034), 423 REG(0x030), 424 REG(0x038), 425 REG(0x03c), 426 REG(0x168), 427 REG(0x140), 428 REG(0x110), 429 REG(0x1c0), 430 REG(0x1c4), 431 REG(0x1c8), 432 REG(0x180), 433 REG16(0x2b4), 434 REG(0x120), 435 REG(0x124), 436 437 NOP(1), 438 LRI(9, POSTED), 439 REG16(0x3a8), 440 REG16(0x28c), 441 REG16(0x288), 442 REG16(0x284), 443 REG16(0x280), 444 REG16(0x27c), 445 REG16(0x278), 446 REG16(0x274), 447 REG16(0x270), 448 449 NOP(2), 450 LRI(2, POSTED), 451 REG16(0x5a8), 452 REG16(0x5ac), 453 454 NOP(6), 455 LRI(1, 0), 456 REG(0x0c8), 457 458 0 459 }; 460 461 #define XE2_CTX_COMMON \ 462 NOP(1), /* [0x00] */ \ 463 LRI(15, POSTED), /* [0x01] */ \ 464 REG16(0x244), /* [0x02] CTXT_SR_CTL */ \ 465 REG(0x034), /* [0x04] RING_BUFFER_HEAD */ \ 466 REG(0x030), /* [0x06] RING_BUFFER_TAIL */ \ 467 REG(0x038), /* [0x08] RING_BUFFER_START */ \ 468 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ \ 469 REG(0x168), /* [0x0c] BB_ADDR_UDW */ \ 470 REG(0x140), /* [0x0e] BB_ADDR */ \ 471 REG(0x110), /* [0x10] BB_STATE */ \ 472 REG(0x1c0), /* [0x12] BB_PER_CTX_PTR */ \ 473 REG(0x1c4), /* [0x14] RCS_INDIRECT_CTX */ \ 474 REG(0x1c8), /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \ 475 REG(0x180), /* [0x18] CCID */ \ 476 REG16(0x2b4), /* [0x1a] SEMAPHORE_TOKEN */ \ 477 REG(0x120), /* [0x1c] PRT_BB_STATE */ \ 478 REG(0x124), /* [0x1e] PRT_BB_STATE_UDW */ \ 479 \ 480 NOP(1), /* [0x20] */ \ 481 LRI(9, POSTED), /* [0x21] */ \ 482 REG16(0x3a8), /* [0x22] CTX_TIMESTAMP */ \ 483 REG16(0x3ac), /* [0x24] CTX_TIMESTAMP_UDW */ \ 484 REG(0x108), /* [0x26] INDIRECT_RING_STATE */ \ 485 REG16(0x284), /* [0x28] dummy reg */ \ 486 REG16(0x280), /* [0x2a] CS_ACC_CTR_THOLD */ \ 487 REG16(0x27c), /* [0x2c] CS_CTX_SYS_PASID */ \ 488 REG16(0x278), /* [0x2e] CS_CTX_ASID */ \ 489 REG16(0x274), /* [0x30] PTBP_UDW */ \ 490 REG16(0x270) /* [0x32] PTBP_LDW */ 491 492 static const u8 xe2_rcs_offsets[] = { 493 XE2_CTX_COMMON, 494 495 NOP(2), /* [0x34] */ 496 LRI(2, POSTED), /* [0x36] */ 497 REG16(0x5a8), /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */ 498 REG16(0x5ac), /* [0x39] PREEMPTION_STATUS */ 499 500 NOP(6), /* [0x41] */ 501 LRI(1, 0), /* [0x47] */ 502 REG(0x0c8), /* [0x48] R_PWR_CLK_STATE */ 503 504 0 505 }; 506 507 static const u8 xe2_bcs_offsets[] = { 508 XE2_CTX_COMMON, 509 510 NOP(4 + 8 + 1), /* [0x34] */ 511 LRI(2, POSTED), /* [0x41] */ 512 REG16(0x200), /* [0x42] BCS_SWCTRL */ 513 REG16(0x204), /* [0x44] BLIT_CCTL */ 514 515 0 516 }; 517 518 static const u8 xe2_xcs_offsets[] = { 519 XE2_CTX_COMMON, 520 521 0 522 }; 523 524 static const u8 xe2_indirect_ring_state_offsets[] = { 525 NOP(1), /* [0x00] */ 526 LRI(5, POSTED), /* [0x01] */ 527 REG(0x034), /* [0x02] RING_BUFFER_HEAD */ 528 REG(0x030), /* [0x04] RING_BUFFER_TAIL */ 529 REG(0x038), /* [0x06] RING_BUFFER_START */ 530 REG(0x048), /* [0x08] RING_BUFFER_START_UDW */ 531 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ 532 533 NOP(5), /* [0x0c] */ 534 LRI(9, POSTED), /* [0x11] */ 535 REG(0x168), /* [0x12] BB_ADDR_UDW */ 536 REG(0x140), /* [0x14] BB_ADDR */ 537 REG(0x110), /* [0x16] BB_STATE */ 538 REG16(0x588), /* [0x18] BB_STACK_WRITE_PORT */ 539 REG16(0x588), /* [0x20] BB_STACK_WRITE_PORT */ 540 REG16(0x588), /* [0x22] BB_STACK_WRITE_PORT */ 541 REG16(0x588), /* [0x24] BB_STACK_WRITE_PORT */ 542 REG16(0x588), /* [0x26] BB_STACK_WRITE_PORT */ 543 REG16(0x588), /* [0x28] BB_STACK_WRITE_PORT */ 544 545 NOP(12), /* [0x00] */ 546 547 0 548 }; 549 550 #undef REG16 551 #undef REG 552 #undef LRI 553 #undef NOP 554 555 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class) 556 { 557 if (class == XE_ENGINE_CLASS_RENDER) { 558 if (GRAPHICS_VER(xe) >= 20) 559 return xe2_rcs_offsets; 560 else if (GRAPHICS_VERx100(xe) >= 1270) 561 return mtl_rcs_offsets; 562 else if (GRAPHICS_VERx100(xe) >= 1255) 563 return dg2_rcs_offsets; 564 else if (GRAPHICS_VERx100(xe) >= 1250) 565 return xehp_rcs_offsets; 566 else 567 return gen12_rcs_offsets; 568 } else if (class == XE_ENGINE_CLASS_COPY) { 569 if (GRAPHICS_VER(xe) >= 20) 570 return xe2_bcs_offsets; 571 else 572 return gen12_xcs_offsets; 573 } else { 574 if (GRAPHICS_VER(xe) >= 20) 575 return xe2_xcs_offsets; 576 else if (GRAPHICS_VERx100(xe) >= 1255) 577 return dg2_xcs_offsets; 578 else 579 return gen12_xcs_offsets; 580 } 581 } 582 583 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe) 584 { 585 regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | 586 CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 587 588 if (xe_gt_has_indirect_ring_state(hwe->gt)) 589 regs[CTX_CONTEXT_CONTROL] |= 590 _MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE); 591 592 /* TODO: Timestamp */ 593 } 594 595 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe) 596 { 597 struct xe_memirq *memirq = >_to_tile(hwe->gt)->sriov.vf.memirq; 598 struct xe_device *xe = gt_to_xe(hwe->gt); 599 600 if (!IS_SRIOV_VF(xe) || !xe_device_has_memirq(xe)) 601 return; 602 603 regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM | 604 MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT; 605 regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr; 606 regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq); 607 608 regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(2) | 609 MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED; 610 regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr; 611 regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq); 612 regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr; 613 regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq); 614 } 615 616 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe) 617 { 618 struct xe_device *xe = gt_to_xe(hwe->gt); 619 620 if (GRAPHICS_VERx100(xe) >= 1250) 621 return 0x70; 622 else 623 return 0x60; 624 } 625 626 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe) 627 { 628 int x; 629 630 x = lrc_ring_mi_mode(hwe); 631 regs[x + 1] &= ~STOP_RING; 632 regs[x + 1] |= STOP_RING << 16; 633 } 634 635 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc) 636 { 637 return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE; 638 } 639 640 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc) 641 { 642 return 0; 643 } 644 645 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc) 646 { 647 return lrc->ring.size; 648 } 649 650 /* Make the magic macros work */ 651 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset 652 653 #define LRC_SEQNO_PPHWSP_OFFSET 512 654 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8) 655 #define LRC_PARALLEL_PPHWSP_OFFSET 2048 656 #define LRC_PPHWSP_SIZE SZ_4K 657 658 static size_t lrc_reg_size(struct xe_device *xe) 659 { 660 if (GRAPHICS_VERx100(xe) >= 1250) 661 return 96 * sizeof(u32); 662 else 663 return 80 * sizeof(u32); 664 } 665 666 size_t xe_lrc_skip_size(struct xe_device *xe) 667 { 668 return LRC_PPHWSP_SIZE + lrc_reg_size(xe); 669 } 670 671 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc) 672 { 673 /* The seqno is stored in the driver-defined portion of PPHWSP */ 674 return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET; 675 } 676 677 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc) 678 { 679 /* The start seqno is stored in the driver-defined portion of PPHWSP */ 680 return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET; 681 } 682 683 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc) 684 { 685 /* The parallel is stored in the driver-defined portion of PPHWSP */ 686 return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET; 687 } 688 689 static inline u32 __xe_lrc_regs_offset(struct xe_lrc *lrc) 690 { 691 return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE; 692 } 693 694 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc) 695 { 696 /* Indirect ring state page is at the very end of LRC */ 697 return lrc->size - LRC_INDIRECT_RING_STATE_SIZE; 698 } 699 700 #define DECL_MAP_ADDR_HELPERS(elem) \ 701 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \ 702 { \ 703 struct iosys_map map = lrc->bo->vmap; \ 704 \ 705 xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map)); \ 706 iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \ 707 return map; \ 708 } \ 709 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \ 710 { \ 711 return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \ 712 } \ 713 714 DECL_MAP_ADDR_HELPERS(ring) 715 DECL_MAP_ADDR_HELPERS(pphwsp) 716 DECL_MAP_ADDR_HELPERS(seqno) 717 DECL_MAP_ADDR_HELPERS(regs) 718 DECL_MAP_ADDR_HELPERS(start_seqno) 719 DECL_MAP_ADDR_HELPERS(parallel) 720 DECL_MAP_ADDR_HELPERS(indirect_ring) 721 722 #undef DECL_MAP_ADDR_HELPERS 723 724 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc) 725 { 726 return __xe_lrc_pphwsp_ggtt_addr(lrc); 727 } 728 729 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc) 730 { 731 if (!xe_lrc_has_indirect_ring_state(lrc)) 732 return 0; 733 734 return __xe_lrc_indirect_ring_ggtt_addr(lrc); 735 } 736 737 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr) 738 { 739 struct xe_device *xe = lrc_to_xe(lrc); 740 struct iosys_map map; 741 742 map = __xe_lrc_indirect_ring_map(lrc); 743 iosys_map_incr(&map, reg_nr * sizeof(u32)); 744 return xe_map_read32(xe, &map); 745 } 746 747 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc, 748 int reg_nr, u32 val) 749 { 750 struct xe_device *xe = lrc_to_xe(lrc); 751 struct iosys_map map; 752 753 map = __xe_lrc_indirect_ring_map(lrc); 754 iosys_map_incr(&map, reg_nr * sizeof(u32)); 755 xe_map_write32(xe, &map, val); 756 } 757 758 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr) 759 { 760 struct xe_device *xe = lrc_to_xe(lrc); 761 struct iosys_map map; 762 763 map = __xe_lrc_regs_map(lrc); 764 iosys_map_incr(&map, reg_nr * sizeof(u32)); 765 return xe_map_read32(xe, &map); 766 } 767 768 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val) 769 { 770 struct xe_device *xe = lrc_to_xe(lrc); 771 struct iosys_map map; 772 773 map = __xe_lrc_regs_map(lrc); 774 iosys_map_incr(&map, reg_nr * sizeof(u32)); 775 xe_map_write32(xe, &map, val); 776 } 777 778 static void *empty_lrc_data(struct xe_hw_engine *hwe) 779 { 780 struct xe_gt *gt = hwe->gt; 781 void *data; 782 u32 *regs; 783 784 data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL); 785 if (!data) 786 return NULL; 787 788 /* 1st page: Per-Process of HW status Page */ 789 regs = data + LRC_PPHWSP_SIZE; 790 set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe); 791 set_context_control(regs, hwe); 792 set_memory_based_intr(regs, hwe); 793 reset_stop_ring(regs, hwe); 794 if (xe_gt_has_indirect_ring_state(gt)) { 795 regs = data + xe_gt_lrc_size(gt, hwe->class) - 796 LRC_INDIRECT_RING_STATE_SIZE; 797 set_offsets(regs, xe2_indirect_ring_state_offsets, hwe); 798 } 799 800 return data; 801 } 802 803 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm) 804 { 805 u64 desc = xe_vm_pdp4_descriptor(vm, lrc->tile); 806 807 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc)); 808 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc)); 809 } 810 811 static void xe_lrc_finish(struct xe_lrc *lrc) 812 { 813 xe_hw_fence_ctx_finish(&lrc->fence_ctx); 814 xe_bo_lock(lrc->bo, false); 815 xe_bo_unpin(lrc->bo); 816 xe_bo_unlock(lrc->bo); 817 xe_bo_put(lrc->bo); 818 } 819 820 #define PVC_CTX_ASID (0x2e + 1) 821 #define PVC_CTX_ACC_CTR_THOLD (0x2a + 1) 822 823 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 824 struct xe_vm *vm, u32 ring_size) 825 { 826 struct xe_gt *gt = hwe->gt; 827 struct xe_tile *tile = gt_to_tile(gt); 828 struct xe_device *xe = gt_to_xe(gt); 829 struct iosys_map map; 830 void *init_data = NULL; 831 u32 arb_enable; 832 u32 lrc_size; 833 int err; 834 835 kref_init(&lrc->refcount); 836 lrc->flags = 0; 837 lrc_size = ring_size + xe_gt_lrc_size(gt, hwe->class); 838 if (xe_gt_has_indirect_ring_state(gt)) 839 lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE; 840 841 /* 842 * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address 843 * via VM bind calls. 844 */ 845 lrc->bo = xe_bo_create_pin_map(xe, tile, vm, lrc_size, 846 ttm_bo_type_kernel, 847 XE_BO_FLAG_VRAM_IF_DGFX(tile) | 848 XE_BO_FLAG_GGTT | 849 XE_BO_FLAG_GGTT_INVALIDATE); 850 if (IS_ERR(lrc->bo)) 851 return PTR_ERR(lrc->bo); 852 853 lrc->size = lrc_size; 854 lrc->tile = gt_to_tile(hwe->gt); 855 lrc->ring.size = ring_size; 856 lrc->ring.tail = 0; 857 lrc->ctx_timestamp = 0; 858 859 xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt, 860 hwe->fence_irq, hwe->name); 861 862 if (!gt->default_lrc[hwe->class]) { 863 init_data = empty_lrc_data(hwe); 864 if (!init_data) { 865 err = -ENOMEM; 866 goto err_lrc_finish; 867 } 868 } 869 870 /* 871 * Init Per-Process of HW status Page, LRC / context state to known 872 * values 873 */ 874 map = __xe_lrc_pphwsp_map(lrc); 875 if (!init_data) { 876 xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE); /* PPHWSP */ 877 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE, 878 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE, 879 xe_gt_lrc_size(gt, hwe->class) - LRC_PPHWSP_SIZE); 880 } else { 881 xe_map_memcpy_to(xe, &map, 0, init_data, 882 xe_gt_lrc_size(gt, hwe->class)); 883 kfree(init_data); 884 } 885 886 if (vm) { 887 xe_lrc_set_ppgtt(lrc, vm); 888 889 if (vm->xef) 890 xe_drm_client_add_bo(vm->xef->client, lrc->bo); 891 } 892 893 if (xe_gt_has_indirect_ring_state(gt)) { 894 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE, 895 __xe_lrc_indirect_ring_ggtt_addr(lrc)); 896 897 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START, 898 __xe_lrc_ring_ggtt_addr(lrc)); 899 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0); 900 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0); 901 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail); 902 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL, 903 RING_CTL_SIZE(lrc->ring.size) | RING_VALID); 904 } else { 905 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc)); 906 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0); 907 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail); 908 xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL, 909 RING_CTL_SIZE(lrc->ring.size) | RING_VALID); 910 } 911 912 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0); 913 914 if (xe->info.has_asid && vm) 915 xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid); 916 917 lrc->desc = LRC_VALID; 918 lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT); 919 /* TODO: Priority */ 920 921 /* While this appears to have something about privileged batches or 922 * some such, it really just means PPGTT mode. 923 */ 924 if (vm) 925 lrc->desc |= LRC_PRIVILEGE; 926 927 if (GRAPHICS_VERx100(xe) < 1250) { 928 lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance); 929 lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class); 930 } 931 932 arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE; 933 xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable)); 934 935 map = __xe_lrc_seqno_map(lrc); 936 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); 937 938 map = __xe_lrc_start_seqno_map(lrc); 939 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); 940 941 return 0; 942 943 err_lrc_finish: 944 xe_lrc_finish(lrc); 945 return err; 946 } 947 948 /** 949 * xe_lrc_create - Create a LRC 950 * @hwe: Hardware Engine 951 * @vm: The VM (address space) 952 * @ring_size: LRC ring size 953 * 954 * Allocate and initialize the Logical Ring Context (LRC). 955 * 956 * Return pointer to created LRC upon success and an error pointer 957 * upon failure. 958 */ 959 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm, 960 u32 ring_size) 961 { 962 struct xe_lrc *lrc; 963 int err; 964 965 lrc = kzalloc(sizeof(*lrc), GFP_KERNEL); 966 if (!lrc) 967 return ERR_PTR(-ENOMEM); 968 969 err = xe_lrc_init(lrc, hwe, vm, ring_size); 970 if (err) { 971 kfree(lrc); 972 return ERR_PTR(err); 973 } 974 975 return lrc; 976 } 977 978 /** 979 * xe_lrc_destroy - Destroy the LRC 980 * @ref: reference to LRC 981 * 982 * Called when ref == 0, release resources held by the Logical Ring Context 983 * (LRC) and free the LRC memory. 984 */ 985 void xe_lrc_destroy(struct kref *ref) 986 { 987 struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount); 988 989 xe_lrc_finish(lrc); 990 kfree(lrc); 991 } 992 993 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail) 994 { 995 if (xe_lrc_has_indirect_ring_state(lrc)) 996 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail); 997 else 998 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail); 999 } 1000 1001 u32 xe_lrc_ring_tail(struct xe_lrc *lrc) 1002 { 1003 if (xe_lrc_has_indirect_ring_state(lrc)) 1004 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR; 1005 else 1006 return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR; 1007 } 1008 1009 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head) 1010 { 1011 if (xe_lrc_has_indirect_ring_state(lrc)) 1012 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head); 1013 else 1014 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head); 1015 } 1016 1017 u32 xe_lrc_ring_head(struct xe_lrc *lrc) 1018 { 1019 if (xe_lrc_has_indirect_ring_state(lrc)) 1020 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR; 1021 else 1022 return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR; 1023 } 1024 1025 u32 xe_lrc_ring_space(struct xe_lrc *lrc) 1026 { 1027 const u32 head = xe_lrc_ring_head(lrc); 1028 const u32 tail = lrc->ring.tail; 1029 const u32 size = lrc->ring.size; 1030 1031 return ((head - tail - 1) & (size - 1)) + 1; 1032 } 1033 1034 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring, 1035 const void *data, size_t size) 1036 { 1037 struct xe_device *xe = lrc_to_xe(lrc); 1038 1039 iosys_map_incr(&ring, lrc->ring.tail); 1040 xe_map_memcpy_to(xe, &ring, 0, data, size); 1041 lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1); 1042 } 1043 1044 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size) 1045 { 1046 struct xe_device *xe = lrc_to_xe(lrc); 1047 struct iosys_map ring; 1048 u32 rhs; 1049 size_t aligned_size; 1050 1051 xe_assert(xe, IS_ALIGNED(size, 4)); 1052 aligned_size = ALIGN(size, 8); 1053 1054 ring = __xe_lrc_ring_map(lrc); 1055 1056 xe_assert(xe, lrc->ring.tail < lrc->ring.size); 1057 rhs = lrc->ring.size - lrc->ring.tail; 1058 if (size > rhs) { 1059 __xe_lrc_write_ring(lrc, ring, data, rhs); 1060 __xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs); 1061 } else { 1062 __xe_lrc_write_ring(lrc, ring, data, size); 1063 } 1064 1065 if (aligned_size > size) { 1066 u32 noop = MI_NOOP; 1067 1068 __xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop)); 1069 } 1070 } 1071 1072 u64 xe_lrc_descriptor(struct xe_lrc *lrc) 1073 { 1074 return lrc->desc | xe_lrc_ggtt_addr(lrc); 1075 } 1076 1077 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc) 1078 { 1079 return __xe_lrc_seqno_ggtt_addr(lrc); 1080 } 1081 1082 /** 1083 * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence. 1084 * 1085 * Allocate but don't initialize an lrc seqno fence. 1086 * 1087 * Return: Pointer to the allocated fence or 1088 * negative error pointer on error. 1089 */ 1090 struct dma_fence *xe_lrc_alloc_seqno_fence(void) 1091 { 1092 return xe_hw_fence_alloc(); 1093 } 1094 1095 /** 1096 * xe_lrc_free_seqno_fence() - Free an lrc seqno fence. 1097 * @fence: Pointer to the fence to free. 1098 * 1099 * Frees an lrc seqno fence that hasn't yet been 1100 * initialized. 1101 */ 1102 void xe_lrc_free_seqno_fence(struct dma_fence *fence) 1103 { 1104 xe_hw_fence_free(fence); 1105 } 1106 1107 /** 1108 * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence. 1109 * @lrc: Pointer to the lrc. 1110 * @fence: Pointer to the fence to initialize. 1111 * 1112 * Initializes a pre-allocated lrc seqno fence. 1113 * After initialization, the fence is subject to normal 1114 * dma-fence refcounting. 1115 */ 1116 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence) 1117 { 1118 xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc)); 1119 } 1120 1121 s32 xe_lrc_seqno(struct xe_lrc *lrc) 1122 { 1123 struct iosys_map map = __xe_lrc_seqno_map(lrc); 1124 1125 return xe_map_read32(lrc_to_xe(lrc), &map); 1126 } 1127 1128 s32 xe_lrc_start_seqno(struct xe_lrc *lrc) 1129 { 1130 struct iosys_map map = __xe_lrc_start_seqno_map(lrc); 1131 1132 return xe_map_read32(lrc_to_xe(lrc), &map); 1133 } 1134 1135 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc) 1136 { 1137 return __xe_lrc_start_seqno_ggtt_addr(lrc); 1138 } 1139 1140 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc) 1141 { 1142 return __xe_lrc_parallel_ggtt_addr(lrc); 1143 } 1144 1145 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc) 1146 { 1147 return __xe_lrc_parallel_map(lrc); 1148 } 1149 1150 static int instr_dw(u32 cmd_header) 1151 { 1152 /* GFXPIPE "SINGLE_DW" opcodes are a single dword */ 1153 if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) == 1154 GFXPIPE_SINGLE_DW_CMD(0, 0)) 1155 return 1; 1156 1157 /* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */ 1158 if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST) 1159 return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2; 1160 1161 /* Most instructions have the # of dwords (minus 2) in 7:0 */ 1162 return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2; 1163 } 1164 1165 static int dump_mi_command(struct drm_printer *p, 1166 struct xe_gt *gt, 1167 u32 *dw, 1168 int remaining_dw) 1169 { 1170 u32 inst_header = *dw; 1171 u32 numdw = instr_dw(inst_header); 1172 u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header); 1173 int num_noop; 1174 1175 /* First check for commands that don't have/use a '# DW' field */ 1176 switch (inst_header & MI_OPCODE) { 1177 case MI_NOOP: 1178 num_noop = 1; 1179 while (num_noop < remaining_dw && 1180 (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP) 1181 num_noop++; 1182 drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop); 1183 return num_noop; 1184 1185 case MI_TOPOLOGY_FILTER: 1186 drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header); 1187 return 1; 1188 1189 case MI_BATCH_BUFFER_END: 1190 drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header); 1191 /* Return 'remaining_dw' to consume the rest of the LRC */ 1192 return remaining_dw; 1193 } 1194 1195 /* 1196 * Any remaining commands include a # of dwords. We should make sure 1197 * it doesn't exceed the remaining size of the LRC. 1198 */ 1199 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 1200 numdw = remaining_dw; 1201 1202 switch (inst_header & MI_OPCODE) { 1203 case MI_LOAD_REGISTER_IMM: 1204 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n", 1205 inst_header, (numdw - 1) / 2); 1206 for (int i = 1; i < numdw; i += 2) 1207 drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]); 1208 return numdw; 1209 1210 case MI_LOAD_REGISTER_MEM & MI_OPCODE: 1211 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n", 1212 inst_header, 1213 dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "", 1214 dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : ""); 1215 if (numdw == 4) 1216 drm_printf(p, " - %#6x = %#010llx\n", 1217 dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2]))); 1218 else 1219 drm_printf(p, " - %*ph (%s)\n", 1220 (int)sizeof(u32) * (numdw - 1), dw + 1, 1221 numdw < 4 ? "truncated" : "malformed"); 1222 return numdw; 1223 1224 case MI_FORCE_WAKEUP: 1225 drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header); 1226 return numdw; 1227 1228 default: 1229 drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n", 1230 inst_header, opcode, numdw); 1231 return numdw; 1232 } 1233 } 1234 1235 static int dump_gfxpipe_command(struct drm_printer *p, 1236 struct xe_gt *gt, 1237 u32 *dw, 1238 int remaining_dw) 1239 { 1240 u32 numdw = instr_dw(*dw); 1241 u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw); 1242 u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw); 1243 u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw); 1244 1245 /* 1246 * Make sure we haven't mis-parsed a number of dwords that exceeds the 1247 * remaining size of the LRC. 1248 */ 1249 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 1250 numdw = remaining_dw; 1251 1252 switch (*dw & GFXPIPE_MATCH_MASK) { 1253 #define MATCH(cmd) \ 1254 case cmd: \ 1255 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \ 1256 return numdw 1257 #define MATCH3D(cmd) \ 1258 case CMD_##cmd: \ 1259 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \ 1260 return numdw 1261 1262 MATCH(STATE_BASE_ADDRESS); 1263 MATCH(STATE_SIP); 1264 MATCH(GPGPU_CSR_BASE_ADDRESS); 1265 MATCH(STATE_COMPUTE_MODE); 1266 MATCH3D(3DSTATE_BTD); 1267 MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS); 1268 MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS); 1269 1270 MATCH3D(3DSTATE_VF_STATISTICS); 1271 1272 MATCH(PIPELINE_SELECT); 1273 1274 MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST); 1275 MATCH3D(3DSTATE_CLEAR_PARAMS); 1276 MATCH3D(3DSTATE_DEPTH_BUFFER); 1277 MATCH3D(3DSTATE_STENCIL_BUFFER); 1278 MATCH3D(3DSTATE_HIER_DEPTH_BUFFER); 1279 MATCH3D(3DSTATE_VERTEX_BUFFERS); 1280 MATCH3D(3DSTATE_VERTEX_ELEMENTS); 1281 MATCH3D(3DSTATE_INDEX_BUFFER); 1282 MATCH3D(3DSTATE_VF); 1283 MATCH3D(3DSTATE_MULTISAMPLE); 1284 MATCH3D(3DSTATE_CC_STATE_POINTERS); 1285 MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS); 1286 MATCH3D(3DSTATE_VS); 1287 MATCH3D(3DSTATE_GS); 1288 MATCH3D(3DSTATE_CLIP); 1289 MATCH3D(3DSTATE_SF); 1290 MATCH3D(3DSTATE_WM); 1291 MATCH3D(3DSTATE_CONSTANT_VS); 1292 MATCH3D(3DSTATE_CONSTANT_GS); 1293 MATCH3D(3DSTATE_CONSTANT_PS); 1294 MATCH3D(3DSTATE_SAMPLE_MASK); 1295 MATCH3D(3DSTATE_CONSTANT_HS); 1296 MATCH3D(3DSTATE_CONSTANT_DS); 1297 MATCH3D(3DSTATE_HS); 1298 MATCH3D(3DSTATE_TE); 1299 MATCH3D(3DSTATE_DS); 1300 MATCH3D(3DSTATE_STREAMOUT); 1301 MATCH3D(3DSTATE_SBE); 1302 MATCH3D(3DSTATE_PS); 1303 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP); 1304 MATCH3D(3DSTATE_CPS_POINTERS); 1305 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC); 1306 MATCH3D(3DSTATE_BLEND_STATE_POINTERS); 1307 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS); 1308 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS); 1309 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS); 1310 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS); 1311 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS); 1312 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS); 1313 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS); 1314 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS); 1315 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS); 1316 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS); 1317 MATCH3D(3DSTATE_VF_INSTANCING); 1318 MATCH3D(3DSTATE_VF_SGVS); 1319 MATCH3D(3DSTATE_VF_TOPOLOGY); 1320 MATCH3D(3DSTATE_WM_CHROMAKEY); 1321 MATCH3D(3DSTATE_PS_BLEND); 1322 MATCH3D(3DSTATE_WM_DEPTH_STENCIL); 1323 MATCH3D(3DSTATE_PS_EXTRA); 1324 MATCH3D(3DSTATE_RASTER); 1325 MATCH3D(3DSTATE_SBE_SWIZ); 1326 MATCH3D(3DSTATE_WM_HZ_OP); 1327 MATCH3D(3DSTATE_VF_COMPONENT_PACKING); 1328 MATCH3D(3DSTATE_VF_SGVS_2); 1329 MATCH3D(3DSTATE_VFG); 1330 MATCH3D(3DSTATE_URB_ALLOC_VS); 1331 MATCH3D(3DSTATE_URB_ALLOC_HS); 1332 MATCH3D(3DSTATE_URB_ALLOC_DS); 1333 MATCH3D(3DSTATE_URB_ALLOC_GS); 1334 MATCH3D(3DSTATE_SO_BUFFER_INDEX_0); 1335 MATCH3D(3DSTATE_SO_BUFFER_INDEX_1); 1336 MATCH3D(3DSTATE_SO_BUFFER_INDEX_2); 1337 MATCH3D(3DSTATE_SO_BUFFER_INDEX_3); 1338 MATCH3D(3DSTATE_PRIMITIVE_REPLICATION); 1339 MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO); 1340 MATCH3D(3DSTATE_AMFS); 1341 MATCH3D(3DSTATE_DEPTH_BOUNDS); 1342 MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS); 1343 MATCH3D(3DSTATE_CONSTANT_TS_POINTER); 1344 MATCH3D(3DSTATE_MESH_CONTROL); 1345 MATCH3D(3DSTATE_MESH_DISTRIB); 1346 MATCH3D(3DSTATE_TASK_REDISTRIB); 1347 MATCH3D(3DSTATE_MESH_SHADER); 1348 MATCH3D(3DSTATE_MESH_SHADER_DATA); 1349 MATCH3D(3DSTATE_TASK_CONTROL); 1350 MATCH3D(3DSTATE_TASK_SHADER); 1351 MATCH3D(3DSTATE_TASK_SHADER_DATA); 1352 MATCH3D(3DSTATE_URB_ALLOC_MESH); 1353 MATCH3D(3DSTATE_URB_ALLOC_TASK); 1354 MATCH3D(3DSTATE_CLIP_MESH); 1355 MATCH3D(3DSTATE_SBE_MESH); 1356 MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER); 1357 1358 MATCH3D(3DSTATE_DRAWING_RECTANGLE); 1359 MATCH3D(3DSTATE_CHROMA_KEY); 1360 MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET); 1361 MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN); 1362 MATCH3D(3DSTATE_LINE_STIPPLE); 1363 MATCH3D(3DSTATE_AA_LINE_PARAMETERS); 1364 MATCH3D(3DSTATE_MONOFILTER_SIZE); 1365 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS); 1366 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS); 1367 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS); 1368 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS); 1369 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS); 1370 MATCH3D(3DSTATE_SO_DECL_LIST); 1371 MATCH3D(3DSTATE_SO_BUFFER); 1372 MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC); 1373 MATCH3D(3DSTATE_SAMPLE_PATTERN); 1374 MATCH3D(3DSTATE_3D_MODE); 1375 MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE); 1376 MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS); 1377 MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO); 1378 1379 default: 1380 drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n", 1381 *dw, pipeline, opcode, subopcode, numdw); 1382 return numdw; 1383 } 1384 } 1385 1386 static int dump_gfx_state_command(struct drm_printer *p, 1387 struct xe_gt *gt, 1388 u32 *dw, 1389 int remaining_dw) 1390 { 1391 u32 numdw = instr_dw(*dw); 1392 u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw); 1393 1394 /* 1395 * Make sure we haven't mis-parsed a number of dwords that exceeds the 1396 * remaining size of the LRC. 1397 */ 1398 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 1399 numdw = remaining_dw; 1400 1401 switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) { 1402 MATCH(STATE_WRITE_INLINE); 1403 1404 default: 1405 drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n", 1406 *dw, opcode, numdw); 1407 return numdw; 1408 } 1409 } 1410 1411 void xe_lrc_dump_default(struct drm_printer *p, 1412 struct xe_gt *gt, 1413 enum xe_engine_class hwe_class) 1414 { 1415 u32 *dw; 1416 int remaining_dw, num_dw; 1417 1418 if (!gt->default_lrc[hwe_class]) { 1419 drm_printf(p, "No default LRC for class %d\n", hwe_class); 1420 return; 1421 } 1422 1423 /* 1424 * Skip the beginning of the LRC since it contains the per-process 1425 * hardware status page. 1426 */ 1427 dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE; 1428 remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4; 1429 1430 while (remaining_dw > 0) { 1431 if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) { 1432 num_dw = dump_mi_command(p, gt, dw, remaining_dw); 1433 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) { 1434 num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw); 1435 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) { 1436 num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw); 1437 } else { 1438 num_dw = min(instr_dw(*dw), remaining_dw); 1439 drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n", 1440 *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw), 1441 num_dw); 1442 } 1443 1444 dw += num_dw; 1445 remaining_dw -= num_dw; 1446 } 1447 } 1448 1449 struct instr_state { 1450 u32 instr; 1451 u16 num_dw; 1452 }; 1453 1454 static const struct instr_state xe_hpg_svg_state[] = { 1455 { .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 }, 1456 { .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 }, 1457 { .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 }, 1458 { .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 }, 1459 { .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 }, 1460 { .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 }, 1461 { .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 }, 1462 { .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 }, 1463 { .instr = CMD_3DSTATE_VS, .num_dw = 9 }, 1464 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 }, 1465 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 }, 1466 { .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 }, 1467 { .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 }, 1468 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 }, 1469 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 }, 1470 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 }, 1471 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 }, 1472 { .instr = CMD_3DSTATE_CLIP, .num_dw = 4 }, 1473 { .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 }, 1474 { .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 }, 1475 { .instr = CMD_3DSTATE_SF, .num_dw = 4 }, 1476 { .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 }, 1477 { .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 }, 1478 { .instr = CMD_3DSTATE_RASTER, .num_dw = 5 }, 1479 { .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 }, 1480 { .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 }, 1481 { .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 }, 1482 { .instr = CMD_3DSTATE_HS, .num_dw = 9 }, 1483 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 }, 1484 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 }, 1485 { .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 }, 1486 { .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 }, 1487 { .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 }, 1488 { .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 }, 1489 { .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 }, 1490 { .instr = CMD_3DSTATE_TE, .num_dw = 5 }, 1491 { .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 }, 1492 { .instr = CMD_3DSTATE_DS, .num_dw = 11 }, 1493 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 }, 1494 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 }, 1495 { .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 }, 1496 { .instr = CMD_3DSTATE_GS, .num_dw = 10 }, 1497 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 }, 1498 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 }, 1499 { .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 }, 1500 { .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 }, 1501 { .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 }, 1502 { .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 }, 1503 { .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 }, 1504 { .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 }, 1505 }; 1506 1507 void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb) 1508 { 1509 struct xe_gt *gt = q->hwe->gt; 1510 struct xe_device *xe = gt_to_xe(gt); 1511 const struct instr_state *state_table = NULL; 1512 int state_table_size = 0; 1513 1514 /* 1515 * At the moment we only need to emit non-register state for the RCS 1516 * engine. 1517 */ 1518 if (q->hwe->class != XE_ENGINE_CLASS_RENDER) 1519 return; 1520 1521 switch (GRAPHICS_VERx100(xe)) { 1522 case 1255: 1523 case 1270 ... 2004: 1524 state_table = xe_hpg_svg_state; 1525 state_table_size = ARRAY_SIZE(xe_hpg_svg_state); 1526 break; 1527 default: 1528 xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n", 1529 GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100); 1530 return; 1531 } 1532 1533 for (int i = 0; i < state_table_size; i++) { 1534 u32 instr = state_table[i].instr; 1535 u16 num_dw = state_table[i].num_dw; 1536 bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW); 1537 1538 xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE); 1539 xe_gt_assert(gt, num_dw != 0); 1540 xe_gt_assert(gt, is_single_dw ^ (num_dw > 1)); 1541 1542 /* 1543 * Xe2's SVG context is the same as the one on DG2 / MTL 1544 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has 1545 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined). 1546 * Just make the replacement here rather than defining a 1547 * whole separate table for the single trivial change. 1548 */ 1549 if (GRAPHICS_VER(xe) >= 20 && 1550 instr == CMD_3DSTATE_DRAWING_RECTANGLE) 1551 instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST; 1552 1553 bb->cs[bb->len] = instr; 1554 if (!is_single_dw) 1555 bb->cs[bb->len] |= (num_dw - 2); 1556 1557 bb->len += num_dw; 1558 } 1559 } 1560 1561 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc) 1562 { 1563 struct xe_lrc_snapshot *snapshot = kmalloc(sizeof(*snapshot), GFP_NOWAIT); 1564 1565 if (!snapshot) 1566 return NULL; 1567 1568 snapshot->context_desc = xe_lrc_ggtt_addr(lrc); 1569 snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc); 1570 snapshot->head = xe_lrc_ring_head(lrc); 1571 snapshot->tail.internal = lrc->ring.tail; 1572 snapshot->tail.memory = xe_lrc_ring_tail(lrc); 1573 snapshot->start_seqno = xe_lrc_start_seqno(lrc); 1574 snapshot->seqno = xe_lrc_seqno(lrc); 1575 snapshot->lrc_bo = xe_bo_get(lrc->bo); 1576 snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc); 1577 snapshot->lrc_size = lrc->bo->size - snapshot->lrc_offset; 1578 snapshot->lrc_snapshot = NULL; 1579 return snapshot; 1580 } 1581 1582 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot) 1583 { 1584 struct xe_bo *bo; 1585 struct iosys_map src; 1586 1587 if (!snapshot) 1588 return; 1589 1590 bo = snapshot->lrc_bo; 1591 snapshot->lrc_bo = NULL; 1592 1593 snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL); 1594 if (!snapshot->lrc_snapshot) 1595 goto put_bo; 1596 1597 xe_bo_lock(bo, false); 1598 if (!ttm_bo_vmap(&bo->ttm, &src)) { 1599 xe_map_memcpy_from(xe_bo_device(bo), 1600 snapshot->lrc_snapshot, &src, snapshot->lrc_offset, 1601 snapshot->lrc_size); 1602 ttm_bo_vunmap(&bo->ttm, &src); 1603 } else { 1604 kvfree(snapshot->lrc_snapshot); 1605 snapshot->lrc_snapshot = NULL; 1606 } 1607 xe_bo_unlock(bo); 1608 put_bo: 1609 xe_bo_put(bo); 1610 } 1611 1612 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p) 1613 { 1614 unsigned long i; 1615 1616 if (!snapshot) 1617 return; 1618 1619 drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc); 1620 drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n", 1621 snapshot->indirect_context_desc); 1622 drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head); 1623 drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n", 1624 snapshot->tail.internal, snapshot->tail.memory); 1625 drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno); 1626 drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno); 1627 1628 if (!snapshot->lrc_snapshot) 1629 return; 1630 1631 drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE); 1632 drm_puts(p, "\t[HWSP].data: "); 1633 for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) { 1634 u32 *val = snapshot->lrc_snapshot + i; 1635 char dumped[ASCII85_BUFSZ]; 1636 1637 drm_puts(p, ascii85_encode(*val, dumped)); 1638 } 1639 1640 drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE); 1641 drm_puts(p, "\t[HWCTX].data: "); 1642 for (; i < snapshot->lrc_size; i += sizeof(u32)) { 1643 u32 *val = snapshot->lrc_snapshot + i; 1644 char dumped[ASCII85_BUFSZ]; 1645 1646 drm_puts(p, ascii85_encode(*val, dumped)); 1647 } 1648 drm_puts(p, "\n"); 1649 } 1650 1651 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot) 1652 { 1653 if (!snapshot) 1654 return; 1655 1656 kvfree(snapshot->lrc_snapshot); 1657 if (snapshot->lrc_bo) 1658 xe_bo_put(snapshot->lrc_bo); 1659 kfree(snapshot); 1660 } 1661 1662 u32 xe_lrc_update_timestamp(struct xe_lrc *lrc, u32 *old_ts) 1663 { 1664 *old_ts = lrc->ctx_timestamp; 1665 1666 lrc->ctx_timestamp = xe_lrc_read_ctx_reg(lrc, CTX_TIMESTAMP); 1667 1668 return lrc->ctx_timestamp; 1669 } 1670