1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2021 Intel Corporation 4 */ 5 6 #include "xe_lrc.h" 7 8 #include <generated/xe_wa_oob.h> 9 10 #include <linux/ascii85.h> 11 12 #include "instructions/xe_mi_commands.h" 13 #include "instructions/xe_gfxpipe_commands.h" 14 #include "instructions/xe_gfx_state_commands.h" 15 #include "regs/xe_engine_regs.h" 16 #include "regs/xe_lrc_layout.h" 17 #include "xe_bb.h" 18 #include "xe_bo.h" 19 #include "xe_device.h" 20 #include "xe_drm_client.h" 21 #include "xe_exec_queue_types.h" 22 #include "xe_gt.h" 23 #include "xe_gt_printk.h" 24 #include "xe_hw_fence.h" 25 #include "xe_map.h" 26 #include "xe_memirq.h" 27 #include "xe_mmio.h" 28 #include "xe_sriov.h" 29 #include "xe_trace_lrc.h" 30 #include "xe_vm.h" 31 #include "xe_wa.h" 32 33 #define LRC_VALID BIT_ULL(0) 34 #define LRC_PRIVILEGE BIT_ULL(8) 35 #define LRC_ADDRESSING_MODE GENMASK_ULL(4, 3) 36 #define LRC_LEGACY_64B_CONTEXT 3 37 38 #define LRC_ENGINE_CLASS GENMASK_ULL(63, 61) 39 #define LRC_ENGINE_INSTANCE GENMASK_ULL(53, 48) 40 41 #define LRC_PPHWSP_SIZE SZ_4K 42 #define LRC_INDIRECT_RING_STATE_SIZE SZ_4K 43 #define LRC_WA_BB_SIZE SZ_4K 44 45 static struct xe_device * 46 lrc_to_xe(struct xe_lrc *lrc) 47 { 48 return gt_to_xe(lrc->fence_ctx.gt); 49 } 50 51 size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class) 52 { 53 struct xe_device *xe = gt_to_xe(gt); 54 size_t size; 55 56 /* Per-process HW status page (PPHWSP) */ 57 size = LRC_PPHWSP_SIZE; 58 59 /* Engine context image */ 60 switch (class) { 61 case XE_ENGINE_CLASS_RENDER: 62 if (GRAPHICS_VER(xe) >= 20) 63 size += 3 * SZ_4K; 64 else 65 size += 13 * SZ_4K; 66 break; 67 case XE_ENGINE_CLASS_COMPUTE: 68 if (GRAPHICS_VER(xe) >= 20) 69 size += 2 * SZ_4K; 70 else 71 size += 13 * SZ_4K; 72 break; 73 default: 74 WARN(1, "Unknown engine class: %d", class); 75 fallthrough; 76 case XE_ENGINE_CLASS_COPY: 77 case XE_ENGINE_CLASS_VIDEO_DECODE: 78 case XE_ENGINE_CLASS_VIDEO_ENHANCE: 79 case XE_ENGINE_CLASS_OTHER: 80 size += 1 * SZ_4K; 81 } 82 83 /* Add indirect ring state page */ 84 if (xe_gt_has_indirect_ring_state(gt)) 85 size += LRC_INDIRECT_RING_STATE_SIZE; 86 87 return size; 88 } 89 90 /* 91 * The per-platform tables are u8-encoded in @data. Decode @data and set the 92 * addresses' offset and commands in @regs. The following encoding is used 93 * for each byte. There are 2 steps: decoding commands and decoding addresses. 94 * 95 * Commands: 96 * [7]: create NOPs - number of NOPs are set in lower bits 97 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set 98 * MI_LRI_FORCE_POSTED 99 * [5:0]: Number of NOPs or registers to set values to in case of 100 * MI_LOAD_REGISTER_IMM 101 * 102 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count" 103 * number of registers. They are set by using the REG/REG16 macros: the former 104 * is used for offsets smaller than 0x200 while the latter is for values bigger 105 * than that. Those macros already set all the bits documented below correctly: 106 * 107 * [7]: When a register offset needs more than 6 bits, use additional bytes, to 108 * follow, for the lower bits 109 * [6:0]: Register offset, without considering the engine base. 110 * 111 * This function only tweaks the commands and register offsets. Values are not 112 * filled out. 113 */ 114 static void set_offsets(u32 *regs, 115 const u8 *data, 116 const struct xe_hw_engine *hwe) 117 #define NOP(x) (BIT(7) | (x)) 118 #define LRI(count, flags) ((flags) << 6 | (count) | \ 119 BUILD_BUG_ON_ZERO(count >= BIT(6))) 120 #define POSTED BIT(0) 121 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 122 #define REG16(x) \ 123 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 124 (((x) >> 2) & 0x7f) 125 { 126 const u32 base = hwe->mmio_base; 127 128 while (*data) { 129 u8 count, flags; 130 131 if (*data & BIT(7)) { /* skip */ 132 count = *data++ & ~BIT(7); 133 regs += count; 134 continue; 135 } 136 137 count = *data & 0x3f; 138 flags = *data >> 6; 139 data++; 140 141 *regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count); 142 if (flags & POSTED) 143 *regs |= MI_LRI_FORCE_POSTED; 144 *regs |= MI_LRI_LRM_CS_MMIO; 145 regs++; 146 147 xe_gt_assert(hwe->gt, count); 148 do { 149 u32 offset = 0; 150 u8 v; 151 152 do { 153 v = *data++; 154 offset <<= 7; 155 offset |= v & ~BIT(7); 156 } while (v & BIT(7)); 157 158 regs[0] = base + (offset << 2); 159 regs += 2; 160 } while (--count); 161 } 162 163 *regs = MI_BATCH_BUFFER_END | BIT(0); 164 } 165 166 static const u8 gen12_xcs_offsets[] = { 167 NOP(1), 168 LRI(13, POSTED), 169 REG16(0x244), 170 REG(0x034), 171 REG(0x030), 172 REG(0x038), 173 REG(0x03c), 174 REG(0x168), 175 REG(0x140), 176 REG(0x110), 177 REG(0x1c0), 178 REG(0x1c4), 179 REG(0x1c8), 180 REG(0x180), 181 REG16(0x2b4), 182 183 NOP(5), 184 LRI(9, POSTED), 185 REG16(0x3a8), 186 REG16(0x28c), 187 REG16(0x288), 188 REG16(0x284), 189 REG16(0x280), 190 REG16(0x27c), 191 REG16(0x278), 192 REG16(0x274), 193 REG16(0x270), 194 195 0 196 }; 197 198 static const u8 dg2_xcs_offsets[] = { 199 NOP(1), 200 LRI(15, POSTED), 201 REG16(0x244), 202 REG(0x034), 203 REG(0x030), 204 REG(0x038), 205 REG(0x03c), 206 REG(0x168), 207 REG(0x140), 208 REG(0x110), 209 REG(0x1c0), 210 REG(0x1c4), 211 REG(0x1c8), 212 REG(0x180), 213 REG16(0x2b4), 214 REG(0x120), 215 REG(0x124), 216 217 NOP(1), 218 LRI(9, POSTED), 219 REG16(0x3a8), 220 REG16(0x28c), 221 REG16(0x288), 222 REG16(0x284), 223 REG16(0x280), 224 REG16(0x27c), 225 REG16(0x278), 226 REG16(0x274), 227 REG16(0x270), 228 229 0 230 }; 231 232 static const u8 gen12_rcs_offsets[] = { 233 NOP(1), 234 LRI(13, POSTED), 235 REG16(0x244), 236 REG(0x034), 237 REG(0x030), 238 REG(0x038), 239 REG(0x03c), 240 REG(0x168), 241 REG(0x140), 242 REG(0x110), 243 REG(0x1c0), 244 REG(0x1c4), 245 REG(0x1c8), 246 REG(0x180), 247 REG16(0x2b4), 248 249 NOP(5), 250 LRI(9, POSTED), 251 REG16(0x3a8), 252 REG16(0x28c), 253 REG16(0x288), 254 REG16(0x284), 255 REG16(0x280), 256 REG16(0x27c), 257 REG16(0x278), 258 REG16(0x274), 259 REG16(0x270), 260 261 LRI(3, POSTED), 262 REG(0x1b0), 263 REG16(0x5a8), 264 REG16(0x5ac), 265 266 NOP(6), 267 LRI(1, 0), 268 REG(0x0c8), 269 NOP(3 + 9 + 1), 270 271 LRI(51, POSTED), 272 REG16(0x588), 273 REG16(0x588), 274 REG16(0x588), 275 REG16(0x588), 276 REG16(0x588), 277 REG16(0x588), 278 REG(0x028), 279 REG(0x09c), 280 REG(0x0c0), 281 REG(0x178), 282 REG(0x17c), 283 REG16(0x358), 284 REG(0x170), 285 REG(0x150), 286 REG(0x154), 287 REG(0x158), 288 REG16(0x41c), 289 REG16(0x600), 290 REG16(0x604), 291 REG16(0x608), 292 REG16(0x60c), 293 REG16(0x610), 294 REG16(0x614), 295 REG16(0x618), 296 REG16(0x61c), 297 REG16(0x620), 298 REG16(0x624), 299 REG16(0x628), 300 REG16(0x62c), 301 REG16(0x630), 302 REG16(0x634), 303 REG16(0x638), 304 REG16(0x63c), 305 REG16(0x640), 306 REG16(0x644), 307 REG16(0x648), 308 REG16(0x64c), 309 REG16(0x650), 310 REG16(0x654), 311 REG16(0x658), 312 REG16(0x65c), 313 REG16(0x660), 314 REG16(0x664), 315 REG16(0x668), 316 REG16(0x66c), 317 REG16(0x670), 318 REG16(0x674), 319 REG16(0x678), 320 REG16(0x67c), 321 REG(0x068), 322 REG(0x084), 323 NOP(1), 324 325 0 326 }; 327 328 static const u8 xehp_rcs_offsets[] = { 329 NOP(1), 330 LRI(13, POSTED), 331 REG16(0x244), 332 REG(0x034), 333 REG(0x030), 334 REG(0x038), 335 REG(0x03c), 336 REG(0x168), 337 REG(0x140), 338 REG(0x110), 339 REG(0x1c0), 340 REG(0x1c4), 341 REG(0x1c8), 342 REG(0x180), 343 REG16(0x2b4), 344 345 NOP(5), 346 LRI(9, POSTED), 347 REG16(0x3a8), 348 REG16(0x28c), 349 REG16(0x288), 350 REG16(0x284), 351 REG16(0x280), 352 REG16(0x27c), 353 REG16(0x278), 354 REG16(0x274), 355 REG16(0x270), 356 357 LRI(3, POSTED), 358 REG(0x1b0), 359 REG16(0x5a8), 360 REG16(0x5ac), 361 362 NOP(6), 363 LRI(1, 0), 364 REG(0x0c8), 365 366 0 367 }; 368 369 static const u8 dg2_rcs_offsets[] = { 370 NOP(1), 371 LRI(15, POSTED), 372 REG16(0x244), 373 REG(0x034), 374 REG(0x030), 375 REG(0x038), 376 REG(0x03c), 377 REG(0x168), 378 REG(0x140), 379 REG(0x110), 380 REG(0x1c0), 381 REG(0x1c4), 382 REG(0x1c8), 383 REG(0x180), 384 REG16(0x2b4), 385 REG(0x120), 386 REG(0x124), 387 388 NOP(1), 389 LRI(9, POSTED), 390 REG16(0x3a8), 391 REG16(0x28c), 392 REG16(0x288), 393 REG16(0x284), 394 REG16(0x280), 395 REG16(0x27c), 396 REG16(0x278), 397 REG16(0x274), 398 REG16(0x270), 399 400 LRI(3, POSTED), 401 REG(0x1b0), 402 REG16(0x5a8), 403 REG16(0x5ac), 404 405 NOP(6), 406 LRI(1, 0), 407 REG(0x0c8), 408 409 0 410 }; 411 412 static const u8 mtl_rcs_offsets[] = { 413 NOP(1), 414 LRI(15, POSTED), 415 REG16(0x244), 416 REG(0x034), 417 REG(0x030), 418 REG(0x038), 419 REG(0x03c), 420 REG(0x168), 421 REG(0x140), 422 REG(0x110), 423 REG(0x1c0), 424 REG(0x1c4), 425 REG(0x1c8), 426 REG(0x180), 427 REG16(0x2b4), 428 REG(0x120), 429 REG(0x124), 430 431 NOP(1), 432 LRI(9, POSTED), 433 REG16(0x3a8), 434 REG16(0x28c), 435 REG16(0x288), 436 REG16(0x284), 437 REG16(0x280), 438 REG16(0x27c), 439 REG16(0x278), 440 REG16(0x274), 441 REG16(0x270), 442 443 NOP(2), 444 LRI(2, POSTED), 445 REG16(0x5a8), 446 REG16(0x5ac), 447 448 NOP(6), 449 LRI(1, 0), 450 REG(0x0c8), 451 452 0 453 }; 454 455 #define XE2_CTX_COMMON \ 456 NOP(1), /* [0x00] */ \ 457 LRI(15, POSTED), /* [0x01] */ \ 458 REG16(0x244), /* [0x02] CTXT_SR_CTL */ \ 459 REG(0x034), /* [0x04] RING_BUFFER_HEAD */ \ 460 REG(0x030), /* [0x06] RING_BUFFER_TAIL */ \ 461 REG(0x038), /* [0x08] RING_BUFFER_START */ \ 462 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ \ 463 REG(0x168), /* [0x0c] BB_ADDR_UDW */ \ 464 REG(0x140), /* [0x0e] BB_ADDR */ \ 465 REG(0x110), /* [0x10] BB_STATE */ \ 466 REG(0x1c0), /* [0x12] BB_PER_CTX_PTR */ \ 467 REG(0x1c4), /* [0x14] RCS_INDIRECT_CTX */ \ 468 REG(0x1c8), /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \ 469 REG(0x180), /* [0x18] CCID */ \ 470 REG16(0x2b4), /* [0x1a] SEMAPHORE_TOKEN */ \ 471 REG(0x120), /* [0x1c] PRT_BB_STATE */ \ 472 REG(0x124), /* [0x1e] PRT_BB_STATE_UDW */ \ 473 \ 474 NOP(1), /* [0x20] */ \ 475 LRI(9, POSTED), /* [0x21] */ \ 476 REG16(0x3a8), /* [0x22] CTX_TIMESTAMP */ \ 477 REG16(0x3ac), /* [0x24] CTX_TIMESTAMP_UDW */ \ 478 REG(0x108), /* [0x26] INDIRECT_RING_STATE */ \ 479 REG16(0x284), /* [0x28] dummy reg */ \ 480 REG16(0x280), /* [0x2a] CS_ACC_CTR_THOLD */ \ 481 REG16(0x27c), /* [0x2c] CS_CTX_SYS_PASID */ \ 482 REG16(0x278), /* [0x2e] CS_CTX_ASID */ \ 483 REG16(0x274), /* [0x30] PTBP_UDW */ \ 484 REG16(0x270) /* [0x32] PTBP_LDW */ 485 486 static const u8 xe2_rcs_offsets[] = { 487 XE2_CTX_COMMON, 488 489 NOP(2), /* [0x34] */ 490 LRI(2, POSTED), /* [0x36] */ 491 REG16(0x5a8), /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */ 492 REG16(0x5ac), /* [0x39] PREEMPTION_STATUS */ 493 494 NOP(6), /* [0x41] */ 495 LRI(1, 0), /* [0x47] */ 496 REG(0x0c8), /* [0x48] R_PWR_CLK_STATE */ 497 498 0 499 }; 500 501 static const u8 xe2_bcs_offsets[] = { 502 XE2_CTX_COMMON, 503 504 NOP(4 + 8 + 1), /* [0x34] */ 505 LRI(2, POSTED), /* [0x41] */ 506 REG16(0x200), /* [0x42] BCS_SWCTRL */ 507 REG16(0x204), /* [0x44] BLIT_CCTL */ 508 509 0 510 }; 511 512 static const u8 xe2_xcs_offsets[] = { 513 XE2_CTX_COMMON, 514 515 0 516 }; 517 518 static const u8 xe2_indirect_ring_state_offsets[] = { 519 NOP(1), /* [0x00] */ 520 LRI(5, POSTED), /* [0x01] */ 521 REG(0x034), /* [0x02] RING_BUFFER_HEAD */ 522 REG(0x030), /* [0x04] RING_BUFFER_TAIL */ 523 REG(0x038), /* [0x06] RING_BUFFER_START */ 524 REG(0x048), /* [0x08] RING_BUFFER_START_UDW */ 525 REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */ 526 527 NOP(5), /* [0x0c] */ 528 LRI(9, POSTED), /* [0x11] */ 529 REG(0x168), /* [0x12] BB_ADDR_UDW */ 530 REG(0x140), /* [0x14] BB_ADDR */ 531 REG(0x110), /* [0x16] BB_STATE */ 532 REG16(0x588), /* [0x18] BB_STACK_WRITE_PORT */ 533 REG16(0x588), /* [0x20] BB_STACK_WRITE_PORT */ 534 REG16(0x588), /* [0x22] BB_STACK_WRITE_PORT */ 535 REG16(0x588), /* [0x24] BB_STACK_WRITE_PORT */ 536 REG16(0x588), /* [0x26] BB_STACK_WRITE_PORT */ 537 REG16(0x588), /* [0x28] BB_STACK_WRITE_PORT */ 538 539 NOP(12), /* [0x00] */ 540 541 0 542 }; 543 544 #undef REG16 545 #undef REG 546 #undef LRI 547 #undef NOP 548 549 static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class) 550 { 551 if (class == XE_ENGINE_CLASS_RENDER) { 552 if (GRAPHICS_VER(xe) >= 20) 553 return xe2_rcs_offsets; 554 else if (GRAPHICS_VERx100(xe) >= 1270) 555 return mtl_rcs_offsets; 556 else if (GRAPHICS_VERx100(xe) >= 1255) 557 return dg2_rcs_offsets; 558 else if (GRAPHICS_VERx100(xe) >= 1250) 559 return xehp_rcs_offsets; 560 else 561 return gen12_rcs_offsets; 562 } else if (class == XE_ENGINE_CLASS_COPY) { 563 if (GRAPHICS_VER(xe) >= 20) 564 return xe2_bcs_offsets; 565 else 566 return gen12_xcs_offsets; 567 } else { 568 if (GRAPHICS_VER(xe) >= 20) 569 return xe2_xcs_offsets; 570 else if (GRAPHICS_VERx100(xe) >= 1255) 571 return dg2_xcs_offsets; 572 else 573 return gen12_xcs_offsets; 574 } 575 } 576 577 static void set_context_control(u32 *regs, struct xe_hw_engine *hwe) 578 { 579 regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | 580 CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 581 582 if (xe_gt_has_indirect_ring_state(hwe->gt)) 583 regs[CTX_CONTEXT_CONTROL] |= 584 _MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE); 585 586 /* TODO: Timestamp */ 587 } 588 589 static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe) 590 { 591 struct xe_memirq *memirq = >_to_tile(hwe->gt)->memirq; 592 struct xe_device *xe = gt_to_xe(hwe->gt); 593 u8 num_regs; 594 595 if (!xe_device_uses_memirq(xe)) 596 return; 597 598 regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM | 599 MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT; 600 regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr; 601 regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq); 602 603 num_regs = xe_device_has_msix(xe) ? 3 : 2; 604 regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) | 605 MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED; 606 regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr; 607 regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe); 608 regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr; 609 regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe); 610 611 if (xe_device_has_msix(xe)) { 612 regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr; 613 /* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */ 614 } 615 } 616 617 static int lrc_ring_mi_mode(struct xe_hw_engine *hwe) 618 { 619 struct xe_device *xe = gt_to_xe(hwe->gt); 620 621 if (GRAPHICS_VERx100(xe) >= 1250) 622 return 0x70; 623 else 624 return 0x60; 625 } 626 627 static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe) 628 { 629 int x; 630 631 x = lrc_ring_mi_mode(hwe); 632 regs[x + 1] &= ~STOP_RING; 633 regs[x + 1] |= STOP_RING << 16; 634 } 635 636 static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc) 637 { 638 return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE; 639 } 640 641 static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc) 642 { 643 return 0; 644 } 645 646 u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc) 647 { 648 return lrc->ring.size; 649 } 650 651 /* Make the magic macros work */ 652 #define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset 653 #define __xe_lrc_regs_offset xe_lrc_regs_offset 654 655 #define LRC_SEQNO_PPHWSP_OFFSET 512 656 #define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8) 657 #define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8) 658 #define LRC_ENGINE_ID_PPHWSP_OFFSET 1024 659 #define LRC_PARALLEL_PPHWSP_OFFSET 2048 660 661 u32 xe_lrc_regs_offset(struct xe_lrc *lrc) 662 { 663 return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE; 664 } 665 666 static size_t lrc_reg_size(struct xe_device *xe) 667 { 668 if (GRAPHICS_VERx100(xe) >= 1250) 669 return 96 * sizeof(u32); 670 else 671 return 80 * sizeof(u32); 672 } 673 674 size_t xe_lrc_skip_size(struct xe_device *xe) 675 { 676 return LRC_PPHWSP_SIZE + lrc_reg_size(xe); 677 } 678 679 static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc) 680 { 681 /* The seqno is stored in the driver-defined portion of PPHWSP */ 682 return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET; 683 } 684 685 static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc) 686 { 687 /* The start seqno is stored in the driver-defined portion of PPHWSP */ 688 return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET; 689 } 690 691 static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc) 692 { 693 /* This is stored in the driver-defined portion of PPHWSP */ 694 return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET; 695 } 696 697 static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc) 698 { 699 /* The parallel is stored in the driver-defined portion of PPHWSP */ 700 return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET; 701 } 702 703 static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc) 704 { 705 return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET; 706 } 707 708 static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc) 709 { 710 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32); 711 } 712 713 static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc) 714 { 715 return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32); 716 } 717 718 static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc) 719 { 720 return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_RING_STATE_SIZE; 721 } 722 723 static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc) 724 { 725 return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE; 726 } 727 728 #define DECL_MAP_ADDR_HELPERS(elem) \ 729 static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \ 730 { \ 731 struct iosys_map map = lrc->bo->vmap; \ 732 \ 733 xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map)); \ 734 iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \ 735 return map; \ 736 } \ 737 static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \ 738 { \ 739 return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \ 740 } \ 741 742 DECL_MAP_ADDR_HELPERS(ring) 743 DECL_MAP_ADDR_HELPERS(pphwsp) 744 DECL_MAP_ADDR_HELPERS(seqno) 745 DECL_MAP_ADDR_HELPERS(regs) 746 DECL_MAP_ADDR_HELPERS(start_seqno) 747 DECL_MAP_ADDR_HELPERS(ctx_job_timestamp) 748 DECL_MAP_ADDR_HELPERS(ctx_timestamp) 749 DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw) 750 DECL_MAP_ADDR_HELPERS(parallel) 751 DECL_MAP_ADDR_HELPERS(indirect_ring) 752 DECL_MAP_ADDR_HELPERS(engine_id) 753 754 #undef DECL_MAP_ADDR_HELPERS 755 756 /** 757 * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address 758 * @lrc: Pointer to the lrc. 759 * 760 * Returns: ctx timestamp GGTT address 761 */ 762 u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc) 763 { 764 return __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 765 } 766 767 /** 768 * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address 769 * @lrc: Pointer to the lrc. 770 * 771 * Returns: ctx timestamp udw GGTT address 772 */ 773 u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc) 774 { 775 return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc); 776 } 777 778 /** 779 * xe_lrc_ctx_timestamp() - Read ctx timestamp value 780 * @lrc: Pointer to the lrc. 781 * 782 * Returns: ctx timestamp value 783 */ 784 u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc) 785 { 786 struct xe_device *xe = lrc_to_xe(lrc); 787 struct iosys_map map; 788 u32 ldw, udw = 0; 789 790 map = __xe_lrc_ctx_timestamp_map(lrc); 791 ldw = xe_map_read32(xe, &map); 792 793 if (xe->info.has_64bit_timestamp) { 794 map = __xe_lrc_ctx_timestamp_udw_map(lrc); 795 udw = xe_map_read32(xe, &map); 796 } 797 798 return (u64)udw << 32 | ldw; 799 } 800 801 /** 802 * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address 803 * @lrc: Pointer to the lrc. 804 * 805 * Returns: ctx timestamp job GGTT address 806 */ 807 u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc) 808 { 809 return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc); 810 } 811 812 /** 813 * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value 814 * @lrc: Pointer to the lrc. 815 * 816 * Returns: ctx timestamp job value 817 */ 818 u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc) 819 { 820 struct xe_device *xe = lrc_to_xe(lrc); 821 struct iosys_map map; 822 823 map = __xe_lrc_ctx_job_timestamp_map(lrc); 824 return xe_map_read32(xe, &map); 825 } 826 827 u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc) 828 { 829 return __xe_lrc_pphwsp_ggtt_addr(lrc); 830 } 831 832 u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc) 833 { 834 if (!xe_lrc_has_indirect_ring_state(lrc)) 835 return 0; 836 837 return __xe_lrc_indirect_ring_ggtt_addr(lrc); 838 } 839 840 static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr) 841 { 842 struct xe_device *xe = lrc_to_xe(lrc); 843 struct iosys_map map; 844 845 map = __xe_lrc_indirect_ring_map(lrc); 846 iosys_map_incr(&map, reg_nr * sizeof(u32)); 847 return xe_map_read32(xe, &map); 848 } 849 850 static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc, 851 int reg_nr, u32 val) 852 { 853 struct xe_device *xe = lrc_to_xe(lrc); 854 struct iosys_map map; 855 856 map = __xe_lrc_indirect_ring_map(lrc); 857 iosys_map_incr(&map, reg_nr * sizeof(u32)); 858 xe_map_write32(xe, &map, val); 859 } 860 861 u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr) 862 { 863 struct xe_device *xe = lrc_to_xe(lrc); 864 struct iosys_map map; 865 866 map = __xe_lrc_regs_map(lrc); 867 iosys_map_incr(&map, reg_nr * sizeof(u32)); 868 return xe_map_read32(xe, &map); 869 } 870 871 void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val) 872 { 873 struct xe_device *xe = lrc_to_xe(lrc); 874 struct iosys_map map; 875 876 map = __xe_lrc_regs_map(lrc); 877 iosys_map_incr(&map, reg_nr * sizeof(u32)); 878 xe_map_write32(xe, &map, val); 879 } 880 881 static void *empty_lrc_data(struct xe_hw_engine *hwe) 882 { 883 struct xe_gt *gt = hwe->gt; 884 void *data; 885 u32 *regs; 886 887 data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL); 888 if (!data) 889 return NULL; 890 891 /* 1st page: Per-Process of HW status Page */ 892 regs = data + LRC_PPHWSP_SIZE; 893 set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe); 894 set_context_control(regs, hwe); 895 set_memory_based_intr(regs, hwe); 896 reset_stop_ring(regs, hwe); 897 if (xe_gt_has_indirect_ring_state(gt)) { 898 regs = data + xe_gt_lrc_size(gt, hwe->class) - 899 LRC_INDIRECT_RING_STATE_SIZE; 900 set_offsets(regs, xe2_indirect_ring_state_offsets, hwe); 901 } 902 903 return data; 904 } 905 906 static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm) 907 { 908 u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt)); 909 910 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc)); 911 xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc)); 912 } 913 914 static void xe_lrc_finish(struct xe_lrc *lrc) 915 { 916 xe_hw_fence_ctx_finish(&lrc->fence_ctx); 917 xe_bo_unpin_map_no_vm(lrc->bo); 918 } 919 920 /* 921 * wa_bb_setup_utilization() - Write commands to wa bb to assist 922 * in calculating active context run ticks. 923 * 924 * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the 925 * context, but only gets updated when the context switches out. In order to 926 * check how long a context has been active before it switches out, two things 927 * are required: 928 * 929 * (1) Determine if the context is running: 930 * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in 931 * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is 932 * initialized. During a query, we just check for this value to determine if the 933 * context is active. If the context switched out, it would overwrite this 934 * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as 935 * the last part of context restore, so reusing this LRC location will not 936 * clobber anything. 937 * 938 * (2) Calculate the time that the context has been active for: 939 * The CTX_TIMESTAMP ticks only when the context is active. If a context is 940 * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization. 941 * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific 942 * engine instance. Since we do not know which instance the context is running 943 * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and 944 * store it in the PPHSWP. 945 */ 946 #define CONTEXT_ACTIVE 1ULL 947 static ssize_t wa_bb_setup_utilization(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 948 u32 *batch, size_t max_len) 949 { 950 u32 *cmd = batch; 951 952 if (xe_gt_WARN_ON(lrc->gt, max_len < 12)) 953 return -ENOSPC; 954 955 *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET; 956 *cmd++ = ENGINE_ID(0).addr; 957 *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc); 958 *cmd++ = 0; 959 960 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); 961 *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc); 962 *cmd++ = 0; 963 *cmd++ = lower_32_bits(CONTEXT_ACTIVE); 964 965 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) { 966 *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1); 967 *cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc); 968 *cmd++ = 0; 969 *cmd++ = upper_32_bits(CONTEXT_ACTIVE); 970 } 971 972 return cmd - batch; 973 } 974 975 struct wa_bb_setup { 976 ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 977 u32 *batch, size_t max_size); 978 }; 979 980 static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe) 981 { 982 const size_t max_size = LRC_WA_BB_SIZE; 983 static const struct wa_bb_setup funcs[] = { 984 { .setup = wa_bb_setup_utilization }, 985 }; 986 ssize_t remain; 987 u32 *cmd, *buf = NULL; 988 989 if (lrc->bo->vmap.is_iomem) { 990 buf = kmalloc(max_size, GFP_KERNEL); 991 if (!buf) 992 return -ENOMEM; 993 cmd = buf; 994 } else { 995 cmd = lrc->bo->vmap.vaddr + __xe_lrc_wa_bb_offset(lrc); 996 } 997 998 remain = max_size / sizeof(*cmd); 999 1000 for (size_t i = 0; i < ARRAY_SIZE(funcs); i++) { 1001 ssize_t len = funcs[i].setup(lrc, hwe, cmd, remain); 1002 1003 remain -= len; 1004 1005 /* 1006 * There should always be at least 1 additional dword for 1007 * the end marker 1008 */ 1009 if (len < 0 || xe_gt_WARN_ON(lrc->gt, remain < 1)) 1010 goto fail; 1011 1012 cmd += len; 1013 } 1014 1015 *cmd++ = MI_BATCH_BUFFER_END; 1016 1017 if (buf) { 1018 xe_map_memcpy_to(gt_to_xe(lrc->gt), &lrc->bo->vmap, 1019 __xe_lrc_wa_bb_offset(lrc), buf, 1020 (cmd - buf) * sizeof(*cmd)); 1021 kfree(buf); 1022 } 1023 1024 xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR, xe_bo_ggtt_addr(lrc->bo) + 1025 __xe_lrc_wa_bb_offset(lrc) + 1); 1026 1027 return 0; 1028 1029 fail: 1030 kfree(buf); 1031 return -ENOSPC; 1032 } 1033 1034 #define PVC_CTX_ASID (0x2e + 1) 1035 #define PVC_CTX_ACC_CTR_THOLD (0x2a + 1) 1036 1037 static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, 1038 struct xe_vm *vm, u32 ring_size, u16 msix_vec, 1039 u32 init_flags) 1040 { 1041 struct xe_gt *gt = hwe->gt; 1042 const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class); 1043 const u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE; 1044 struct xe_tile *tile = gt_to_tile(gt); 1045 struct xe_device *xe = gt_to_xe(gt); 1046 struct iosys_map map; 1047 void *init_data = NULL; 1048 u32 arb_enable; 1049 u32 bo_flags; 1050 int err; 1051 1052 kref_init(&lrc->refcount); 1053 lrc->gt = gt; 1054 lrc->size = lrc_size; 1055 lrc->flags = 0; 1056 lrc->ring.size = ring_size; 1057 lrc->ring.tail = 0; 1058 if (xe_gt_has_indirect_ring_state(gt)) 1059 lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE; 1060 1061 bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT | 1062 XE_BO_FLAG_GGTT_INVALIDATE; 1063 if (vm && vm->xef) /* userspace */ 1064 bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE; 1065 1066 /* 1067 * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address 1068 * via VM bind calls. 1069 */ 1070 lrc->bo = xe_bo_create_pin_map(xe, tile, NULL, bo_size, 1071 ttm_bo_type_kernel, 1072 bo_flags); 1073 if (IS_ERR(lrc->bo)) 1074 return PTR_ERR(lrc->bo); 1075 1076 xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt, 1077 hwe->fence_irq, hwe->name); 1078 1079 if (!gt->default_lrc[hwe->class]) { 1080 init_data = empty_lrc_data(hwe); 1081 if (!init_data) { 1082 err = -ENOMEM; 1083 goto err_lrc_finish; 1084 } 1085 } 1086 1087 /* 1088 * Init Per-Process of HW status Page, LRC / context state to known 1089 * values 1090 */ 1091 map = __xe_lrc_pphwsp_map(lrc); 1092 if (!init_data) { 1093 xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE); /* PPHWSP */ 1094 xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE, 1095 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE, 1096 lrc_size - LRC_PPHWSP_SIZE); 1097 } else { 1098 xe_map_memcpy_to(xe, &map, 0, init_data, lrc_size); 1099 kfree(init_data); 1100 } 1101 1102 if (vm) { 1103 xe_lrc_set_ppgtt(lrc, vm); 1104 1105 if (vm->xef) 1106 xe_drm_client_add_bo(vm->xef->client, lrc->bo); 1107 } 1108 1109 if (xe_device_has_msix(xe)) { 1110 xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR, 1111 xe_memirq_status_ptr(&tile->memirq, hwe)); 1112 xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR, 1113 xe_memirq_source_ptr(&tile->memirq, hwe)); 1114 xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec); 1115 } 1116 1117 if (xe_gt_has_indirect_ring_state(gt)) { 1118 xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE, 1119 __xe_lrc_indirect_ring_ggtt_addr(lrc)); 1120 1121 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START, 1122 __xe_lrc_ring_ggtt_addr(lrc)); 1123 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0); 1124 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0); 1125 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail); 1126 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL, 1127 RING_CTL_SIZE(lrc->ring.size) | RING_VALID); 1128 } else { 1129 xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc)); 1130 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0); 1131 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail); 1132 xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL, 1133 RING_CTL_SIZE(lrc->ring.size) | RING_VALID); 1134 } 1135 1136 if (init_flags & XE_LRC_CREATE_RUNALONE) 1137 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL, 1138 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) | 1139 _MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE)); 1140 1141 if (init_flags & XE_LRC_CREATE_PXP) 1142 xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL, 1143 xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) | 1144 _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE)); 1145 1146 lrc->ctx_timestamp = 0; 1147 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0); 1148 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) 1149 xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0); 1150 1151 if (xe->info.has_asid && vm) 1152 xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid); 1153 1154 lrc->desc = LRC_VALID; 1155 lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT); 1156 /* TODO: Priority */ 1157 1158 /* While this appears to have something about privileged batches or 1159 * some such, it really just means PPGTT mode. 1160 */ 1161 if (vm) 1162 lrc->desc |= LRC_PRIVILEGE; 1163 1164 if (GRAPHICS_VERx100(xe) < 1250) { 1165 lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance); 1166 lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class); 1167 } 1168 1169 arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1170 xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable)); 1171 1172 map = __xe_lrc_seqno_map(lrc); 1173 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); 1174 1175 map = __xe_lrc_start_seqno_map(lrc); 1176 xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1); 1177 1178 err = setup_wa_bb(lrc, hwe); 1179 if (err) 1180 goto err_lrc_finish; 1181 1182 return 0; 1183 1184 err_lrc_finish: 1185 xe_lrc_finish(lrc); 1186 return err; 1187 } 1188 1189 /** 1190 * xe_lrc_create - Create a LRC 1191 * @hwe: Hardware Engine 1192 * @vm: The VM (address space) 1193 * @ring_size: LRC ring size 1194 * @msix_vec: MSI-X interrupt vector (for platforms that support it) 1195 * @flags: LRC initialization flags 1196 * 1197 * Allocate and initialize the Logical Ring Context (LRC). 1198 * 1199 * Return pointer to created LRC upon success and an error pointer 1200 * upon failure. 1201 */ 1202 struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm, 1203 u32 ring_size, u16 msix_vec, u32 flags) 1204 { 1205 struct xe_lrc *lrc; 1206 int err; 1207 1208 lrc = kzalloc(sizeof(*lrc), GFP_KERNEL); 1209 if (!lrc) 1210 return ERR_PTR(-ENOMEM); 1211 1212 err = xe_lrc_init(lrc, hwe, vm, ring_size, msix_vec, flags); 1213 if (err) { 1214 kfree(lrc); 1215 return ERR_PTR(err); 1216 } 1217 1218 return lrc; 1219 } 1220 1221 /** 1222 * xe_lrc_destroy - Destroy the LRC 1223 * @ref: reference to LRC 1224 * 1225 * Called when ref == 0, release resources held by the Logical Ring Context 1226 * (LRC) and free the LRC memory. 1227 */ 1228 void xe_lrc_destroy(struct kref *ref) 1229 { 1230 struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount); 1231 1232 xe_lrc_finish(lrc); 1233 kfree(lrc); 1234 } 1235 1236 void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail) 1237 { 1238 if (xe_lrc_has_indirect_ring_state(lrc)) 1239 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail); 1240 else 1241 xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail); 1242 } 1243 1244 u32 xe_lrc_ring_tail(struct xe_lrc *lrc) 1245 { 1246 if (xe_lrc_has_indirect_ring_state(lrc)) 1247 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR; 1248 else 1249 return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR; 1250 } 1251 1252 static u32 xe_lrc_ring_start(struct xe_lrc *lrc) 1253 { 1254 if (xe_lrc_has_indirect_ring_state(lrc)) 1255 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START); 1256 else 1257 return xe_lrc_read_ctx_reg(lrc, CTX_RING_START); 1258 } 1259 1260 void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head) 1261 { 1262 if (xe_lrc_has_indirect_ring_state(lrc)) 1263 xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head); 1264 else 1265 xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head); 1266 } 1267 1268 u32 xe_lrc_ring_head(struct xe_lrc *lrc) 1269 { 1270 if (xe_lrc_has_indirect_ring_state(lrc)) 1271 return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR; 1272 else 1273 return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR; 1274 } 1275 1276 u32 xe_lrc_ring_space(struct xe_lrc *lrc) 1277 { 1278 const u32 head = xe_lrc_ring_head(lrc); 1279 const u32 tail = lrc->ring.tail; 1280 const u32 size = lrc->ring.size; 1281 1282 return ((head - tail - 1) & (size - 1)) + 1; 1283 } 1284 1285 static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring, 1286 const void *data, size_t size) 1287 { 1288 struct xe_device *xe = lrc_to_xe(lrc); 1289 1290 iosys_map_incr(&ring, lrc->ring.tail); 1291 xe_map_memcpy_to(xe, &ring, 0, data, size); 1292 lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1); 1293 } 1294 1295 void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size) 1296 { 1297 struct xe_device *xe = lrc_to_xe(lrc); 1298 struct iosys_map ring; 1299 u32 rhs; 1300 size_t aligned_size; 1301 1302 xe_assert(xe, IS_ALIGNED(size, 4)); 1303 aligned_size = ALIGN(size, 8); 1304 1305 ring = __xe_lrc_ring_map(lrc); 1306 1307 xe_assert(xe, lrc->ring.tail < lrc->ring.size); 1308 rhs = lrc->ring.size - lrc->ring.tail; 1309 if (size > rhs) { 1310 __xe_lrc_write_ring(lrc, ring, data, rhs); 1311 __xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs); 1312 } else { 1313 __xe_lrc_write_ring(lrc, ring, data, size); 1314 } 1315 1316 if (aligned_size > size) { 1317 u32 noop = MI_NOOP; 1318 1319 __xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop)); 1320 } 1321 } 1322 1323 u64 xe_lrc_descriptor(struct xe_lrc *lrc) 1324 { 1325 return lrc->desc | xe_lrc_ggtt_addr(lrc); 1326 } 1327 1328 u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc) 1329 { 1330 return __xe_lrc_seqno_ggtt_addr(lrc); 1331 } 1332 1333 /** 1334 * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence. 1335 * 1336 * Allocate but don't initialize an lrc seqno fence. 1337 * 1338 * Return: Pointer to the allocated fence or 1339 * negative error pointer on error. 1340 */ 1341 struct dma_fence *xe_lrc_alloc_seqno_fence(void) 1342 { 1343 return xe_hw_fence_alloc(); 1344 } 1345 1346 /** 1347 * xe_lrc_free_seqno_fence() - Free an lrc seqno fence. 1348 * @fence: Pointer to the fence to free. 1349 * 1350 * Frees an lrc seqno fence that hasn't yet been 1351 * initialized. 1352 */ 1353 void xe_lrc_free_seqno_fence(struct dma_fence *fence) 1354 { 1355 xe_hw_fence_free(fence); 1356 } 1357 1358 /** 1359 * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence. 1360 * @lrc: Pointer to the lrc. 1361 * @fence: Pointer to the fence to initialize. 1362 * 1363 * Initializes a pre-allocated lrc seqno fence. 1364 * After initialization, the fence is subject to normal 1365 * dma-fence refcounting. 1366 */ 1367 void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence) 1368 { 1369 xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc)); 1370 } 1371 1372 s32 xe_lrc_seqno(struct xe_lrc *lrc) 1373 { 1374 struct iosys_map map = __xe_lrc_seqno_map(lrc); 1375 1376 return xe_map_read32(lrc_to_xe(lrc), &map); 1377 } 1378 1379 s32 xe_lrc_start_seqno(struct xe_lrc *lrc) 1380 { 1381 struct iosys_map map = __xe_lrc_start_seqno_map(lrc); 1382 1383 return xe_map_read32(lrc_to_xe(lrc), &map); 1384 } 1385 1386 u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc) 1387 { 1388 return __xe_lrc_start_seqno_ggtt_addr(lrc); 1389 } 1390 1391 u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc) 1392 { 1393 return __xe_lrc_parallel_ggtt_addr(lrc); 1394 } 1395 1396 struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc) 1397 { 1398 return __xe_lrc_parallel_map(lrc); 1399 } 1400 1401 /** 1402 * xe_lrc_engine_id() - Read engine id value 1403 * @lrc: Pointer to the lrc. 1404 * 1405 * Returns: context id value 1406 */ 1407 static u32 xe_lrc_engine_id(struct xe_lrc *lrc) 1408 { 1409 struct xe_device *xe = lrc_to_xe(lrc); 1410 struct iosys_map map; 1411 1412 map = __xe_lrc_engine_id_map(lrc); 1413 return xe_map_read32(xe, &map); 1414 } 1415 1416 static int instr_dw(u32 cmd_header) 1417 { 1418 /* GFXPIPE "SINGLE_DW" opcodes are a single dword */ 1419 if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) == 1420 GFXPIPE_SINGLE_DW_CMD(0, 0)) 1421 return 1; 1422 1423 /* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */ 1424 if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST) 1425 return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2; 1426 1427 /* Most instructions have the # of dwords (minus 2) in 7:0 */ 1428 return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2; 1429 } 1430 1431 static int dump_mi_command(struct drm_printer *p, 1432 struct xe_gt *gt, 1433 u32 *dw, 1434 int remaining_dw) 1435 { 1436 u32 inst_header = *dw; 1437 u32 numdw = instr_dw(inst_header); 1438 u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header); 1439 int num_noop; 1440 1441 /* First check for commands that don't have/use a '# DW' field */ 1442 switch (inst_header & MI_OPCODE) { 1443 case MI_NOOP: 1444 num_noop = 1; 1445 while (num_noop < remaining_dw && 1446 (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP) 1447 num_noop++; 1448 drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop); 1449 return num_noop; 1450 1451 case MI_TOPOLOGY_FILTER: 1452 drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header); 1453 return 1; 1454 1455 case MI_BATCH_BUFFER_END: 1456 drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header); 1457 /* Return 'remaining_dw' to consume the rest of the LRC */ 1458 return remaining_dw; 1459 } 1460 1461 /* 1462 * Any remaining commands include a # of dwords. We should make sure 1463 * it doesn't exceed the remaining size of the LRC. 1464 */ 1465 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 1466 numdw = remaining_dw; 1467 1468 switch (inst_header & MI_OPCODE) { 1469 case MI_LOAD_REGISTER_IMM: 1470 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n", 1471 inst_header, (numdw - 1) / 2); 1472 for (int i = 1; i < numdw; i += 2) 1473 drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]); 1474 return numdw; 1475 1476 case MI_LOAD_REGISTER_MEM & MI_OPCODE: 1477 drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n", 1478 inst_header, 1479 dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "", 1480 dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : ""); 1481 if (numdw == 4) 1482 drm_printf(p, " - %#6x = %#010llx\n", 1483 dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2]))); 1484 else 1485 drm_printf(p, " - %*ph (%s)\n", 1486 (int)sizeof(u32) * (numdw - 1), dw + 1, 1487 numdw < 4 ? "truncated" : "malformed"); 1488 return numdw; 1489 1490 case MI_FORCE_WAKEUP: 1491 drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header); 1492 return numdw; 1493 1494 default: 1495 drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n", 1496 inst_header, opcode, numdw); 1497 return numdw; 1498 } 1499 } 1500 1501 static int dump_gfxpipe_command(struct drm_printer *p, 1502 struct xe_gt *gt, 1503 u32 *dw, 1504 int remaining_dw) 1505 { 1506 u32 numdw = instr_dw(*dw); 1507 u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw); 1508 u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw); 1509 u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw); 1510 1511 /* 1512 * Make sure we haven't mis-parsed a number of dwords that exceeds the 1513 * remaining size of the LRC. 1514 */ 1515 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 1516 numdw = remaining_dw; 1517 1518 switch (*dw & GFXPIPE_MATCH_MASK) { 1519 #define MATCH(cmd) \ 1520 case cmd: \ 1521 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \ 1522 return numdw 1523 #define MATCH3D(cmd) \ 1524 case CMD_##cmd: \ 1525 drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \ 1526 return numdw 1527 1528 MATCH(STATE_BASE_ADDRESS); 1529 MATCH(STATE_SIP); 1530 MATCH(GPGPU_CSR_BASE_ADDRESS); 1531 MATCH(STATE_COMPUTE_MODE); 1532 MATCH3D(3DSTATE_BTD); 1533 MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS); 1534 MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS); 1535 1536 MATCH3D(3DSTATE_VF_STATISTICS); 1537 1538 MATCH(PIPELINE_SELECT); 1539 1540 MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST); 1541 MATCH3D(3DSTATE_CLEAR_PARAMS); 1542 MATCH3D(3DSTATE_DEPTH_BUFFER); 1543 MATCH3D(3DSTATE_STENCIL_BUFFER); 1544 MATCH3D(3DSTATE_HIER_DEPTH_BUFFER); 1545 MATCH3D(3DSTATE_VERTEX_BUFFERS); 1546 MATCH3D(3DSTATE_VERTEX_ELEMENTS); 1547 MATCH3D(3DSTATE_INDEX_BUFFER); 1548 MATCH3D(3DSTATE_VF); 1549 MATCH3D(3DSTATE_MULTISAMPLE); 1550 MATCH3D(3DSTATE_CC_STATE_POINTERS); 1551 MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS); 1552 MATCH3D(3DSTATE_VS); 1553 MATCH3D(3DSTATE_GS); 1554 MATCH3D(3DSTATE_CLIP); 1555 MATCH3D(3DSTATE_SF); 1556 MATCH3D(3DSTATE_WM); 1557 MATCH3D(3DSTATE_CONSTANT_VS); 1558 MATCH3D(3DSTATE_CONSTANT_GS); 1559 MATCH3D(3DSTATE_CONSTANT_PS); 1560 MATCH3D(3DSTATE_SAMPLE_MASK); 1561 MATCH3D(3DSTATE_CONSTANT_HS); 1562 MATCH3D(3DSTATE_CONSTANT_DS); 1563 MATCH3D(3DSTATE_HS); 1564 MATCH3D(3DSTATE_TE); 1565 MATCH3D(3DSTATE_DS); 1566 MATCH3D(3DSTATE_STREAMOUT); 1567 MATCH3D(3DSTATE_SBE); 1568 MATCH3D(3DSTATE_PS); 1569 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP); 1570 MATCH3D(3DSTATE_CPS_POINTERS); 1571 MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC); 1572 MATCH3D(3DSTATE_BLEND_STATE_POINTERS); 1573 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS); 1574 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS); 1575 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS); 1576 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS); 1577 MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS); 1578 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS); 1579 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS); 1580 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS); 1581 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS); 1582 MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS); 1583 MATCH3D(3DSTATE_VF_INSTANCING); 1584 MATCH3D(3DSTATE_VF_SGVS); 1585 MATCH3D(3DSTATE_VF_TOPOLOGY); 1586 MATCH3D(3DSTATE_WM_CHROMAKEY); 1587 MATCH3D(3DSTATE_PS_BLEND); 1588 MATCH3D(3DSTATE_WM_DEPTH_STENCIL); 1589 MATCH3D(3DSTATE_PS_EXTRA); 1590 MATCH3D(3DSTATE_RASTER); 1591 MATCH3D(3DSTATE_SBE_SWIZ); 1592 MATCH3D(3DSTATE_WM_HZ_OP); 1593 MATCH3D(3DSTATE_VF_COMPONENT_PACKING); 1594 MATCH3D(3DSTATE_VF_SGVS_2); 1595 MATCH3D(3DSTATE_VFG); 1596 MATCH3D(3DSTATE_URB_ALLOC_VS); 1597 MATCH3D(3DSTATE_URB_ALLOC_HS); 1598 MATCH3D(3DSTATE_URB_ALLOC_DS); 1599 MATCH3D(3DSTATE_URB_ALLOC_GS); 1600 MATCH3D(3DSTATE_SO_BUFFER_INDEX_0); 1601 MATCH3D(3DSTATE_SO_BUFFER_INDEX_1); 1602 MATCH3D(3DSTATE_SO_BUFFER_INDEX_2); 1603 MATCH3D(3DSTATE_SO_BUFFER_INDEX_3); 1604 MATCH3D(3DSTATE_PRIMITIVE_REPLICATION); 1605 MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO); 1606 MATCH3D(3DSTATE_AMFS); 1607 MATCH3D(3DSTATE_DEPTH_BOUNDS); 1608 MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS); 1609 MATCH3D(3DSTATE_CONSTANT_TS_POINTER); 1610 MATCH3D(3DSTATE_MESH_CONTROL); 1611 MATCH3D(3DSTATE_MESH_DISTRIB); 1612 MATCH3D(3DSTATE_TASK_REDISTRIB); 1613 MATCH3D(3DSTATE_MESH_SHADER); 1614 MATCH3D(3DSTATE_MESH_SHADER_DATA); 1615 MATCH3D(3DSTATE_TASK_CONTROL); 1616 MATCH3D(3DSTATE_TASK_SHADER); 1617 MATCH3D(3DSTATE_TASK_SHADER_DATA); 1618 MATCH3D(3DSTATE_URB_ALLOC_MESH); 1619 MATCH3D(3DSTATE_URB_ALLOC_TASK); 1620 MATCH3D(3DSTATE_CLIP_MESH); 1621 MATCH3D(3DSTATE_SBE_MESH); 1622 MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER); 1623 MATCH3D(3DSTATE_COARSE_PIXEL); 1624 1625 MATCH3D(3DSTATE_DRAWING_RECTANGLE); 1626 MATCH3D(3DSTATE_CHROMA_KEY); 1627 MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET); 1628 MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN); 1629 MATCH3D(3DSTATE_LINE_STIPPLE); 1630 MATCH3D(3DSTATE_AA_LINE_PARAMETERS); 1631 MATCH3D(3DSTATE_MONOFILTER_SIZE); 1632 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS); 1633 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS); 1634 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS); 1635 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS); 1636 MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS); 1637 MATCH3D(3DSTATE_SO_DECL_LIST); 1638 MATCH3D(3DSTATE_SO_BUFFER); 1639 MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC); 1640 MATCH3D(3DSTATE_SAMPLE_PATTERN); 1641 MATCH3D(3DSTATE_3D_MODE); 1642 MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE); 1643 MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS); 1644 MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO); 1645 1646 default: 1647 drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n", 1648 *dw, pipeline, opcode, subopcode, numdw); 1649 return numdw; 1650 } 1651 } 1652 1653 static int dump_gfx_state_command(struct drm_printer *p, 1654 struct xe_gt *gt, 1655 u32 *dw, 1656 int remaining_dw) 1657 { 1658 u32 numdw = instr_dw(*dw); 1659 u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw); 1660 1661 /* 1662 * Make sure we haven't mis-parsed a number of dwords that exceeds the 1663 * remaining size of the LRC. 1664 */ 1665 if (xe_gt_WARN_ON(gt, numdw > remaining_dw)) 1666 numdw = remaining_dw; 1667 1668 switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) { 1669 MATCH(STATE_WRITE_INLINE); 1670 1671 default: 1672 drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n", 1673 *dw, opcode, numdw); 1674 return numdw; 1675 } 1676 } 1677 1678 void xe_lrc_dump_default(struct drm_printer *p, 1679 struct xe_gt *gt, 1680 enum xe_engine_class hwe_class) 1681 { 1682 u32 *dw; 1683 int remaining_dw, num_dw; 1684 1685 if (!gt->default_lrc[hwe_class]) { 1686 drm_printf(p, "No default LRC for class %d\n", hwe_class); 1687 return; 1688 } 1689 1690 /* 1691 * Skip the beginning of the LRC since it contains the per-process 1692 * hardware status page. 1693 */ 1694 dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE; 1695 remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4; 1696 1697 while (remaining_dw > 0) { 1698 if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) { 1699 num_dw = dump_mi_command(p, gt, dw, remaining_dw); 1700 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) { 1701 num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw); 1702 } else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) { 1703 num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw); 1704 } else { 1705 num_dw = min(instr_dw(*dw), remaining_dw); 1706 drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n", 1707 *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw), 1708 num_dw); 1709 } 1710 1711 dw += num_dw; 1712 remaining_dw -= num_dw; 1713 } 1714 } 1715 1716 struct instr_state { 1717 u32 instr; 1718 u16 num_dw; 1719 }; 1720 1721 static const struct instr_state xe_hpg_svg_state[] = { 1722 { .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 }, 1723 { .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 }, 1724 { .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 }, 1725 { .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 }, 1726 { .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 }, 1727 { .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 }, 1728 { .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 }, 1729 { .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 }, 1730 { .instr = CMD_3DSTATE_VS, .num_dw = 9 }, 1731 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 }, 1732 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 }, 1733 { .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 }, 1734 { .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 }, 1735 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 }, 1736 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 }, 1737 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 }, 1738 { .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 }, 1739 { .instr = CMD_3DSTATE_CLIP, .num_dw = 4 }, 1740 { .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 }, 1741 { .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 }, 1742 { .instr = CMD_3DSTATE_SF, .num_dw = 4 }, 1743 { .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 }, 1744 { .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 }, 1745 { .instr = CMD_3DSTATE_RASTER, .num_dw = 5 }, 1746 { .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 }, 1747 { .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 }, 1748 { .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 }, 1749 { .instr = CMD_3DSTATE_HS, .num_dw = 9 }, 1750 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 }, 1751 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 }, 1752 { .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 }, 1753 { .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 }, 1754 { .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 }, 1755 { .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 }, 1756 { .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 }, 1757 { .instr = CMD_3DSTATE_TE, .num_dw = 5 }, 1758 { .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 }, 1759 { .instr = CMD_3DSTATE_DS, .num_dw = 11 }, 1760 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 }, 1761 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 }, 1762 { .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 }, 1763 { .instr = CMD_3DSTATE_GS, .num_dw = 10 }, 1764 { .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 }, 1765 { .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 }, 1766 { .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 }, 1767 { .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 }, 1768 { .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 }, 1769 { .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 }, 1770 { .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 }, 1771 { .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 }, 1772 }; 1773 1774 void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb) 1775 { 1776 struct xe_gt *gt = q->hwe->gt; 1777 struct xe_device *xe = gt_to_xe(gt); 1778 const struct instr_state *state_table = NULL; 1779 int state_table_size = 0; 1780 1781 /* 1782 * Wa_14019789679 1783 * 1784 * If the driver doesn't explicitly emit the SVG instructions while 1785 * setting up the default LRC, the context switch will write 0's 1786 * (noops) into the LRC memory rather than the expected instruction 1787 * headers. Application contexts start out as a copy of the default 1788 * LRC, and if they also do not emit specific settings for some SVG 1789 * state, then on context restore they'll unintentionally inherit 1790 * whatever state setting the previous context had programmed into the 1791 * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will 1792 * prevent the hardware from resetting that state back to any specific 1793 * value). 1794 * 1795 * The official workaround only requires emitting 3DSTATE_MESH_CONTROL 1796 * since that's a specific state setting that can easily cause GPU 1797 * hangs if unintentionally inherited. However to be safe we'll 1798 * continue to emit all of the SVG state since it's best not to leak 1799 * any of the state between contexts, even if that leakage is harmless. 1800 */ 1801 if (XE_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) { 1802 state_table = xe_hpg_svg_state; 1803 state_table_size = ARRAY_SIZE(xe_hpg_svg_state); 1804 } 1805 1806 if (!state_table) { 1807 xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n", 1808 GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100); 1809 return; 1810 } 1811 1812 for (int i = 0; i < state_table_size; i++) { 1813 u32 instr = state_table[i].instr; 1814 u16 num_dw = state_table[i].num_dw; 1815 bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW); 1816 1817 xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE); 1818 xe_gt_assert(gt, num_dw != 0); 1819 xe_gt_assert(gt, is_single_dw ^ (num_dw > 1)); 1820 1821 /* 1822 * Xe2's SVG context is the same as the one on DG2 / MTL 1823 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has 1824 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined). 1825 * Just make the replacement here rather than defining a 1826 * whole separate table for the single trivial change. 1827 */ 1828 if (GRAPHICS_VER(xe) >= 20 && 1829 instr == CMD_3DSTATE_DRAWING_RECTANGLE) 1830 instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST; 1831 1832 bb->cs[bb->len] = instr; 1833 if (!is_single_dw) 1834 bb->cs[bb->len] |= (num_dw - 2); 1835 1836 bb->len += num_dw; 1837 } 1838 } 1839 1840 struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc) 1841 { 1842 struct xe_lrc_snapshot *snapshot = kmalloc(sizeof(*snapshot), GFP_NOWAIT); 1843 1844 if (!snapshot) 1845 return NULL; 1846 1847 snapshot->context_desc = xe_lrc_ggtt_addr(lrc); 1848 snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc); 1849 snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc); 1850 snapshot->head = xe_lrc_ring_head(lrc); 1851 snapshot->tail.internal = lrc->ring.tail; 1852 snapshot->tail.memory = xe_lrc_ring_tail(lrc); 1853 snapshot->start = xe_lrc_ring_start(lrc); 1854 snapshot->start_seqno = xe_lrc_start_seqno(lrc); 1855 snapshot->seqno = xe_lrc_seqno(lrc); 1856 snapshot->lrc_bo = xe_bo_get(lrc->bo); 1857 snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc); 1858 snapshot->lrc_size = lrc->size; 1859 snapshot->lrc_snapshot = NULL; 1860 snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc)); 1861 snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc); 1862 return snapshot; 1863 } 1864 1865 void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot) 1866 { 1867 struct xe_bo *bo; 1868 struct iosys_map src; 1869 1870 if (!snapshot) 1871 return; 1872 1873 bo = snapshot->lrc_bo; 1874 snapshot->lrc_bo = NULL; 1875 1876 snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL); 1877 if (!snapshot->lrc_snapshot) 1878 goto put_bo; 1879 1880 xe_bo_lock(bo, false); 1881 if (!ttm_bo_vmap(&bo->ttm, &src)) { 1882 xe_map_memcpy_from(xe_bo_device(bo), 1883 snapshot->lrc_snapshot, &src, snapshot->lrc_offset, 1884 snapshot->lrc_size); 1885 ttm_bo_vunmap(&bo->ttm, &src); 1886 } else { 1887 kvfree(snapshot->lrc_snapshot); 1888 snapshot->lrc_snapshot = NULL; 1889 } 1890 xe_bo_unlock(bo); 1891 put_bo: 1892 xe_bo_put(bo); 1893 } 1894 1895 void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p) 1896 { 1897 unsigned long i; 1898 1899 if (!snapshot) 1900 return; 1901 1902 drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc); 1903 drm_printf(p, "\tHW Ring address: 0x%08x\n", 1904 snapshot->ring_addr); 1905 drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n", 1906 snapshot->indirect_context_desc); 1907 drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head); 1908 drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n", 1909 snapshot->tail.internal, snapshot->tail.memory); 1910 drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start); 1911 drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno); 1912 drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno); 1913 drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp); 1914 drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp); 1915 1916 if (!snapshot->lrc_snapshot) 1917 return; 1918 1919 drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE); 1920 drm_puts(p, "\t[HWSP].data: "); 1921 for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) { 1922 u32 *val = snapshot->lrc_snapshot + i; 1923 char dumped[ASCII85_BUFSZ]; 1924 1925 drm_puts(p, ascii85_encode(*val, dumped)); 1926 } 1927 1928 drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE); 1929 drm_puts(p, "\t[HWCTX].data: "); 1930 for (; i < snapshot->lrc_size; i += sizeof(u32)) { 1931 u32 *val = snapshot->lrc_snapshot + i; 1932 char dumped[ASCII85_BUFSZ]; 1933 1934 drm_puts(p, ascii85_encode(*val, dumped)); 1935 } 1936 drm_puts(p, "\n"); 1937 } 1938 1939 void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot) 1940 { 1941 if (!snapshot) 1942 return; 1943 1944 kvfree(snapshot->lrc_snapshot); 1945 if (snapshot->lrc_bo) 1946 xe_bo_put(snapshot->lrc_bo); 1947 1948 kfree(snapshot); 1949 } 1950 1951 static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts) 1952 { 1953 u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id); 1954 u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id); 1955 struct xe_hw_engine *hwe; 1956 u64 val; 1957 1958 hwe = xe_gt_hw_engine(lrc->gt, class, instance, false); 1959 if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe), 1960 "Unexpected engine class:instance %d:%d for context utilization\n", 1961 class, instance)) 1962 return -1; 1963 1964 if (lrc_to_xe(lrc)->info.has_64bit_timestamp) 1965 val = xe_mmio_read64_2x32(&hwe->gt->mmio, 1966 RING_CTX_TIMESTAMP(hwe->mmio_base)); 1967 else 1968 val = xe_mmio_read32(&hwe->gt->mmio, 1969 RING_CTX_TIMESTAMP(hwe->mmio_base)); 1970 1971 *reg_ctx_ts = val; 1972 1973 return 0; 1974 } 1975 1976 /** 1977 * xe_lrc_update_timestamp() - Update ctx timestamp 1978 * @lrc: Pointer to the lrc. 1979 * @old_ts: Old timestamp value 1980 * 1981 * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and 1982 * update saved value. With support for active contexts, the calculation may be 1983 * slightly racy, so follow a read-again logic to ensure that the context is 1984 * still active before returning the right timestamp. 1985 * 1986 * Returns: New ctx timestamp value 1987 */ 1988 u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts) 1989 { 1990 u64 lrc_ts, reg_ts; 1991 u32 engine_id; 1992 1993 *old_ts = lrc->ctx_timestamp; 1994 1995 lrc_ts = xe_lrc_ctx_timestamp(lrc); 1996 /* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */ 1997 if (IS_SRIOV_VF(lrc_to_xe(lrc))) { 1998 lrc->ctx_timestamp = lrc_ts; 1999 goto done; 2000 } 2001 2002 if (lrc_ts == CONTEXT_ACTIVE) { 2003 engine_id = xe_lrc_engine_id(lrc); 2004 if (!get_ctx_timestamp(lrc, engine_id, ®_ts)) 2005 lrc->ctx_timestamp = reg_ts; 2006 2007 /* read lrc again to ensure context is still active */ 2008 lrc_ts = xe_lrc_ctx_timestamp(lrc); 2009 } 2010 2011 /* 2012 * If context switched out, just use the lrc_ts. Note that this needs to 2013 * be a separate if condition. 2014 */ 2015 if (lrc_ts != CONTEXT_ACTIVE) 2016 lrc->ctx_timestamp = lrc_ts; 2017 2018 done: 2019 trace_xe_lrc_update_timestamp(lrc, *old_ts); 2020 2021 return lrc->ctx_timestamp; 2022 } 2023 2024 /** 2025 * xe_lrc_ring_is_idle() - LRC is idle 2026 * @lrc: Pointer to the lrc. 2027 * 2028 * Compare LRC ring head and tail to determine if idle. 2029 * 2030 * Return: True is ring is idle, False otherwise 2031 */ 2032 bool xe_lrc_ring_is_idle(struct xe_lrc *lrc) 2033 { 2034 return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc); 2035 } 2036