1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2014 Intel Corporation 4 */ 5 6 #include <drm/drm_print.h> 7 8 #include "gem/i915_gem_lmem.h" 9 10 #include "gen8_engine_cs.h" 11 #include "i915_drv.h" 12 #include "i915_perf.h" 13 #include "i915_reg.h" 14 #include "intel_context.h" 15 #include "intel_engine.h" 16 #include "intel_engine_regs.h" 17 #include "intel_gpu_commands.h" 18 #include "intel_gt.h" 19 #include "intel_gt_regs.h" 20 #include "intel_lrc.h" 21 #include "intel_lrc_reg.h" 22 #include "intel_ring.h" 23 #include "shmem_utils.h" 24 25 /* 26 * The per-platform tables are u8-encoded in @data. Decode @data and set the 27 * addresses' offset and commands in @regs. The following encoding is used 28 * for each byte. There are 2 steps: decoding commands and decoding addresses. 29 * 30 * Commands: 31 * [7]: create NOPs - number of NOPs are set in lower bits 32 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set 33 * MI_LRI_FORCE_POSTED 34 * [5:0]: Number of NOPs or registers to set values to in case of 35 * MI_LOAD_REGISTER_IMM 36 * 37 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count" 38 * number of registers. They are set by using the REG/REG16 macros: the former 39 * is used for offsets smaller than 0x200 while the latter is for values bigger 40 * than that. Those macros already set all the bits documented below correctly: 41 * 42 * [7]: When a register offset needs more than 6 bits, use additional bytes, to 43 * follow, for the lower bits 44 * [6:0]: Register offset, without considering the engine base. 45 * 46 * This function only tweaks the commands and register offsets. Values are not 47 * filled out. 48 */ 49 static void set_offsets(u32 *regs, 50 const u8 *data, 51 const struct intel_engine_cs *engine, 52 bool close) 53 #define NOP(x) (BIT(7) | (x)) 54 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 55 #define POSTED BIT(0) 56 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 57 #define REG16(x) \ 58 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 59 (((x) >> 2) & 0x7f) 60 #define END 0 61 { 62 const u32 base = engine->mmio_base; 63 64 while (*data) { 65 u8 count, flags; 66 67 if (*data & BIT(7)) { /* skip */ 68 count = *data++ & ~BIT(7); 69 regs += count; 70 continue; 71 } 72 73 count = *data & 0x3f; 74 flags = *data >> 6; 75 data++; 76 77 *regs = MI_LOAD_REGISTER_IMM(count); 78 if (flags & POSTED) 79 *regs |= MI_LRI_FORCE_POSTED; 80 if (GRAPHICS_VER(engine->i915) >= 11) 81 *regs |= MI_LRI_LRM_CS_MMIO; 82 regs++; 83 84 GEM_BUG_ON(!count); 85 do { 86 u32 offset = 0; 87 u8 v; 88 89 do { 90 v = *data++; 91 offset <<= 7; 92 offset |= v & ~BIT(7); 93 } while (v & BIT(7)); 94 95 regs[0] = base + (offset << 2); 96 regs += 2; 97 } while (--count); 98 } 99 100 if (close) { 101 /* Close the batch; used mainly by live_lrc_layout() */ 102 *regs = MI_BATCH_BUFFER_END; 103 if (GRAPHICS_VER(engine->i915) >= 11) 104 *regs |= BIT(0); 105 } 106 } 107 108 static const u8 gen8_xcs_offsets[] = { 109 NOP(1), 110 LRI(11, 0), 111 REG16(0x244), 112 REG(0x034), 113 REG(0x030), 114 REG(0x038), 115 REG(0x03c), 116 REG(0x168), 117 REG(0x140), 118 REG(0x110), 119 REG(0x11c), 120 REG(0x114), 121 REG(0x118), 122 123 NOP(9), 124 LRI(9, 0), 125 REG16(0x3a8), 126 REG16(0x28c), 127 REG16(0x288), 128 REG16(0x284), 129 REG16(0x280), 130 REG16(0x27c), 131 REG16(0x278), 132 REG16(0x274), 133 REG16(0x270), 134 135 NOP(13), 136 LRI(2, 0), 137 REG16(0x200), 138 REG(0x028), 139 140 END 141 }; 142 143 static const u8 gen9_xcs_offsets[] = { 144 NOP(1), 145 LRI(14, POSTED), 146 REG16(0x244), 147 REG(0x034), 148 REG(0x030), 149 REG(0x038), 150 REG(0x03c), 151 REG(0x168), 152 REG(0x140), 153 REG(0x110), 154 REG(0x11c), 155 REG(0x114), 156 REG(0x118), 157 REG(0x1c0), 158 REG(0x1c4), 159 REG(0x1c8), 160 161 NOP(3), 162 LRI(9, POSTED), 163 REG16(0x3a8), 164 REG16(0x28c), 165 REG16(0x288), 166 REG16(0x284), 167 REG16(0x280), 168 REG16(0x27c), 169 REG16(0x278), 170 REG16(0x274), 171 REG16(0x270), 172 173 NOP(13), 174 LRI(1, POSTED), 175 REG16(0x200), 176 177 NOP(13), 178 LRI(44, POSTED), 179 REG(0x028), 180 REG(0x09c), 181 REG(0x0c0), 182 REG(0x178), 183 REG(0x17c), 184 REG16(0x358), 185 REG(0x170), 186 REG(0x150), 187 REG(0x154), 188 REG(0x158), 189 REG16(0x41c), 190 REG16(0x600), 191 REG16(0x604), 192 REG16(0x608), 193 REG16(0x60c), 194 REG16(0x610), 195 REG16(0x614), 196 REG16(0x618), 197 REG16(0x61c), 198 REG16(0x620), 199 REG16(0x624), 200 REG16(0x628), 201 REG16(0x62c), 202 REG16(0x630), 203 REG16(0x634), 204 REG16(0x638), 205 REG16(0x63c), 206 REG16(0x640), 207 REG16(0x644), 208 REG16(0x648), 209 REG16(0x64c), 210 REG16(0x650), 211 REG16(0x654), 212 REG16(0x658), 213 REG16(0x65c), 214 REG16(0x660), 215 REG16(0x664), 216 REG16(0x668), 217 REG16(0x66c), 218 REG16(0x670), 219 REG16(0x674), 220 REG16(0x678), 221 REG16(0x67c), 222 REG(0x068), 223 224 END 225 }; 226 227 static const u8 gen12_xcs_offsets[] = { 228 NOP(1), 229 LRI(13, POSTED), 230 REG16(0x244), 231 REG(0x034), 232 REG(0x030), 233 REG(0x038), 234 REG(0x03c), 235 REG(0x168), 236 REG(0x140), 237 REG(0x110), 238 REG(0x1c0), 239 REG(0x1c4), 240 REG(0x1c8), 241 REG(0x180), 242 REG16(0x2b4), 243 244 NOP(5), 245 LRI(9, POSTED), 246 REG16(0x3a8), 247 REG16(0x28c), 248 REG16(0x288), 249 REG16(0x284), 250 REG16(0x280), 251 REG16(0x27c), 252 REG16(0x278), 253 REG16(0x274), 254 REG16(0x270), 255 256 END 257 }; 258 259 static const u8 dg2_xcs_offsets[] = { 260 NOP(1), 261 LRI(15, POSTED), 262 REG16(0x244), 263 REG(0x034), 264 REG(0x030), 265 REG(0x038), 266 REG(0x03c), 267 REG(0x168), 268 REG(0x140), 269 REG(0x110), 270 REG(0x1c0), 271 REG(0x1c4), 272 REG(0x1c8), 273 REG(0x180), 274 REG16(0x2b4), 275 REG(0x120), 276 REG(0x124), 277 278 NOP(1), 279 LRI(9, POSTED), 280 REG16(0x3a8), 281 REG16(0x28c), 282 REG16(0x288), 283 REG16(0x284), 284 REG16(0x280), 285 REG16(0x27c), 286 REG16(0x278), 287 REG16(0x274), 288 REG16(0x270), 289 290 END 291 }; 292 293 static const u8 gen8_rcs_offsets[] = { 294 NOP(1), 295 LRI(14, POSTED), 296 REG16(0x244), 297 REG(0x034), 298 REG(0x030), 299 REG(0x038), 300 REG(0x03c), 301 REG(0x168), 302 REG(0x140), 303 REG(0x110), 304 REG(0x11c), 305 REG(0x114), 306 REG(0x118), 307 REG(0x1c0), 308 REG(0x1c4), 309 REG(0x1c8), 310 311 NOP(3), 312 LRI(9, POSTED), 313 REG16(0x3a8), 314 REG16(0x28c), 315 REG16(0x288), 316 REG16(0x284), 317 REG16(0x280), 318 REG16(0x27c), 319 REG16(0x278), 320 REG16(0x274), 321 REG16(0x270), 322 323 NOP(13), 324 LRI(1, 0), 325 REG(0x0c8), 326 327 END 328 }; 329 330 static const u8 gen9_rcs_offsets[] = { 331 NOP(1), 332 LRI(14, POSTED), 333 REG16(0x244), 334 REG(0x34), 335 REG(0x30), 336 REG(0x38), 337 REG(0x3c), 338 REG(0x168), 339 REG(0x140), 340 REG(0x110), 341 REG(0x11c), 342 REG(0x114), 343 REG(0x118), 344 REG(0x1c0), 345 REG(0x1c4), 346 REG(0x1c8), 347 348 NOP(3), 349 LRI(9, POSTED), 350 REG16(0x3a8), 351 REG16(0x28c), 352 REG16(0x288), 353 REG16(0x284), 354 REG16(0x280), 355 REG16(0x27c), 356 REG16(0x278), 357 REG16(0x274), 358 REG16(0x270), 359 360 NOP(13), 361 LRI(1, 0), 362 REG(0xc8), 363 364 NOP(13), 365 LRI(44, POSTED), 366 REG(0x28), 367 REG(0x9c), 368 REG(0xc0), 369 REG(0x178), 370 REG(0x17c), 371 REG16(0x358), 372 REG(0x170), 373 REG(0x150), 374 REG(0x154), 375 REG(0x158), 376 REG16(0x41c), 377 REG16(0x600), 378 REG16(0x604), 379 REG16(0x608), 380 REG16(0x60c), 381 REG16(0x610), 382 REG16(0x614), 383 REG16(0x618), 384 REG16(0x61c), 385 REG16(0x620), 386 REG16(0x624), 387 REG16(0x628), 388 REG16(0x62c), 389 REG16(0x630), 390 REG16(0x634), 391 REG16(0x638), 392 REG16(0x63c), 393 REG16(0x640), 394 REG16(0x644), 395 REG16(0x648), 396 REG16(0x64c), 397 REG16(0x650), 398 REG16(0x654), 399 REG16(0x658), 400 REG16(0x65c), 401 REG16(0x660), 402 REG16(0x664), 403 REG16(0x668), 404 REG16(0x66c), 405 REG16(0x670), 406 REG16(0x674), 407 REG16(0x678), 408 REG16(0x67c), 409 REG(0x68), 410 411 END 412 }; 413 414 static const u8 gen11_rcs_offsets[] = { 415 NOP(1), 416 LRI(15, POSTED), 417 REG16(0x244), 418 REG(0x034), 419 REG(0x030), 420 REG(0x038), 421 REG(0x03c), 422 REG(0x168), 423 REG(0x140), 424 REG(0x110), 425 REG(0x11c), 426 REG(0x114), 427 REG(0x118), 428 REG(0x1c0), 429 REG(0x1c4), 430 REG(0x1c8), 431 REG(0x180), 432 433 NOP(1), 434 LRI(9, POSTED), 435 REG16(0x3a8), 436 REG16(0x28c), 437 REG16(0x288), 438 REG16(0x284), 439 REG16(0x280), 440 REG16(0x27c), 441 REG16(0x278), 442 REG16(0x274), 443 REG16(0x270), 444 445 LRI(1, POSTED), 446 REG(0x1b0), 447 448 NOP(10), 449 LRI(1, 0), 450 REG(0x0c8), 451 452 END 453 }; 454 455 static const u8 gen12_rcs_offsets[] = { 456 NOP(1), 457 LRI(13, POSTED), 458 REG16(0x244), 459 REG(0x034), 460 REG(0x030), 461 REG(0x038), 462 REG(0x03c), 463 REG(0x168), 464 REG(0x140), 465 REG(0x110), 466 REG(0x1c0), 467 REG(0x1c4), 468 REG(0x1c8), 469 REG(0x180), 470 REG16(0x2b4), 471 472 NOP(5), 473 LRI(9, POSTED), 474 REG16(0x3a8), 475 REG16(0x28c), 476 REG16(0x288), 477 REG16(0x284), 478 REG16(0x280), 479 REG16(0x27c), 480 REG16(0x278), 481 REG16(0x274), 482 REG16(0x270), 483 484 LRI(3, POSTED), 485 REG(0x1b0), 486 REG16(0x5a8), 487 REG16(0x5ac), 488 489 NOP(6), 490 LRI(1, 0), 491 REG(0x0c8), 492 NOP(3 + 9 + 1), 493 494 LRI(51, POSTED), 495 REG16(0x588), 496 REG16(0x588), 497 REG16(0x588), 498 REG16(0x588), 499 REG16(0x588), 500 REG16(0x588), 501 REG(0x028), 502 REG(0x09c), 503 REG(0x0c0), 504 REG(0x178), 505 REG(0x17c), 506 REG16(0x358), 507 REG(0x170), 508 REG(0x150), 509 REG(0x154), 510 REG(0x158), 511 REG16(0x41c), 512 REG16(0x600), 513 REG16(0x604), 514 REG16(0x608), 515 REG16(0x60c), 516 REG16(0x610), 517 REG16(0x614), 518 REG16(0x618), 519 REG16(0x61c), 520 REG16(0x620), 521 REG16(0x624), 522 REG16(0x628), 523 REG16(0x62c), 524 REG16(0x630), 525 REG16(0x634), 526 REG16(0x638), 527 REG16(0x63c), 528 REG16(0x640), 529 REG16(0x644), 530 REG16(0x648), 531 REG16(0x64c), 532 REG16(0x650), 533 REG16(0x654), 534 REG16(0x658), 535 REG16(0x65c), 536 REG16(0x660), 537 REG16(0x664), 538 REG16(0x668), 539 REG16(0x66c), 540 REG16(0x670), 541 REG16(0x674), 542 REG16(0x678), 543 REG16(0x67c), 544 REG(0x068), 545 REG(0x084), 546 NOP(1), 547 548 END 549 }; 550 551 static const u8 dg2_rcs_offsets[] = { 552 NOP(1), 553 LRI(15, POSTED), 554 REG16(0x244), 555 REG(0x034), 556 REG(0x030), 557 REG(0x038), 558 REG(0x03c), 559 REG(0x168), 560 REG(0x140), 561 REG(0x110), 562 REG(0x1c0), 563 REG(0x1c4), 564 REG(0x1c8), 565 REG(0x180), 566 REG16(0x2b4), 567 REG(0x120), 568 REG(0x124), 569 570 NOP(1), 571 LRI(9, POSTED), 572 REG16(0x3a8), 573 REG16(0x28c), 574 REG16(0x288), 575 REG16(0x284), 576 REG16(0x280), 577 REG16(0x27c), 578 REG16(0x278), 579 REG16(0x274), 580 REG16(0x270), 581 582 LRI(3, POSTED), 583 REG(0x1b0), 584 REG16(0x5a8), 585 REG16(0x5ac), 586 587 NOP(6), 588 LRI(1, 0), 589 REG(0x0c8), 590 591 END 592 }; 593 594 static const u8 mtl_rcs_offsets[] = { 595 NOP(1), 596 LRI(15, POSTED), 597 REG16(0x244), 598 REG(0x034), 599 REG(0x030), 600 REG(0x038), 601 REG(0x03c), 602 REG(0x168), 603 REG(0x140), 604 REG(0x110), 605 REG(0x1c0), 606 REG(0x1c4), 607 REG(0x1c8), 608 REG(0x180), 609 REG16(0x2b4), 610 REG(0x120), 611 REG(0x124), 612 613 NOP(1), 614 LRI(9, POSTED), 615 REG16(0x3a8), 616 REG16(0x28c), 617 REG16(0x288), 618 REG16(0x284), 619 REG16(0x280), 620 REG16(0x27c), 621 REG16(0x278), 622 REG16(0x274), 623 REG16(0x270), 624 625 NOP(2), 626 LRI(2, POSTED), 627 REG16(0x5a8), 628 REG16(0x5ac), 629 630 NOP(6), 631 LRI(1, 0), 632 REG(0x0c8), 633 634 END 635 }; 636 637 #undef END 638 #undef REG16 639 #undef REG 640 #undef LRI 641 #undef NOP 642 643 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 644 { 645 /* 646 * The gen12+ lists only have the registers we program in the basic 647 * default state. We rely on the context image using relative 648 * addressing to automatic fixup the register state between the 649 * physical engines for virtual engine. 650 */ 651 GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 && 652 !intel_engine_has_relative_mmio(engine)); 653 654 if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) { 655 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70)) 656 return mtl_rcs_offsets; 657 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 658 return dg2_rcs_offsets; 659 else if (GRAPHICS_VER(engine->i915) >= 12) 660 return gen12_rcs_offsets; 661 else if (GRAPHICS_VER(engine->i915) >= 11) 662 return gen11_rcs_offsets; 663 else if (GRAPHICS_VER(engine->i915) >= 9) 664 return gen9_rcs_offsets; 665 else 666 return gen8_rcs_offsets; 667 } else { 668 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 669 return dg2_xcs_offsets; 670 else if (GRAPHICS_VER(engine->i915) >= 12) 671 return gen12_xcs_offsets; 672 else if (GRAPHICS_VER(engine->i915) >= 9) 673 return gen9_xcs_offsets; 674 else 675 return gen8_xcs_offsets; 676 } 677 } 678 679 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 680 { 681 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 682 return 0x70; 683 else if (GRAPHICS_VER(engine->i915) >= 12) 684 return 0x60; 685 else if (GRAPHICS_VER(engine->i915) >= 9) 686 return 0x54; 687 else if (engine->class == RENDER_CLASS) 688 return 0x58; 689 else 690 return -1; 691 } 692 693 static int lrc_ring_bb_offset(const struct intel_engine_cs *engine) 694 { 695 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 696 return 0x80; 697 else if (GRAPHICS_VER(engine->i915) >= 12) 698 return 0x70; 699 else if (GRAPHICS_VER(engine->i915) >= 9) 700 return 0x64; 701 else if (GRAPHICS_VER(engine->i915) >= 8 && 702 engine->class == RENDER_CLASS) 703 return 0xc4; 704 else 705 return -1; 706 } 707 708 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 709 { 710 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 711 return 0x84; 712 else if (GRAPHICS_VER(engine->i915) >= 12) 713 return 0x74; 714 else if (GRAPHICS_VER(engine->i915) >= 9) 715 return 0x68; 716 else if (engine->class == RENDER_CLASS) 717 return 0xd8; 718 else 719 return -1; 720 } 721 722 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 723 { 724 if (GRAPHICS_VER(engine->i915) >= 12) 725 return 0x12; 726 else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS) 727 return 0x18; 728 else 729 return -1; 730 } 731 732 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 733 { 734 int x; 735 736 x = lrc_ring_wa_bb_per_ctx(engine); 737 if (x < 0) 738 return x; 739 740 return x + 2; 741 } 742 743 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 744 { 745 int x; 746 747 x = lrc_ring_indirect_ptr(engine); 748 if (x < 0) 749 return x; 750 751 return x + 2; 752 } 753 754 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 755 { 756 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 757 /* 758 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL 759 * simply to match the RCS context image layout. 760 */ 761 return 0xc6; 762 else if (engine->class != RENDER_CLASS) 763 return -1; 764 else if (GRAPHICS_VER(engine->i915) >= 12) 765 return 0xb6; 766 else if (GRAPHICS_VER(engine->i915) >= 11) 767 return 0xaa; 768 else 769 return -1; 770 } 771 772 static u32 773 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 774 { 775 if (GRAPHICS_VER(engine->i915) >= 12) 776 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 777 else if (GRAPHICS_VER(engine->i915) >= 11) 778 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 779 else if (GRAPHICS_VER(engine->i915) >= 9) 780 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 781 else if (GRAPHICS_VER(engine->i915) >= 8) 782 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 783 784 GEM_BUG_ON(GRAPHICS_VER(engine->i915) < 8); 785 786 return 0; 787 } 788 789 static void 790 lrc_setup_bb_per_ctx(u32 *regs, 791 const struct intel_engine_cs *engine, 792 u32 ctx_bb_ggtt_addr) 793 { 794 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 795 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 796 ctx_bb_ggtt_addr | 797 PER_CTX_BB_FORCE | 798 PER_CTX_BB_VALID; 799 } 800 801 static void 802 lrc_setup_indirect_ctx(u32 *regs, 803 const struct intel_engine_cs *engine, 804 u32 ctx_bb_ggtt_addr, 805 u32 size) 806 { 807 GEM_BUG_ON(!size); 808 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 809 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 810 regs[lrc_ring_indirect_ptr(engine) + 1] = 811 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 812 813 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 814 regs[lrc_ring_indirect_offset(engine) + 1] = 815 lrc_ring_indirect_offset_default(engine) << 6; 816 } 817 818 static bool ctx_needs_runalone(const struct intel_context *ce) 819 { 820 struct i915_gem_context *gem_ctx; 821 bool ctx_is_protected = false; 822 823 /* 824 * Wa_14019159160 - Case 2. 825 * On some platforms, protected contexts require setting 826 * the LRC run-alone bit or else the encryption/decryption will not happen. 827 * NOTE: Case 2 only applies to PXP use-case of said workaround. 828 */ 829 if (GRAPHICS_VER_FULL(ce->engine->i915) >= IP_VER(12, 70) && 830 (ce->engine->class == COMPUTE_CLASS || ce->engine->class == RENDER_CLASS)) { 831 rcu_read_lock(); 832 gem_ctx = rcu_dereference(ce->gem_context); 833 if (gem_ctx) 834 ctx_is_protected = gem_ctx->uses_protected_content; 835 rcu_read_unlock(); 836 } 837 838 return ctx_is_protected; 839 } 840 841 static void init_common_regs(u32 * const regs, 842 const struct intel_context *ce, 843 const struct intel_engine_cs *engine, 844 bool inhibit) 845 { 846 u32 ctl; 847 int loc; 848 849 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 850 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 851 if (inhibit) 852 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 853 if (GRAPHICS_VER(engine->i915) < 11) 854 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 855 CTX_CTRL_RS_CTX_ENABLE); 856 /* Wa_14019159160 - Case 2.*/ 857 if (ctx_needs_runalone(ce)) 858 ctl |= _MASKED_BIT_ENABLE(GEN12_CTX_CTRL_RUNALONE_MODE); 859 regs[CTX_CONTEXT_CONTROL] = ctl; 860 861 regs[CTX_TIMESTAMP] = ce->stats.runtime.last; 862 863 loc = lrc_ring_bb_offset(engine); 864 if (loc != -1) 865 regs[loc + 1] = 0; 866 } 867 868 static void init_wa_bb_regs(u32 * const regs, 869 const struct intel_engine_cs *engine) 870 { 871 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 872 873 if (wa_ctx->per_ctx.size) { 874 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 875 876 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 877 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 878 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 879 } 880 881 if (wa_ctx->indirect_ctx.size) { 882 lrc_setup_indirect_ctx(regs, engine, 883 i915_ggtt_offset(wa_ctx->vma) + 884 wa_ctx->indirect_ctx.offset, 885 wa_ctx->indirect_ctx.size); 886 } 887 } 888 889 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt) 890 { 891 if (i915_vm_is_4lvl(&ppgtt->vm)) { 892 /* 64b PPGTT (48bit canonical) 893 * PDP0_DESCRIPTOR contains the base address to PML4 and 894 * other PDP Descriptors are ignored. 895 */ 896 ASSIGN_CTX_PML4(ppgtt, regs); 897 } else { 898 ASSIGN_CTX_PDP(ppgtt, regs, 3); 899 ASSIGN_CTX_PDP(ppgtt, regs, 2); 900 ASSIGN_CTX_PDP(ppgtt, regs, 1); 901 ASSIGN_CTX_PDP(ppgtt, regs, 0); 902 } 903 } 904 905 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 906 { 907 if (i915_is_ggtt(vm)) 908 return i915_vm_to_ggtt(vm)->alias; 909 else 910 return i915_vm_to_ppgtt(vm); 911 } 912 913 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 914 { 915 int x; 916 917 x = lrc_ring_mi_mode(engine); 918 if (x != -1) { 919 regs[x + 1] &= ~STOP_RING; 920 regs[x + 1] |= STOP_RING << 16; 921 } 922 } 923 924 static void __lrc_init_regs(u32 *regs, 925 const struct intel_context *ce, 926 const struct intel_engine_cs *engine, 927 bool inhibit) 928 { 929 /* 930 * A context is actually a big batch buffer with several 931 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 932 * values we are setting here are only for the first context restore: 933 * on a subsequent save, the GPU will recreate this batchbuffer with new 934 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 935 * we are not initializing here). 936 * 937 * Must keep consistent with virtual_update_register_offsets(). 938 */ 939 940 if (inhibit) 941 memset(regs, 0, PAGE_SIZE); 942 943 set_offsets(regs, reg_offsets(engine), engine, inhibit); 944 945 init_common_regs(regs, ce, engine, inhibit); 946 init_ppgtt_regs(regs, vm_alias(ce->vm)); 947 948 init_wa_bb_regs(regs, engine); 949 950 __reset_stop_ring(regs, engine); 951 } 952 953 void lrc_init_regs(const struct intel_context *ce, 954 const struct intel_engine_cs *engine, 955 bool inhibit) 956 { 957 __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit); 958 } 959 960 void lrc_reset_regs(const struct intel_context *ce, 961 const struct intel_engine_cs *engine) 962 { 963 __reset_stop_ring(ce->lrc_reg_state, engine); 964 } 965 966 static void 967 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 968 { 969 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 970 return; 971 972 vaddr += engine->context_size; 973 974 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 975 } 976 977 static void 978 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 979 { 980 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 981 return; 982 983 vaddr += engine->context_size; 984 985 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 986 drm_err_once(&engine->i915->drm, 987 "%s context redzone overwritten!\n", 988 engine->name); 989 } 990 991 static u32 context_wa_bb_offset(const struct intel_context *ce) 992 { 993 return PAGE_SIZE * ce->wa_bb_page; 994 } 995 996 /* 997 * per_ctx below determines which WABB section is used. 998 * When true, the function returns the location of the 999 * PER_CTX_BB. When false, the function returns the 1000 * location of the INDIRECT_CTX. 1001 */ 1002 static u32 *context_wabb(const struct intel_context *ce, bool per_ctx) 1003 { 1004 void *ptr; 1005 1006 GEM_BUG_ON(!ce->wa_bb_page); 1007 1008 ptr = ce->lrc_reg_state; 1009 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 1010 ptr += context_wa_bb_offset(ce); 1011 ptr += per_ctx ? PAGE_SIZE : 0; 1012 1013 return ptr; 1014 } 1015 1016 void lrc_init_state(struct intel_context *ce, 1017 struct intel_engine_cs *engine, 1018 void *state) 1019 { 1020 bool inhibit = true; 1021 1022 set_redzone(state, engine); 1023 1024 if (ce->default_state) { 1025 shmem_read(ce->default_state, 0, state, engine->context_size); 1026 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 1027 inhibit = false; 1028 } 1029 1030 /* Clear the ppHWSP (inc. per-context counters) */ 1031 memset(state, 0, PAGE_SIZE); 1032 1033 /* Clear the indirect wa and storage */ 1034 if (ce->wa_bb_page) 1035 memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE); 1036 1037 /* 1038 * The second page of the context object contains some registers which 1039 * must be set up prior to the first execution. 1040 */ 1041 __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit); 1042 } 1043 1044 u32 lrc_indirect_bb(const struct intel_context *ce) 1045 { 1046 return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce); 1047 } 1048 1049 static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs) 1050 { 1051 /* If predication is active, this will be noop'ed */ 1052 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2); 1053 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA; 1054 *cs++ = 0; 1055 *cs++ = 0; /* No predication */ 1056 1057 /* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */ 1058 *cs++ = MI_BATCH_BUFFER_END | BIT(15); 1059 *cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE; 1060 1061 /* Instructions are no longer predicated (disabled), we can proceed */ 1062 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2); 1063 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA; 1064 *cs++ = 0; 1065 *cs++ = 1; /* enable predication before the next BB */ 1066 1067 *cs++ = MI_BATCH_BUFFER_END; 1068 GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA); 1069 1070 return cs; 1071 } 1072 1073 static struct i915_vma * 1074 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine) 1075 { 1076 struct drm_i915_gem_object *obj; 1077 struct i915_vma *vma; 1078 u32 context_size; 1079 1080 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 1081 1082 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1083 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 1084 1085 if (GRAPHICS_VER(engine->i915) >= 12) { 1086 ce->wa_bb_page = context_size / PAGE_SIZE; 1087 /* INDIRECT_CTX and PER_CTX_BB need separate pages. */ 1088 context_size += PAGE_SIZE * 2; 1089 } 1090 1091 if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) { 1092 ce->parallel.guc.parent_page = context_size / PAGE_SIZE; 1093 context_size += PARENT_SCRATCH_SIZE; 1094 } 1095 1096 obj = i915_gem_object_create_lmem(engine->i915, context_size, 1097 I915_BO_ALLOC_PM_VOLATILE); 1098 if (IS_ERR(obj)) { 1099 obj = i915_gem_object_create_shmem(engine->i915, context_size); 1100 if (IS_ERR(obj)) 1101 return ERR_CAST(obj); 1102 1103 /* 1104 * Wa_22016122933: For Media version 13.0, all Media GT shared 1105 * memory needs to be mapped as WC on CPU side and UC (PAT 1106 * index 2) on GPU side. 1107 */ 1108 if (intel_gt_needs_wa_22016122933(engine->gt)) 1109 i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE); 1110 } 1111 1112 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 1113 if (IS_ERR(vma)) { 1114 i915_gem_object_put(obj); 1115 return vma; 1116 } 1117 1118 return vma; 1119 } 1120 1121 static struct intel_timeline * 1122 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine) 1123 { 1124 struct intel_timeline *tl = fetch_and_zero(&ce->timeline); 1125 1126 return intel_timeline_create_from_engine(engine, page_unmask_bits(tl)); 1127 } 1128 1129 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine) 1130 { 1131 struct intel_ring *ring; 1132 struct i915_vma *vma; 1133 int err; 1134 1135 GEM_BUG_ON(ce->state); 1136 1137 if (!intel_context_has_own_state(ce)) 1138 ce->default_state = engine->default_state; 1139 1140 vma = __lrc_alloc_state(ce, engine); 1141 if (IS_ERR(vma)) 1142 return PTR_ERR(vma); 1143 1144 ring = intel_engine_create_ring(engine, ce->ring_size); 1145 if (IS_ERR(ring)) { 1146 err = PTR_ERR(ring); 1147 goto err_vma; 1148 } 1149 1150 if (!page_mask_bits(ce->timeline)) { 1151 struct intel_timeline *tl; 1152 1153 /* 1154 * Use the static global HWSP for the kernel context, and 1155 * a dynamically allocated cacheline for everyone else. 1156 */ 1157 if (unlikely(ce->timeline)) 1158 tl = pinned_timeline(ce, engine); 1159 else 1160 tl = intel_timeline_create(engine->gt); 1161 if (IS_ERR(tl)) { 1162 err = PTR_ERR(tl); 1163 goto err_ring; 1164 } 1165 1166 ce->timeline = tl; 1167 } 1168 1169 ce->ring = ring; 1170 ce->state = vma; 1171 1172 return 0; 1173 1174 err_ring: 1175 intel_ring_put(ring); 1176 err_vma: 1177 i915_vma_put(vma); 1178 return err; 1179 } 1180 1181 void lrc_reset(struct intel_context *ce) 1182 { 1183 GEM_BUG_ON(!intel_context_is_pinned(ce)); 1184 1185 intel_ring_reset(ce->ring, ce->ring->emit); 1186 1187 /* Scrub away the garbage */ 1188 lrc_init_regs(ce, ce->engine, true); 1189 ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail); 1190 } 1191 1192 int 1193 lrc_pre_pin(struct intel_context *ce, 1194 struct intel_engine_cs *engine, 1195 struct i915_gem_ww_ctx *ww, 1196 void **vaddr) 1197 { 1198 GEM_BUG_ON(!ce->state); 1199 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 1200 1201 *vaddr = i915_gem_object_pin_map(ce->state->obj, 1202 intel_gt_coherent_map_type(ce->engine->gt, 1203 ce->state->obj, 1204 false) | 1205 I915_MAP_OVERRIDE); 1206 1207 return PTR_ERR_OR_ZERO(*vaddr); 1208 } 1209 1210 int 1211 lrc_pin(struct intel_context *ce, 1212 struct intel_engine_cs *engine, 1213 void *vaddr) 1214 { 1215 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 1216 1217 if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags)) 1218 lrc_init_state(ce, engine, vaddr); 1219 1220 ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail); 1221 return 0; 1222 } 1223 1224 void lrc_unpin(struct intel_context *ce) 1225 { 1226 if (unlikely(ce->parallel.last_rq)) { 1227 i915_request_put(ce->parallel.last_rq); 1228 ce->parallel.last_rq = NULL; 1229 } 1230 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 1231 ce->engine); 1232 } 1233 1234 void lrc_post_unpin(struct intel_context *ce) 1235 { 1236 i915_gem_object_unpin_map(ce->state->obj); 1237 } 1238 1239 void lrc_fini(struct intel_context *ce) 1240 { 1241 if (!ce->state) 1242 return; 1243 1244 intel_ring_put(fetch_and_zero(&ce->ring)); 1245 i915_vma_put(fetch_and_zero(&ce->state)); 1246 } 1247 1248 void lrc_destroy(struct kref *kref) 1249 { 1250 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 1251 1252 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 1253 GEM_BUG_ON(intel_context_is_pinned(ce)); 1254 1255 lrc_fini(ce); 1256 1257 intel_context_fini(ce); 1258 intel_context_free(ce); 1259 } 1260 1261 static u32 * 1262 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 1263 { 1264 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1265 MI_SRM_LRM_GLOBAL_GTT | 1266 MI_LRI_LRM_CS_MMIO; 1267 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1268 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1269 CTX_TIMESTAMP * sizeof(u32); 1270 *cs++ = 0; 1271 1272 *cs++ = MI_LOAD_REGISTER_REG | 1273 MI_LRR_SOURCE_CS_MMIO | 1274 MI_LRI_LRM_CS_MMIO; 1275 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1276 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1277 1278 *cs++ = MI_LOAD_REGISTER_REG | 1279 MI_LRR_SOURCE_CS_MMIO | 1280 MI_LRI_LRM_CS_MMIO; 1281 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1282 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1283 1284 return cs; 1285 } 1286 1287 static u32 * 1288 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 1289 { 1290 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 1291 1292 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1293 MI_SRM_LRM_GLOBAL_GTT | 1294 MI_LRI_LRM_CS_MMIO; 1295 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1296 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1297 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 1298 *cs++ = 0; 1299 1300 return cs; 1301 } 1302 1303 static u32 * 1304 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 1305 { 1306 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 1307 1308 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1309 MI_SRM_LRM_GLOBAL_GTT | 1310 MI_LRI_LRM_CS_MMIO; 1311 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1312 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1313 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 1314 *cs++ = 0; 1315 1316 *cs++ = MI_LOAD_REGISTER_REG | 1317 MI_LRR_SOURCE_CS_MMIO | 1318 MI_LRI_LRM_CS_MMIO; 1319 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1320 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 1321 1322 return cs; 1323 } 1324 1325 /* 1326 * The bspec's tuning guide asks us to program a vertical watermark value of 1327 * 0x3FF. However this register is not saved/restored properly by the 1328 * hardware, so we're required to apply the desired value via INDIRECT_CTX 1329 * batch buffer to ensure the value takes effect properly. All other bits 1330 * in this register should remain at 0 (the hardware default). 1331 */ 1332 static u32 * 1333 dg2_emit_draw_watermark_setting(u32 *cs) 1334 { 1335 *cs++ = MI_LOAD_REGISTER_IMM(1); 1336 *cs++ = i915_mmio_reg_offset(DRAW_WATERMARK); 1337 *cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF); 1338 1339 return cs; 1340 } 1341 1342 static u32 * 1343 gen12_invalidate_state_cache(u32 *cs) 1344 { 1345 *cs++ = MI_LOAD_REGISTER_IMM(1); 1346 *cs++ = i915_mmio_reg_offset(GEN12_CS_DEBUG_MODE2); 1347 *cs++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE); 1348 return cs; 1349 } 1350 1351 static u32 * 1352 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 1353 { 1354 cs = gen12_emit_timestamp_wa(ce, cs); 1355 cs = gen12_emit_cmd_buf_wa(ce, cs); 1356 cs = gen12_emit_restore_scratch(ce, cs); 1357 1358 /* Wa_16013000631:dg2 */ 1359 if (IS_DG2_G11(ce->engine->i915)) 1360 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0); 1361 1362 cs = gen12_emit_aux_table_inv(ce->engine, cs); 1363 1364 /* Wa_18022495364 */ 1365 if (IS_GFX_GT_IP_RANGE(ce->engine->gt, IP_VER(12, 0), IP_VER(12, 10))) 1366 cs = gen12_invalidate_state_cache(cs); 1367 1368 /* Wa_16014892111 */ 1369 if (IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 70), STEP_A0, STEP_B0) || 1370 IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 71), STEP_A0, STEP_B0) || 1371 IS_DG2(ce->engine->i915)) 1372 cs = dg2_emit_draw_watermark_setting(cs); 1373 1374 return cs; 1375 } 1376 1377 static u32 * 1378 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 1379 { 1380 cs = gen12_emit_timestamp_wa(ce, cs); 1381 cs = gen12_emit_restore_scratch(ce, cs); 1382 1383 /* Wa_16013000631:dg2 */ 1384 if (IS_DG2_G11(ce->engine->i915)) 1385 if (ce->engine->class == COMPUTE_CLASS) 1386 cs = gen8_emit_pipe_control(cs, 1387 PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 1388 0); 1389 1390 return gen12_emit_aux_table_inv(ce->engine, cs); 1391 } 1392 1393 static u32 *xehp_emit_fastcolor_blt_wabb(const struct intel_context *ce, u32 *cs) 1394 { 1395 struct intel_gt *gt = ce->engine->gt; 1396 int mocs = gt->mocs.uc_index << 1; 1397 1398 /** 1399 * Wa_16018031267 / Wa_16018063123 requires that SW forces the 1400 * main copy engine arbitration into round robin mode. We 1401 * additionally need to submit the following WABB blt command 1402 * to produce 4 subblits with each subblit generating 0 byte 1403 * write requests as WABB: 1404 * 1405 * XY_FASTCOLOR_BLT 1406 * BG0 -> 5100000E 1407 * BG1 -> 0000003F (Dest pitch) 1408 * BG2 -> 00000000 (X1, Y1) = (0, 0) 1409 * BG3 -> 00040001 (X2, Y2) = (1, 4) 1410 * BG4 -> scratch 1411 * BG5 -> scratch 1412 * BG6-12 -> 00000000 1413 * BG13 -> 20004004 (Surf. Width= 2,Surf. Height = 5 ) 1414 * BG14 -> 00000010 (Qpitch = 4) 1415 * BG15 -> 00000000 1416 */ 1417 *cs++ = XY_FAST_COLOR_BLT_CMD | (16 - 2); 1418 *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) | 0x3f; 1419 *cs++ = 0; 1420 *cs++ = 4 << 16 | 1; 1421 *cs++ = lower_32_bits(i915_vma_offset(ce->vm->rsvd.vma)); 1422 *cs++ = upper_32_bits(i915_vma_offset(ce->vm->rsvd.vma)); 1423 *cs++ = 0; 1424 *cs++ = 0; 1425 *cs++ = 0; 1426 *cs++ = 0; 1427 *cs++ = 0; 1428 *cs++ = 0; 1429 *cs++ = 0; 1430 *cs++ = 0x20004004; 1431 *cs++ = 0x10; 1432 *cs++ = 0; 1433 1434 return cs; 1435 } 1436 1437 static u32 * 1438 xehp_emit_per_ctx_bb(const struct intel_context *ce, u32 *cs) 1439 { 1440 /* Wa_16018031267, Wa_16018063123 */ 1441 if (NEEDS_FASTCOLOR_BLT_WABB(ce->engine)) 1442 cs = xehp_emit_fastcolor_blt_wabb(ce, cs); 1443 1444 return cs; 1445 } 1446 1447 static void 1448 setup_per_ctx_bb(const struct intel_context *ce, 1449 const struct intel_engine_cs *engine, 1450 u32 *(*emit)(const struct intel_context *, u32 *)) 1451 { 1452 /* Place PER_CTX_BB on next page after INDIRECT_CTX */ 1453 u32 * const start = context_wabb(ce, true); 1454 u32 *cs; 1455 1456 cs = emit(ce, start); 1457 1458 /* PER_CTX_BB must manually terminate */ 1459 *cs++ = MI_BATCH_BUFFER_END; 1460 1461 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 1462 lrc_setup_bb_per_ctx(ce->lrc_reg_state, engine, 1463 lrc_indirect_bb(ce) + PAGE_SIZE); 1464 } 1465 1466 static void 1467 setup_indirect_ctx_bb(const struct intel_context *ce, 1468 const struct intel_engine_cs *engine, 1469 u32 *(*emit)(const struct intel_context *, u32 *)) 1470 { 1471 u32 * const start = context_wabb(ce, false); 1472 u32 *cs; 1473 1474 cs = emit(ce, start); 1475 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 1476 while ((unsigned long)cs % CACHELINE_BYTES) 1477 *cs++ = MI_NOOP; 1478 1479 GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start)); 1480 setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start)); 1481 1482 lrc_setup_indirect_ctx(ce->lrc_reg_state, engine, 1483 lrc_indirect_bb(ce), 1484 (cs - start) * sizeof(*cs)); 1485 } 1486 1487 /* 1488 * The context descriptor encodes various attributes of a context, 1489 * including its GTT address and some flags. Because it's fairly 1490 * expensive to calculate, we'll just do it once and cache the result, 1491 * which remains valid until the context is unpinned. 1492 * 1493 * This is what a descriptor looks like, from LSB to MSB:: 1494 * 1495 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 1496 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 1497 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 1498 * bits 53-54: mbz, reserved for use by hardware 1499 * bits 55-63: group ID, currently unused and set to 0 1500 * 1501 * Starting from Gen11, the upper dword of the descriptor has a new format: 1502 * 1503 * bits 32-36: reserved 1504 * bits 37-47: SW context ID 1505 * bits 48:53: engine instance 1506 * bit 54: mbz, reserved for use by hardware 1507 * bits 55-60: SW counter 1508 * bits 61-63: engine class 1509 * 1510 * On Xe_HP, the upper dword of the descriptor has a new format: 1511 * 1512 * bits 32-37: virtual function number 1513 * bit 38: mbz, reserved for use by hardware 1514 * bits 39-54: SW context ID 1515 * bits 55-57: reserved 1516 * bits 58-63: SW counter 1517 * 1518 * engine info, SW context ID and SW counter need to form a unique number 1519 * (Context ID) per lrc. 1520 */ 1521 static u32 lrc_descriptor(const struct intel_context *ce) 1522 { 1523 u32 desc; 1524 1525 desc = INTEL_LEGACY_32B_CONTEXT; 1526 if (i915_vm_is_4lvl(ce->vm)) 1527 desc = INTEL_LEGACY_64B_CONTEXT; 1528 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 1529 1530 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 1531 if (GRAPHICS_VER(ce->vm->i915) == 8) 1532 desc |= GEN8_CTX_L3LLC_COHERENT; 1533 1534 return i915_ggtt_offset(ce->state) | desc; 1535 } 1536 1537 u32 lrc_update_regs(const struct intel_context *ce, 1538 const struct intel_engine_cs *engine, 1539 u32 head) 1540 { 1541 struct intel_ring *ring = ce->ring; 1542 u32 *regs = ce->lrc_reg_state; 1543 1544 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 1545 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 1546 1547 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1548 regs[CTX_RING_HEAD] = head; 1549 regs[CTX_RING_TAIL] = ring->tail; 1550 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1551 1552 /* RPCS */ 1553 if (engine->class == RENDER_CLASS) { 1554 regs[CTX_R_PWR_CLK_STATE] = 1555 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 1556 1557 i915_oa_init_reg_state(ce, engine); 1558 } 1559 1560 if (ce->wa_bb_page) { 1561 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 1562 1563 fn = gen12_emit_indirect_ctx_xcs; 1564 if (ce->engine->class == RENDER_CLASS) 1565 fn = gen12_emit_indirect_ctx_rcs; 1566 1567 /* Mutually exclusive wrt to global indirect bb */ 1568 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 1569 setup_indirect_ctx_bb(ce, engine, fn); 1570 setup_per_ctx_bb(ce, engine, xehp_emit_per_ctx_bb); 1571 } 1572 1573 return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE; 1574 } 1575 1576 void lrc_update_offsets(struct intel_context *ce, 1577 struct intel_engine_cs *engine) 1578 { 1579 set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false); 1580 } 1581 1582 void lrc_check_regs(const struct intel_context *ce, 1583 const struct intel_engine_cs *engine, 1584 const char *when) 1585 { 1586 const struct intel_ring *ring = ce->ring; 1587 u32 *regs = ce->lrc_reg_state; 1588 bool valid = true; 1589 int x; 1590 1591 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1592 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1593 engine->name, 1594 regs[CTX_RING_START], 1595 i915_ggtt_offset(ring->vma)); 1596 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1597 valid = false; 1598 } 1599 1600 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1601 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1602 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1603 engine->name, 1604 regs[CTX_RING_CTL], 1605 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1606 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1607 valid = false; 1608 } 1609 1610 x = lrc_ring_mi_mode(engine); 1611 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1612 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1613 engine->name, regs[x + 1]); 1614 regs[x + 1] &= ~STOP_RING; 1615 regs[x + 1] |= STOP_RING << 16; 1616 valid = false; 1617 } 1618 1619 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when); 1620 } 1621 1622 /* 1623 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 1624 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 1625 * but there is a slight complication as this is applied in WA batch where the 1626 * values are only initialized once so we cannot take register value at the 1627 * beginning and reuse it further; hence we save its value to memory, upload a 1628 * constant value with bit21 set and then we restore it back with the saved value. 1629 * To simplify the WA, a constant value is formed by using the default value 1630 * of this register. This shouldn't be a problem because we are only modifying 1631 * it for a short period and this batch in non-premptible. We can ofcourse 1632 * use additional instructions that read the actual value of the register 1633 * at that time and set our bit of interest but it makes the WA complicated. 1634 * 1635 * This WA is also required for Gen9 so extracting as a function avoids 1636 * code duplication. 1637 */ 1638 static u32 * 1639 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 1640 { 1641 /* NB no one else is allowed to scribble over scratch + 256! */ 1642 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1643 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1644 *batch++ = intel_gt_scratch_offset(engine->gt, 1645 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1646 *batch++ = 0; 1647 1648 *batch++ = MI_LOAD_REGISTER_IMM(1); 1649 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1650 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 1651 1652 batch = gen8_emit_pipe_control(batch, 1653 PIPE_CONTROL_CS_STALL | 1654 PIPE_CONTROL_DC_FLUSH_ENABLE, 1655 0); 1656 1657 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1658 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1659 *batch++ = intel_gt_scratch_offset(engine->gt, 1660 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1661 *batch++ = 0; 1662 1663 return batch; 1664 } 1665 1666 /* 1667 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 1668 * initialized at the beginning and shared across all contexts but this field 1669 * helps us to have multiple batches at different offsets and select them based 1670 * on a criteria. At the moment this batch always start at the beginning of the page 1671 * and at this point we don't have multiple wa_ctx batch buffers. 1672 * 1673 * The number of WA applied are not known at the beginning; we use this field 1674 * to return the no of DWORDS written. 1675 * 1676 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 1677 * so it adds NOOPs as padding to make it cacheline aligned. 1678 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 1679 * makes a complete batch buffer. 1680 */ 1681 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1682 { 1683 /* WaDisableCtxRestoreArbitration:bdw,chv */ 1684 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1685 1686 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 1687 if (IS_BROADWELL(engine->i915)) 1688 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1689 1690 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 1691 /* Actual scratch location is at 128 bytes offset */ 1692 batch = gen8_emit_pipe_control(batch, 1693 PIPE_CONTROL_FLUSH_L3 | 1694 PIPE_CONTROL_STORE_DATA_INDEX | 1695 PIPE_CONTROL_CS_STALL | 1696 PIPE_CONTROL_QW_WRITE, 1697 LRC_PPHWSP_SCRATCH_ADDR); 1698 1699 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1700 1701 /* Pad to end of cacheline */ 1702 while ((unsigned long)batch % CACHELINE_BYTES) 1703 *batch++ = MI_NOOP; 1704 1705 /* 1706 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 1707 * execution depends on the length specified in terms of cache lines 1708 * in the register CTX_RCS_INDIRECT_CTX 1709 */ 1710 1711 return batch; 1712 } 1713 1714 struct lri { 1715 i915_reg_t reg; 1716 u32 value; 1717 }; 1718 1719 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 1720 { 1721 GEM_BUG_ON(!count || count > 63); 1722 1723 *batch++ = MI_LOAD_REGISTER_IMM(count); 1724 do { 1725 *batch++ = i915_mmio_reg_offset(lri->reg); 1726 *batch++ = lri->value; 1727 } while (lri++, --count); 1728 *batch++ = MI_NOOP; 1729 1730 return batch; 1731 } 1732 1733 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1734 { 1735 static const struct lri lri[] = { 1736 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 1737 { 1738 COMMON_SLICE_CHICKEN2, 1739 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 1740 0), 1741 }, 1742 1743 /* BSpec: 11391 */ 1744 { 1745 FF_SLICE_CHICKEN, 1746 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 1747 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 1748 }, 1749 1750 /* BSpec: 11299 */ 1751 { 1752 _3D_CHICKEN3, 1753 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 1754 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 1755 } 1756 }; 1757 1758 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1759 1760 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 1761 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1762 1763 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 1764 batch = gen8_emit_pipe_control(batch, 1765 PIPE_CONTROL_FLUSH_L3 | 1766 PIPE_CONTROL_STORE_DATA_INDEX | 1767 PIPE_CONTROL_CS_STALL | 1768 PIPE_CONTROL_QW_WRITE, 1769 LRC_PPHWSP_SCRATCH_ADDR); 1770 1771 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 1772 1773 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 1774 if (HAS_POOLED_EU(engine->i915)) { 1775 /* 1776 * EU pool configuration is setup along with golden context 1777 * during context initialization. This value depends on 1778 * device type (2x6 or 3x6) and needs to be updated based 1779 * on which subslice is disabled especially for 2x6 1780 * devices, however it is safe to load default 1781 * configuration of 3x6 device instead of masking off 1782 * corresponding bits because HW ignores bits of a disabled 1783 * subslice and drops down to appropriate config. Please 1784 * see render_state_setup() in i915_gem_render_state.c for 1785 * possible configurations, to avoid duplication they are 1786 * not shown here again. 1787 */ 1788 *batch++ = GEN9_MEDIA_POOL_STATE; 1789 *batch++ = GEN9_MEDIA_POOL_ENABLE; 1790 *batch++ = 0x00777000; 1791 *batch++ = 0; 1792 *batch++ = 0; 1793 *batch++ = 0; 1794 } 1795 1796 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1797 1798 /* Pad to end of cacheline */ 1799 while ((unsigned long)batch % CACHELINE_BYTES) 1800 *batch++ = MI_NOOP; 1801 1802 return batch; 1803 } 1804 1805 #define CTX_WA_BB_SIZE (PAGE_SIZE) 1806 1807 static int lrc_create_wa_ctx(struct intel_engine_cs *engine) 1808 { 1809 struct drm_i915_gem_object *obj; 1810 struct i915_vma *vma; 1811 int err; 1812 1813 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE); 1814 if (IS_ERR(obj)) 1815 return PTR_ERR(obj); 1816 1817 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 1818 if (IS_ERR(vma)) { 1819 err = PTR_ERR(vma); 1820 goto err; 1821 } 1822 1823 engine->wa_ctx.vma = vma; 1824 return 0; 1825 1826 err: 1827 i915_gem_object_put(obj); 1828 return err; 1829 } 1830 1831 void lrc_fini_wa_ctx(struct intel_engine_cs *engine) 1832 { 1833 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 1834 } 1835 1836 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 1837 1838 void lrc_init_wa_ctx(struct intel_engine_cs *engine) 1839 { 1840 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 1841 struct i915_wa_ctx_bb *wa_bb[] = { 1842 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx 1843 }; 1844 wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)]; 1845 struct i915_gem_ww_ctx ww; 1846 void *batch, *batch_ptr; 1847 unsigned int i; 1848 int err; 1849 1850 if (GRAPHICS_VER(engine->i915) >= 11 || 1851 !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE)) 1852 return; 1853 1854 if (GRAPHICS_VER(engine->i915) == 9) { 1855 wa_bb_fn[0] = gen9_init_indirectctx_bb; 1856 wa_bb_fn[1] = NULL; 1857 } else if (GRAPHICS_VER(engine->i915) == 8) { 1858 wa_bb_fn[0] = gen8_init_indirectctx_bb; 1859 wa_bb_fn[1] = NULL; 1860 } 1861 1862 err = lrc_create_wa_ctx(engine); 1863 if (err) { 1864 /* 1865 * We continue even if we fail to initialize WA batch 1866 * because we only expect rare glitches but nothing 1867 * critical to prevent us from using GPU 1868 */ 1869 drm_err(&engine->i915->drm, 1870 "Ignoring context switch w/a allocation error:%d\n", 1871 err); 1872 return; 1873 } 1874 1875 if (!engine->wa_ctx.vma) 1876 return; 1877 1878 i915_gem_ww_ctx_init(&ww, true); 1879 retry: 1880 err = i915_gem_object_lock(wa_ctx->vma->obj, &ww); 1881 if (!err) 1882 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH); 1883 if (err) 1884 goto err; 1885 1886 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 1887 if (IS_ERR(batch)) { 1888 err = PTR_ERR(batch); 1889 goto err_unpin; 1890 } 1891 1892 /* 1893 * Emit the two workaround batch buffers, recording the offset from the 1894 * start of the workaround batch buffer object for each and their 1895 * respective sizes. 1896 */ 1897 batch_ptr = batch; 1898 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 1899 wa_bb[i]->offset = batch_ptr - batch; 1900 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 1901 CACHELINE_BYTES))) { 1902 err = -EINVAL; 1903 break; 1904 } 1905 if (wa_bb_fn[i]) 1906 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 1907 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 1908 } 1909 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE); 1910 1911 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 1912 __i915_gem_object_release_map(wa_ctx->vma->obj); 1913 1914 /* Verify that we can handle failure to setup the wa_ctx */ 1915 if (!err) 1916 err = i915_inject_probe_error(engine->i915, -ENODEV); 1917 1918 err_unpin: 1919 if (err) 1920 i915_vma_unpin(wa_ctx->vma); 1921 err: 1922 if (err == -EDEADLK) { 1923 err = i915_gem_ww_ctx_backoff(&ww); 1924 if (!err) 1925 goto retry; 1926 } 1927 i915_gem_ww_ctx_fini(&ww); 1928 1929 if (err) { 1930 i915_vma_put(engine->wa_ctx.vma); 1931 1932 /* Clear all flags to prevent further use */ 1933 memset(wa_ctx, 0, sizeof(*wa_ctx)); 1934 } 1935 } 1936 1937 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt) 1938 { 1939 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1940 stats->runtime.num_underflow++; 1941 stats->runtime.max_underflow = 1942 max_t(u32, stats->runtime.max_underflow, -dt); 1943 #endif 1944 } 1945 1946 static u32 lrc_get_runtime(const struct intel_context *ce) 1947 { 1948 /* 1949 * We can use either ppHWSP[16] which is recorded before the context 1950 * switch (and so excludes the cost of context switches) or use the 1951 * value from the context image itself, which is saved/restored earlier 1952 * and so includes the cost of the save. 1953 */ 1954 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); 1955 } 1956 1957 void lrc_update_runtime(struct intel_context *ce) 1958 { 1959 struct intel_context_stats *stats = &ce->stats; 1960 u32 old; 1961 s32 dt; 1962 1963 old = stats->runtime.last; 1964 stats->runtime.last = lrc_get_runtime(ce); 1965 dt = stats->runtime.last - old; 1966 if (!dt) 1967 return; 1968 1969 if (unlikely(dt < 0)) { 1970 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1971 old, stats->runtime.last, dt); 1972 st_runtime_underflow(stats, dt); 1973 return; 1974 } 1975 1976 ewma_runtime_add(&stats->runtime.avg, dt); 1977 stats->runtime.total += dt; 1978 } 1979 1980 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1981 #include "selftest_lrc.c" 1982 #endif 1983