1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2014 Intel Corporation 4 */ 5 6 #include "gem/i915_gem_lmem.h" 7 8 #include "gen8_engine_cs.h" 9 #include "i915_drv.h" 10 #include "i915_perf.h" 11 #include "i915_reg.h" 12 #include "intel_context.h" 13 #include "intel_engine.h" 14 #include "intel_engine_regs.h" 15 #include "intel_gpu_commands.h" 16 #include "intel_gt.h" 17 #include "intel_gt_regs.h" 18 #include "intel_lrc.h" 19 #include "intel_lrc_reg.h" 20 #include "intel_ring.h" 21 #include "shmem_utils.h" 22 23 /* 24 * The per-platform tables are u8-encoded in @data. Decode @data and set the 25 * addresses' offset and commands in @regs. The following encoding is used 26 * for each byte. There are 2 steps: decoding commands and decoding addresses. 27 * 28 * Commands: 29 * [7]: create NOPs - number of NOPs are set in lower bits 30 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set 31 * MI_LRI_FORCE_POSTED 32 * [5:0]: Number of NOPs or registers to set values to in case of 33 * MI_LOAD_REGISTER_IMM 34 * 35 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count" 36 * number of registers. They are set by using the REG/REG16 macros: the former 37 * is used for offsets smaller than 0x200 while the latter is for values bigger 38 * than that. Those macros already set all the bits documented below correctly: 39 * 40 * [7]: When a register offset needs more than 6 bits, use additional bytes, to 41 * follow, for the lower bits 42 * [6:0]: Register offset, without considering the engine base. 43 * 44 * This function only tweaks the commands and register offsets. Values are not 45 * filled out. 46 */ 47 static void set_offsets(u32 *regs, 48 const u8 *data, 49 const struct intel_engine_cs *engine, 50 bool close) 51 #define NOP(x) (BIT(7) | (x)) 52 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 53 #define POSTED BIT(0) 54 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 55 #define REG16(x) \ 56 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 57 (((x) >> 2) & 0x7f) 58 #define END 0 59 { 60 const u32 base = engine->mmio_base; 61 62 while (*data) { 63 u8 count, flags; 64 65 if (*data & BIT(7)) { /* skip */ 66 count = *data++ & ~BIT(7); 67 regs += count; 68 continue; 69 } 70 71 count = *data & 0x3f; 72 flags = *data >> 6; 73 data++; 74 75 *regs = MI_LOAD_REGISTER_IMM(count); 76 if (flags & POSTED) 77 *regs |= MI_LRI_FORCE_POSTED; 78 if (GRAPHICS_VER(engine->i915) >= 11) 79 *regs |= MI_LRI_LRM_CS_MMIO; 80 regs++; 81 82 GEM_BUG_ON(!count); 83 do { 84 u32 offset = 0; 85 u8 v; 86 87 do { 88 v = *data++; 89 offset <<= 7; 90 offset |= v & ~BIT(7); 91 } while (v & BIT(7)); 92 93 regs[0] = base + (offset << 2); 94 regs += 2; 95 } while (--count); 96 } 97 98 if (close) { 99 /* Close the batch; used mainly by live_lrc_layout() */ 100 *regs = MI_BATCH_BUFFER_END; 101 if (GRAPHICS_VER(engine->i915) >= 11) 102 *regs |= BIT(0); 103 } 104 } 105 106 static const u8 gen8_xcs_offsets[] = { 107 NOP(1), 108 LRI(11, 0), 109 REG16(0x244), 110 REG(0x034), 111 REG(0x030), 112 REG(0x038), 113 REG(0x03c), 114 REG(0x168), 115 REG(0x140), 116 REG(0x110), 117 REG(0x11c), 118 REG(0x114), 119 REG(0x118), 120 121 NOP(9), 122 LRI(9, 0), 123 REG16(0x3a8), 124 REG16(0x28c), 125 REG16(0x288), 126 REG16(0x284), 127 REG16(0x280), 128 REG16(0x27c), 129 REG16(0x278), 130 REG16(0x274), 131 REG16(0x270), 132 133 NOP(13), 134 LRI(2, 0), 135 REG16(0x200), 136 REG(0x028), 137 138 END 139 }; 140 141 static const u8 gen9_xcs_offsets[] = { 142 NOP(1), 143 LRI(14, POSTED), 144 REG16(0x244), 145 REG(0x034), 146 REG(0x030), 147 REG(0x038), 148 REG(0x03c), 149 REG(0x168), 150 REG(0x140), 151 REG(0x110), 152 REG(0x11c), 153 REG(0x114), 154 REG(0x118), 155 REG(0x1c0), 156 REG(0x1c4), 157 REG(0x1c8), 158 159 NOP(3), 160 LRI(9, POSTED), 161 REG16(0x3a8), 162 REG16(0x28c), 163 REG16(0x288), 164 REG16(0x284), 165 REG16(0x280), 166 REG16(0x27c), 167 REG16(0x278), 168 REG16(0x274), 169 REG16(0x270), 170 171 NOP(13), 172 LRI(1, POSTED), 173 REG16(0x200), 174 175 NOP(13), 176 LRI(44, POSTED), 177 REG(0x028), 178 REG(0x09c), 179 REG(0x0c0), 180 REG(0x178), 181 REG(0x17c), 182 REG16(0x358), 183 REG(0x170), 184 REG(0x150), 185 REG(0x154), 186 REG(0x158), 187 REG16(0x41c), 188 REG16(0x600), 189 REG16(0x604), 190 REG16(0x608), 191 REG16(0x60c), 192 REG16(0x610), 193 REG16(0x614), 194 REG16(0x618), 195 REG16(0x61c), 196 REG16(0x620), 197 REG16(0x624), 198 REG16(0x628), 199 REG16(0x62c), 200 REG16(0x630), 201 REG16(0x634), 202 REG16(0x638), 203 REG16(0x63c), 204 REG16(0x640), 205 REG16(0x644), 206 REG16(0x648), 207 REG16(0x64c), 208 REG16(0x650), 209 REG16(0x654), 210 REG16(0x658), 211 REG16(0x65c), 212 REG16(0x660), 213 REG16(0x664), 214 REG16(0x668), 215 REG16(0x66c), 216 REG16(0x670), 217 REG16(0x674), 218 REG16(0x678), 219 REG16(0x67c), 220 REG(0x068), 221 222 END 223 }; 224 225 static const u8 gen12_xcs_offsets[] = { 226 NOP(1), 227 LRI(13, POSTED), 228 REG16(0x244), 229 REG(0x034), 230 REG(0x030), 231 REG(0x038), 232 REG(0x03c), 233 REG(0x168), 234 REG(0x140), 235 REG(0x110), 236 REG(0x1c0), 237 REG(0x1c4), 238 REG(0x1c8), 239 REG(0x180), 240 REG16(0x2b4), 241 242 NOP(5), 243 LRI(9, POSTED), 244 REG16(0x3a8), 245 REG16(0x28c), 246 REG16(0x288), 247 REG16(0x284), 248 REG16(0x280), 249 REG16(0x27c), 250 REG16(0x278), 251 REG16(0x274), 252 REG16(0x270), 253 254 END 255 }; 256 257 static const u8 dg2_xcs_offsets[] = { 258 NOP(1), 259 LRI(15, POSTED), 260 REG16(0x244), 261 REG(0x034), 262 REG(0x030), 263 REG(0x038), 264 REG(0x03c), 265 REG(0x168), 266 REG(0x140), 267 REG(0x110), 268 REG(0x1c0), 269 REG(0x1c4), 270 REG(0x1c8), 271 REG(0x180), 272 REG16(0x2b4), 273 REG(0x120), 274 REG(0x124), 275 276 NOP(1), 277 LRI(9, POSTED), 278 REG16(0x3a8), 279 REG16(0x28c), 280 REG16(0x288), 281 REG16(0x284), 282 REG16(0x280), 283 REG16(0x27c), 284 REG16(0x278), 285 REG16(0x274), 286 REG16(0x270), 287 288 END 289 }; 290 291 static const u8 gen8_rcs_offsets[] = { 292 NOP(1), 293 LRI(14, POSTED), 294 REG16(0x244), 295 REG(0x034), 296 REG(0x030), 297 REG(0x038), 298 REG(0x03c), 299 REG(0x168), 300 REG(0x140), 301 REG(0x110), 302 REG(0x11c), 303 REG(0x114), 304 REG(0x118), 305 REG(0x1c0), 306 REG(0x1c4), 307 REG(0x1c8), 308 309 NOP(3), 310 LRI(9, POSTED), 311 REG16(0x3a8), 312 REG16(0x28c), 313 REG16(0x288), 314 REG16(0x284), 315 REG16(0x280), 316 REG16(0x27c), 317 REG16(0x278), 318 REG16(0x274), 319 REG16(0x270), 320 321 NOP(13), 322 LRI(1, 0), 323 REG(0x0c8), 324 325 END 326 }; 327 328 static const u8 gen9_rcs_offsets[] = { 329 NOP(1), 330 LRI(14, POSTED), 331 REG16(0x244), 332 REG(0x34), 333 REG(0x30), 334 REG(0x38), 335 REG(0x3c), 336 REG(0x168), 337 REG(0x140), 338 REG(0x110), 339 REG(0x11c), 340 REG(0x114), 341 REG(0x118), 342 REG(0x1c0), 343 REG(0x1c4), 344 REG(0x1c8), 345 346 NOP(3), 347 LRI(9, POSTED), 348 REG16(0x3a8), 349 REG16(0x28c), 350 REG16(0x288), 351 REG16(0x284), 352 REG16(0x280), 353 REG16(0x27c), 354 REG16(0x278), 355 REG16(0x274), 356 REG16(0x270), 357 358 NOP(13), 359 LRI(1, 0), 360 REG(0xc8), 361 362 NOP(13), 363 LRI(44, POSTED), 364 REG(0x28), 365 REG(0x9c), 366 REG(0xc0), 367 REG(0x178), 368 REG(0x17c), 369 REG16(0x358), 370 REG(0x170), 371 REG(0x150), 372 REG(0x154), 373 REG(0x158), 374 REG16(0x41c), 375 REG16(0x600), 376 REG16(0x604), 377 REG16(0x608), 378 REG16(0x60c), 379 REG16(0x610), 380 REG16(0x614), 381 REG16(0x618), 382 REG16(0x61c), 383 REG16(0x620), 384 REG16(0x624), 385 REG16(0x628), 386 REG16(0x62c), 387 REG16(0x630), 388 REG16(0x634), 389 REG16(0x638), 390 REG16(0x63c), 391 REG16(0x640), 392 REG16(0x644), 393 REG16(0x648), 394 REG16(0x64c), 395 REG16(0x650), 396 REG16(0x654), 397 REG16(0x658), 398 REG16(0x65c), 399 REG16(0x660), 400 REG16(0x664), 401 REG16(0x668), 402 REG16(0x66c), 403 REG16(0x670), 404 REG16(0x674), 405 REG16(0x678), 406 REG16(0x67c), 407 REG(0x68), 408 409 END 410 }; 411 412 static const u8 gen11_rcs_offsets[] = { 413 NOP(1), 414 LRI(15, POSTED), 415 REG16(0x244), 416 REG(0x034), 417 REG(0x030), 418 REG(0x038), 419 REG(0x03c), 420 REG(0x168), 421 REG(0x140), 422 REG(0x110), 423 REG(0x11c), 424 REG(0x114), 425 REG(0x118), 426 REG(0x1c0), 427 REG(0x1c4), 428 REG(0x1c8), 429 REG(0x180), 430 431 NOP(1), 432 LRI(9, POSTED), 433 REG16(0x3a8), 434 REG16(0x28c), 435 REG16(0x288), 436 REG16(0x284), 437 REG16(0x280), 438 REG16(0x27c), 439 REG16(0x278), 440 REG16(0x274), 441 REG16(0x270), 442 443 LRI(1, POSTED), 444 REG(0x1b0), 445 446 NOP(10), 447 LRI(1, 0), 448 REG(0x0c8), 449 450 END 451 }; 452 453 static const u8 gen12_rcs_offsets[] = { 454 NOP(1), 455 LRI(13, POSTED), 456 REG16(0x244), 457 REG(0x034), 458 REG(0x030), 459 REG(0x038), 460 REG(0x03c), 461 REG(0x168), 462 REG(0x140), 463 REG(0x110), 464 REG(0x1c0), 465 REG(0x1c4), 466 REG(0x1c8), 467 REG(0x180), 468 REG16(0x2b4), 469 470 NOP(5), 471 LRI(9, POSTED), 472 REG16(0x3a8), 473 REG16(0x28c), 474 REG16(0x288), 475 REG16(0x284), 476 REG16(0x280), 477 REG16(0x27c), 478 REG16(0x278), 479 REG16(0x274), 480 REG16(0x270), 481 482 LRI(3, POSTED), 483 REG(0x1b0), 484 REG16(0x5a8), 485 REG16(0x5ac), 486 487 NOP(6), 488 LRI(1, 0), 489 REG(0x0c8), 490 NOP(3 + 9 + 1), 491 492 LRI(51, POSTED), 493 REG16(0x588), 494 REG16(0x588), 495 REG16(0x588), 496 REG16(0x588), 497 REG16(0x588), 498 REG16(0x588), 499 REG(0x028), 500 REG(0x09c), 501 REG(0x0c0), 502 REG(0x178), 503 REG(0x17c), 504 REG16(0x358), 505 REG(0x170), 506 REG(0x150), 507 REG(0x154), 508 REG(0x158), 509 REG16(0x41c), 510 REG16(0x600), 511 REG16(0x604), 512 REG16(0x608), 513 REG16(0x60c), 514 REG16(0x610), 515 REG16(0x614), 516 REG16(0x618), 517 REG16(0x61c), 518 REG16(0x620), 519 REG16(0x624), 520 REG16(0x628), 521 REG16(0x62c), 522 REG16(0x630), 523 REG16(0x634), 524 REG16(0x638), 525 REG16(0x63c), 526 REG16(0x640), 527 REG16(0x644), 528 REG16(0x648), 529 REG16(0x64c), 530 REG16(0x650), 531 REG16(0x654), 532 REG16(0x658), 533 REG16(0x65c), 534 REG16(0x660), 535 REG16(0x664), 536 REG16(0x668), 537 REG16(0x66c), 538 REG16(0x670), 539 REG16(0x674), 540 REG16(0x678), 541 REG16(0x67c), 542 REG(0x068), 543 REG(0x084), 544 NOP(1), 545 546 END 547 }; 548 549 static const u8 xehp_rcs_offsets[] = { 550 NOP(1), 551 LRI(13, POSTED), 552 REG16(0x244), 553 REG(0x034), 554 REG(0x030), 555 REG(0x038), 556 REG(0x03c), 557 REG(0x168), 558 REG(0x140), 559 REG(0x110), 560 REG(0x1c0), 561 REG(0x1c4), 562 REG(0x1c8), 563 REG(0x180), 564 REG16(0x2b4), 565 566 NOP(5), 567 LRI(9, POSTED), 568 REG16(0x3a8), 569 REG16(0x28c), 570 REG16(0x288), 571 REG16(0x284), 572 REG16(0x280), 573 REG16(0x27c), 574 REG16(0x278), 575 REG16(0x274), 576 REG16(0x270), 577 578 LRI(3, POSTED), 579 REG(0x1b0), 580 REG16(0x5a8), 581 REG16(0x5ac), 582 583 NOP(6), 584 LRI(1, 0), 585 REG(0x0c8), 586 587 END 588 }; 589 590 static const u8 dg2_rcs_offsets[] = { 591 NOP(1), 592 LRI(15, POSTED), 593 REG16(0x244), 594 REG(0x034), 595 REG(0x030), 596 REG(0x038), 597 REG(0x03c), 598 REG(0x168), 599 REG(0x140), 600 REG(0x110), 601 REG(0x1c0), 602 REG(0x1c4), 603 REG(0x1c8), 604 REG(0x180), 605 REG16(0x2b4), 606 REG(0x120), 607 REG(0x124), 608 609 NOP(1), 610 LRI(9, POSTED), 611 REG16(0x3a8), 612 REG16(0x28c), 613 REG16(0x288), 614 REG16(0x284), 615 REG16(0x280), 616 REG16(0x27c), 617 REG16(0x278), 618 REG16(0x274), 619 REG16(0x270), 620 621 LRI(3, POSTED), 622 REG(0x1b0), 623 REG16(0x5a8), 624 REG16(0x5ac), 625 626 NOP(6), 627 LRI(1, 0), 628 REG(0x0c8), 629 630 END 631 }; 632 633 static const u8 mtl_rcs_offsets[] = { 634 NOP(1), 635 LRI(15, POSTED), 636 REG16(0x244), 637 REG(0x034), 638 REG(0x030), 639 REG(0x038), 640 REG(0x03c), 641 REG(0x168), 642 REG(0x140), 643 REG(0x110), 644 REG(0x1c0), 645 REG(0x1c4), 646 REG(0x1c8), 647 REG(0x180), 648 REG16(0x2b4), 649 REG(0x120), 650 REG(0x124), 651 652 NOP(1), 653 LRI(9, POSTED), 654 REG16(0x3a8), 655 REG16(0x28c), 656 REG16(0x288), 657 REG16(0x284), 658 REG16(0x280), 659 REG16(0x27c), 660 REG16(0x278), 661 REG16(0x274), 662 REG16(0x270), 663 664 NOP(2), 665 LRI(2, POSTED), 666 REG16(0x5a8), 667 REG16(0x5ac), 668 669 NOP(6), 670 LRI(1, 0), 671 REG(0x0c8), 672 673 END 674 }; 675 676 #undef END 677 #undef REG16 678 #undef REG 679 #undef LRI 680 #undef NOP 681 682 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 683 { 684 /* 685 * The gen12+ lists only have the registers we program in the basic 686 * default state. We rely on the context image using relative 687 * addressing to automatic fixup the register state between the 688 * physical engines for virtual engine. 689 */ 690 GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 && 691 !intel_engine_has_relative_mmio(engine)); 692 693 if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) { 694 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70)) 695 return mtl_rcs_offsets; 696 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 697 return dg2_rcs_offsets; 698 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 699 return xehp_rcs_offsets; 700 else if (GRAPHICS_VER(engine->i915) >= 12) 701 return gen12_rcs_offsets; 702 else if (GRAPHICS_VER(engine->i915) >= 11) 703 return gen11_rcs_offsets; 704 else if (GRAPHICS_VER(engine->i915) >= 9) 705 return gen9_rcs_offsets; 706 else 707 return gen8_rcs_offsets; 708 } else { 709 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 710 return dg2_xcs_offsets; 711 else if (GRAPHICS_VER(engine->i915) >= 12) 712 return gen12_xcs_offsets; 713 else if (GRAPHICS_VER(engine->i915) >= 9) 714 return gen9_xcs_offsets; 715 else 716 return gen8_xcs_offsets; 717 } 718 } 719 720 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 721 { 722 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 723 return 0x70; 724 else if (GRAPHICS_VER(engine->i915) >= 12) 725 return 0x60; 726 else if (GRAPHICS_VER(engine->i915) >= 9) 727 return 0x54; 728 else if (engine->class == RENDER_CLASS) 729 return 0x58; 730 else 731 return -1; 732 } 733 734 static int lrc_ring_bb_offset(const struct intel_engine_cs *engine) 735 { 736 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 737 return 0x80; 738 else if (GRAPHICS_VER(engine->i915) >= 12) 739 return 0x70; 740 else if (GRAPHICS_VER(engine->i915) >= 9) 741 return 0x64; 742 else if (GRAPHICS_VER(engine->i915) >= 8 && 743 engine->class == RENDER_CLASS) 744 return 0xc4; 745 else 746 return -1; 747 } 748 749 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 750 { 751 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 752 return 0x84; 753 else if (GRAPHICS_VER(engine->i915) >= 12) 754 return 0x74; 755 else if (GRAPHICS_VER(engine->i915) >= 9) 756 return 0x68; 757 else if (engine->class == RENDER_CLASS) 758 return 0xd8; 759 else 760 return -1; 761 } 762 763 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 764 { 765 if (GRAPHICS_VER(engine->i915) >= 12) 766 return 0x12; 767 else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS) 768 return 0x18; 769 else 770 return -1; 771 } 772 773 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 774 { 775 int x; 776 777 x = lrc_ring_wa_bb_per_ctx(engine); 778 if (x < 0) 779 return x; 780 781 return x + 2; 782 } 783 784 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 785 { 786 int x; 787 788 x = lrc_ring_indirect_ptr(engine); 789 if (x < 0) 790 return x; 791 792 return x + 2; 793 } 794 795 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 796 { 797 798 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 799 /* 800 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL 801 * simply to match the RCS context image layout. 802 */ 803 return 0xc6; 804 else if (engine->class != RENDER_CLASS) 805 return -1; 806 else if (GRAPHICS_VER(engine->i915) >= 12) 807 return 0xb6; 808 else if (GRAPHICS_VER(engine->i915) >= 11) 809 return 0xaa; 810 else 811 return -1; 812 } 813 814 static u32 815 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 816 { 817 if (GRAPHICS_VER(engine->i915) >= 12) 818 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 819 else if (GRAPHICS_VER(engine->i915) >= 11) 820 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 821 else if (GRAPHICS_VER(engine->i915) >= 9) 822 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 823 else if (GRAPHICS_VER(engine->i915) >= 8) 824 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 825 826 GEM_BUG_ON(GRAPHICS_VER(engine->i915) < 8); 827 828 return 0; 829 } 830 831 static void 832 lrc_setup_indirect_ctx(u32 *regs, 833 const struct intel_engine_cs *engine, 834 u32 ctx_bb_ggtt_addr, 835 u32 size) 836 { 837 GEM_BUG_ON(!size); 838 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 839 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 840 regs[lrc_ring_indirect_ptr(engine) + 1] = 841 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 842 843 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 844 regs[lrc_ring_indirect_offset(engine) + 1] = 845 lrc_ring_indirect_offset_default(engine) << 6; 846 } 847 848 static bool ctx_needs_runalone(const struct intel_context *ce) 849 { 850 struct i915_gem_context *gem_ctx; 851 bool ctx_is_protected = false; 852 853 /* 854 * On MTL and newer platforms, protected contexts require setting 855 * the LRC run-alone bit or else the encryption will not happen. 856 */ 857 if (GRAPHICS_VER_FULL(ce->engine->i915) >= IP_VER(12, 70) && 858 (ce->engine->class == COMPUTE_CLASS || ce->engine->class == RENDER_CLASS)) { 859 rcu_read_lock(); 860 gem_ctx = rcu_dereference(ce->gem_context); 861 if (gem_ctx) 862 ctx_is_protected = gem_ctx->uses_protected_content; 863 rcu_read_unlock(); 864 } 865 866 return ctx_is_protected; 867 } 868 869 static void init_common_regs(u32 * const regs, 870 const struct intel_context *ce, 871 const struct intel_engine_cs *engine, 872 bool inhibit) 873 { 874 u32 ctl; 875 int loc; 876 877 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 878 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 879 if (inhibit) 880 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 881 if (GRAPHICS_VER(engine->i915) < 11) 882 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 883 CTX_CTRL_RS_CTX_ENABLE); 884 if (ctx_needs_runalone(ce)) 885 ctl |= _MASKED_BIT_ENABLE(GEN12_CTX_CTRL_RUNALONE_MODE); 886 regs[CTX_CONTEXT_CONTROL] = ctl; 887 888 regs[CTX_TIMESTAMP] = ce->stats.runtime.last; 889 890 loc = lrc_ring_bb_offset(engine); 891 if (loc != -1) 892 regs[loc + 1] = 0; 893 } 894 895 static void init_wa_bb_regs(u32 * const regs, 896 const struct intel_engine_cs *engine) 897 { 898 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 899 900 if (wa_ctx->per_ctx.size) { 901 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 902 903 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 904 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 905 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 906 } 907 908 if (wa_ctx->indirect_ctx.size) { 909 lrc_setup_indirect_ctx(regs, engine, 910 i915_ggtt_offset(wa_ctx->vma) + 911 wa_ctx->indirect_ctx.offset, 912 wa_ctx->indirect_ctx.size); 913 } 914 } 915 916 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt) 917 { 918 if (i915_vm_is_4lvl(&ppgtt->vm)) { 919 /* 64b PPGTT (48bit canonical) 920 * PDP0_DESCRIPTOR contains the base address to PML4 and 921 * other PDP Descriptors are ignored. 922 */ 923 ASSIGN_CTX_PML4(ppgtt, regs); 924 } else { 925 ASSIGN_CTX_PDP(ppgtt, regs, 3); 926 ASSIGN_CTX_PDP(ppgtt, regs, 2); 927 ASSIGN_CTX_PDP(ppgtt, regs, 1); 928 ASSIGN_CTX_PDP(ppgtt, regs, 0); 929 } 930 } 931 932 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 933 { 934 if (i915_is_ggtt(vm)) 935 return i915_vm_to_ggtt(vm)->alias; 936 else 937 return i915_vm_to_ppgtt(vm); 938 } 939 940 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 941 { 942 int x; 943 944 x = lrc_ring_mi_mode(engine); 945 if (x != -1) { 946 regs[x + 1] &= ~STOP_RING; 947 regs[x + 1] |= STOP_RING << 16; 948 } 949 } 950 951 static void __lrc_init_regs(u32 *regs, 952 const struct intel_context *ce, 953 const struct intel_engine_cs *engine, 954 bool inhibit) 955 { 956 /* 957 * A context is actually a big batch buffer with several 958 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 959 * values we are setting here are only for the first context restore: 960 * on a subsequent save, the GPU will recreate this batchbuffer with new 961 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 962 * we are not initializing here). 963 * 964 * Must keep consistent with virtual_update_register_offsets(). 965 */ 966 967 if (inhibit) 968 memset(regs, 0, PAGE_SIZE); 969 970 set_offsets(regs, reg_offsets(engine), engine, inhibit); 971 972 init_common_regs(regs, ce, engine, inhibit); 973 init_ppgtt_regs(regs, vm_alias(ce->vm)); 974 975 init_wa_bb_regs(regs, engine); 976 977 __reset_stop_ring(regs, engine); 978 } 979 980 void lrc_init_regs(const struct intel_context *ce, 981 const struct intel_engine_cs *engine, 982 bool inhibit) 983 { 984 __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit); 985 } 986 987 void lrc_reset_regs(const struct intel_context *ce, 988 const struct intel_engine_cs *engine) 989 { 990 __reset_stop_ring(ce->lrc_reg_state, engine); 991 } 992 993 static void 994 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 995 { 996 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 997 return; 998 999 vaddr += engine->context_size; 1000 1001 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 1002 } 1003 1004 static void 1005 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 1006 { 1007 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1008 return; 1009 1010 vaddr += engine->context_size; 1011 1012 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 1013 drm_err_once(&engine->i915->drm, 1014 "%s context redzone overwritten!\n", 1015 engine->name); 1016 } 1017 1018 static u32 context_wa_bb_offset(const struct intel_context *ce) 1019 { 1020 return PAGE_SIZE * ce->wa_bb_page; 1021 } 1022 1023 static u32 *context_indirect_bb(const struct intel_context *ce) 1024 { 1025 void *ptr; 1026 1027 GEM_BUG_ON(!ce->wa_bb_page); 1028 1029 ptr = ce->lrc_reg_state; 1030 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 1031 ptr += context_wa_bb_offset(ce); 1032 1033 return ptr; 1034 } 1035 1036 void lrc_init_state(struct intel_context *ce, 1037 struct intel_engine_cs *engine, 1038 void *state) 1039 { 1040 bool inhibit = true; 1041 1042 set_redzone(state, engine); 1043 1044 if (engine->default_state) { 1045 shmem_read(engine->default_state, 0, 1046 state, engine->context_size); 1047 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 1048 inhibit = false; 1049 } 1050 1051 /* Clear the ppHWSP (inc. per-context counters) */ 1052 memset(state, 0, PAGE_SIZE); 1053 1054 /* Clear the indirect wa and storage */ 1055 if (ce->wa_bb_page) 1056 memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE); 1057 1058 /* 1059 * The second page of the context object contains some registers which 1060 * must be set up prior to the first execution. 1061 */ 1062 __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit); 1063 } 1064 1065 u32 lrc_indirect_bb(const struct intel_context *ce) 1066 { 1067 return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce); 1068 } 1069 1070 static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs) 1071 { 1072 /* If predication is active, this will be noop'ed */ 1073 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2); 1074 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA; 1075 *cs++ = 0; 1076 *cs++ = 0; /* No predication */ 1077 1078 /* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */ 1079 *cs++ = MI_BATCH_BUFFER_END | BIT(15); 1080 *cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE; 1081 1082 /* Instructions are no longer predicated (disabled), we can proceed */ 1083 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2); 1084 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA; 1085 *cs++ = 0; 1086 *cs++ = 1; /* enable predication before the next BB */ 1087 1088 *cs++ = MI_BATCH_BUFFER_END; 1089 GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA); 1090 1091 return cs; 1092 } 1093 1094 static struct i915_vma * 1095 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine) 1096 { 1097 struct drm_i915_gem_object *obj; 1098 struct i915_vma *vma; 1099 u32 context_size; 1100 1101 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 1102 1103 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1104 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 1105 1106 if (GRAPHICS_VER(engine->i915) >= 12) { 1107 ce->wa_bb_page = context_size / PAGE_SIZE; 1108 context_size += PAGE_SIZE; 1109 } 1110 1111 if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) { 1112 ce->parallel.guc.parent_page = context_size / PAGE_SIZE; 1113 context_size += PARENT_SCRATCH_SIZE; 1114 } 1115 1116 obj = i915_gem_object_create_lmem(engine->i915, context_size, 1117 I915_BO_ALLOC_PM_VOLATILE); 1118 if (IS_ERR(obj)) { 1119 obj = i915_gem_object_create_shmem(engine->i915, context_size); 1120 if (IS_ERR(obj)) 1121 return ERR_CAST(obj); 1122 1123 /* 1124 * Wa_22016122933: For Media version 13.0, all Media GT shared 1125 * memory needs to be mapped as WC on CPU side and UC (PAT 1126 * index 2) on GPU side. 1127 */ 1128 if (intel_gt_needs_wa_22016122933(engine->gt)) 1129 i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE); 1130 } 1131 1132 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 1133 if (IS_ERR(vma)) { 1134 i915_gem_object_put(obj); 1135 return vma; 1136 } 1137 1138 return vma; 1139 } 1140 1141 static struct intel_timeline * 1142 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine) 1143 { 1144 struct intel_timeline *tl = fetch_and_zero(&ce->timeline); 1145 1146 return intel_timeline_create_from_engine(engine, page_unmask_bits(tl)); 1147 } 1148 1149 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine) 1150 { 1151 struct intel_ring *ring; 1152 struct i915_vma *vma; 1153 int err; 1154 1155 GEM_BUG_ON(ce->state); 1156 1157 vma = __lrc_alloc_state(ce, engine); 1158 if (IS_ERR(vma)) 1159 return PTR_ERR(vma); 1160 1161 ring = intel_engine_create_ring(engine, ce->ring_size); 1162 if (IS_ERR(ring)) { 1163 err = PTR_ERR(ring); 1164 goto err_vma; 1165 } 1166 1167 if (!page_mask_bits(ce->timeline)) { 1168 struct intel_timeline *tl; 1169 1170 /* 1171 * Use the static global HWSP for the kernel context, and 1172 * a dynamically allocated cacheline for everyone else. 1173 */ 1174 if (unlikely(ce->timeline)) 1175 tl = pinned_timeline(ce, engine); 1176 else 1177 tl = intel_timeline_create(engine->gt); 1178 if (IS_ERR(tl)) { 1179 err = PTR_ERR(tl); 1180 goto err_ring; 1181 } 1182 1183 ce->timeline = tl; 1184 } 1185 1186 ce->ring = ring; 1187 ce->state = vma; 1188 1189 return 0; 1190 1191 err_ring: 1192 intel_ring_put(ring); 1193 err_vma: 1194 i915_vma_put(vma); 1195 return err; 1196 } 1197 1198 void lrc_reset(struct intel_context *ce) 1199 { 1200 GEM_BUG_ON(!intel_context_is_pinned(ce)); 1201 1202 intel_ring_reset(ce->ring, ce->ring->emit); 1203 1204 /* Scrub away the garbage */ 1205 lrc_init_regs(ce, ce->engine, true); 1206 ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail); 1207 } 1208 1209 int 1210 lrc_pre_pin(struct intel_context *ce, 1211 struct intel_engine_cs *engine, 1212 struct i915_gem_ww_ctx *ww, 1213 void **vaddr) 1214 { 1215 GEM_BUG_ON(!ce->state); 1216 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 1217 1218 *vaddr = i915_gem_object_pin_map(ce->state->obj, 1219 intel_gt_coherent_map_type(ce->engine->gt, 1220 ce->state->obj, 1221 false) | 1222 I915_MAP_OVERRIDE); 1223 1224 return PTR_ERR_OR_ZERO(*vaddr); 1225 } 1226 1227 int 1228 lrc_pin(struct intel_context *ce, 1229 struct intel_engine_cs *engine, 1230 void *vaddr) 1231 { 1232 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 1233 1234 if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags)) 1235 lrc_init_state(ce, engine, vaddr); 1236 1237 ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail); 1238 return 0; 1239 } 1240 1241 void lrc_unpin(struct intel_context *ce) 1242 { 1243 if (unlikely(ce->parallel.last_rq)) { 1244 i915_request_put(ce->parallel.last_rq); 1245 ce->parallel.last_rq = NULL; 1246 } 1247 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 1248 ce->engine); 1249 } 1250 1251 void lrc_post_unpin(struct intel_context *ce) 1252 { 1253 i915_gem_object_unpin_map(ce->state->obj); 1254 } 1255 1256 void lrc_fini(struct intel_context *ce) 1257 { 1258 if (!ce->state) 1259 return; 1260 1261 intel_ring_put(fetch_and_zero(&ce->ring)); 1262 i915_vma_put(fetch_and_zero(&ce->state)); 1263 } 1264 1265 void lrc_destroy(struct kref *kref) 1266 { 1267 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 1268 1269 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 1270 GEM_BUG_ON(intel_context_is_pinned(ce)); 1271 1272 lrc_fini(ce); 1273 1274 intel_context_fini(ce); 1275 intel_context_free(ce); 1276 } 1277 1278 static u32 * 1279 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 1280 { 1281 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1282 MI_SRM_LRM_GLOBAL_GTT | 1283 MI_LRI_LRM_CS_MMIO; 1284 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1285 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1286 CTX_TIMESTAMP * sizeof(u32); 1287 *cs++ = 0; 1288 1289 *cs++ = MI_LOAD_REGISTER_REG | 1290 MI_LRR_SOURCE_CS_MMIO | 1291 MI_LRI_LRM_CS_MMIO; 1292 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1293 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1294 1295 *cs++ = MI_LOAD_REGISTER_REG | 1296 MI_LRR_SOURCE_CS_MMIO | 1297 MI_LRI_LRM_CS_MMIO; 1298 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1299 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1300 1301 return cs; 1302 } 1303 1304 static u32 * 1305 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 1306 { 1307 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 1308 1309 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1310 MI_SRM_LRM_GLOBAL_GTT | 1311 MI_LRI_LRM_CS_MMIO; 1312 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1313 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1314 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 1315 *cs++ = 0; 1316 1317 return cs; 1318 } 1319 1320 static u32 * 1321 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 1322 { 1323 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 1324 1325 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1326 MI_SRM_LRM_GLOBAL_GTT | 1327 MI_LRI_LRM_CS_MMIO; 1328 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1329 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1330 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 1331 *cs++ = 0; 1332 1333 *cs++ = MI_LOAD_REGISTER_REG | 1334 MI_LRR_SOURCE_CS_MMIO | 1335 MI_LRI_LRM_CS_MMIO; 1336 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1337 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 1338 1339 return cs; 1340 } 1341 1342 /* 1343 * The bspec's tuning guide asks us to program a vertical watermark value of 1344 * 0x3FF. However this register is not saved/restored properly by the 1345 * hardware, so we're required to apply the desired value via INDIRECT_CTX 1346 * batch buffer to ensure the value takes effect properly. All other bits 1347 * in this register should remain at 0 (the hardware default). 1348 */ 1349 static u32 * 1350 dg2_emit_draw_watermark_setting(u32 *cs) 1351 { 1352 *cs++ = MI_LOAD_REGISTER_IMM(1); 1353 *cs++ = i915_mmio_reg_offset(DRAW_WATERMARK); 1354 *cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF); 1355 1356 return cs; 1357 } 1358 1359 static u32 * 1360 gen12_invalidate_state_cache(u32 *cs) 1361 { 1362 *cs++ = MI_LOAD_REGISTER_IMM(1); 1363 *cs++ = i915_mmio_reg_offset(GEN12_CS_DEBUG_MODE2); 1364 *cs++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE); 1365 return cs; 1366 } 1367 1368 static u32 * 1369 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 1370 { 1371 cs = gen12_emit_timestamp_wa(ce, cs); 1372 cs = gen12_emit_cmd_buf_wa(ce, cs); 1373 cs = gen12_emit_restore_scratch(ce, cs); 1374 1375 /* Wa_16013000631:dg2 */ 1376 if (IS_DG2_G11(ce->engine->i915)) 1377 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0); 1378 1379 cs = gen12_emit_aux_table_inv(ce->engine, cs); 1380 1381 /* Wa_18022495364 */ 1382 if (IS_GFX_GT_IP_RANGE(ce->engine->gt, IP_VER(12, 0), IP_VER(12, 10))) 1383 cs = gen12_invalidate_state_cache(cs); 1384 1385 /* Wa_16014892111 */ 1386 if (IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 70), STEP_A0, STEP_B0) || 1387 IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 71), STEP_A0, STEP_B0) || 1388 IS_DG2(ce->engine->i915)) 1389 cs = dg2_emit_draw_watermark_setting(cs); 1390 1391 return cs; 1392 } 1393 1394 static u32 * 1395 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 1396 { 1397 cs = gen12_emit_timestamp_wa(ce, cs); 1398 cs = gen12_emit_restore_scratch(ce, cs); 1399 1400 /* Wa_16013000631:dg2 */ 1401 if (IS_DG2_G11(ce->engine->i915)) 1402 if (ce->engine->class == COMPUTE_CLASS) 1403 cs = gen8_emit_pipe_control(cs, 1404 PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 1405 0); 1406 1407 return gen12_emit_aux_table_inv(ce->engine, cs); 1408 } 1409 1410 static void 1411 setup_indirect_ctx_bb(const struct intel_context *ce, 1412 const struct intel_engine_cs *engine, 1413 u32 *(*emit)(const struct intel_context *, u32 *)) 1414 { 1415 u32 * const start = context_indirect_bb(ce); 1416 u32 *cs; 1417 1418 cs = emit(ce, start); 1419 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 1420 while ((unsigned long)cs % CACHELINE_BYTES) 1421 *cs++ = MI_NOOP; 1422 1423 GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start)); 1424 setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start)); 1425 1426 lrc_setup_indirect_ctx(ce->lrc_reg_state, engine, 1427 lrc_indirect_bb(ce), 1428 (cs - start) * sizeof(*cs)); 1429 } 1430 1431 /* 1432 * The context descriptor encodes various attributes of a context, 1433 * including its GTT address and some flags. Because it's fairly 1434 * expensive to calculate, we'll just do it once and cache the result, 1435 * which remains valid until the context is unpinned. 1436 * 1437 * This is what a descriptor looks like, from LSB to MSB:: 1438 * 1439 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 1440 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 1441 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 1442 * bits 53-54: mbz, reserved for use by hardware 1443 * bits 55-63: group ID, currently unused and set to 0 1444 * 1445 * Starting from Gen11, the upper dword of the descriptor has a new format: 1446 * 1447 * bits 32-36: reserved 1448 * bits 37-47: SW context ID 1449 * bits 48:53: engine instance 1450 * bit 54: mbz, reserved for use by hardware 1451 * bits 55-60: SW counter 1452 * bits 61-63: engine class 1453 * 1454 * On Xe_HP, the upper dword of the descriptor has a new format: 1455 * 1456 * bits 32-37: virtual function number 1457 * bit 38: mbz, reserved for use by hardware 1458 * bits 39-54: SW context ID 1459 * bits 55-57: reserved 1460 * bits 58-63: SW counter 1461 * 1462 * engine info, SW context ID and SW counter need to form a unique number 1463 * (Context ID) per lrc. 1464 */ 1465 static u32 lrc_descriptor(const struct intel_context *ce) 1466 { 1467 u32 desc; 1468 1469 desc = INTEL_LEGACY_32B_CONTEXT; 1470 if (i915_vm_is_4lvl(ce->vm)) 1471 desc = INTEL_LEGACY_64B_CONTEXT; 1472 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 1473 1474 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 1475 if (GRAPHICS_VER(ce->vm->i915) == 8) 1476 desc |= GEN8_CTX_L3LLC_COHERENT; 1477 1478 return i915_ggtt_offset(ce->state) | desc; 1479 } 1480 1481 u32 lrc_update_regs(const struct intel_context *ce, 1482 const struct intel_engine_cs *engine, 1483 u32 head) 1484 { 1485 struct intel_ring *ring = ce->ring; 1486 u32 *regs = ce->lrc_reg_state; 1487 1488 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 1489 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 1490 1491 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1492 regs[CTX_RING_HEAD] = head; 1493 regs[CTX_RING_TAIL] = ring->tail; 1494 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1495 1496 /* RPCS */ 1497 if (engine->class == RENDER_CLASS) { 1498 regs[CTX_R_PWR_CLK_STATE] = 1499 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 1500 1501 i915_oa_init_reg_state(ce, engine); 1502 } 1503 1504 if (ce->wa_bb_page) { 1505 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 1506 1507 fn = gen12_emit_indirect_ctx_xcs; 1508 if (ce->engine->class == RENDER_CLASS) 1509 fn = gen12_emit_indirect_ctx_rcs; 1510 1511 /* Mutually exclusive wrt to global indirect bb */ 1512 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 1513 setup_indirect_ctx_bb(ce, engine, fn); 1514 } 1515 1516 return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE; 1517 } 1518 1519 void lrc_update_offsets(struct intel_context *ce, 1520 struct intel_engine_cs *engine) 1521 { 1522 set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false); 1523 } 1524 1525 void lrc_check_regs(const struct intel_context *ce, 1526 const struct intel_engine_cs *engine, 1527 const char *when) 1528 { 1529 const struct intel_ring *ring = ce->ring; 1530 u32 *regs = ce->lrc_reg_state; 1531 bool valid = true; 1532 int x; 1533 1534 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1535 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1536 engine->name, 1537 regs[CTX_RING_START], 1538 i915_ggtt_offset(ring->vma)); 1539 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1540 valid = false; 1541 } 1542 1543 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1544 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1545 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1546 engine->name, 1547 regs[CTX_RING_CTL], 1548 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1549 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1550 valid = false; 1551 } 1552 1553 x = lrc_ring_mi_mode(engine); 1554 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1555 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1556 engine->name, regs[x + 1]); 1557 regs[x + 1] &= ~STOP_RING; 1558 regs[x + 1] |= STOP_RING << 16; 1559 valid = false; 1560 } 1561 1562 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when); 1563 } 1564 1565 /* 1566 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 1567 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 1568 * but there is a slight complication as this is applied in WA batch where the 1569 * values are only initialized once so we cannot take register value at the 1570 * beginning and reuse it further; hence we save its value to memory, upload a 1571 * constant value with bit21 set and then we restore it back with the saved value. 1572 * To simplify the WA, a constant value is formed by using the default value 1573 * of this register. This shouldn't be a problem because we are only modifying 1574 * it for a short period and this batch in non-premptible. We can ofcourse 1575 * use additional instructions that read the actual value of the register 1576 * at that time and set our bit of interest but it makes the WA complicated. 1577 * 1578 * This WA is also required for Gen9 so extracting as a function avoids 1579 * code duplication. 1580 */ 1581 static u32 * 1582 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 1583 { 1584 /* NB no one else is allowed to scribble over scratch + 256! */ 1585 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1586 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1587 *batch++ = intel_gt_scratch_offset(engine->gt, 1588 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1589 *batch++ = 0; 1590 1591 *batch++ = MI_LOAD_REGISTER_IMM(1); 1592 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1593 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 1594 1595 batch = gen8_emit_pipe_control(batch, 1596 PIPE_CONTROL_CS_STALL | 1597 PIPE_CONTROL_DC_FLUSH_ENABLE, 1598 0); 1599 1600 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1601 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1602 *batch++ = intel_gt_scratch_offset(engine->gt, 1603 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1604 *batch++ = 0; 1605 1606 return batch; 1607 } 1608 1609 /* 1610 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 1611 * initialized at the beginning and shared across all contexts but this field 1612 * helps us to have multiple batches at different offsets and select them based 1613 * on a criteria. At the moment this batch always start at the beginning of the page 1614 * and at this point we don't have multiple wa_ctx batch buffers. 1615 * 1616 * The number of WA applied are not known at the beginning; we use this field 1617 * to return the no of DWORDS written. 1618 * 1619 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 1620 * so it adds NOOPs as padding to make it cacheline aligned. 1621 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 1622 * makes a complete batch buffer. 1623 */ 1624 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1625 { 1626 /* WaDisableCtxRestoreArbitration:bdw,chv */ 1627 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1628 1629 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 1630 if (IS_BROADWELL(engine->i915)) 1631 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1632 1633 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 1634 /* Actual scratch location is at 128 bytes offset */ 1635 batch = gen8_emit_pipe_control(batch, 1636 PIPE_CONTROL_FLUSH_L3 | 1637 PIPE_CONTROL_STORE_DATA_INDEX | 1638 PIPE_CONTROL_CS_STALL | 1639 PIPE_CONTROL_QW_WRITE, 1640 LRC_PPHWSP_SCRATCH_ADDR); 1641 1642 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1643 1644 /* Pad to end of cacheline */ 1645 while ((unsigned long)batch % CACHELINE_BYTES) 1646 *batch++ = MI_NOOP; 1647 1648 /* 1649 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 1650 * execution depends on the length specified in terms of cache lines 1651 * in the register CTX_RCS_INDIRECT_CTX 1652 */ 1653 1654 return batch; 1655 } 1656 1657 struct lri { 1658 i915_reg_t reg; 1659 u32 value; 1660 }; 1661 1662 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 1663 { 1664 GEM_BUG_ON(!count || count > 63); 1665 1666 *batch++ = MI_LOAD_REGISTER_IMM(count); 1667 do { 1668 *batch++ = i915_mmio_reg_offset(lri->reg); 1669 *batch++ = lri->value; 1670 } while (lri++, --count); 1671 *batch++ = MI_NOOP; 1672 1673 return batch; 1674 } 1675 1676 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1677 { 1678 static const struct lri lri[] = { 1679 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 1680 { 1681 COMMON_SLICE_CHICKEN2, 1682 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 1683 0), 1684 }, 1685 1686 /* BSpec: 11391 */ 1687 { 1688 FF_SLICE_CHICKEN, 1689 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 1690 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 1691 }, 1692 1693 /* BSpec: 11299 */ 1694 { 1695 _3D_CHICKEN3, 1696 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 1697 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 1698 } 1699 }; 1700 1701 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1702 1703 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 1704 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1705 1706 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 1707 batch = gen8_emit_pipe_control(batch, 1708 PIPE_CONTROL_FLUSH_L3 | 1709 PIPE_CONTROL_STORE_DATA_INDEX | 1710 PIPE_CONTROL_CS_STALL | 1711 PIPE_CONTROL_QW_WRITE, 1712 LRC_PPHWSP_SCRATCH_ADDR); 1713 1714 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 1715 1716 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 1717 if (HAS_POOLED_EU(engine->i915)) { 1718 /* 1719 * EU pool configuration is setup along with golden context 1720 * during context initialization. This value depends on 1721 * device type (2x6 or 3x6) and needs to be updated based 1722 * on which subslice is disabled especially for 2x6 1723 * devices, however it is safe to load default 1724 * configuration of 3x6 device instead of masking off 1725 * corresponding bits because HW ignores bits of a disabled 1726 * subslice and drops down to appropriate config. Please 1727 * see render_state_setup() in i915_gem_render_state.c for 1728 * possible configurations, to avoid duplication they are 1729 * not shown here again. 1730 */ 1731 *batch++ = GEN9_MEDIA_POOL_STATE; 1732 *batch++ = GEN9_MEDIA_POOL_ENABLE; 1733 *batch++ = 0x00777000; 1734 *batch++ = 0; 1735 *batch++ = 0; 1736 *batch++ = 0; 1737 } 1738 1739 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1740 1741 /* Pad to end of cacheline */ 1742 while ((unsigned long)batch % CACHELINE_BYTES) 1743 *batch++ = MI_NOOP; 1744 1745 return batch; 1746 } 1747 1748 #define CTX_WA_BB_SIZE (PAGE_SIZE) 1749 1750 static int lrc_create_wa_ctx(struct intel_engine_cs *engine) 1751 { 1752 struct drm_i915_gem_object *obj; 1753 struct i915_vma *vma; 1754 int err; 1755 1756 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE); 1757 if (IS_ERR(obj)) 1758 return PTR_ERR(obj); 1759 1760 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 1761 if (IS_ERR(vma)) { 1762 err = PTR_ERR(vma); 1763 goto err; 1764 } 1765 1766 engine->wa_ctx.vma = vma; 1767 return 0; 1768 1769 err: 1770 i915_gem_object_put(obj); 1771 return err; 1772 } 1773 1774 void lrc_fini_wa_ctx(struct intel_engine_cs *engine) 1775 { 1776 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 1777 } 1778 1779 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 1780 1781 void lrc_init_wa_ctx(struct intel_engine_cs *engine) 1782 { 1783 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 1784 struct i915_wa_ctx_bb *wa_bb[] = { 1785 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx 1786 }; 1787 wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)]; 1788 struct i915_gem_ww_ctx ww; 1789 void *batch, *batch_ptr; 1790 unsigned int i; 1791 int err; 1792 1793 if (GRAPHICS_VER(engine->i915) >= 11 || 1794 !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE)) 1795 return; 1796 1797 if (GRAPHICS_VER(engine->i915) == 9) { 1798 wa_bb_fn[0] = gen9_init_indirectctx_bb; 1799 wa_bb_fn[1] = NULL; 1800 } else if (GRAPHICS_VER(engine->i915) == 8) { 1801 wa_bb_fn[0] = gen8_init_indirectctx_bb; 1802 wa_bb_fn[1] = NULL; 1803 } 1804 1805 err = lrc_create_wa_ctx(engine); 1806 if (err) { 1807 /* 1808 * We continue even if we fail to initialize WA batch 1809 * because we only expect rare glitches but nothing 1810 * critical to prevent us from using GPU 1811 */ 1812 drm_err(&engine->i915->drm, 1813 "Ignoring context switch w/a allocation error:%d\n", 1814 err); 1815 return; 1816 } 1817 1818 if (!engine->wa_ctx.vma) 1819 return; 1820 1821 i915_gem_ww_ctx_init(&ww, true); 1822 retry: 1823 err = i915_gem_object_lock(wa_ctx->vma->obj, &ww); 1824 if (!err) 1825 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH); 1826 if (err) 1827 goto err; 1828 1829 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 1830 if (IS_ERR(batch)) { 1831 err = PTR_ERR(batch); 1832 goto err_unpin; 1833 } 1834 1835 /* 1836 * Emit the two workaround batch buffers, recording the offset from the 1837 * start of the workaround batch buffer object for each and their 1838 * respective sizes. 1839 */ 1840 batch_ptr = batch; 1841 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 1842 wa_bb[i]->offset = batch_ptr - batch; 1843 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 1844 CACHELINE_BYTES))) { 1845 err = -EINVAL; 1846 break; 1847 } 1848 if (wa_bb_fn[i]) 1849 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 1850 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 1851 } 1852 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE); 1853 1854 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 1855 __i915_gem_object_release_map(wa_ctx->vma->obj); 1856 1857 /* Verify that we can handle failure to setup the wa_ctx */ 1858 if (!err) 1859 err = i915_inject_probe_error(engine->i915, -ENODEV); 1860 1861 err_unpin: 1862 if (err) 1863 i915_vma_unpin(wa_ctx->vma); 1864 err: 1865 if (err == -EDEADLK) { 1866 err = i915_gem_ww_ctx_backoff(&ww); 1867 if (!err) 1868 goto retry; 1869 } 1870 i915_gem_ww_ctx_fini(&ww); 1871 1872 if (err) { 1873 i915_vma_put(engine->wa_ctx.vma); 1874 1875 /* Clear all flags to prevent further use */ 1876 memset(wa_ctx, 0, sizeof(*wa_ctx)); 1877 } 1878 } 1879 1880 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt) 1881 { 1882 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1883 stats->runtime.num_underflow++; 1884 stats->runtime.max_underflow = 1885 max_t(u32, stats->runtime.max_underflow, -dt); 1886 #endif 1887 } 1888 1889 static u32 lrc_get_runtime(const struct intel_context *ce) 1890 { 1891 /* 1892 * We can use either ppHWSP[16] which is recorded before the context 1893 * switch (and so excludes the cost of context switches) or use the 1894 * value from the context image itself, which is saved/restored earlier 1895 * and so includes the cost of the save. 1896 */ 1897 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); 1898 } 1899 1900 void lrc_update_runtime(struct intel_context *ce) 1901 { 1902 struct intel_context_stats *stats = &ce->stats; 1903 u32 old; 1904 s32 dt; 1905 1906 old = stats->runtime.last; 1907 stats->runtime.last = lrc_get_runtime(ce); 1908 dt = stats->runtime.last - old; 1909 if (!dt) 1910 return; 1911 1912 if (unlikely(dt < 0)) { 1913 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1914 old, stats->runtime.last, dt); 1915 st_runtime_underflow(stats, dt); 1916 return; 1917 } 1918 1919 ewma_runtime_add(&stats->runtime.avg, dt); 1920 stats->runtime.total += dt; 1921 } 1922 1923 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1924 #include "selftest_lrc.c" 1925 #endif 1926