1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2014 Intel Corporation 4 */ 5 6 #include "gem/i915_gem_lmem.h" 7 8 #include "gen8_engine_cs.h" 9 #include "i915_drv.h" 10 #include "i915_perf.h" 11 #include "i915_reg.h" 12 #include "intel_context.h" 13 #include "intel_engine.h" 14 #include "intel_engine_regs.h" 15 #include "intel_gpu_commands.h" 16 #include "intel_gt.h" 17 #include "intel_gt_regs.h" 18 #include "intel_lrc.h" 19 #include "intel_lrc_reg.h" 20 #include "intel_ring.h" 21 #include "shmem_utils.h" 22 23 /* 24 * The per-platform tables are u8-encoded in @data. Decode @data and set the 25 * addresses' offset and commands in @regs. The following encoding is used 26 * for each byte. There are 2 steps: decoding commands and decoding addresses. 27 * 28 * Commands: 29 * [7]: create NOPs - number of NOPs are set in lower bits 30 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set 31 * MI_LRI_FORCE_POSTED 32 * [5:0]: Number of NOPs or registers to set values to in case of 33 * MI_LOAD_REGISTER_IMM 34 * 35 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count" 36 * number of registers. They are set by using the REG/REG16 macros: the former 37 * is used for offsets smaller than 0x200 while the latter is for values bigger 38 * than that. Those macros already set all the bits documented below correctly: 39 * 40 * [7]: When a register offset needs more than 6 bits, use additional bytes, to 41 * follow, for the lower bits 42 * [6:0]: Register offset, without considering the engine base. 43 * 44 * This function only tweaks the commands and register offsets. Values are not 45 * filled out. 46 */ 47 static void set_offsets(u32 *regs, 48 const u8 *data, 49 const struct intel_engine_cs *engine, 50 bool close) 51 #define NOP(x) (BIT(7) | (x)) 52 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 53 #define POSTED BIT(0) 54 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 55 #define REG16(x) \ 56 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 57 (((x) >> 2) & 0x7f) 58 #define END 0 59 { 60 const u32 base = engine->mmio_base; 61 62 while (*data) { 63 u8 count, flags; 64 65 if (*data & BIT(7)) { /* skip */ 66 count = *data++ & ~BIT(7); 67 regs += count; 68 continue; 69 } 70 71 count = *data & 0x3f; 72 flags = *data >> 6; 73 data++; 74 75 *regs = MI_LOAD_REGISTER_IMM(count); 76 if (flags & POSTED) 77 *regs |= MI_LRI_FORCE_POSTED; 78 if (GRAPHICS_VER(engine->i915) >= 11) 79 *regs |= MI_LRI_LRM_CS_MMIO; 80 regs++; 81 82 GEM_BUG_ON(!count); 83 do { 84 u32 offset = 0; 85 u8 v; 86 87 do { 88 v = *data++; 89 offset <<= 7; 90 offset |= v & ~BIT(7); 91 } while (v & BIT(7)); 92 93 regs[0] = base + (offset << 2); 94 regs += 2; 95 } while (--count); 96 } 97 98 if (close) { 99 /* Close the batch; used mainly by live_lrc_layout() */ 100 *regs = MI_BATCH_BUFFER_END; 101 if (GRAPHICS_VER(engine->i915) >= 11) 102 *regs |= BIT(0); 103 } 104 } 105 106 static const u8 gen8_xcs_offsets[] = { 107 NOP(1), 108 LRI(11, 0), 109 REG16(0x244), 110 REG(0x034), 111 REG(0x030), 112 REG(0x038), 113 REG(0x03c), 114 REG(0x168), 115 REG(0x140), 116 REG(0x110), 117 REG(0x11c), 118 REG(0x114), 119 REG(0x118), 120 121 NOP(9), 122 LRI(9, 0), 123 REG16(0x3a8), 124 REG16(0x28c), 125 REG16(0x288), 126 REG16(0x284), 127 REG16(0x280), 128 REG16(0x27c), 129 REG16(0x278), 130 REG16(0x274), 131 REG16(0x270), 132 133 NOP(13), 134 LRI(2, 0), 135 REG16(0x200), 136 REG(0x028), 137 138 END 139 }; 140 141 static const u8 gen9_xcs_offsets[] = { 142 NOP(1), 143 LRI(14, POSTED), 144 REG16(0x244), 145 REG(0x034), 146 REG(0x030), 147 REG(0x038), 148 REG(0x03c), 149 REG(0x168), 150 REG(0x140), 151 REG(0x110), 152 REG(0x11c), 153 REG(0x114), 154 REG(0x118), 155 REG(0x1c0), 156 REG(0x1c4), 157 REG(0x1c8), 158 159 NOP(3), 160 LRI(9, POSTED), 161 REG16(0x3a8), 162 REG16(0x28c), 163 REG16(0x288), 164 REG16(0x284), 165 REG16(0x280), 166 REG16(0x27c), 167 REG16(0x278), 168 REG16(0x274), 169 REG16(0x270), 170 171 NOP(13), 172 LRI(1, POSTED), 173 REG16(0x200), 174 175 NOP(13), 176 LRI(44, POSTED), 177 REG(0x028), 178 REG(0x09c), 179 REG(0x0c0), 180 REG(0x178), 181 REG(0x17c), 182 REG16(0x358), 183 REG(0x170), 184 REG(0x150), 185 REG(0x154), 186 REG(0x158), 187 REG16(0x41c), 188 REG16(0x600), 189 REG16(0x604), 190 REG16(0x608), 191 REG16(0x60c), 192 REG16(0x610), 193 REG16(0x614), 194 REG16(0x618), 195 REG16(0x61c), 196 REG16(0x620), 197 REG16(0x624), 198 REG16(0x628), 199 REG16(0x62c), 200 REG16(0x630), 201 REG16(0x634), 202 REG16(0x638), 203 REG16(0x63c), 204 REG16(0x640), 205 REG16(0x644), 206 REG16(0x648), 207 REG16(0x64c), 208 REG16(0x650), 209 REG16(0x654), 210 REG16(0x658), 211 REG16(0x65c), 212 REG16(0x660), 213 REG16(0x664), 214 REG16(0x668), 215 REG16(0x66c), 216 REG16(0x670), 217 REG16(0x674), 218 REG16(0x678), 219 REG16(0x67c), 220 REG(0x068), 221 222 END 223 }; 224 225 static const u8 gen12_xcs_offsets[] = { 226 NOP(1), 227 LRI(13, POSTED), 228 REG16(0x244), 229 REG(0x034), 230 REG(0x030), 231 REG(0x038), 232 REG(0x03c), 233 REG(0x168), 234 REG(0x140), 235 REG(0x110), 236 REG(0x1c0), 237 REG(0x1c4), 238 REG(0x1c8), 239 REG(0x180), 240 REG16(0x2b4), 241 242 NOP(5), 243 LRI(9, POSTED), 244 REG16(0x3a8), 245 REG16(0x28c), 246 REG16(0x288), 247 REG16(0x284), 248 REG16(0x280), 249 REG16(0x27c), 250 REG16(0x278), 251 REG16(0x274), 252 REG16(0x270), 253 254 END 255 }; 256 257 static const u8 dg2_xcs_offsets[] = { 258 NOP(1), 259 LRI(15, POSTED), 260 REG16(0x244), 261 REG(0x034), 262 REG(0x030), 263 REG(0x038), 264 REG(0x03c), 265 REG(0x168), 266 REG(0x140), 267 REG(0x110), 268 REG(0x1c0), 269 REG(0x1c4), 270 REG(0x1c8), 271 REG(0x180), 272 REG16(0x2b4), 273 REG(0x120), 274 REG(0x124), 275 276 NOP(1), 277 LRI(9, POSTED), 278 REG16(0x3a8), 279 REG16(0x28c), 280 REG16(0x288), 281 REG16(0x284), 282 REG16(0x280), 283 REG16(0x27c), 284 REG16(0x278), 285 REG16(0x274), 286 REG16(0x270), 287 288 END 289 }; 290 291 static const u8 mtl_xcs_offsets[] = { 292 NOP(1), 293 LRI(13, POSTED), 294 REG16(0x244), 295 REG(0x034), 296 REG(0x030), 297 REG(0x038), 298 REG(0x03c), 299 REG(0x168), 300 REG(0x140), 301 REG(0x110), 302 REG(0x1c0), 303 REG(0x1c4), 304 REG(0x1c8), 305 REG(0x180), 306 REG16(0x2b4), 307 NOP(4), 308 309 NOP(1), 310 LRI(9, POSTED), 311 REG16(0x3a8), 312 REG16(0x28c), 313 REG16(0x288), 314 REG16(0x284), 315 REG16(0x280), 316 REG16(0x27c), 317 REG16(0x278), 318 REG16(0x274), 319 REG16(0x270), 320 321 END 322 }; 323 324 static const u8 gen8_rcs_offsets[] = { 325 NOP(1), 326 LRI(14, POSTED), 327 REG16(0x244), 328 REG(0x034), 329 REG(0x030), 330 REG(0x038), 331 REG(0x03c), 332 REG(0x168), 333 REG(0x140), 334 REG(0x110), 335 REG(0x11c), 336 REG(0x114), 337 REG(0x118), 338 REG(0x1c0), 339 REG(0x1c4), 340 REG(0x1c8), 341 342 NOP(3), 343 LRI(9, POSTED), 344 REG16(0x3a8), 345 REG16(0x28c), 346 REG16(0x288), 347 REG16(0x284), 348 REG16(0x280), 349 REG16(0x27c), 350 REG16(0x278), 351 REG16(0x274), 352 REG16(0x270), 353 354 NOP(13), 355 LRI(1, 0), 356 REG(0x0c8), 357 358 END 359 }; 360 361 static const u8 gen9_rcs_offsets[] = { 362 NOP(1), 363 LRI(14, POSTED), 364 REG16(0x244), 365 REG(0x34), 366 REG(0x30), 367 REG(0x38), 368 REG(0x3c), 369 REG(0x168), 370 REG(0x140), 371 REG(0x110), 372 REG(0x11c), 373 REG(0x114), 374 REG(0x118), 375 REG(0x1c0), 376 REG(0x1c4), 377 REG(0x1c8), 378 379 NOP(3), 380 LRI(9, POSTED), 381 REG16(0x3a8), 382 REG16(0x28c), 383 REG16(0x288), 384 REG16(0x284), 385 REG16(0x280), 386 REG16(0x27c), 387 REG16(0x278), 388 REG16(0x274), 389 REG16(0x270), 390 391 NOP(13), 392 LRI(1, 0), 393 REG(0xc8), 394 395 NOP(13), 396 LRI(44, POSTED), 397 REG(0x28), 398 REG(0x9c), 399 REG(0xc0), 400 REG(0x178), 401 REG(0x17c), 402 REG16(0x358), 403 REG(0x170), 404 REG(0x150), 405 REG(0x154), 406 REG(0x158), 407 REG16(0x41c), 408 REG16(0x600), 409 REG16(0x604), 410 REG16(0x608), 411 REG16(0x60c), 412 REG16(0x610), 413 REG16(0x614), 414 REG16(0x618), 415 REG16(0x61c), 416 REG16(0x620), 417 REG16(0x624), 418 REG16(0x628), 419 REG16(0x62c), 420 REG16(0x630), 421 REG16(0x634), 422 REG16(0x638), 423 REG16(0x63c), 424 REG16(0x640), 425 REG16(0x644), 426 REG16(0x648), 427 REG16(0x64c), 428 REG16(0x650), 429 REG16(0x654), 430 REG16(0x658), 431 REG16(0x65c), 432 REG16(0x660), 433 REG16(0x664), 434 REG16(0x668), 435 REG16(0x66c), 436 REG16(0x670), 437 REG16(0x674), 438 REG16(0x678), 439 REG16(0x67c), 440 REG(0x68), 441 442 END 443 }; 444 445 static const u8 gen11_rcs_offsets[] = { 446 NOP(1), 447 LRI(15, POSTED), 448 REG16(0x244), 449 REG(0x034), 450 REG(0x030), 451 REG(0x038), 452 REG(0x03c), 453 REG(0x168), 454 REG(0x140), 455 REG(0x110), 456 REG(0x11c), 457 REG(0x114), 458 REG(0x118), 459 REG(0x1c0), 460 REG(0x1c4), 461 REG(0x1c8), 462 REG(0x180), 463 464 NOP(1), 465 LRI(9, POSTED), 466 REG16(0x3a8), 467 REG16(0x28c), 468 REG16(0x288), 469 REG16(0x284), 470 REG16(0x280), 471 REG16(0x27c), 472 REG16(0x278), 473 REG16(0x274), 474 REG16(0x270), 475 476 LRI(1, POSTED), 477 REG(0x1b0), 478 479 NOP(10), 480 LRI(1, 0), 481 REG(0x0c8), 482 483 END 484 }; 485 486 static const u8 gen12_rcs_offsets[] = { 487 NOP(1), 488 LRI(13, POSTED), 489 REG16(0x244), 490 REG(0x034), 491 REG(0x030), 492 REG(0x038), 493 REG(0x03c), 494 REG(0x168), 495 REG(0x140), 496 REG(0x110), 497 REG(0x1c0), 498 REG(0x1c4), 499 REG(0x1c8), 500 REG(0x180), 501 REG16(0x2b4), 502 503 NOP(5), 504 LRI(9, POSTED), 505 REG16(0x3a8), 506 REG16(0x28c), 507 REG16(0x288), 508 REG16(0x284), 509 REG16(0x280), 510 REG16(0x27c), 511 REG16(0x278), 512 REG16(0x274), 513 REG16(0x270), 514 515 LRI(3, POSTED), 516 REG(0x1b0), 517 REG16(0x5a8), 518 REG16(0x5ac), 519 520 NOP(6), 521 LRI(1, 0), 522 REG(0x0c8), 523 NOP(3 + 9 + 1), 524 525 LRI(51, POSTED), 526 REG16(0x588), 527 REG16(0x588), 528 REG16(0x588), 529 REG16(0x588), 530 REG16(0x588), 531 REG16(0x588), 532 REG(0x028), 533 REG(0x09c), 534 REG(0x0c0), 535 REG(0x178), 536 REG(0x17c), 537 REG16(0x358), 538 REG(0x170), 539 REG(0x150), 540 REG(0x154), 541 REG(0x158), 542 REG16(0x41c), 543 REG16(0x600), 544 REG16(0x604), 545 REG16(0x608), 546 REG16(0x60c), 547 REG16(0x610), 548 REG16(0x614), 549 REG16(0x618), 550 REG16(0x61c), 551 REG16(0x620), 552 REG16(0x624), 553 REG16(0x628), 554 REG16(0x62c), 555 REG16(0x630), 556 REG16(0x634), 557 REG16(0x638), 558 REG16(0x63c), 559 REG16(0x640), 560 REG16(0x644), 561 REG16(0x648), 562 REG16(0x64c), 563 REG16(0x650), 564 REG16(0x654), 565 REG16(0x658), 566 REG16(0x65c), 567 REG16(0x660), 568 REG16(0x664), 569 REG16(0x668), 570 REG16(0x66c), 571 REG16(0x670), 572 REG16(0x674), 573 REG16(0x678), 574 REG16(0x67c), 575 REG(0x068), 576 REG(0x084), 577 NOP(1), 578 579 END 580 }; 581 582 static const u8 xehp_rcs_offsets[] = { 583 NOP(1), 584 LRI(13, POSTED), 585 REG16(0x244), 586 REG(0x034), 587 REG(0x030), 588 REG(0x038), 589 REG(0x03c), 590 REG(0x168), 591 REG(0x140), 592 REG(0x110), 593 REG(0x1c0), 594 REG(0x1c4), 595 REG(0x1c8), 596 REG(0x180), 597 REG16(0x2b4), 598 599 NOP(5), 600 LRI(9, POSTED), 601 REG16(0x3a8), 602 REG16(0x28c), 603 REG16(0x288), 604 REG16(0x284), 605 REG16(0x280), 606 REG16(0x27c), 607 REG16(0x278), 608 REG16(0x274), 609 REG16(0x270), 610 611 LRI(3, POSTED), 612 REG(0x1b0), 613 REG16(0x5a8), 614 REG16(0x5ac), 615 616 NOP(6), 617 LRI(1, 0), 618 REG(0x0c8), 619 620 END 621 }; 622 623 static const u8 dg2_rcs_offsets[] = { 624 NOP(1), 625 LRI(15, POSTED), 626 REG16(0x244), 627 REG(0x034), 628 REG(0x030), 629 REG(0x038), 630 REG(0x03c), 631 REG(0x168), 632 REG(0x140), 633 REG(0x110), 634 REG(0x1c0), 635 REG(0x1c4), 636 REG(0x1c8), 637 REG(0x180), 638 REG16(0x2b4), 639 REG(0x120), 640 REG(0x124), 641 642 NOP(1), 643 LRI(9, POSTED), 644 REG16(0x3a8), 645 REG16(0x28c), 646 REG16(0x288), 647 REG16(0x284), 648 REG16(0x280), 649 REG16(0x27c), 650 REG16(0x278), 651 REG16(0x274), 652 REG16(0x270), 653 654 LRI(3, POSTED), 655 REG(0x1b0), 656 REG16(0x5a8), 657 REG16(0x5ac), 658 659 NOP(6), 660 LRI(1, 0), 661 REG(0x0c8), 662 663 END 664 }; 665 666 static const u8 mtl_rcs_offsets[] = { 667 NOP(1), 668 LRI(15, POSTED), 669 REG16(0x244), 670 REG(0x034), 671 REG(0x030), 672 REG(0x038), 673 REG(0x03c), 674 REG(0x168), 675 REG(0x140), 676 REG(0x110), 677 REG(0x1c0), 678 REG(0x1c4), 679 REG(0x1c8), 680 REG(0x180), 681 REG16(0x2b4), 682 REG(0x120), 683 REG(0x124), 684 685 NOP(1), 686 LRI(9, POSTED), 687 REG16(0x3a8), 688 REG16(0x28c), 689 REG16(0x288), 690 REG16(0x284), 691 REG16(0x280), 692 REG16(0x27c), 693 REG16(0x278), 694 REG16(0x274), 695 REG16(0x270), 696 697 NOP(2), 698 LRI(2, POSTED), 699 REG16(0x5a8), 700 REG16(0x5ac), 701 702 NOP(6), 703 LRI(1, 0), 704 REG(0x0c8), 705 706 END 707 }; 708 709 #undef END 710 #undef REG16 711 #undef REG 712 #undef LRI 713 #undef NOP 714 715 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 716 { 717 /* 718 * The gen12+ lists only have the registers we program in the basic 719 * default state. We rely on the context image using relative 720 * addressing to automatic fixup the register state between the 721 * physical engines for virtual engine. 722 */ 723 GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 && 724 !intel_engine_has_relative_mmio(engine)); 725 726 if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) { 727 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70)) 728 return mtl_rcs_offsets; 729 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 730 return dg2_rcs_offsets; 731 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 732 return xehp_rcs_offsets; 733 else if (GRAPHICS_VER(engine->i915) >= 12) 734 return gen12_rcs_offsets; 735 else if (GRAPHICS_VER(engine->i915) >= 11) 736 return gen11_rcs_offsets; 737 else if (GRAPHICS_VER(engine->i915) >= 9) 738 return gen9_rcs_offsets; 739 else 740 return gen8_rcs_offsets; 741 } else { 742 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70)) 743 return mtl_xcs_offsets; 744 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 745 return dg2_xcs_offsets; 746 else if (GRAPHICS_VER(engine->i915) >= 12) 747 return gen12_xcs_offsets; 748 else if (GRAPHICS_VER(engine->i915) >= 9) 749 return gen9_xcs_offsets; 750 else 751 return gen8_xcs_offsets; 752 } 753 } 754 755 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 756 { 757 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 758 return 0x70; 759 else if (GRAPHICS_VER(engine->i915) >= 12) 760 return 0x60; 761 else if (GRAPHICS_VER(engine->i915) >= 9) 762 return 0x54; 763 else if (engine->class == RENDER_CLASS) 764 return 0x58; 765 else 766 return -1; 767 } 768 769 static int lrc_ring_bb_offset(const struct intel_engine_cs *engine) 770 { 771 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 772 return 0x80; 773 else if (GRAPHICS_VER(engine->i915) >= 12) 774 return 0x70; 775 else if (GRAPHICS_VER(engine->i915) >= 9) 776 return 0x64; 777 else if (GRAPHICS_VER(engine->i915) >= 8 && 778 engine->class == RENDER_CLASS) 779 return 0xc4; 780 else 781 return -1; 782 } 783 784 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 785 { 786 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 787 return 0x84; 788 else if (GRAPHICS_VER(engine->i915) >= 12) 789 return 0x74; 790 else if (GRAPHICS_VER(engine->i915) >= 9) 791 return 0x68; 792 else if (engine->class == RENDER_CLASS) 793 return 0xd8; 794 else 795 return -1; 796 } 797 798 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 799 { 800 if (GRAPHICS_VER(engine->i915) >= 12) 801 return 0x12; 802 else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS) 803 return 0x18; 804 else 805 return -1; 806 } 807 808 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 809 { 810 int x; 811 812 x = lrc_ring_wa_bb_per_ctx(engine); 813 if (x < 0) 814 return x; 815 816 return x + 2; 817 } 818 819 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 820 { 821 int x; 822 823 x = lrc_ring_indirect_ptr(engine); 824 if (x < 0) 825 return x; 826 827 return x + 2; 828 } 829 830 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 831 { 832 833 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) 834 /* 835 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL 836 * simply to match the RCS context image layout. 837 */ 838 return 0xc6; 839 else if (engine->class != RENDER_CLASS) 840 return -1; 841 else if (GRAPHICS_VER(engine->i915) >= 12) 842 return 0xb6; 843 else if (GRAPHICS_VER(engine->i915) >= 11) 844 return 0xaa; 845 else 846 return -1; 847 } 848 849 static u32 850 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 851 { 852 if (GRAPHICS_VER(engine->i915) >= 12) 853 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 854 else if (GRAPHICS_VER(engine->i915) >= 11) 855 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 856 else if (GRAPHICS_VER(engine->i915) >= 9) 857 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 858 else if (GRAPHICS_VER(engine->i915) >= 8) 859 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 860 861 GEM_BUG_ON(GRAPHICS_VER(engine->i915) < 8); 862 863 return 0; 864 } 865 866 static void 867 lrc_setup_indirect_ctx(u32 *regs, 868 const struct intel_engine_cs *engine, 869 u32 ctx_bb_ggtt_addr, 870 u32 size) 871 { 872 GEM_BUG_ON(!size); 873 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 874 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 875 regs[lrc_ring_indirect_ptr(engine) + 1] = 876 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 877 878 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 879 regs[lrc_ring_indirect_offset(engine) + 1] = 880 lrc_ring_indirect_offset_default(engine) << 6; 881 } 882 883 static void init_common_regs(u32 * const regs, 884 const struct intel_context *ce, 885 const struct intel_engine_cs *engine, 886 bool inhibit) 887 { 888 u32 ctl; 889 int loc; 890 891 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 892 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 893 if (inhibit) 894 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 895 if (GRAPHICS_VER(engine->i915) < 11) 896 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 897 CTX_CTRL_RS_CTX_ENABLE); 898 regs[CTX_CONTEXT_CONTROL] = ctl; 899 900 regs[CTX_TIMESTAMP] = ce->stats.runtime.last; 901 902 loc = lrc_ring_bb_offset(engine); 903 if (loc != -1) 904 regs[loc + 1] = 0; 905 } 906 907 static void init_wa_bb_regs(u32 * const regs, 908 const struct intel_engine_cs *engine) 909 { 910 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 911 912 if (wa_ctx->per_ctx.size) { 913 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 914 915 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 916 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 917 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 918 } 919 920 if (wa_ctx->indirect_ctx.size) { 921 lrc_setup_indirect_ctx(regs, engine, 922 i915_ggtt_offset(wa_ctx->vma) + 923 wa_ctx->indirect_ctx.offset, 924 wa_ctx->indirect_ctx.size); 925 } 926 } 927 928 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt) 929 { 930 if (i915_vm_is_4lvl(&ppgtt->vm)) { 931 /* 64b PPGTT (48bit canonical) 932 * PDP0_DESCRIPTOR contains the base address to PML4 and 933 * other PDP Descriptors are ignored. 934 */ 935 ASSIGN_CTX_PML4(ppgtt, regs); 936 } else { 937 ASSIGN_CTX_PDP(ppgtt, regs, 3); 938 ASSIGN_CTX_PDP(ppgtt, regs, 2); 939 ASSIGN_CTX_PDP(ppgtt, regs, 1); 940 ASSIGN_CTX_PDP(ppgtt, regs, 0); 941 } 942 } 943 944 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 945 { 946 if (i915_is_ggtt(vm)) 947 return i915_vm_to_ggtt(vm)->alias; 948 else 949 return i915_vm_to_ppgtt(vm); 950 } 951 952 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 953 { 954 int x; 955 956 x = lrc_ring_mi_mode(engine); 957 if (x != -1) { 958 regs[x + 1] &= ~STOP_RING; 959 regs[x + 1] |= STOP_RING << 16; 960 } 961 } 962 963 static void __lrc_init_regs(u32 *regs, 964 const struct intel_context *ce, 965 const struct intel_engine_cs *engine, 966 bool inhibit) 967 { 968 /* 969 * A context is actually a big batch buffer with several 970 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 971 * values we are setting here are only for the first context restore: 972 * on a subsequent save, the GPU will recreate this batchbuffer with new 973 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 974 * we are not initializing here). 975 * 976 * Must keep consistent with virtual_update_register_offsets(). 977 */ 978 979 if (inhibit) 980 memset(regs, 0, PAGE_SIZE); 981 982 set_offsets(regs, reg_offsets(engine), engine, inhibit); 983 984 init_common_regs(regs, ce, engine, inhibit); 985 init_ppgtt_regs(regs, vm_alias(ce->vm)); 986 987 init_wa_bb_regs(regs, engine); 988 989 __reset_stop_ring(regs, engine); 990 } 991 992 void lrc_init_regs(const struct intel_context *ce, 993 const struct intel_engine_cs *engine, 994 bool inhibit) 995 { 996 __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit); 997 } 998 999 void lrc_reset_regs(const struct intel_context *ce, 1000 const struct intel_engine_cs *engine) 1001 { 1002 __reset_stop_ring(ce->lrc_reg_state, engine); 1003 } 1004 1005 static void 1006 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 1007 { 1008 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1009 return; 1010 1011 vaddr += engine->context_size; 1012 1013 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 1014 } 1015 1016 static void 1017 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 1018 { 1019 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1020 return; 1021 1022 vaddr += engine->context_size; 1023 1024 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 1025 drm_err_once(&engine->i915->drm, 1026 "%s context redzone overwritten!\n", 1027 engine->name); 1028 } 1029 1030 static u32 context_wa_bb_offset(const struct intel_context *ce) 1031 { 1032 return PAGE_SIZE * ce->wa_bb_page; 1033 } 1034 1035 static u32 *context_indirect_bb(const struct intel_context *ce) 1036 { 1037 void *ptr; 1038 1039 GEM_BUG_ON(!ce->wa_bb_page); 1040 1041 ptr = ce->lrc_reg_state; 1042 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 1043 ptr += context_wa_bb_offset(ce); 1044 1045 return ptr; 1046 } 1047 1048 void lrc_init_state(struct intel_context *ce, 1049 struct intel_engine_cs *engine, 1050 void *state) 1051 { 1052 bool inhibit = true; 1053 1054 set_redzone(state, engine); 1055 1056 if (engine->default_state) { 1057 shmem_read(engine->default_state, 0, 1058 state, engine->context_size); 1059 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 1060 inhibit = false; 1061 } 1062 1063 /* Clear the ppHWSP (inc. per-context counters) */ 1064 memset(state, 0, PAGE_SIZE); 1065 1066 /* Clear the indirect wa and storage */ 1067 if (ce->wa_bb_page) 1068 memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE); 1069 1070 /* 1071 * The second page of the context object contains some registers which 1072 * must be set up prior to the first execution. 1073 */ 1074 __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit); 1075 } 1076 1077 u32 lrc_indirect_bb(const struct intel_context *ce) 1078 { 1079 return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce); 1080 } 1081 1082 static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs) 1083 { 1084 /* If predication is active, this will be noop'ed */ 1085 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2); 1086 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA; 1087 *cs++ = 0; 1088 *cs++ = 0; /* No predication */ 1089 1090 /* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */ 1091 *cs++ = MI_BATCH_BUFFER_END | BIT(15); 1092 *cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE; 1093 1094 /* Instructions are no longer predicated (disabled), we can proceed */ 1095 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2); 1096 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA; 1097 *cs++ = 0; 1098 *cs++ = 1; /* enable predication before the next BB */ 1099 1100 *cs++ = MI_BATCH_BUFFER_END; 1101 GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA); 1102 1103 return cs; 1104 } 1105 1106 static struct i915_vma * 1107 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine) 1108 { 1109 struct drm_i915_gem_object *obj; 1110 struct i915_vma *vma; 1111 u32 context_size; 1112 1113 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 1114 1115 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1116 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 1117 1118 if (GRAPHICS_VER(engine->i915) >= 12) { 1119 ce->wa_bb_page = context_size / PAGE_SIZE; 1120 context_size += PAGE_SIZE; 1121 } 1122 1123 if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) { 1124 ce->parallel.guc.parent_page = context_size / PAGE_SIZE; 1125 context_size += PARENT_SCRATCH_SIZE; 1126 } 1127 1128 obj = i915_gem_object_create_lmem(engine->i915, context_size, 1129 I915_BO_ALLOC_PM_VOLATILE); 1130 if (IS_ERR(obj)) 1131 obj = i915_gem_object_create_shmem(engine->i915, context_size); 1132 if (IS_ERR(obj)) 1133 return ERR_CAST(obj); 1134 1135 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 1136 if (IS_ERR(vma)) { 1137 i915_gem_object_put(obj); 1138 return vma; 1139 } 1140 1141 return vma; 1142 } 1143 1144 static struct intel_timeline * 1145 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine) 1146 { 1147 struct intel_timeline *tl = fetch_and_zero(&ce->timeline); 1148 1149 return intel_timeline_create_from_engine(engine, page_unmask_bits(tl)); 1150 } 1151 1152 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine) 1153 { 1154 struct intel_ring *ring; 1155 struct i915_vma *vma; 1156 int err; 1157 1158 GEM_BUG_ON(ce->state); 1159 1160 vma = __lrc_alloc_state(ce, engine); 1161 if (IS_ERR(vma)) 1162 return PTR_ERR(vma); 1163 1164 ring = intel_engine_create_ring(engine, ce->ring_size); 1165 if (IS_ERR(ring)) { 1166 err = PTR_ERR(ring); 1167 goto err_vma; 1168 } 1169 1170 if (!page_mask_bits(ce->timeline)) { 1171 struct intel_timeline *tl; 1172 1173 /* 1174 * Use the static global HWSP for the kernel context, and 1175 * a dynamically allocated cacheline for everyone else. 1176 */ 1177 if (unlikely(ce->timeline)) 1178 tl = pinned_timeline(ce, engine); 1179 else 1180 tl = intel_timeline_create(engine->gt); 1181 if (IS_ERR(tl)) { 1182 err = PTR_ERR(tl); 1183 goto err_ring; 1184 } 1185 1186 ce->timeline = tl; 1187 } 1188 1189 ce->ring = ring; 1190 ce->state = vma; 1191 1192 return 0; 1193 1194 err_ring: 1195 intel_ring_put(ring); 1196 err_vma: 1197 i915_vma_put(vma); 1198 return err; 1199 } 1200 1201 void lrc_reset(struct intel_context *ce) 1202 { 1203 GEM_BUG_ON(!intel_context_is_pinned(ce)); 1204 1205 intel_ring_reset(ce->ring, ce->ring->emit); 1206 1207 /* Scrub away the garbage */ 1208 lrc_init_regs(ce, ce->engine, true); 1209 ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail); 1210 } 1211 1212 int 1213 lrc_pre_pin(struct intel_context *ce, 1214 struct intel_engine_cs *engine, 1215 struct i915_gem_ww_ctx *ww, 1216 void **vaddr) 1217 { 1218 GEM_BUG_ON(!ce->state); 1219 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 1220 1221 *vaddr = i915_gem_object_pin_map(ce->state->obj, 1222 i915_coherent_map_type(ce->engine->i915, 1223 ce->state->obj, 1224 false) | 1225 I915_MAP_OVERRIDE); 1226 1227 return PTR_ERR_OR_ZERO(*vaddr); 1228 } 1229 1230 int 1231 lrc_pin(struct intel_context *ce, 1232 struct intel_engine_cs *engine, 1233 void *vaddr) 1234 { 1235 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 1236 1237 if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags)) 1238 lrc_init_state(ce, engine, vaddr); 1239 1240 ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail); 1241 return 0; 1242 } 1243 1244 void lrc_unpin(struct intel_context *ce) 1245 { 1246 if (unlikely(ce->parallel.last_rq)) { 1247 i915_request_put(ce->parallel.last_rq); 1248 ce->parallel.last_rq = NULL; 1249 } 1250 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 1251 ce->engine); 1252 } 1253 1254 void lrc_post_unpin(struct intel_context *ce) 1255 { 1256 i915_gem_object_unpin_map(ce->state->obj); 1257 } 1258 1259 void lrc_fini(struct intel_context *ce) 1260 { 1261 if (!ce->state) 1262 return; 1263 1264 intel_ring_put(fetch_and_zero(&ce->ring)); 1265 i915_vma_put(fetch_and_zero(&ce->state)); 1266 } 1267 1268 void lrc_destroy(struct kref *kref) 1269 { 1270 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 1271 1272 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 1273 GEM_BUG_ON(intel_context_is_pinned(ce)); 1274 1275 lrc_fini(ce); 1276 1277 intel_context_fini(ce); 1278 intel_context_free(ce); 1279 } 1280 1281 static u32 * 1282 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 1283 { 1284 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1285 MI_SRM_LRM_GLOBAL_GTT | 1286 MI_LRI_LRM_CS_MMIO; 1287 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1288 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1289 CTX_TIMESTAMP * sizeof(u32); 1290 *cs++ = 0; 1291 1292 *cs++ = MI_LOAD_REGISTER_REG | 1293 MI_LRR_SOURCE_CS_MMIO | 1294 MI_LRI_LRM_CS_MMIO; 1295 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1296 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1297 1298 *cs++ = MI_LOAD_REGISTER_REG | 1299 MI_LRR_SOURCE_CS_MMIO | 1300 MI_LRI_LRM_CS_MMIO; 1301 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1302 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1303 1304 return cs; 1305 } 1306 1307 static u32 * 1308 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 1309 { 1310 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 1311 1312 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1313 MI_SRM_LRM_GLOBAL_GTT | 1314 MI_LRI_LRM_CS_MMIO; 1315 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1316 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1317 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 1318 *cs++ = 0; 1319 1320 return cs; 1321 } 1322 1323 static u32 * 1324 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 1325 { 1326 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 1327 1328 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1329 MI_SRM_LRM_GLOBAL_GTT | 1330 MI_LRI_LRM_CS_MMIO; 1331 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1332 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1333 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 1334 *cs++ = 0; 1335 1336 *cs++ = MI_LOAD_REGISTER_REG | 1337 MI_LRR_SOURCE_CS_MMIO | 1338 MI_LRI_LRM_CS_MMIO; 1339 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1340 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 1341 1342 return cs; 1343 } 1344 1345 /* 1346 * On DG2 during context restore of a preempted context in GPGPU mode, 1347 * RCS restore hang is detected. This is extremely timing dependent. 1348 * To address this below sw wabb is implemented for DG2 A steppings. 1349 */ 1350 static u32 * 1351 dg2_emit_rcs_hang_wabb(const struct intel_context *ce, u32 *cs) 1352 { 1353 *cs++ = MI_LOAD_REGISTER_IMM(1); 1354 *cs++ = i915_mmio_reg_offset(GEN12_STATE_ACK_DEBUG); 1355 *cs++ = 0x21; 1356 1357 *cs++ = MI_LOAD_REGISTER_REG; 1358 *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base)); 1359 *cs++ = i915_mmio_reg_offset(GEN12_CULLBIT1); 1360 1361 *cs++ = MI_LOAD_REGISTER_REG; 1362 *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base)); 1363 *cs++ = i915_mmio_reg_offset(GEN12_CULLBIT2); 1364 1365 return cs; 1366 } 1367 1368 /* 1369 * The bspec's tuning guide asks us to program a vertical watermark value of 1370 * 0x3FF. However this register is not saved/restored properly by the 1371 * hardware, so we're required to apply the desired value via INDIRECT_CTX 1372 * batch buffer to ensure the value takes effect properly. All other bits 1373 * in this register should remain at 0 (the hardware default). 1374 */ 1375 static u32 * 1376 dg2_emit_draw_watermark_setting(u32 *cs) 1377 { 1378 *cs++ = MI_LOAD_REGISTER_IMM(1); 1379 *cs++ = i915_mmio_reg_offset(DRAW_WATERMARK); 1380 *cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF); 1381 1382 return cs; 1383 } 1384 1385 static u32 * 1386 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 1387 { 1388 cs = gen12_emit_timestamp_wa(ce, cs); 1389 cs = gen12_emit_cmd_buf_wa(ce, cs); 1390 cs = gen12_emit_restore_scratch(ce, cs); 1391 1392 /* Wa_22011450934:dg2 */ 1393 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_A0, STEP_B0) || 1394 IS_DG2_GRAPHICS_STEP(ce->engine->i915, G11, STEP_A0, STEP_B0)) 1395 cs = dg2_emit_rcs_hang_wabb(ce, cs); 1396 1397 /* Wa_16013000631:dg2 */ 1398 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) || 1399 IS_DG2_G11(ce->engine->i915)) 1400 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0); 1401 1402 /* hsdes: 1809175790 */ 1403 if (!HAS_FLAT_CCS(ce->engine->i915)) 1404 cs = gen12_emit_aux_table_inv(ce->engine->gt, 1405 cs, GEN12_GFX_CCS_AUX_NV); 1406 1407 /* Wa_16014892111 */ 1408 if (IS_DG2(ce->engine->i915)) 1409 cs = dg2_emit_draw_watermark_setting(cs); 1410 1411 return cs; 1412 } 1413 1414 static u32 * 1415 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 1416 { 1417 cs = gen12_emit_timestamp_wa(ce, cs); 1418 cs = gen12_emit_restore_scratch(ce, cs); 1419 1420 /* Wa_16013000631:dg2 */ 1421 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) || 1422 IS_DG2_G11(ce->engine->i915)) 1423 if (ce->engine->class == COMPUTE_CLASS) 1424 cs = gen8_emit_pipe_control(cs, 1425 PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 1426 0); 1427 1428 /* hsdes: 1809175790 */ 1429 if (!HAS_FLAT_CCS(ce->engine->i915)) { 1430 if (ce->engine->class == VIDEO_DECODE_CLASS) 1431 cs = gen12_emit_aux_table_inv(ce->engine->gt, 1432 cs, GEN12_VD0_AUX_NV); 1433 else if (ce->engine->class == VIDEO_ENHANCEMENT_CLASS) 1434 cs = gen12_emit_aux_table_inv(ce->engine->gt, 1435 cs, GEN12_VE0_AUX_NV); 1436 } 1437 1438 return cs; 1439 } 1440 1441 static void 1442 setup_indirect_ctx_bb(const struct intel_context *ce, 1443 const struct intel_engine_cs *engine, 1444 u32 *(*emit)(const struct intel_context *, u32 *)) 1445 { 1446 u32 * const start = context_indirect_bb(ce); 1447 u32 *cs; 1448 1449 cs = emit(ce, start); 1450 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 1451 while ((unsigned long)cs % CACHELINE_BYTES) 1452 *cs++ = MI_NOOP; 1453 1454 GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start)); 1455 setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start)); 1456 1457 lrc_setup_indirect_ctx(ce->lrc_reg_state, engine, 1458 lrc_indirect_bb(ce), 1459 (cs - start) * sizeof(*cs)); 1460 } 1461 1462 /* 1463 * The context descriptor encodes various attributes of a context, 1464 * including its GTT address and some flags. Because it's fairly 1465 * expensive to calculate, we'll just do it once and cache the result, 1466 * which remains valid until the context is unpinned. 1467 * 1468 * This is what a descriptor looks like, from LSB to MSB:: 1469 * 1470 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 1471 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 1472 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 1473 * bits 53-54: mbz, reserved for use by hardware 1474 * bits 55-63: group ID, currently unused and set to 0 1475 * 1476 * Starting from Gen11, the upper dword of the descriptor has a new format: 1477 * 1478 * bits 32-36: reserved 1479 * bits 37-47: SW context ID 1480 * bits 48:53: engine instance 1481 * bit 54: mbz, reserved for use by hardware 1482 * bits 55-60: SW counter 1483 * bits 61-63: engine class 1484 * 1485 * On Xe_HP, the upper dword of the descriptor has a new format: 1486 * 1487 * bits 32-37: virtual function number 1488 * bit 38: mbz, reserved for use by hardware 1489 * bits 39-54: SW context ID 1490 * bits 55-57: reserved 1491 * bits 58-63: SW counter 1492 * 1493 * engine info, SW context ID and SW counter need to form a unique number 1494 * (Context ID) per lrc. 1495 */ 1496 static u32 lrc_descriptor(const struct intel_context *ce) 1497 { 1498 u32 desc; 1499 1500 desc = INTEL_LEGACY_32B_CONTEXT; 1501 if (i915_vm_is_4lvl(ce->vm)) 1502 desc = INTEL_LEGACY_64B_CONTEXT; 1503 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 1504 1505 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 1506 if (GRAPHICS_VER(ce->vm->i915) == 8) 1507 desc |= GEN8_CTX_L3LLC_COHERENT; 1508 1509 return i915_ggtt_offset(ce->state) | desc; 1510 } 1511 1512 u32 lrc_update_regs(const struct intel_context *ce, 1513 const struct intel_engine_cs *engine, 1514 u32 head) 1515 { 1516 struct intel_ring *ring = ce->ring; 1517 u32 *regs = ce->lrc_reg_state; 1518 1519 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 1520 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 1521 1522 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1523 regs[CTX_RING_HEAD] = head; 1524 regs[CTX_RING_TAIL] = ring->tail; 1525 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1526 1527 /* RPCS */ 1528 if (engine->class == RENDER_CLASS) { 1529 regs[CTX_R_PWR_CLK_STATE] = 1530 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 1531 1532 i915_oa_init_reg_state(ce, engine); 1533 } 1534 1535 if (ce->wa_bb_page) { 1536 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 1537 1538 fn = gen12_emit_indirect_ctx_xcs; 1539 if (ce->engine->class == RENDER_CLASS) 1540 fn = gen12_emit_indirect_ctx_rcs; 1541 1542 /* Mutually exclusive wrt to global indirect bb */ 1543 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 1544 setup_indirect_ctx_bb(ce, engine, fn); 1545 } 1546 1547 return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE; 1548 } 1549 1550 void lrc_update_offsets(struct intel_context *ce, 1551 struct intel_engine_cs *engine) 1552 { 1553 set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false); 1554 } 1555 1556 void lrc_check_regs(const struct intel_context *ce, 1557 const struct intel_engine_cs *engine, 1558 const char *when) 1559 { 1560 const struct intel_ring *ring = ce->ring; 1561 u32 *regs = ce->lrc_reg_state; 1562 bool valid = true; 1563 int x; 1564 1565 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1566 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1567 engine->name, 1568 regs[CTX_RING_START], 1569 i915_ggtt_offset(ring->vma)); 1570 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1571 valid = false; 1572 } 1573 1574 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1575 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1576 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1577 engine->name, 1578 regs[CTX_RING_CTL], 1579 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1580 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1581 valid = false; 1582 } 1583 1584 x = lrc_ring_mi_mode(engine); 1585 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1586 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1587 engine->name, regs[x + 1]); 1588 regs[x + 1] &= ~STOP_RING; 1589 regs[x + 1] |= STOP_RING << 16; 1590 valid = false; 1591 } 1592 1593 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when); 1594 } 1595 1596 /* 1597 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 1598 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 1599 * but there is a slight complication as this is applied in WA batch where the 1600 * values are only initialized once so we cannot take register value at the 1601 * beginning and reuse it further; hence we save its value to memory, upload a 1602 * constant value with bit21 set and then we restore it back with the saved value. 1603 * To simplify the WA, a constant value is formed by using the default value 1604 * of this register. This shouldn't be a problem because we are only modifying 1605 * it for a short period and this batch in non-premptible. We can ofcourse 1606 * use additional instructions that read the actual value of the register 1607 * at that time and set our bit of interest but it makes the WA complicated. 1608 * 1609 * This WA is also required for Gen9 so extracting as a function avoids 1610 * code duplication. 1611 */ 1612 static u32 * 1613 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 1614 { 1615 /* NB no one else is allowed to scribble over scratch + 256! */ 1616 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1617 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1618 *batch++ = intel_gt_scratch_offset(engine->gt, 1619 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1620 *batch++ = 0; 1621 1622 *batch++ = MI_LOAD_REGISTER_IMM(1); 1623 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1624 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 1625 1626 batch = gen8_emit_pipe_control(batch, 1627 PIPE_CONTROL_CS_STALL | 1628 PIPE_CONTROL_DC_FLUSH_ENABLE, 1629 0); 1630 1631 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1632 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1633 *batch++ = intel_gt_scratch_offset(engine->gt, 1634 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1635 *batch++ = 0; 1636 1637 return batch; 1638 } 1639 1640 /* 1641 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 1642 * initialized at the beginning and shared across all contexts but this field 1643 * helps us to have multiple batches at different offsets and select them based 1644 * on a criteria. At the moment this batch always start at the beginning of the page 1645 * and at this point we don't have multiple wa_ctx batch buffers. 1646 * 1647 * The number of WA applied are not known at the beginning; we use this field 1648 * to return the no of DWORDS written. 1649 * 1650 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 1651 * so it adds NOOPs as padding to make it cacheline aligned. 1652 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 1653 * makes a complete batch buffer. 1654 */ 1655 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1656 { 1657 /* WaDisableCtxRestoreArbitration:bdw,chv */ 1658 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1659 1660 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 1661 if (IS_BROADWELL(engine->i915)) 1662 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1663 1664 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 1665 /* Actual scratch location is at 128 bytes offset */ 1666 batch = gen8_emit_pipe_control(batch, 1667 PIPE_CONTROL_FLUSH_L3 | 1668 PIPE_CONTROL_STORE_DATA_INDEX | 1669 PIPE_CONTROL_CS_STALL | 1670 PIPE_CONTROL_QW_WRITE, 1671 LRC_PPHWSP_SCRATCH_ADDR); 1672 1673 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1674 1675 /* Pad to end of cacheline */ 1676 while ((unsigned long)batch % CACHELINE_BYTES) 1677 *batch++ = MI_NOOP; 1678 1679 /* 1680 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 1681 * execution depends on the length specified in terms of cache lines 1682 * in the register CTX_RCS_INDIRECT_CTX 1683 */ 1684 1685 return batch; 1686 } 1687 1688 struct lri { 1689 i915_reg_t reg; 1690 u32 value; 1691 }; 1692 1693 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 1694 { 1695 GEM_BUG_ON(!count || count > 63); 1696 1697 *batch++ = MI_LOAD_REGISTER_IMM(count); 1698 do { 1699 *batch++ = i915_mmio_reg_offset(lri->reg); 1700 *batch++ = lri->value; 1701 } while (lri++, --count); 1702 *batch++ = MI_NOOP; 1703 1704 return batch; 1705 } 1706 1707 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1708 { 1709 static const struct lri lri[] = { 1710 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 1711 { 1712 COMMON_SLICE_CHICKEN2, 1713 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 1714 0), 1715 }, 1716 1717 /* BSpec: 11391 */ 1718 { 1719 FF_SLICE_CHICKEN, 1720 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 1721 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 1722 }, 1723 1724 /* BSpec: 11299 */ 1725 { 1726 _3D_CHICKEN3, 1727 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 1728 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 1729 } 1730 }; 1731 1732 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1733 1734 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 1735 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1736 1737 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 1738 batch = gen8_emit_pipe_control(batch, 1739 PIPE_CONTROL_FLUSH_L3 | 1740 PIPE_CONTROL_STORE_DATA_INDEX | 1741 PIPE_CONTROL_CS_STALL | 1742 PIPE_CONTROL_QW_WRITE, 1743 LRC_PPHWSP_SCRATCH_ADDR); 1744 1745 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 1746 1747 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 1748 if (HAS_POOLED_EU(engine->i915)) { 1749 /* 1750 * EU pool configuration is setup along with golden context 1751 * during context initialization. This value depends on 1752 * device type (2x6 or 3x6) and needs to be updated based 1753 * on which subslice is disabled especially for 2x6 1754 * devices, however it is safe to load default 1755 * configuration of 3x6 device instead of masking off 1756 * corresponding bits because HW ignores bits of a disabled 1757 * subslice and drops down to appropriate config. Please 1758 * see render_state_setup() in i915_gem_render_state.c for 1759 * possible configurations, to avoid duplication they are 1760 * not shown here again. 1761 */ 1762 *batch++ = GEN9_MEDIA_POOL_STATE; 1763 *batch++ = GEN9_MEDIA_POOL_ENABLE; 1764 *batch++ = 0x00777000; 1765 *batch++ = 0; 1766 *batch++ = 0; 1767 *batch++ = 0; 1768 } 1769 1770 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1771 1772 /* Pad to end of cacheline */ 1773 while ((unsigned long)batch % CACHELINE_BYTES) 1774 *batch++ = MI_NOOP; 1775 1776 return batch; 1777 } 1778 1779 #define CTX_WA_BB_SIZE (PAGE_SIZE) 1780 1781 static int lrc_create_wa_ctx(struct intel_engine_cs *engine) 1782 { 1783 struct drm_i915_gem_object *obj; 1784 struct i915_vma *vma; 1785 int err; 1786 1787 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE); 1788 if (IS_ERR(obj)) 1789 return PTR_ERR(obj); 1790 1791 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 1792 if (IS_ERR(vma)) { 1793 err = PTR_ERR(vma); 1794 goto err; 1795 } 1796 1797 engine->wa_ctx.vma = vma; 1798 return 0; 1799 1800 err: 1801 i915_gem_object_put(obj); 1802 return err; 1803 } 1804 1805 void lrc_fini_wa_ctx(struct intel_engine_cs *engine) 1806 { 1807 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 1808 } 1809 1810 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 1811 1812 void lrc_init_wa_ctx(struct intel_engine_cs *engine) 1813 { 1814 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 1815 struct i915_wa_ctx_bb *wa_bb[] = { 1816 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx 1817 }; 1818 wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)]; 1819 struct i915_gem_ww_ctx ww; 1820 void *batch, *batch_ptr; 1821 unsigned int i; 1822 int err; 1823 1824 if (GRAPHICS_VER(engine->i915) >= 11 || 1825 !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE)) 1826 return; 1827 1828 if (GRAPHICS_VER(engine->i915) == 9) { 1829 wa_bb_fn[0] = gen9_init_indirectctx_bb; 1830 wa_bb_fn[1] = NULL; 1831 } else if (GRAPHICS_VER(engine->i915) == 8) { 1832 wa_bb_fn[0] = gen8_init_indirectctx_bb; 1833 wa_bb_fn[1] = NULL; 1834 } 1835 1836 err = lrc_create_wa_ctx(engine); 1837 if (err) { 1838 /* 1839 * We continue even if we fail to initialize WA batch 1840 * because we only expect rare glitches but nothing 1841 * critical to prevent us from using GPU 1842 */ 1843 drm_err(&engine->i915->drm, 1844 "Ignoring context switch w/a allocation error:%d\n", 1845 err); 1846 return; 1847 } 1848 1849 if (!engine->wa_ctx.vma) 1850 return; 1851 1852 i915_gem_ww_ctx_init(&ww, true); 1853 retry: 1854 err = i915_gem_object_lock(wa_ctx->vma->obj, &ww); 1855 if (!err) 1856 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH); 1857 if (err) 1858 goto err; 1859 1860 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 1861 if (IS_ERR(batch)) { 1862 err = PTR_ERR(batch); 1863 goto err_unpin; 1864 } 1865 1866 /* 1867 * Emit the two workaround batch buffers, recording the offset from the 1868 * start of the workaround batch buffer object for each and their 1869 * respective sizes. 1870 */ 1871 batch_ptr = batch; 1872 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 1873 wa_bb[i]->offset = batch_ptr - batch; 1874 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 1875 CACHELINE_BYTES))) { 1876 err = -EINVAL; 1877 break; 1878 } 1879 if (wa_bb_fn[i]) 1880 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 1881 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 1882 } 1883 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE); 1884 1885 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 1886 __i915_gem_object_release_map(wa_ctx->vma->obj); 1887 1888 /* Verify that we can handle failure to setup the wa_ctx */ 1889 if (!err) 1890 err = i915_inject_probe_error(engine->i915, -ENODEV); 1891 1892 err_unpin: 1893 if (err) 1894 i915_vma_unpin(wa_ctx->vma); 1895 err: 1896 if (err == -EDEADLK) { 1897 err = i915_gem_ww_ctx_backoff(&ww); 1898 if (!err) 1899 goto retry; 1900 } 1901 i915_gem_ww_ctx_fini(&ww); 1902 1903 if (err) { 1904 i915_vma_put(engine->wa_ctx.vma); 1905 1906 /* Clear all flags to prevent further use */ 1907 memset(wa_ctx, 0, sizeof(*wa_ctx)); 1908 } 1909 } 1910 1911 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt) 1912 { 1913 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1914 stats->runtime.num_underflow++; 1915 stats->runtime.max_underflow = 1916 max_t(u32, stats->runtime.max_underflow, -dt); 1917 #endif 1918 } 1919 1920 static u32 lrc_get_runtime(const struct intel_context *ce) 1921 { 1922 /* 1923 * We can use either ppHWSP[16] which is recorded before the context 1924 * switch (and so excludes the cost of context switches) or use the 1925 * value from the context image itself, which is saved/restored earlier 1926 * and so includes the cost of the save. 1927 */ 1928 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); 1929 } 1930 1931 void lrc_update_runtime(struct intel_context *ce) 1932 { 1933 struct intel_context_stats *stats = &ce->stats; 1934 u32 old; 1935 s32 dt; 1936 1937 old = stats->runtime.last; 1938 stats->runtime.last = lrc_get_runtime(ce); 1939 dt = stats->runtime.last - old; 1940 if (!dt) 1941 return; 1942 1943 if (unlikely(dt < 0)) { 1944 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1945 old, stats->runtime.last, dt); 1946 st_runtime_underflow(stats, dt); 1947 return; 1948 } 1949 1950 ewma_runtime_add(&stats->runtime.avg, dt); 1951 stats->runtime.total += dt; 1952 } 1953 1954 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1955 #include "selftest_lrc.c" 1956 #endif 1957