1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2014 Intel Corporation 4 */ 5 6 #include "gem/i915_gem_lmem.h" 7 8 #include "gen8_engine_cs.h" 9 #include "i915_drv.h" 10 #include "i915_perf.h" 11 #include "i915_reg.h" 12 #include "intel_context.h" 13 #include "intel_engine.h" 14 #include "intel_engine_regs.h" 15 #include "intel_gpu_commands.h" 16 #include "intel_gt.h" 17 #include "intel_gt_regs.h" 18 #include "intel_lrc.h" 19 #include "intel_lrc_reg.h" 20 #include "intel_ring.h" 21 #include "shmem_utils.h" 22 23 /* 24 * The per-platform tables are u8-encoded in @data. Decode @data and set the 25 * addresses' offset and commands in @regs. The following encoding is used 26 * for each byte. There are 2 steps: decoding commands and decoding addresses. 27 * 28 * Commands: 29 * [7]: create NOPs - number of NOPs are set in lower bits 30 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set 31 * MI_LRI_FORCE_POSTED 32 * [5:0]: Number of NOPs or registers to set values to in case of 33 * MI_LOAD_REGISTER_IMM 34 * 35 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count" 36 * number of registers. They are set by using the REG/REG16 macros: the former 37 * is used for offsets smaller than 0x200 while the latter is for values bigger 38 * than that. Those macros already set all the bits documented below correctly: 39 * 40 * [7]: When a register offset needs more than 6 bits, use additional bytes, to 41 * follow, for the lower bits 42 * [6:0]: Register offset, without considering the engine base. 43 * 44 * This function only tweaks the commands and register offsets. Values are not 45 * filled out. 46 */ 47 static void set_offsets(u32 *regs, 48 const u8 *data, 49 const struct intel_engine_cs *engine, 50 bool close) 51 #define NOP(x) (BIT(7) | (x)) 52 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 53 #define POSTED BIT(0) 54 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 55 #define REG16(x) \ 56 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 57 (((x) >> 2) & 0x7f) 58 #define END 0 59 { 60 const u32 base = engine->mmio_base; 61 62 while (*data) { 63 u8 count, flags; 64 65 if (*data & BIT(7)) { /* skip */ 66 count = *data++ & ~BIT(7); 67 regs += count; 68 continue; 69 } 70 71 count = *data & 0x3f; 72 flags = *data >> 6; 73 data++; 74 75 *regs = MI_LOAD_REGISTER_IMM(count); 76 if (flags & POSTED) 77 *regs |= MI_LRI_FORCE_POSTED; 78 if (GRAPHICS_VER(engine->i915) >= 11) 79 *regs |= MI_LRI_LRM_CS_MMIO; 80 regs++; 81 82 GEM_BUG_ON(!count); 83 do { 84 u32 offset = 0; 85 u8 v; 86 87 do { 88 v = *data++; 89 offset <<= 7; 90 offset |= v & ~BIT(7); 91 } while (v & BIT(7)); 92 93 regs[0] = base + (offset << 2); 94 regs += 2; 95 } while (--count); 96 } 97 98 if (close) { 99 /* Close the batch; used mainly by live_lrc_layout() */ 100 *regs = MI_BATCH_BUFFER_END; 101 if (GRAPHICS_VER(engine->i915) >= 11) 102 *regs |= BIT(0); 103 } 104 } 105 106 static const u8 gen8_xcs_offsets[] = { 107 NOP(1), 108 LRI(11, 0), 109 REG16(0x244), 110 REG(0x034), 111 REG(0x030), 112 REG(0x038), 113 REG(0x03c), 114 REG(0x168), 115 REG(0x140), 116 REG(0x110), 117 REG(0x11c), 118 REG(0x114), 119 REG(0x118), 120 121 NOP(9), 122 LRI(9, 0), 123 REG16(0x3a8), 124 REG16(0x28c), 125 REG16(0x288), 126 REG16(0x284), 127 REG16(0x280), 128 REG16(0x27c), 129 REG16(0x278), 130 REG16(0x274), 131 REG16(0x270), 132 133 NOP(13), 134 LRI(2, 0), 135 REG16(0x200), 136 REG(0x028), 137 138 END 139 }; 140 141 static const u8 gen9_xcs_offsets[] = { 142 NOP(1), 143 LRI(14, POSTED), 144 REG16(0x244), 145 REG(0x034), 146 REG(0x030), 147 REG(0x038), 148 REG(0x03c), 149 REG(0x168), 150 REG(0x140), 151 REG(0x110), 152 REG(0x11c), 153 REG(0x114), 154 REG(0x118), 155 REG(0x1c0), 156 REG(0x1c4), 157 REG(0x1c8), 158 159 NOP(3), 160 LRI(9, POSTED), 161 REG16(0x3a8), 162 REG16(0x28c), 163 REG16(0x288), 164 REG16(0x284), 165 REG16(0x280), 166 REG16(0x27c), 167 REG16(0x278), 168 REG16(0x274), 169 REG16(0x270), 170 171 NOP(13), 172 LRI(1, POSTED), 173 REG16(0x200), 174 175 NOP(13), 176 LRI(44, POSTED), 177 REG(0x028), 178 REG(0x09c), 179 REG(0x0c0), 180 REG(0x178), 181 REG(0x17c), 182 REG16(0x358), 183 REG(0x170), 184 REG(0x150), 185 REG(0x154), 186 REG(0x158), 187 REG16(0x41c), 188 REG16(0x600), 189 REG16(0x604), 190 REG16(0x608), 191 REG16(0x60c), 192 REG16(0x610), 193 REG16(0x614), 194 REG16(0x618), 195 REG16(0x61c), 196 REG16(0x620), 197 REG16(0x624), 198 REG16(0x628), 199 REG16(0x62c), 200 REG16(0x630), 201 REG16(0x634), 202 REG16(0x638), 203 REG16(0x63c), 204 REG16(0x640), 205 REG16(0x644), 206 REG16(0x648), 207 REG16(0x64c), 208 REG16(0x650), 209 REG16(0x654), 210 REG16(0x658), 211 REG16(0x65c), 212 REG16(0x660), 213 REG16(0x664), 214 REG16(0x668), 215 REG16(0x66c), 216 REG16(0x670), 217 REG16(0x674), 218 REG16(0x678), 219 REG16(0x67c), 220 REG(0x068), 221 222 END 223 }; 224 225 static const u8 gen12_xcs_offsets[] = { 226 NOP(1), 227 LRI(13, POSTED), 228 REG16(0x244), 229 REG(0x034), 230 REG(0x030), 231 REG(0x038), 232 REG(0x03c), 233 REG(0x168), 234 REG(0x140), 235 REG(0x110), 236 REG(0x1c0), 237 REG(0x1c4), 238 REG(0x1c8), 239 REG(0x180), 240 REG16(0x2b4), 241 242 NOP(5), 243 LRI(9, POSTED), 244 REG16(0x3a8), 245 REG16(0x28c), 246 REG16(0x288), 247 REG16(0x284), 248 REG16(0x280), 249 REG16(0x27c), 250 REG16(0x278), 251 REG16(0x274), 252 REG16(0x270), 253 254 END 255 }; 256 257 static const u8 dg2_xcs_offsets[] = { 258 NOP(1), 259 LRI(15, POSTED), 260 REG16(0x244), 261 REG(0x034), 262 REG(0x030), 263 REG(0x038), 264 REG(0x03c), 265 REG(0x168), 266 REG(0x140), 267 REG(0x110), 268 REG(0x1c0), 269 REG(0x1c4), 270 REG(0x1c8), 271 REG(0x180), 272 REG16(0x2b4), 273 REG(0x120), 274 REG(0x124), 275 276 NOP(1), 277 LRI(9, POSTED), 278 REG16(0x3a8), 279 REG16(0x28c), 280 REG16(0x288), 281 REG16(0x284), 282 REG16(0x280), 283 REG16(0x27c), 284 REG16(0x278), 285 REG16(0x274), 286 REG16(0x270), 287 288 END 289 }; 290 291 static const u8 gen8_rcs_offsets[] = { 292 NOP(1), 293 LRI(14, POSTED), 294 REG16(0x244), 295 REG(0x034), 296 REG(0x030), 297 REG(0x038), 298 REG(0x03c), 299 REG(0x168), 300 REG(0x140), 301 REG(0x110), 302 REG(0x11c), 303 REG(0x114), 304 REG(0x118), 305 REG(0x1c0), 306 REG(0x1c4), 307 REG(0x1c8), 308 309 NOP(3), 310 LRI(9, POSTED), 311 REG16(0x3a8), 312 REG16(0x28c), 313 REG16(0x288), 314 REG16(0x284), 315 REG16(0x280), 316 REG16(0x27c), 317 REG16(0x278), 318 REG16(0x274), 319 REG16(0x270), 320 321 NOP(13), 322 LRI(1, 0), 323 REG(0x0c8), 324 325 END 326 }; 327 328 static const u8 gen9_rcs_offsets[] = { 329 NOP(1), 330 LRI(14, POSTED), 331 REG16(0x244), 332 REG(0x34), 333 REG(0x30), 334 REG(0x38), 335 REG(0x3c), 336 REG(0x168), 337 REG(0x140), 338 REG(0x110), 339 REG(0x11c), 340 REG(0x114), 341 REG(0x118), 342 REG(0x1c0), 343 REG(0x1c4), 344 REG(0x1c8), 345 346 NOP(3), 347 LRI(9, POSTED), 348 REG16(0x3a8), 349 REG16(0x28c), 350 REG16(0x288), 351 REG16(0x284), 352 REG16(0x280), 353 REG16(0x27c), 354 REG16(0x278), 355 REG16(0x274), 356 REG16(0x270), 357 358 NOP(13), 359 LRI(1, 0), 360 REG(0xc8), 361 362 NOP(13), 363 LRI(44, POSTED), 364 REG(0x28), 365 REG(0x9c), 366 REG(0xc0), 367 REG(0x178), 368 REG(0x17c), 369 REG16(0x358), 370 REG(0x170), 371 REG(0x150), 372 REG(0x154), 373 REG(0x158), 374 REG16(0x41c), 375 REG16(0x600), 376 REG16(0x604), 377 REG16(0x608), 378 REG16(0x60c), 379 REG16(0x610), 380 REG16(0x614), 381 REG16(0x618), 382 REG16(0x61c), 383 REG16(0x620), 384 REG16(0x624), 385 REG16(0x628), 386 REG16(0x62c), 387 REG16(0x630), 388 REG16(0x634), 389 REG16(0x638), 390 REG16(0x63c), 391 REG16(0x640), 392 REG16(0x644), 393 REG16(0x648), 394 REG16(0x64c), 395 REG16(0x650), 396 REG16(0x654), 397 REG16(0x658), 398 REG16(0x65c), 399 REG16(0x660), 400 REG16(0x664), 401 REG16(0x668), 402 REG16(0x66c), 403 REG16(0x670), 404 REG16(0x674), 405 REG16(0x678), 406 REG16(0x67c), 407 REG(0x68), 408 409 END 410 }; 411 412 static const u8 gen11_rcs_offsets[] = { 413 NOP(1), 414 LRI(15, POSTED), 415 REG16(0x244), 416 REG(0x034), 417 REG(0x030), 418 REG(0x038), 419 REG(0x03c), 420 REG(0x168), 421 REG(0x140), 422 REG(0x110), 423 REG(0x11c), 424 REG(0x114), 425 REG(0x118), 426 REG(0x1c0), 427 REG(0x1c4), 428 REG(0x1c8), 429 REG(0x180), 430 431 NOP(1), 432 LRI(9, POSTED), 433 REG16(0x3a8), 434 REG16(0x28c), 435 REG16(0x288), 436 REG16(0x284), 437 REG16(0x280), 438 REG16(0x27c), 439 REG16(0x278), 440 REG16(0x274), 441 REG16(0x270), 442 443 LRI(1, POSTED), 444 REG(0x1b0), 445 446 NOP(10), 447 LRI(1, 0), 448 REG(0x0c8), 449 450 END 451 }; 452 453 static const u8 gen12_rcs_offsets[] = { 454 NOP(1), 455 LRI(13, POSTED), 456 REG16(0x244), 457 REG(0x034), 458 REG(0x030), 459 REG(0x038), 460 REG(0x03c), 461 REG(0x168), 462 REG(0x140), 463 REG(0x110), 464 REG(0x1c0), 465 REG(0x1c4), 466 REG(0x1c8), 467 REG(0x180), 468 REG16(0x2b4), 469 470 NOP(5), 471 LRI(9, POSTED), 472 REG16(0x3a8), 473 REG16(0x28c), 474 REG16(0x288), 475 REG16(0x284), 476 REG16(0x280), 477 REG16(0x27c), 478 REG16(0x278), 479 REG16(0x274), 480 REG16(0x270), 481 482 LRI(3, POSTED), 483 REG(0x1b0), 484 REG16(0x5a8), 485 REG16(0x5ac), 486 487 NOP(6), 488 LRI(1, 0), 489 REG(0x0c8), 490 NOP(3 + 9 + 1), 491 492 LRI(51, POSTED), 493 REG16(0x588), 494 REG16(0x588), 495 REG16(0x588), 496 REG16(0x588), 497 REG16(0x588), 498 REG16(0x588), 499 REG(0x028), 500 REG(0x09c), 501 REG(0x0c0), 502 REG(0x178), 503 REG(0x17c), 504 REG16(0x358), 505 REG(0x170), 506 REG(0x150), 507 REG(0x154), 508 REG(0x158), 509 REG16(0x41c), 510 REG16(0x600), 511 REG16(0x604), 512 REG16(0x608), 513 REG16(0x60c), 514 REG16(0x610), 515 REG16(0x614), 516 REG16(0x618), 517 REG16(0x61c), 518 REG16(0x620), 519 REG16(0x624), 520 REG16(0x628), 521 REG16(0x62c), 522 REG16(0x630), 523 REG16(0x634), 524 REG16(0x638), 525 REG16(0x63c), 526 REG16(0x640), 527 REG16(0x644), 528 REG16(0x648), 529 REG16(0x64c), 530 REG16(0x650), 531 REG16(0x654), 532 REG16(0x658), 533 REG16(0x65c), 534 REG16(0x660), 535 REG16(0x664), 536 REG16(0x668), 537 REG16(0x66c), 538 REG16(0x670), 539 REG16(0x674), 540 REG16(0x678), 541 REG16(0x67c), 542 REG(0x068), 543 REG(0x084), 544 NOP(1), 545 546 END 547 }; 548 549 static const u8 dg2_rcs_offsets[] = { 550 NOP(1), 551 LRI(15, POSTED), 552 REG16(0x244), 553 REG(0x034), 554 REG(0x030), 555 REG(0x038), 556 REG(0x03c), 557 REG(0x168), 558 REG(0x140), 559 REG(0x110), 560 REG(0x1c0), 561 REG(0x1c4), 562 REG(0x1c8), 563 REG(0x180), 564 REG16(0x2b4), 565 REG(0x120), 566 REG(0x124), 567 568 NOP(1), 569 LRI(9, POSTED), 570 REG16(0x3a8), 571 REG16(0x28c), 572 REG16(0x288), 573 REG16(0x284), 574 REG16(0x280), 575 REG16(0x27c), 576 REG16(0x278), 577 REG16(0x274), 578 REG16(0x270), 579 580 LRI(3, POSTED), 581 REG(0x1b0), 582 REG16(0x5a8), 583 REG16(0x5ac), 584 585 NOP(6), 586 LRI(1, 0), 587 REG(0x0c8), 588 589 END 590 }; 591 592 static const u8 mtl_rcs_offsets[] = { 593 NOP(1), 594 LRI(15, POSTED), 595 REG16(0x244), 596 REG(0x034), 597 REG(0x030), 598 REG(0x038), 599 REG(0x03c), 600 REG(0x168), 601 REG(0x140), 602 REG(0x110), 603 REG(0x1c0), 604 REG(0x1c4), 605 REG(0x1c8), 606 REG(0x180), 607 REG16(0x2b4), 608 REG(0x120), 609 REG(0x124), 610 611 NOP(1), 612 LRI(9, POSTED), 613 REG16(0x3a8), 614 REG16(0x28c), 615 REG16(0x288), 616 REG16(0x284), 617 REG16(0x280), 618 REG16(0x27c), 619 REG16(0x278), 620 REG16(0x274), 621 REG16(0x270), 622 623 NOP(2), 624 LRI(2, POSTED), 625 REG16(0x5a8), 626 REG16(0x5ac), 627 628 NOP(6), 629 LRI(1, 0), 630 REG(0x0c8), 631 632 END 633 }; 634 635 #undef END 636 #undef REG16 637 #undef REG 638 #undef LRI 639 #undef NOP 640 641 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 642 { 643 /* 644 * The gen12+ lists only have the registers we program in the basic 645 * default state. We rely on the context image using relative 646 * addressing to automatic fixup the register state between the 647 * physical engines for virtual engine. 648 */ 649 GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 && 650 !intel_engine_has_relative_mmio(engine)); 651 652 if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) { 653 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70)) 654 return mtl_rcs_offsets; 655 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 656 return dg2_rcs_offsets; 657 else if (GRAPHICS_VER(engine->i915) >= 12) 658 return gen12_rcs_offsets; 659 else if (GRAPHICS_VER(engine->i915) >= 11) 660 return gen11_rcs_offsets; 661 else if (GRAPHICS_VER(engine->i915) >= 9) 662 return gen9_rcs_offsets; 663 else 664 return gen8_rcs_offsets; 665 } else { 666 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 667 return dg2_xcs_offsets; 668 else if (GRAPHICS_VER(engine->i915) >= 12) 669 return gen12_xcs_offsets; 670 else if (GRAPHICS_VER(engine->i915) >= 9) 671 return gen9_xcs_offsets; 672 else 673 return gen8_xcs_offsets; 674 } 675 } 676 677 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 678 { 679 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 680 return 0x70; 681 else if (GRAPHICS_VER(engine->i915) >= 12) 682 return 0x60; 683 else if (GRAPHICS_VER(engine->i915) >= 9) 684 return 0x54; 685 else if (engine->class == RENDER_CLASS) 686 return 0x58; 687 else 688 return -1; 689 } 690 691 static int lrc_ring_bb_offset(const struct intel_engine_cs *engine) 692 { 693 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 694 return 0x80; 695 else if (GRAPHICS_VER(engine->i915) >= 12) 696 return 0x70; 697 else if (GRAPHICS_VER(engine->i915) >= 9) 698 return 0x64; 699 else if (GRAPHICS_VER(engine->i915) >= 8 && 700 engine->class == RENDER_CLASS) 701 return 0xc4; 702 else 703 return -1; 704 } 705 706 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 707 { 708 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 709 return 0x84; 710 else if (GRAPHICS_VER(engine->i915) >= 12) 711 return 0x74; 712 else if (GRAPHICS_VER(engine->i915) >= 9) 713 return 0x68; 714 else if (engine->class == RENDER_CLASS) 715 return 0xd8; 716 else 717 return -1; 718 } 719 720 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 721 { 722 if (GRAPHICS_VER(engine->i915) >= 12) 723 return 0x12; 724 else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS) 725 return 0x18; 726 else 727 return -1; 728 } 729 730 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 731 { 732 int x; 733 734 x = lrc_ring_wa_bb_per_ctx(engine); 735 if (x < 0) 736 return x; 737 738 return x + 2; 739 } 740 741 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 742 { 743 int x; 744 745 x = lrc_ring_indirect_ptr(engine); 746 if (x < 0) 747 return x; 748 749 return x + 2; 750 } 751 752 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 753 { 754 755 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 756 /* 757 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL 758 * simply to match the RCS context image layout. 759 */ 760 return 0xc6; 761 else if (engine->class != RENDER_CLASS) 762 return -1; 763 else if (GRAPHICS_VER(engine->i915) >= 12) 764 return 0xb6; 765 else if (GRAPHICS_VER(engine->i915) >= 11) 766 return 0xaa; 767 else 768 return -1; 769 } 770 771 static u32 772 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 773 { 774 if (GRAPHICS_VER(engine->i915) >= 12) 775 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 776 else if (GRAPHICS_VER(engine->i915) >= 11) 777 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 778 else if (GRAPHICS_VER(engine->i915) >= 9) 779 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 780 else if (GRAPHICS_VER(engine->i915) >= 8) 781 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 782 783 GEM_BUG_ON(GRAPHICS_VER(engine->i915) < 8); 784 785 return 0; 786 } 787 788 static void 789 lrc_setup_bb_per_ctx(u32 *regs, 790 const struct intel_engine_cs *engine, 791 u32 ctx_bb_ggtt_addr) 792 { 793 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 794 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 795 ctx_bb_ggtt_addr | 796 PER_CTX_BB_FORCE | 797 PER_CTX_BB_VALID; 798 } 799 800 static void 801 lrc_setup_indirect_ctx(u32 *regs, 802 const struct intel_engine_cs *engine, 803 u32 ctx_bb_ggtt_addr, 804 u32 size) 805 { 806 GEM_BUG_ON(!size); 807 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 808 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 809 regs[lrc_ring_indirect_ptr(engine) + 1] = 810 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 811 812 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 813 regs[lrc_ring_indirect_offset(engine) + 1] = 814 lrc_ring_indirect_offset_default(engine) << 6; 815 } 816 817 static bool ctx_needs_runalone(const struct intel_context *ce) 818 { 819 struct i915_gem_context *gem_ctx; 820 bool ctx_is_protected = false; 821 822 /* 823 * Wa_14019159160 - Case 2. 824 * On some platforms, protected contexts require setting 825 * the LRC run-alone bit or else the encryption/decryption will not happen. 826 * NOTE: Case 2 only applies to PXP use-case of said workaround. 827 */ 828 if (GRAPHICS_VER_FULL(ce->engine->i915) >= IP_VER(12, 70) && 829 (ce->engine->class == COMPUTE_CLASS || ce->engine->class == RENDER_CLASS)) { 830 rcu_read_lock(); 831 gem_ctx = rcu_dereference(ce->gem_context); 832 if (gem_ctx) 833 ctx_is_protected = gem_ctx->uses_protected_content; 834 rcu_read_unlock(); 835 } 836 837 return ctx_is_protected; 838 } 839 840 static void init_common_regs(u32 * const regs, 841 const struct intel_context *ce, 842 const struct intel_engine_cs *engine, 843 bool inhibit) 844 { 845 u32 ctl; 846 int loc; 847 848 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 849 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 850 if (inhibit) 851 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 852 if (GRAPHICS_VER(engine->i915) < 11) 853 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 854 CTX_CTRL_RS_CTX_ENABLE); 855 /* Wa_14019159160 - Case 2.*/ 856 if (ctx_needs_runalone(ce)) 857 ctl |= _MASKED_BIT_ENABLE(GEN12_CTX_CTRL_RUNALONE_MODE); 858 regs[CTX_CONTEXT_CONTROL] = ctl; 859 860 regs[CTX_TIMESTAMP] = ce->stats.runtime.last; 861 862 loc = lrc_ring_bb_offset(engine); 863 if (loc != -1) 864 regs[loc + 1] = 0; 865 } 866 867 static void init_wa_bb_regs(u32 * const regs, 868 const struct intel_engine_cs *engine) 869 { 870 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 871 872 if (wa_ctx->per_ctx.size) { 873 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 874 875 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 876 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 877 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 878 } 879 880 if (wa_ctx->indirect_ctx.size) { 881 lrc_setup_indirect_ctx(regs, engine, 882 i915_ggtt_offset(wa_ctx->vma) + 883 wa_ctx->indirect_ctx.offset, 884 wa_ctx->indirect_ctx.size); 885 } 886 } 887 888 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt) 889 { 890 if (i915_vm_is_4lvl(&ppgtt->vm)) { 891 /* 64b PPGTT (48bit canonical) 892 * PDP0_DESCRIPTOR contains the base address to PML4 and 893 * other PDP Descriptors are ignored. 894 */ 895 ASSIGN_CTX_PML4(ppgtt, regs); 896 } else { 897 ASSIGN_CTX_PDP(ppgtt, regs, 3); 898 ASSIGN_CTX_PDP(ppgtt, regs, 2); 899 ASSIGN_CTX_PDP(ppgtt, regs, 1); 900 ASSIGN_CTX_PDP(ppgtt, regs, 0); 901 } 902 } 903 904 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 905 { 906 if (i915_is_ggtt(vm)) 907 return i915_vm_to_ggtt(vm)->alias; 908 else 909 return i915_vm_to_ppgtt(vm); 910 } 911 912 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 913 { 914 int x; 915 916 x = lrc_ring_mi_mode(engine); 917 if (x != -1) { 918 regs[x + 1] &= ~STOP_RING; 919 regs[x + 1] |= STOP_RING << 16; 920 } 921 } 922 923 static void __lrc_init_regs(u32 *regs, 924 const struct intel_context *ce, 925 const struct intel_engine_cs *engine, 926 bool inhibit) 927 { 928 /* 929 * A context is actually a big batch buffer with several 930 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 931 * values we are setting here are only for the first context restore: 932 * on a subsequent save, the GPU will recreate this batchbuffer with new 933 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 934 * we are not initializing here). 935 * 936 * Must keep consistent with virtual_update_register_offsets(). 937 */ 938 939 if (inhibit) 940 memset(regs, 0, PAGE_SIZE); 941 942 set_offsets(regs, reg_offsets(engine), engine, inhibit); 943 944 init_common_regs(regs, ce, engine, inhibit); 945 init_ppgtt_regs(regs, vm_alias(ce->vm)); 946 947 init_wa_bb_regs(regs, engine); 948 949 __reset_stop_ring(regs, engine); 950 } 951 952 void lrc_init_regs(const struct intel_context *ce, 953 const struct intel_engine_cs *engine, 954 bool inhibit) 955 { 956 __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit); 957 } 958 959 void lrc_reset_regs(const struct intel_context *ce, 960 const struct intel_engine_cs *engine) 961 { 962 __reset_stop_ring(ce->lrc_reg_state, engine); 963 } 964 965 static void 966 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 967 { 968 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 969 return; 970 971 vaddr += engine->context_size; 972 973 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 974 } 975 976 static void 977 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 978 { 979 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 980 return; 981 982 vaddr += engine->context_size; 983 984 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 985 drm_err_once(&engine->i915->drm, 986 "%s context redzone overwritten!\n", 987 engine->name); 988 } 989 990 static u32 context_wa_bb_offset(const struct intel_context *ce) 991 { 992 return PAGE_SIZE * ce->wa_bb_page; 993 } 994 995 /* 996 * per_ctx below determines which WABB section is used. 997 * When true, the function returns the location of the 998 * PER_CTX_BB. When false, the function returns the 999 * location of the INDIRECT_CTX. 1000 */ 1001 static u32 *context_wabb(const struct intel_context *ce, bool per_ctx) 1002 { 1003 void *ptr; 1004 1005 GEM_BUG_ON(!ce->wa_bb_page); 1006 1007 ptr = ce->lrc_reg_state; 1008 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 1009 ptr += context_wa_bb_offset(ce); 1010 ptr += per_ctx ? PAGE_SIZE : 0; 1011 1012 return ptr; 1013 } 1014 1015 void lrc_init_state(struct intel_context *ce, 1016 struct intel_engine_cs *engine, 1017 void *state) 1018 { 1019 bool inhibit = true; 1020 1021 set_redzone(state, engine); 1022 1023 if (ce->default_state) { 1024 shmem_read(ce->default_state, 0, state, engine->context_size); 1025 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 1026 inhibit = false; 1027 } 1028 1029 /* Clear the ppHWSP (inc. per-context counters) */ 1030 memset(state, 0, PAGE_SIZE); 1031 1032 /* Clear the indirect wa and storage */ 1033 if (ce->wa_bb_page) 1034 memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE); 1035 1036 /* 1037 * The second page of the context object contains some registers which 1038 * must be set up prior to the first execution. 1039 */ 1040 __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit); 1041 } 1042 1043 u32 lrc_indirect_bb(const struct intel_context *ce) 1044 { 1045 return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce); 1046 } 1047 1048 static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs) 1049 { 1050 /* If predication is active, this will be noop'ed */ 1051 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2); 1052 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA; 1053 *cs++ = 0; 1054 *cs++ = 0; /* No predication */ 1055 1056 /* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */ 1057 *cs++ = MI_BATCH_BUFFER_END | BIT(15); 1058 *cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE; 1059 1060 /* Instructions are no longer predicated (disabled), we can proceed */ 1061 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2); 1062 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA; 1063 *cs++ = 0; 1064 *cs++ = 1; /* enable predication before the next BB */ 1065 1066 *cs++ = MI_BATCH_BUFFER_END; 1067 GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA); 1068 1069 return cs; 1070 } 1071 1072 static struct i915_vma * 1073 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine) 1074 { 1075 struct drm_i915_gem_object *obj; 1076 struct i915_vma *vma; 1077 u32 context_size; 1078 1079 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 1080 1081 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1082 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 1083 1084 if (GRAPHICS_VER(engine->i915) >= 12) { 1085 ce->wa_bb_page = context_size / PAGE_SIZE; 1086 /* INDIRECT_CTX and PER_CTX_BB need separate pages. */ 1087 context_size += PAGE_SIZE * 2; 1088 } 1089 1090 if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) { 1091 ce->parallel.guc.parent_page = context_size / PAGE_SIZE; 1092 context_size += PARENT_SCRATCH_SIZE; 1093 } 1094 1095 obj = i915_gem_object_create_lmem(engine->i915, context_size, 1096 I915_BO_ALLOC_PM_VOLATILE); 1097 if (IS_ERR(obj)) { 1098 obj = i915_gem_object_create_shmem(engine->i915, context_size); 1099 if (IS_ERR(obj)) 1100 return ERR_CAST(obj); 1101 1102 /* 1103 * Wa_22016122933: For Media version 13.0, all Media GT shared 1104 * memory needs to be mapped as WC on CPU side and UC (PAT 1105 * index 2) on GPU side. 1106 */ 1107 if (intel_gt_needs_wa_22016122933(engine->gt)) 1108 i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE); 1109 } 1110 1111 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 1112 if (IS_ERR(vma)) { 1113 i915_gem_object_put(obj); 1114 return vma; 1115 } 1116 1117 return vma; 1118 } 1119 1120 static struct intel_timeline * 1121 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine) 1122 { 1123 struct intel_timeline *tl = fetch_and_zero(&ce->timeline); 1124 1125 return intel_timeline_create_from_engine(engine, page_unmask_bits(tl)); 1126 } 1127 1128 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine) 1129 { 1130 struct intel_ring *ring; 1131 struct i915_vma *vma; 1132 int err; 1133 1134 GEM_BUG_ON(ce->state); 1135 1136 if (!intel_context_has_own_state(ce)) 1137 ce->default_state = engine->default_state; 1138 1139 vma = __lrc_alloc_state(ce, engine); 1140 if (IS_ERR(vma)) 1141 return PTR_ERR(vma); 1142 1143 ring = intel_engine_create_ring(engine, ce->ring_size); 1144 if (IS_ERR(ring)) { 1145 err = PTR_ERR(ring); 1146 goto err_vma; 1147 } 1148 1149 if (!page_mask_bits(ce->timeline)) { 1150 struct intel_timeline *tl; 1151 1152 /* 1153 * Use the static global HWSP for the kernel context, and 1154 * a dynamically allocated cacheline for everyone else. 1155 */ 1156 if (unlikely(ce->timeline)) 1157 tl = pinned_timeline(ce, engine); 1158 else 1159 tl = intel_timeline_create(engine->gt); 1160 if (IS_ERR(tl)) { 1161 err = PTR_ERR(tl); 1162 goto err_ring; 1163 } 1164 1165 ce->timeline = tl; 1166 } 1167 1168 ce->ring = ring; 1169 ce->state = vma; 1170 1171 return 0; 1172 1173 err_ring: 1174 intel_ring_put(ring); 1175 err_vma: 1176 i915_vma_put(vma); 1177 return err; 1178 } 1179 1180 void lrc_reset(struct intel_context *ce) 1181 { 1182 GEM_BUG_ON(!intel_context_is_pinned(ce)); 1183 1184 intel_ring_reset(ce->ring, ce->ring->emit); 1185 1186 /* Scrub away the garbage */ 1187 lrc_init_regs(ce, ce->engine, true); 1188 ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail); 1189 } 1190 1191 int 1192 lrc_pre_pin(struct intel_context *ce, 1193 struct intel_engine_cs *engine, 1194 struct i915_gem_ww_ctx *ww, 1195 void **vaddr) 1196 { 1197 GEM_BUG_ON(!ce->state); 1198 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 1199 1200 *vaddr = i915_gem_object_pin_map(ce->state->obj, 1201 intel_gt_coherent_map_type(ce->engine->gt, 1202 ce->state->obj, 1203 false) | 1204 I915_MAP_OVERRIDE); 1205 1206 return PTR_ERR_OR_ZERO(*vaddr); 1207 } 1208 1209 int 1210 lrc_pin(struct intel_context *ce, 1211 struct intel_engine_cs *engine, 1212 void *vaddr) 1213 { 1214 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 1215 1216 if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags)) 1217 lrc_init_state(ce, engine, vaddr); 1218 1219 ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail); 1220 return 0; 1221 } 1222 1223 void lrc_unpin(struct intel_context *ce) 1224 { 1225 if (unlikely(ce->parallel.last_rq)) { 1226 i915_request_put(ce->parallel.last_rq); 1227 ce->parallel.last_rq = NULL; 1228 } 1229 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 1230 ce->engine); 1231 } 1232 1233 void lrc_post_unpin(struct intel_context *ce) 1234 { 1235 i915_gem_object_unpin_map(ce->state->obj); 1236 } 1237 1238 void lrc_fini(struct intel_context *ce) 1239 { 1240 if (!ce->state) 1241 return; 1242 1243 intel_ring_put(fetch_and_zero(&ce->ring)); 1244 i915_vma_put(fetch_and_zero(&ce->state)); 1245 } 1246 1247 void lrc_destroy(struct kref *kref) 1248 { 1249 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 1250 1251 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 1252 GEM_BUG_ON(intel_context_is_pinned(ce)); 1253 1254 lrc_fini(ce); 1255 1256 intel_context_fini(ce); 1257 intel_context_free(ce); 1258 } 1259 1260 static u32 * 1261 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 1262 { 1263 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1264 MI_SRM_LRM_GLOBAL_GTT | 1265 MI_LRI_LRM_CS_MMIO; 1266 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1267 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1268 CTX_TIMESTAMP * sizeof(u32); 1269 *cs++ = 0; 1270 1271 *cs++ = MI_LOAD_REGISTER_REG | 1272 MI_LRR_SOURCE_CS_MMIO | 1273 MI_LRI_LRM_CS_MMIO; 1274 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1275 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1276 1277 *cs++ = MI_LOAD_REGISTER_REG | 1278 MI_LRR_SOURCE_CS_MMIO | 1279 MI_LRI_LRM_CS_MMIO; 1280 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1281 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 1282 1283 return cs; 1284 } 1285 1286 static u32 * 1287 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 1288 { 1289 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 1290 1291 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1292 MI_SRM_LRM_GLOBAL_GTT | 1293 MI_LRI_LRM_CS_MMIO; 1294 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1295 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1296 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 1297 *cs++ = 0; 1298 1299 return cs; 1300 } 1301 1302 static u32 * 1303 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 1304 { 1305 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 1306 1307 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 1308 MI_SRM_LRM_GLOBAL_GTT | 1309 MI_LRI_LRM_CS_MMIO; 1310 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1311 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 1312 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 1313 *cs++ = 0; 1314 1315 *cs++ = MI_LOAD_REGISTER_REG | 1316 MI_LRR_SOURCE_CS_MMIO | 1317 MI_LRI_LRM_CS_MMIO; 1318 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 1319 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 1320 1321 return cs; 1322 } 1323 1324 /* 1325 * The bspec's tuning guide asks us to program a vertical watermark value of 1326 * 0x3FF. However this register is not saved/restored properly by the 1327 * hardware, so we're required to apply the desired value via INDIRECT_CTX 1328 * batch buffer to ensure the value takes effect properly. All other bits 1329 * in this register should remain at 0 (the hardware default). 1330 */ 1331 static u32 * 1332 dg2_emit_draw_watermark_setting(u32 *cs) 1333 { 1334 *cs++ = MI_LOAD_REGISTER_IMM(1); 1335 *cs++ = i915_mmio_reg_offset(DRAW_WATERMARK); 1336 *cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF); 1337 1338 return cs; 1339 } 1340 1341 static u32 * 1342 gen12_invalidate_state_cache(u32 *cs) 1343 { 1344 *cs++ = MI_LOAD_REGISTER_IMM(1); 1345 *cs++ = i915_mmio_reg_offset(GEN12_CS_DEBUG_MODE2); 1346 *cs++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE); 1347 return cs; 1348 } 1349 1350 static u32 * 1351 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 1352 { 1353 cs = gen12_emit_timestamp_wa(ce, cs); 1354 cs = gen12_emit_cmd_buf_wa(ce, cs); 1355 cs = gen12_emit_restore_scratch(ce, cs); 1356 1357 /* Wa_16013000631:dg2 */ 1358 if (IS_DG2_G11(ce->engine->i915)) 1359 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0); 1360 1361 cs = gen12_emit_aux_table_inv(ce->engine, cs); 1362 1363 /* Wa_18022495364 */ 1364 if (IS_GFX_GT_IP_RANGE(ce->engine->gt, IP_VER(12, 0), IP_VER(12, 10))) 1365 cs = gen12_invalidate_state_cache(cs); 1366 1367 /* Wa_16014892111 */ 1368 if (IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 70), STEP_A0, STEP_B0) || 1369 IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 71), STEP_A0, STEP_B0) || 1370 IS_DG2(ce->engine->i915)) 1371 cs = dg2_emit_draw_watermark_setting(cs); 1372 1373 return cs; 1374 } 1375 1376 static u32 * 1377 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 1378 { 1379 cs = gen12_emit_timestamp_wa(ce, cs); 1380 cs = gen12_emit_restore_scratch(ce, cs); 1381 1382 /* Wa_16013000631:dg2 */ 1383 if (IS_DG2_G11(ce->engine->i915)) 1384 if (ce->engine->class == COMPUTE_CLASS) 1385 cs = gen8_emit_pipe_control(cs, 1386 PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 1387 0); 1388 1389 return gen12_emit_aux_table_inv(ce->engine, cs); 1390 } 1391 1392 static u32 *xehp_emit_fastcolor_blt_wabb(const struct intel_context *ce, u32 *cs) 1393 { 1394 struct intel_gt *gt = ce->engine->gt; 1395 int mocs = gt->mocs.uc_index << 1; 1396 1397 /** 1398 * Wa_16018031267 / Wa_16018063123 requires that SW forces the 1399 * main copy engine arbitration into round robin mode. We 1400 * additionally need to submit the following WABB blt command 1401 * to produce 4 subblits with each subblit generating 0 byte 1402 * write requests as WABB: 1403 * 1404 * XY_FASTCOLOR_BLT 1405 * BG0 -> 5100000E 1406 * BG1 -> 0000003F (Dest pitch) 1407 * BG2 -> 00000000 (X1, Y1) = (0, 0) 1408 * BG3 -> 00040001 (X2, Y2) = (1, 4) 1409 * BG4 -> scratch 1410 * BG5 -> scratch 1411 * BG6-12 -> 00000000 1412 * BG13 -> 20004004 (Surf. Width= 2,Surf. Height = 5 ) 1413 * BG14 -> 00000010 (Qpitch = 4) 1414 * BG15 -> 00000000 1415 */ 1416 *cs++ = XY_FAST_COLOR_BLT_CMD | (16 - 2); 1417 *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) | 0x3f; 1418 *cs++ = 0; 1419 *cs++ = 4 << 16 | 1; 1420 *cs++ = lower_32_bits(i915_vma_offset(ce->vm->rsvd.vma)); 1421 *cs++ = upper_32_bits(i915_vma_offset(ce->vm->rsvd.vma)); 1422 *cs++ = 0; 1423 *cs++ = 0; 1424 *cs++ = 0; 1425 *cs++ = 0; 1426 *cs++ = 0; 1427 *cs++ = 0; 1428 *cs++ = 0; 1429 *cs++ = 0x20004004; 1430 *cs++ = 0x10; 1431 *cs++ = 0; 1432 1433 return cs; 1434 } 1435 1436 static u32 * 1437 xehp_emit_per_ctx_bb(const struct intel_context *ce, u32 *cs) 1438 { 1439 /* Wa_16018031267, Wa_16018063123 */ 1440 if (NEEDS_FASTCOLOR_BLT_WABB(ce->engine)) 1441 cs = xehp_emit_fastcolor_blt_wabb(ce, cs); 1442 1443 return cs; 1444 } 1445 1446 static void 1447 setup_per_ctx_bb(const struct intel_context *ce, 1448 const struct intel_engine_cs *engine, 1449 u32 *(*emit)(const struct intel_context *, u32 *)) 1450 { 1451 /* Place PER_CTX_BB on next page after INDIRECT_CTX */ 1452 u32 * const start = context_wabb(ce, true); 1453 u32 *cs; 1454 1455 cs = emit(ce, start); 1456 1457 /* PER_CTX_BB must manually terminate */ 1458 *cs++ = MI_BATCH_BUFFER_END; 1459 1460 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 1461 lrc_setup_bb_per_ctx(ce->lrc_reg_state, engine, 1462 lrc_indirect_bb(ce) + PAGE_SIZE); 1463 } 1464 1465 static void 1466 setup_indirect_ctx_bb(const struct intel_context *ce, 1467 const struct intel_engine_cs *engine, 1468 u32 *(*emit)(const struct intel_context *, u32 *)) 1469 { 1470 u32 * const start = context_wabb(ce, false); 1471 u32 *cs; 1472 1473 cs = emit(ce, start); 1474 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 1475 while ((unsigned long)cs % CACHELINE_BYTES) 1476 *cs++ = MI_NOOP; 1477 1478 GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start)); 1479 setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start)); 1480 1481 lrc_setup_indirect_ctx(ce->lrc_reg_state, engine, 1482 lrc_indirect_bb(ce), 1483 (cs - start) * sizeof(*cs)); 1484 } 1485 1486 /* 1487 * The context descriptor encodes various attributes of a context, 1488 * including its GTT address and some flags. Because it's fairly 1489 * expensive to calculate, we'll just do it once and cache the result, 1490 * which remains valid until the context is unpinned. 1491 * 1492 * This is what a descriptor looks like, from LSB to MSB:: 1493 * 1494 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 1495 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 1496 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 1497 * bits 53-54: mbz, reserved for use by hardware 1498 * bits 55-63: group ID, currently unused and set to 0 1499 * 1500 * Starting from Gen11, the upper dword of the descriptor has a new format: 1501 * 1502 * bits 32-36: reserved 1503 * bits 37-47: SW context ID 1504 * bits 48:53: engine instance 1505 * bit 54: mbz, reserved for use by hardware 1506 * bits 55-60: SW counter 1507 * bits 61-63: engine class 1508 * 1509 * On Xe_HP, the upper dword of the descriptor has a new format: 1510 * 1511 * bits 32-37: virtual function number 1512 * bit 38: mbz, reserved for use by hardware 1513 * bits 39-54: SW context ID 1514 * bits 55-57: reserved 1515 * bits 58-63: SW counter 1516 * 1517 * engine info, SW context ID and SW counter need to form a unique number 1518 * (Context ID) per lrc. 1519 */ 1520 static u32 lrc_descriptor(const struct intel_context *ce) 1521 { 1522 u32 desc; 1523 1524 desc = INTEL_LEGACY_32B_CONTEXT; 1525 if (i915_vm_is_4lvl(ce->vm)) 1526 desc = INTEL_LEGACY_64B_CONTEXT; 1527 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 1528 1529 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 1530 if (GRAPHICS_VER(ce->vm->i915) == 8) 1531 desc |= GEN8_CTX_L3LLC_COHERENT; 1532 1533 return i915_ggtt_offset(ce->state) | desc; 1534 } 1535 1536 u32 lrc_update_regs(const struct intel_context *ce, 1537 const struct intel_engine_cs *engine, 1538 u32 head) 1539 { 1540 struct intel_ring *ring = ce->ring; 1541 u32 *regs = ce->lrc_reg_state; 1542 1543 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 1544 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 1545 1546 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1547 regs[CTX_RING_HEAD] = head; 1548 regs[CTX_RING_TAIL] = ring->tail; 1549 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1550 1551 /* RPCS */ 1552 if (engine->class == RENDER_CLASS) { 1553 regs[CTX_R_PWR_CLK_STATE] = 1554 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 1555 1556 i915_oa_init_reg_state(ce, engine); 1557 } 1558 1559 if (ce->wa_bb_page) { 1560 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 1561 1562 fn = gen12_emit_indirect_ctx_xcs; 1563 if (ce->engine->class == RENDER_CLASS) 1564 fn = gen12_emit_indirect_ctx_rcs; 1565 1566 /* Mutually exclusive wrt to global indirect bb */ 1567 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 1568 setup_indirect_ctx_bb(ce, engine, fn); 1569 setup_per_ctx_bb(ce, engine, xehp_emit_per_ctx_bb); 1570 } 1571 1572 return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE; 1573 } 1574 1575 void lrc_update_offsets(struct intel_context *ce, 1576 struct intel_engine_cs *engine) 1577 { 1578 set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false); 1579 } 1580 1581 void lrc_check_regs(const struct intel_context *ce, 1582 const struct intel_engine_cs *engine, 1583 const char *when) 1584 { 1585 const struct intel_ring *ring = ce->ring; 1586 u32 *regs = ce->lrc_reg_state; 1587 bool valid = true; 1588 int x; 1589 1590 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1591 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1592 engine->name, 1593 regs[CTX_RING_START], 1594 i915_ggtt_offset(ring->vma)); 1595 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1596 valid = false; 1597 } 1598 1599 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1600 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1601 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1602 engine->name, 1603 regs[CTX_RING_CTL], 1604 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1605 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1606 valid = false; 1607 } 1608 1609 x = lrc_ring_mi_mode(engine); 1610 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1611 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1612 engine->name, regs[x + 1]); 1613 regs[x + 1] &= ~STOP_RING; 1614 regs[x + 1] |= STOP_RING << 16; 1615 valid = false; 1616 } 1617 1618 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when); 1619 } 1620 1621 /* 1622 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 1623 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 1624 * but there is a slight complication as this is applied in WA batch where the 1625 * values are only initialized once so we cannot take register value at the 1626 * beginning and reuse it further; hence we save its value to memory, upload a 1627 * constant value with bit21 set and then we restore it back with the saved value. 1628 * To simplify the WA, a constant value is formed by using the default value 1629 * of this register. This shouldn't be a problem because we are only modifying 1630 * it for a short period and this batch in non-premptible. We can ofcourse 1631 * use additional instructions that read the actual value of the register 1632 * at that time and set our bit of interest but it makes the WA complicated. 1633 * 1634 * This WA is also required for Gen9 so extracting as a function avoids 1635 * code duplication. 1636 */ 1637 static u32 * 1638 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 1639 { 1640 /* NB no one else is allowed to scribble over scratch + 256! */ 1641 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1642 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1643 *batch++ = intel_gt_scratch_offset(engine->gt, 1644 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1645 *batch++ = 0; 1646 1647 *batch++ = MI_LOAD_REGISTER_IMM(1); 1648 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1649 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 1650 1651 batch = gen8_emit_pipe_control(batch, 1652 PIPE_CONTROL_CS_STALL | 1653 PIPE_CONTROL_DC_FLUSH_ENABLE, 1654 0); 1655 1656 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1657 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1658 *batch++ = intel_gt_scratch_offset(engine->gt, 1659 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1660 *batch++ = 0; 1661 1662 return batch; 1663 } 1664 1665 /* 1666 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 1667 * initialized at the beginning and shared across all contexts but this field 1668 * helps us to have multiple batches at different offsets and select them based 1669 * on a criteria. At the moment this batch always start at the beginning of the page 1670 * and at this point we don't have multiple wa_ctx batch buffers. 1671 * 1672 * The number of WA applied are not known at the beginning; we use this field 1673 * to return the no of DWORDS written. 1674 * 1675 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 1676 * so it adds NOOPs as padding to make it cacheline aligned. 1677 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 1678 * makes a complete batch buffer. 1679 */ 1680 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1681 { 1682 /* WaDisableCtxRestoreArbitration:bdw,chv */ 1683 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1684 1685 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 1686 if (IS_BROADWELL(engine->i915)) 1687 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1688 1689 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 1690 /* Actual scratch location is at 128 bytes offset */ 1691 batch = gen8_emit_pipe_control(batch, 1692 PIPE_CONTROL_FLUSH_L3 | 1693 PIPE_CONTROL_STORE_DATA_INDEX | 1694 PIPE_CONTROL_CS_STALL | 1695 PIPE_CONTROL_QW_WRITE, 1696 LRC_PPHWSP_SCRATCH_ADDR); 1697 1698 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1699 1700 /* Pad to end of cacheline */ 1701 while ((unsigned long)batch % CACHELINE_BYTES) 1702 *batch++ = MI_NOOP; 1703 1704 /* 1705 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 1706 * execution depends on the length specified in terms of cache lines 1707 * in the register CTX_RCS_INDIRECT_CTX 1708 */ 1709 1710 return batch; 1711 } 1712 1713 struct lri { 1714 i915_reg_t reg; 1715 u32 value; 1716 }; 1717 1718 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 1719 { 1720 GEM_BUG_ON(!count || count > 63); 1721 1722 *batch++ = MI_LOAD_REGISTER_IMM(count); 1723 do { 1724 *batch++ = i915_mmio_reg_offset(lri->reg); 1725 *batch++ = lri->value; 1726 } while (lri++, --count); 1727 *batch++ = MI_NOOP; 1728 1729 return batch; 1730 } 1731 1732 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 1733 { 1734 static const struct lri lri[] = { 1735 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 1736 { 1737 COMMON_SLICE_CHICKEN2, 1738 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 1739 0), 1740 }, 1741 1742 /* BSpec: 11391 */ 1743 { 1744 FF_SLICE_CHICKEN, 1745 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 1746 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 1747 }, 1748 1749 /* BSpec: 11299 */ 1750 { 1751 _3D_CHICKEN3, 1752 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 1753 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 1754 } 1755 }; 1756 1757 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 1758 1759 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 1760 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 1761 1762 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 1763 batch = gen8_emit_pipe_control(batch, 1764 PIPE_CONTROL_FLUSH_L3 | 1765 PIPE_CONTROL_STORE_DATA_INDEX | 1766 PIPE_CONTROL_CS_STALL | 1767 PIPE_CONTROL_QW_WRITE, 1768 LRC_PPHWSP_SCRATCH_ADDR); 1769 1770 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 1771 1772 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 1773 if (HAS_POOLED_EU(engine->i915)) { 1774 /* 1775 * EU pool configuration is setup along with golden context 1776 * during context initialization. This value depends on 1777 * device type (2x6 or 3x6) and needs to be updated based 1778 * on which subslice is disabled especially for 2x6 1779 * devices, however it is safe to load default 1780 * configuration of 3x6 device instead of masking off 1781 * corresponding bits because HW ignores bits of a disabled 1782 * subslice and drops down to appropriate config. Please 1783 * see render_state_setup() in i915_gem_render_state.c for 1784 * possible configurations, to avoid duplication they are 1785 * not shown here again. 1786 */ 1787 *batch++ = GEN9_MEDIA_POOL_STATE; 1788 *batch++ = GEN9_MEDIA_POOL_ENABLE; 1789 *batch++ = 0x00777000; 1790 *batch++ = 0; 1791 *batch++ = 0; 1792 *batch++ = 0; 1793 } 1794 1795 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 1796 1797 /* Pad to end of cacheline */ 1798 while ((unsigned long)batch % CACHELINE_BYTES) 1799 *batch++ = MI_NOOP; 1800 1801 return batch; 1802 } 1803 1804 #define CTX_WA_BB_SIZE (PAGE_SIZE) 1805 1806 static int lrc_create_wa_ctx(struct intel_engine_cs *engine) 1807 { 1808 struct drm_i915_gem_object *obj; 1809 struct i915_vma *vma; 1810 int err; 1811 1812 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE); 1813 if (IS_ERR(obj)) 1814 return PTR_ERR(obj); 1815 1816 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 1817 if (IS_ERR(vma)) { 1818 err = PTR_ERR(vma); 1819 goto err; 1820 } 1821 1822 engine->wa_ctx.vma = vma; 1823 return 0; 1824 1825 err: 1826 i915_gem_object_put(obj); 1827 return err; 1828 } 1829 1830 void lrc_fini_wa_ctx(struct intel_engine_cs *engine) 1831 { 1832 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 1833 } 1834 1835 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 1836 1837 void lrc_init_wa_ctx(struct intel_engine_cs *engine) 1838 { 1839 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 1840 struct i915_wa_ctx_bb *wa_bb[] = { 1841 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx 1842 }; 1843 wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)]; 1844 struct i915_gem_ww_ctx ww; 1845 void *batch, *batch_ptr; 1846 unsigned int i; 1847 int err; 1848 1849 if (GRAPHICS_VER(engine->i915) >= 11 || 1850 !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE)) 1851 return; 1852 1853 if (GRAPHICS_VER(engine->i915) == 9) { 1854 wa_bb_fn[0] = gen9_init_indirectctx_bb; 1855 wa_bb_fn[1] = NULL; 1856 } else if (GRAPHICS_VER(engine->i915) == 8) { 1857 wa_bb_fn[0] = gen8_init_indirectctx_bb; 1858 wa_bb_fn[1] = NULL; 1859 } 1860 1861 err = lrc_create_wa_ctx(engine); 1862 if (err) { 1863 /* 1864 * We continue even if we fail to initialize WA batch 1865 * because we only expect rare glitches but nothing 1866 * critical to prevent us from using GPU 1867 */ 1868 drm_err(&engine->i915->drm, 1869 "Ignoring context switch w/a allocation error:%d\n", 1870 err); 1871 return; 1872 } 1873 1874 if (!engine->wa_ctx.vma) 1875 return; 1876 1877 i915_gem_ww_ctx_init(&ww, true); 1878 retry: 1879 err = i915_gem_object_lock(wa_ctx->vma->obj, &ww); 1880 if (!err) 1881 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH); 1882 if (err) 1883 goto err; 1884 1885 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 1886 if (IS_ERR(batch)) { 1887 err = PTR_ERR(batch); 1888 goto err_unpin; 1889 } 1890 1891 /* 1892 * Emit the two workaround batch buffers, recording the offset from the 1893 * start of the workaround batch buffer object for each and their 1894 * respective sizes. 1895 */ 1896 batch_ptr = batch; 1897 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 1898 wa_bb[i]->offset = batch_ptr - batch; 1899 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 1900 CACHELINE_BYTES))) { 1901 err = -EINVAL; 1902 break; 1903 } 1904 if (wa_bb_fn[i]) 1905 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 1906 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 1907 } 1908 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE); 1909 1910 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 1911 __i915_gem_object_release_map(wa_ctx->vma->obj); 1912 1913 /* Verify that we can handle failure to setup the wa_ctx */ 1914 if (!err) 1915 err = i915_inject_probe_error(engine->i915, -ENODEV); 1916 1917 err_unpin: 1918 if (err) 1919 i915_vma_unpin(wa_ctx->vma); 1920 err: 1921 if (err == -EDEADLK) { 1922 err = i915_gem_ww_ctx_backoff(&ww); 1923 if (!err) 1924 goto retry; 1925 } 1926 i915_gem_ww_ctx_fini(&ww); 1927 1928 if (err) { 1929 i915_vma_put(engine->wa_ctx.vma); 1930 1931 /* Clear all flags to prevent further use */ 1932 memset(wa_ctx, 0, sizeof(*wa_ctx)); 1933 } 1934 } 1935 1936 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt) 1937 { 1938 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1939 stats->runtime.num_underflow++; 1940 stats->runtime.max_underflow = 1941 max_t(u32, stats->runtime.max_underflow, -dt); 1942 #endif 1943 } 1944 1945 static u32 lrc_get_runtime(const struct intel_context *ce) 1946 { 1947 /* 1948 * We can use either ppHWSP[16] which is recorded before the context 1949 * switch (and so excludes the cost of context switches) or use the 1950 * value from the context image itself, which is saved/restored earlier 1951 * and so includes the cost of the save. 1952 */ 1953 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); 1954 } 1955 1956 void lrc_update_runtime(struct intel_context *ce) 1957 { 1958 struct intel_context_stats *stats = &ce->stats; 1959 u32 old; 1960 s32 dt; 1961 1962 old = stats->runtime.last; 1963 stats->runtime.last = lrc_get_runtime(ce); 1964 dt = stats->runtime.last - old; 1965 if (!dt) 1966 return; 1967 1968 if (unlikely(dt < 0)) { 1969 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1970 old, stats->runtime.last, dt); 1971 st_runtime_underflow(stats, dt); 1972 return; 1973 } 1974 1975 ewma_runtime_add(&stats->runtime.avg, dt); 1976 stats->runtime.total += dt; 1977 } 1978 1979 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1980 #include "selftest_lrc.c" 1981 #endif 1982