1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2014-2018 Intel Corporation 4 */ 5 6 #include "i915_drv.h" 7 #include "i915_reg.h" 8 #include "intel_context.h" 9 #include "intel_engine_pm.h" 10 #include "intel_engine_regs.h" 11 #include "intel_gpu_commands.h" 12 #include "intel_gt.h" 13 #include "intel_gt_ccs_mode.h" 14 #include "intel_gt_mcr.h" 15 #include "intel_gt_print.h" 16 #include "intel_gt_regs.h" 17 #include "intel_ring.h" 18 #include "intel_workarounds.h" 19 20 #include "display/intel_fbc_regs.h" 21 22 /** 23 * DOC: Hardware workarounds 24 * 25 * Hardware workarounds are register programming documented to be executed in 26 * the driver that fall outside of the normal programming sequences for a 27 * platform. There are some basic categories of workarounds, depending on 28 * how/when they are applied: 29 * 30 * - Context workarounds: workarounds that touch registers that are 31 * saved/restored to/from the HW context image. The list is emitted (via Load 32 * Register Immediate commands) once when initializing the device and saved in 33 * the default context. That default context is then used on every context 34 * creation to have a "primed golden context", i.e. a context image that 35 * already contains the changes needed to all the registers. 36 * 37 * Context workarounds should be implemented in the \*_ctx_workarounds_init() 38 * variants respective to the targeted platforms. 39 * 40 * - Engine workarounds: the list of these WAs is applied whenever the specific 41 * engine is reset. It's also possible that a set of engine classes share a 42 * common power domain and they are reset together. This happens on some 43 * platforms with render and compute engines. In this case (at least) one of 44 * them need to keeep the workaround programming: the approach taken in the 45 * driver is to tie those workarounds to the first compute/render engine that 46 * is registered. When executing with GuC submission, engine resets are 47 * outside of kernel driver control, hence the list of registers involved in 48 * written once, on engine initialization, and then passed to GuC, that 49 * saves/restores their values before/after the reset takes place. See 50 * ``drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c`` for reference. 51 * 52 * Workarounds for registers specific to RCS and CCS should be implemented in 53 * rcs_engine_wa_init() and ccs_engine_wa_init(), respectively; those for 54 * registers belonging to BCS, VCS or VECS should be implemented in 55 * xcs_engine_wa_init(). Workarounds for registers not belonging to a specific 56 * engine's MMIO range but that are part of of the common RCS/CCS reset domain 57 * should be implemented in general_render_compute_wa_init(). The settings 58 * about the CCS load balancing should be added in ccs_engine_wa_mode(). 59 * 60 * - GT workarounds: the list of these WAs is applied whenever these registers 61 * revert to their default values: on GPU reset, suspend/resume [1]_, etc. 62 * 63 * GT workarounds should be implemented in the \*_gt_workarounds_init() 64 * variants respective to the targeted platforms. 65 * 66 * - Register whitelist: some workarounds need to be implemented in userspace, 67 * but need to touch privileged registers. The whitelist in the kernel 68 * instructs the hardware to allow the access to happen. From the kernel side, 69 * this is just a special case of a MMIO workaround (as we write the list of 70 * these to/be-whitelisted registers to some special HW registers). 71 * 72 * Register whitelisting should be done in the \*_whitelist_build() variants 73 * respective to the targeted platforms. 74 * 75 * - Workaround batchbuffers: buffers that get executed automatically by the 76 * hardware on every HW context restore. These buffers are created and 77 * programmed in the default context so the hardware always go through those 78 * programming sequences when switching contexts. The support for workaround 79 * batchbuffers is enabled these hardware mechanisms: 80 * 81 * #. INDIRECT_CTX: A batchbuffer and an offset are provided in the default 82 * context, pointing the hardware to jump to that location when that offset 83 * is reached in the context restore. Workaround batchbuffer in the driver 84 * currently uses this mechanism for all platforms. 85 * 86 * #. BB_PER_CTX_PTR: A batchbuffer is provided in the default context, 87 * pointing the hardware to a buffer to continue executing after the 88 * engine registers are restored in a context restore sequence. This is 89 * currently not used in the driver. 90 * 91 * - Other: There are WAs that, due to their nature, cannot be applied from a 92 * central place. Those are peppered around the rest of the code, as needed. 93 * Workarounds related to the display IP are the main example. 94 * 95 * .. [1] Technically, some registers are powercontext saved & restored, so they 96 * survive a suspend/resume. In practice, writing them again is not too 97 * costly and simplifies things, so it's the approach taken in the driver. 98 */ 99 100 static void wa_init_start(struct i915_wa_list *wal, struct intel_gt *gt, 101 const char *name, const char *engine_name) 102 { 103 wal->gt = gt; 104 wal->name = name; 105 wal->engine_name = engine_name; 106 } 107 108 #define WA_LIST_CHUNK (1 << 4) 109 110 static void wa_init_finish(struct i915_wa_list *wal) 111 { 112 /* Trim unused entries. */ 113 if (!IS_ALIGNED(wal->count, WA_LIST_CHUNK)) { 114 struct i915_wa *list = kmemdup_array(wal->list, wal->count, 115 sizeof(*list), GFP_KERNEL); 116 117 if (list) { 118 kfree(wal->list); 119 wal->list = list; 120 } 121 } 122 123 if (!wal->count) 124 return; 125 126 gt_dbg(wal->gt, "Initialized %u %s workarounds on %s\n", 127 wal->wa_count, wal->name, wal->engine_name); 128 } 129 130 static enum forcewake_domains 131 wal_get_fw_for_rmw(struct intel_uncore *uncore, const struct i915_wa_list *wal) 132 { 133 enum forcewake_domains fw = 0; 134 struct i915_wa *wa; 135 unsigned int i; 136 137 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) 138 fw |= intel_uncore_forcewake_for_reg(uncore, 139 wa->reg, 140 FW_REG_READ | 141 FW_REG_WRITE); 142 143 return fw; 144 } 145 146 static void _wa_add(struct i915_wa_list *wal, const struct i915_wa *wa) 147 { 148 unsigned int addr = i915_mmio_reg_offset(wa->reg); 149 struct drm_i915_private *i915 = wal->gt->i915; 150 unsigned int start = 0, end = wal->count; 151 const unsigned int grow = WA_LIST_CHUNK; 152 struct i915_wa *wa_; 153 154 GEM_BUG_ON(!is_power_of_2(grow)); 155 156 if (IS_ALIGNED(wal->count, grow)) { /* Either uninitialized or full. */ 157 struct i915_wa *list; 158 159 list = kmalloc_array(ALIGN(wal->count + 1, grow), sizeof(*list), 160 GFP_KERNEL); 161 if (!list) { 162 drm_err(&i915->drm, "No space for workaround init!\n"); 163 return; 164 } 165 166 if (wal->list) { 167 memcpy(list, wal->list, sizeof(*wa) * wal->count); 168 kfree(wal->list); 169 } 170 171 wal->list = list; 172 } 173 174 while (start < end) { 175 unsigned int mid = start + (end - start) / 2; 176 177 if (i915_mmio_reg_offset(wal->list[mid].reg) < addr) { 178 start = mid + 1; 179 } else if (i915_mmio_reg_offset(wal->list[mid].reg) > addr) { 180 end = mid; 181 } else { 182 wa_ = &wal->list[mid]; 183 184 if ((wa->clr | wa_->clr) && !(wa->clr & ~wa_->clr)) { 185 drm_err(&i915->drm, 186 "Discarding overwritten w/a for reg %04x (clear: %08x, set: %08x)\n", 187 i915_mmio_reg_offset(wa_->reg), 188 wa_->clr, wa_->set); 189 190 wa_->set &= ~wa->clr; 191 } 192 193 wal->wa_count++; 194 wa_->set |= wa->set; 195 wa_->clr |= wa->clr; 196 wa_->read |= wa->read; 197 return; 198 } 199 } 200 201 wal->wa_count++; 202 wa_ = &wal->list[wal->count++]; 203 *wa_ = *wa; 204 205 while (wa_-- > wal->list) { 206 GEM_BUG_ON(i915_mmio_reg_offset(wa_[0].reg) == 207 i915_mmio_reg_offset(wa_[1].reg)); 208 if (i915_mmio_reg_offset(wa_[1].reg) > 209 i915_mmio_reg_offset(wa_[0].reg)) 210 break; 211 212 swap(wa_[1], wa_[0]); 213 } 214 } 215 216 static void wa_add(struct i915_wa_list *wal, i915_reg_t reg, 217 u32 clear, u32 set, u32 read_mask, bool masked_reg) 218 { 219 struct i915_wa wa = { 220 .reg = reg, 221 .clr = clear, 222 .set = set, 223 .read = read_mask, 224 .masked_reg = masked_reg, 225 }; 226 227 _wa_add(wal, &wa); 228 } 229 230 static void wa_mcr_add(struct i915_wa_list *wal, i915_mcr_reg_t reg, 231 u32 clear, u32 set, u32 read_mask, bool masked_reg) 232 { 233 struct i915_wa wa = { 234 .mcr_reg = reg, 235 .clr = clear, 236 .set = set, 237 .read = read_mask, 238 .masked_reg = masked_reg, 239 .is_mcr = 1, 240 }; 241 242 _wa_add(wal, &wa); 243 } 244 245 static void 246 wa_write_clr_set(struct i915_wa_list *wal, i915_reg_t reg, u32 clear, u32 set) 247 { 248 wa_add(wal, reg, clear, set, clear | set, false); 249 } 250 251 static void 252 wa_mcr_write_clr_set(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clear, u32 set) 253 { 254 wa_mcr_add(wal, reg, clear, set, clear | set, false); 255 } 256 257 static void 258 wa_write(struct i915_wa_list *wal, i915_reg_t reg, u32 set) 259 { 260 wa_write_clr_set(wal, reg, ~0, set); 261 } 262 263 static void 264 wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set) 265 { 266 wa_write_clr_set(wal, reg, set, set); 267 } 268 269 static void 270 wa_mcr_write_or(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 set) 271 { 272 wa_mcr_write_clr_set(wal, reg, set, set); 273 } 274 275 static void 276 wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr) 277 { 278 wa_write_clr_set(wal, reg, clr, 0); 279 } 280 281 static void 282 wa_mcr_write_clr(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clr) 283 { 284 wa_mcr_write_clr_set(wal, reg, clr, 0); 285 } 286 287 /* 288 * WA operations on "masked register". A masked register has the upper 16 bits 289 * documented as "masked" in b-spec. Its purpose is to allow writing to just a 290 * portion of the register without a rmw: you simply write in the upper 16 bits 291 * the mask of bits you are going to modify. 292 * 293 * The wa_masked_* family of functions already does the necessary operations to 294 * calculate the mask based on the parameters passed, so user only has to 295 * provide the lower 16 bits of that register. 296 */ 297 298 static void 299 wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val) 300 { 301 wa_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true); 302 } 303 304 static void 305 wa_mcr_masked_en(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val) 306 { 307 wa_mcr_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true); 308 } 309 310 static void 311 wa_masked_dis(struct i915_wa_list *wal, i915_reg_t reg, u32 val) 312 { 313 wa_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true); 314 } 315 316 static void 317 wa_mcr_masked_dis(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val) 318 { 319 wa_mcr_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true); 320 } 321 322 static void 323 wa_masked_field_set(struct i915_wa_list *wal, i915_reg_t reg, 324 u32 mask, u32 val) 325 { 326 wa_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true); 327 } 328 329 static void 330 wa_mcr_masked_field_set(struct i915_wa_list *wal, i915_mcr_reg_t reg, 331 u32 mask, u32 val) 332 { 333 wa_mcr_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true); 334 } 335 336 static void gen6_ctx_workarounds_init(struct intel_engine_cs *engine, 337 struct i915_wa_list *wal) 338 { 339 wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING); 340 } 341 342 static void gen7_ctx_workarounds_init(struct intel_engine_cs *engine, 343 struct i915_wa_list *wal) 344 { 345 wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING); 346 } 347 348 static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine, 349 struct i915_wa_list *wal) 350 { 351 wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING); 352 353 /* WaDisableAsyncFlipPerfMode:bdw,chv */ 354 wa_masked_en(wal, RING_MI_MODE(RENDER_RING_BASE), ASYNC_FLIP_PERF_DISABLE); 355 356 /* WaDisablePartialInstShootdown:bdw,chv */ 357 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, 358 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE); 359 360 /* Use Force Non-Coherent whenever executing a 3D context. This is a 361 * workaround for a possible hang in the unlikely event a TLB 362 * invalidation occurs during a PSD flush. 363 */ 364 /* WaForceEnableNonCoherent:bdw,chv */ 365 /* WaHdcDisableFetchWhenMasked:bdw,chv */ 366 wa_masked_en(wal, HDC_CHICKEN0, 367 HDC_DONOT_FETCH_MEM_WHEN_MASKED | 368 HDC_FORCE_NON_COHERENT); 369 370 /* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0: 371 * "The Hierarchical Z RAW Stall Optimization allows non-overlapping 372 * polygons in the same 8x4 pixel/sample area to be processed without 373 * stalling waiting for the earlier ones to write to Hierarchical Z 374 * buffer." 375 * 376 * This optimization is off by default for BDW and CHV; turn it on. 377 */ 378 wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE); 379 380 /* Wa4x4STCOptimizationDisable:bdw,chv */ 381 wa_masked_en(wal, CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE); 382 383 /* 384 * BSpec recommends 8x4 when MSAA is used, 385 * however in practice 16x4 seems fastest. 386 * 387 * Note that PS/WM thread counts depend on the WIZ hashing 388 * disable bit, which we don't touch here, but it's good 389 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). 390 */ 391 wa_masked_field_set(wal, GEN7_GT_MODE, 392 GEN6_WIZ_HASHING_MASK, 393 GEN6_WIZ_HASHING_16x4); 394 } 395 396 static void bdw_ctx_workarounds_init(struct intel_engine_cs *engine, 397 struct i915_wa_list *wal) 398 { 399 struct drm_i915_private *i915 = engine->i915; 400 401 gen8_ctx_workarounds_init(engine, wal); 402 403 /* WaDisableThreadStallDopClockGating:bdw (pre-production) */ 404 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE); 405 406 /* WaDisableDopClockGating:bdw 407 * 408 * Also see the related UCGTCL1 write in bdw_init_clock_gating() 409 * to disable EUTC clock gating. 410 */ 411 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, 412 DOP_CLOCK_GATING_DISABLE); 413 414 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3, 415 GEN8_SAMPLER_POWER_BYPASS_DIS); 416 417 wa_masked_en(wal, HDC_CHICKEN0, 418 /* WaForceContextSaveRestoreNonCoherent:bdw */ 419 HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT | 420 /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */ 421 (INTEL_INFO(i915)->gt == 3 ? HDC_FENCE_DEST_SLM_DISABLE : 0)); 422 } 423 424 static void chv_ctx_workarounds_init(struct intel_engine_cs *engine, 425 struct i915_wa_list *wal) 426 { 427 gen8_ctx_workarounds_init(engine, wal); 428 429 /* WaDisableThreadStallDopClockGating:chv */ 430 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE); 431 432 /* Improve HiZ throughput on CHV. */ 433 wa_masked_en(wal, HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X); 434 } 435 436 static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine, 437 struct i915_wa_list *wal) 438 { 439 struct drm_i915_private *i915 = engine->i915; 440 441 if (HAS_LLC(i915)) { 442 /* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl 443 * 444 * Must match Display Engine. See 445 * WaCompressedResourceDisplayNewHashMode. 446 */ 447 wa_masked_en(wal, COMMON_SLICE_CHICKEN2, 448 GEN9_PBE_COMPRESSED_HASH_SELECTION); 449 wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7, 450 GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR); 451 } 452 453 /* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl,glk,cfl */ 454 /* WaDisablePartialInstShootdown:skl,bxt,kbl,glk,cfl */ 455 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, 456 FLOW_CONTROL_ENABLE | 457 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE); 458 459 /* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl,glk,cfl */ 460 /* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl,cfl */ 461 wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7, 462 GEN9_ENABLE_YV12_BUGFIX | 463 GEN9_ENABLE_GPGPU_PREEMPTION); 464 465 /* Wa4x4STCOptimizationDisable:skl,bxt,kbl,glk,cfl */ 466 /* WaDisablePartialResolveInVc:skl,bxt,kbl,cfl */ 467 wa_masked_en(wal, CACHE_MODE_1, 468 GEN8_4x4_STC_OPTIMIZATION_DISABLE | 469 GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE); 470 471 /* WaCcsTlbPrefetchDisable:skl,bxt,kbl,glk,cfl */ 472 wa_mcr_masked_dis(wal, GEN9_HALF_SLICE_CHICKEN5, 473 GEN9_CCS_TLB_PREFETCH_ENABLE); 474 475 /* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl,cfl */ 476 wa_masked_en(wal, HDC_CHICKEN0, 477 HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT | 478 HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE); 479 480 /* WaForceEnableNonCoherent and WaDisableHDCInvalidation are 481 * both tied to WaForceContextSaveRestoreNonCoherent 482 * in some hsds for skl. We keep the tie for all gen9. The 483 * documentation is a bit hazy and so we want to get common behaviour, 484 * even though there is no clear evidence we would need both on kbl/bxt. 485 * This area has been source of system hangs so we play it safe 486 * and mimic the skl regardless of what bspec says. 487 * 488 * Use Force Non-Coherent whenever executing a 3D context. This 489 * is a workaround for a possible hang in the unlikely event 490 * a TLB invalidation occurs during a PSD flush. 491 */ 492 493 /* WaForceEnableNonCoherent:skl,bxt,kbl,cfl */ 494 wa_masked_en(wal, HDC_CHICKEN0, 495 HDC_FORCE_NON_COHERENT); 496 497 /* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */ 498 if (IS_SKYLAKE(i915) || 499 IS_KABYLAKE(i915) || 500 IS_COFFEELAKE(i915) || 501 IS_COMETLAKE(i915)) 502 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3, 503 GEN8_SAMPLER_POWER_BYPASS_DIS); 504 505 /* WaDisableSTUnitPowerOptimization:skl,bxt,kbl,glk,cfl */ 506 wa_mcr_masked_en(wal, HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE); 507 508 /* 509 * Supporting preemption with fine-granularity requires changes in the 510 * batch buffer programming. Since we can't break old userspace, we 511 * need to set our default preemption level to safe value. Userspace is 512 * still able to use more fine-grained preemption levels, since in 513 * WaEnablePreemptionGranularityControlByUMD we're whitelisting the 514 * per-ctx register. As such, WaDisable{3D,GPGPU}MidCmdPreemption are 515 * not real HW workarounds, but merely a way to start using preemption 516 * while maintaining old contract with userspace. 517 */ 518 519 /* WaDisable3DMidCmdPreemption:skl,bxt,glk,cfl,[cnl] */ 520 wa_masked_dis(wal, GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL); 521 522 /* WaDisableGPGPUMidCmdPreemption:skl,bxt,blk,cfl,[cnl] */ 523 wa_masked_field_set(wal, GEN8_CS_CHICKEN1, 524 GEN9_PREEMPT_GPGPU_LEVEL_MASK, 525 GEN9_PREEMPT_GPGPU_COMMAND_LEVEL); 526 527 /* WaClearHIZ_WM_CHICKEN3:bxt,glk */ 528 if (IS_GEN9_LP(i915)) 529 wa_masked_en(wal, GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ); 530 } 531 532 static void skl_tune_iz_hashing(struct intel_engine_cs *engine, 533 struct i915_wa_list *wal) 534 { 535 struct intel_gt *gt = engine->gt; 536 u8 vals[3] = { 0, 0, 0 }; 537 unsigned int i; 538 539 for (i = 0; i < 3; i++) { 540 u8 ss; 541 542 /* 543 * Only consider slices where one, and only one, subslice has 7 544 * EUs 545 */ 546 if (!is_power_of_2(gt->info.sseu.subslice_7eu[i])) 547 continue; 548 549 /* 550 * subslice_7eu[i] != 0 (because of the check above) and 551 * ss_max == 4 (maximum number of subslices possible per slice) 552 * 553 * -> 0 <= ss <= 3; 554 */ 555 ss = ffs(gt->info.sseu.subslice_7eu[i]) - 1; 556 vals[i] = 3 - ss; 557 } 558 559 if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0) 560 return; 561 562 /* Tune IZ hashing. See intel_device_info_runtime_init() */ 563 wa_masked_field_set(wal, GEN7_GT_MODE, 564 GEN9_IZ_HASHING_MASK(2) | 565 GEN9_IZ_HASHING_MASK(1) | 566 GEN9_IZ_HASHING_MASK(0), 567 GEN9_IZ_HASHING(2, vals[2]) | 568 GEN9_IZ_HASHING(1, vals[1]) | 569 GEN9_IZ_HASHING(0, vals[0])); 570 } 571 572 static void skl_ctx_workarounds_init(struct intel_engine_cs *engine, 573 struct i915_wa_list *wal) 574 { 575 gen9_ctx_workarounds_init(engine, wal); 576 skl_tune_iz_hashing(engine, wal); 577 } 578 579 static void bxt_ctx_workarounds_init(struct intel_engine_cs *engine, 580 struct i915_wa_list *wal) 581 { 582 gen9_ctx_workarounds_init(engine, wal); 583 584 /* WaDisableThreadStallDopClockGating:bxt */ 585 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, 586 STALL_DOP_GATING_DISABLE); 587 588 /* WaToEnableHwFixForPushConstHWBug:bxt */ 589 wa_masked_en(wal, COMMON_SLICE_CHICKEN2, 590 GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); 591 } 592 593 static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine, 594 struct i915_wa_list *wal) 595 { 596 struct drm_i915_private *i915 = engine->i915; 597 598 gen9_ctx_workarounds_init(engine, wal); 599 600 /* WaToEnableHwFixForPushConstHWBug:kbl */ 601 if (IS_KABYLAKE(i915) && IS_GRAPHICS_STEP(i915, STEP_C0, STEP_FOREVER)) 602 wa_masked_en(wal, COMMON_SLICE_CHICKEN2, 603 GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); 604 605 /* WaDisableSbeCacheDispatchPortSharing:kbl */ 606 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1, 607 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE); 608 } 609 610 static void glk_ctx_workarounds_init(struct intel_engine_cs *engine, 611 struct i915_wa_list *wal) 612 { 613 gen9_ctx_workarounds_init(engine, wal); 614 615 /* WaToEnableHwFixForPushConstHWBug:glk */ 616 wa_masked_en(wal, COMMON_SLICE_CHICKEN2, 617 GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); 618 } 619 620 static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine, 621 struct i915_wa_list *wal) 622 { 623 gen9_ctx_workarounds_init(engine, wal); 624 625 /* WaToEnableHwFixForPushConstHWBug:cfl */ 626 wa_masked_en(wal, COMMON_SLICE_CHICKEN2, 627 GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); 628 629 /* WaDisableSbeCacheDispatchPortSharing:cfl */ 630 wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1, 631 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE); 632 } 633 634 static void icl_ctx_workarounds_init(struct intel_engine_cs *engine, 635 struct i915_wa_list *wal) 636 { 637 struct drm_i915_private *i915 = engine->i915; 638 639 /* Wa_1406697149 (WaDisableBankHangMode:icl) */ 640 wa_write(wal, GEN8_L3CNTLREG, GEN8_ERRDETBCTRL); 641 642 /* WaForceEnableNonCoherent:icl 643 * This is not the same workaround as in early Gen9 platforms, where 644 * lacking this could cause system hangs, but coherency performance 645 * overhead is high and only a few compute workloads really need it 646 * (the register is whitelisted in hardware now, so UMDs can opt in 647 * for coherency if they have a good reason). 648 */ 649 wa_mcr_masked_en(wal, ICL_HDC_MODE, HDC_FORCE_NON_COHERENT); 650 651 /* WaEnableFloatBlendOptimization:icl */ 652 wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0, 653 _MASKED_BIT_ENABLE(FLOAT_BLEND_OPTIMIZATION_ENABLE), 654 0 /* write-only, so skip validation */, 655 true); 656 657 /* WaDisableGPGPUMidThreadPreemption:icl */ 658 wa_masked_field_set(wal, GEN8_CS_CHICKEN1, 659 GEN9_PREEMPT_GPGPU_LEVEL_MASK, 660 GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL); 661 662 /* allow headerless messages for preemptible GPGPU context */ 663 wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE, 664 GEN11_SAMPLER_ENABLE_HEADLESS_MSG); 665 666 /* Wa_1604278689:icl,ehl */ 667 wa_write(wal, IVB_FBC_RT_BASE, 0xFFFFFFFF & ~ILK_FBC_RT_VALID); 668 wa_write_clr_set(wal, IVB_FBC_RT_BASE_UPPER, 669 0, 670 0xFFFFFFFF); 671 672 /* Wa_1406306137:icl,ehl */ 673 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN11_DIS_PICK_2ND_EU); 674 675 if (IS_JASPERLAKE(i915) || IS_ELKHARTLAKE(i915)) { 676 /* 677 * Disable Repacking for Compression (masked R/W access) 678 * before rendering compressed surfaces for display. 679 */ 680 wa_masked_en(wal, CACHE_MODE_0_GEN7, 681 DISABLE_REPACKING_FOR_COMPRESSION); 682 } 683 } 684 685 /* 686 * These settings aren't actually workarounds, but general tuning settings that 687 * need to be programmed on dg2 platform. 688 */ 689 static void dg2_ctx_gt_tuning_init(struct intel_engine_cs *engine, 690 struct i915_wa_list *wal) 691 { 692 wa_mcr_masked_en(wal, CHICKEN_RASTER_2, TBIMR_FAST_CLIP); 693 wa_mcr_write_clr_set(wal, XEHP_L3SQCREG5, L3_PWM_TIMER_INIT_VAL_MASK, 694 REG_FIELD_PREP(L3_PWM_TIMER_INIT_VAL_MASK, 0x7f)); 695 wa_mcr_write_clr_set(wal, XEHP_FF_MODE2, FF_MODE2_TDS_TIMER_MASK, 696 FF_MODE2_TDS_TIMER_128); 697 } 698 699 static void gen12_ctx_workarounds_init(struct intel_engine_cs *engine, 700 struct i915_wa_list *wal) 701 { 702 struct drm_i915_private *i915 = engine->i915; 703 704 /* 705 * Wa_1409142259:tgl,dg1,adl-p,adl-n 706 * Wa_1409347922:tgl,dg1,adl-p 707 * Wa_1409252684:tgl,dg1,adl-p 708 * Wa_1409217633:tgl,dg1,adl-p 709 * Wa_1409207793:tgl,dg1,adl-p 710 * Wa_1409178076:tgl,dg1,adl-p,adl-n 711 * Wa_1408979724:tgl,dg1,adl-p,adl-n 712 * Wa_14010443199:tgl,rkl,dg1,adl-p,adl-n 713 * Wa_14010698770:tgl,rkl,dg1,adl-s,adl-p,adl-n 714 * Wa_1409342910:tgl,rkl,dg1,adl-s,adl-p,adl-n 715 * Wa_22010465259:tgl,rkl,dg1,adl-s,adl-p,adl-n 716 */ 717 wa_masked_en(wal, GEN11_COMMON_SLICE_CHICKEN3, 718 GEN12_DISABLE_CPS_AWARE_COLOR_PIPE); 719 720 /* WaDisableGPGPUMidThreadPreemption:gen12 */ 721 wa_masked_field_set(wal, GEN8_CS_CHICKEN1, 722 GEN9_PREEMPT_GPGPU_LEVEL_MASK, 723 GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL); 724 725 /* 726 * Wa_16011163337 - GS_TIMER 727 * 728 * TDS_TIMER: Although some platforms refer to it as Wa_1604555607, we 729 * need to program it even on those that don't explicitly list that 730 * workaround. 731 * 732 * Note that the programming of GEN12_FF_MODE2 is further modified 733 * according to the FF_MODE2 guidance given by Wa_1608008084. 734 * Wa_1608008084 tells us the FF_MODE2 register will return the wrong 735 * value when read from the CPU. 736 * 737 * The default value for this register is zero for all fields. 738 * So instead of doing a RMW we should just write the desired values 739 * for TDS and GS timers. Note that since the readback can't be trusted, 740 * the clear mask is just set to ~0 to make sure other bits are not 741 * inadvertently set. For the same reason read verification is ignored. 742 */ 743 wa_add(wal, 744 GEN12_FF_MODE2, 745 ~0, 746 FF_MODE2_TDS_TIMER_128 | FF_MODE2_GS_TIMER_224, 747 0, false); 748 749 if (!IS_DG1(i915)) { 750 /* Wa_1806527549 */ 751 wa_masked_en(wal, HIZ_CHICKEN, HZ_DEPTH_TEST_LE_GE_OPT_DISABLE); 752 753 /* Wa_1606376872 */ 754 wa_masked_en(wal, COMMON_SLICE_CHICKEN4, DISABLE_TDC_LOAD_BALANCING_CALC); 755 } 756 757 /* 758 * This bit must be set to enable performance optimization for fast 759 * clears. 760 */ 761 wa_mcr_write_or(wal, GEN8_WM_CHICKEN2, WAIT_ON_DEPTH_STALL_DONE_DISABLE); 762 } 763 764 static void dg1_ctx_workarounds_init(struct intel_engine_cs *engine, 765 struct i915_wa_list *wal) 766 { 767 gen12_ctx_workarounds_init(engine, wal); 768 769 /* Wa_1409044764 */ 770 wa_masked_dis(wal, GEN11_COMMON_SLICE_CHICKEN3, 771 DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN); 772 773 /* Wa_22010493298 */ 774 wa_masked_en(wal, HIZ_CHICKEN, 775 DG1_HZ_READ_SUPPRESSION_OPTIMIZATION_DISABLE); 776 } 777 778 static void dg2_ctx_workarounds_init(struct intel_engine_cs *engine, 779 struct i915_wa_list *wal) 780 { 781 dg2_ctx_gt_tuning_init(engine, wal); 782 783 /* Wa_16013271637:dg2 */ 784 wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1, 785 MSC_MSAA_REODER_BUF_BYPASS_DISABLE); 786 787 /* Wa_14014947963:dg2 */ 788 wa_masked_field_set(wal, VF_PREEMPTION, PREEMPTION_VERTEX_COUNT, 0x4000); 789 790 /* Wa_18018764978:dg2 */ 791 wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL); 792 793 /* Wa_18019271663:dg2 */ 794 wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE); 795 796 /* Wa_14019877138:dg2 */ 797 wa_mcr_masked_en(wal, XEHP_PSS_CHICKEN, FD_END_COLLECT); 798 } 799 800 static void xelpg_ctx_gt_tuning_init(struct intel_engine_cs *engine, 801 struct i915_wa_list *wal) 802 { 803 struct intel_gt *gt = engine->gt; 804 805 dg2_ctx_gt_tuning_init(engine, wal); 806 807 /* 808 * Due to Wa_16014892111, the DRAW_WATERMARK tuning must be done in 809 * gen12_emit_indirect_ctx_rcs() rather than here on some early 810 * steppings. 811 */ 812 if (!(IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || 813 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0))) 814 wa_add(wal, DRAW_WATERMARK, VERT_WM_VAL, 0x3FF, 0, false); 815 } 816 817 static void xelpg_ctx_workarounds_init(struct intel_engine_cs *engine, 818 struct i915_wa_list *wal) 819 { 820 struct intel_gt *gt = engine->gt; 821 822 xelpg_ctx_gt_tuning_init(engine, wal); 823 824 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || 825 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) { 826 /* Wa_14014947963 */ 827 wa_masked_field_set(wal, VF_PREEMPTION, 828 PREEMPTION_VERTEX_COUNT, 0x4000); 829 830 /* Wa_16013271637 */ 831 wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1, 832 MSC_MSAA_REODER_BUF_BYPASS_DISABLE); 833 834 /* Wa_18019627453 */ 835 wa_mcr_masked_en(wal, VFLSKPD, VF_PREFETCH_TLB_DIS); 836 837 /* Wa_18018764978 */ 838 wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL); 839 } 840 841 /* Wa_18019271663 */ 842 wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE); 843 844 /* Wa_14019877138 */ 845 wa_mcr_masked_en(wal, XEHP_PSS_CHICKEN, FD_END_COLLECT); 846 } 847 848 static void fakewa_disable_nestedbb_mode(struct intel_engine_cs *engine, 849 struct i915_wa_list *wal) 850 { 851 /* 852 * This is a "fake" workaround defined by software to ensure we 853 * maintain reliable, backward-compatible behavior for userspace with 854 * regards to how nested MI_BATCH_BUFFER_START commands are handled. 855 * 856 * The per-context setting of MI_MODE[12] determines whether the bits 857 * of a nested MI_BATCH_BUFFER_START instruction should be interpreted 858 * in the traditional manner or whether they should instead use a new 859 * tgl+ meaning that breaks backward compatibility, but allows nesting 860 * into 3rd-level batchbuffers. When this new capability was first 861 * added in TGL, it remained off by default unless a context 862 * intentionally opted in to the new behavior. However Xe_HPG now 863 * flips this on by default and requires that we explicitly opt out if 864 * we don't want the new behavior. 865 * 866 * From a SW perspective, we want to maintain the backward-compatible 867 * behavior for userspace, so we'll apply a fake workaround to set it 868 * back to the legacy behavior on platforms where the hardware default 869 * is to break compatibility. At the moment there is no Linux 870 * userspace that utilizes third-level batchbuffers, so this will avoid 871 * userspace from needing to make any changes. using the legacy 872 * meaning is the correct thing to do. If/when we have userspace 873 * consumers that want to utilize third-level batch nesting, we can 874 * provide a context parameter to allow them to opt-in. 875 */ 876 wa_masked_dis(wal, RING_MI_MODE(engine->mmio_base), TGL_NESTED_BB_EN); 877 } 878 879 static void gen12_ctx_gt_mocs_init(struct intel_engine_cs *engine, 880 struct i915_wa_list *wal) 881 { 882 u8 mocs; 883 884 /* 885 * Some blitter commands do not have a field for MOCS, those 886 * commands will use MOCS index pointed by BLIT_CCTL. 887 * BLIT_CCTL registers are needed to be programmed to un-cached. 888 */ 889 if (engine->class == COPY_ENGINE_CLASS) { 890 mocs = engine->gt->mocs.uc_index; 891 wa_write_clr_set(wal, 892 BLIT_CCTL(engine->mmio_base), 893 BLIT_CCTL_MASK, 894 BLIT_CCTL_MOCS(mocs, mocs)); 895 } 896 } 897 898 /* 899 * gen12_ctx_gt_fake_wa_init() aren't programmingan official workaround 900 * defined by the hardware team, but it programming general context registers. 901 * Adding those context register programming in context workaround 902 * allow us to use the wa framework for proper application and validation. 903 */ 904 static void 905 gen12_ctx_gt_fake_wa_init(struct intel_engine_cs *engine, 906 struct i915_wa_list *wal) 907 { 908 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) 909 fakewa_disable_nestedbb_mode(engine, wal); 910 911 gen12_ctx_gt_mocs_init(engine, wal); 912 } 913 914 static void 915 __intel_engine_init_ctx_wa(struct intel_engine_cs *engine, 916 struct i915_wa_list *wal, 917 const char *name) 918 { 919 struct drm_i915_private *i915 = engine->i915; 920 921 wa_init_start(wal, engine->gt, name, engine->name); 922 923 /* Applies to all engines */ 924 /* 925 * Fake workarounds are not the actual workaround but 926 * programming of context registers using workaround framework. 927 */ 928 if (GRAPHICS_VER(i915) >= 12) 929 gen12_ctx_gt_fake_wa_init(engine, wal); 930 931 if (engine->class != RENDER_CLASS) 932 goto done; 933 934 if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74))) 935 xelpg_ctx_workarounds_init(engine, wal); 936 else if (IS_DG2(i915)) 937 dg2_ctx_workarounds_init(engine, wal); 938 else if (IS_DG1(i915)) 939 dg1_ctx_workarounds_init(engine, wal); 940 else if (GRAPHICS_VER(i915) == 12) 941 gen12_ctx_workarounds_init(engine, wal); 942 else if (GRAPHICS_VER(i915) == 11) 943 icl_ctx_workarounds_init(engine, wal); 944 else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915)) 945 cfl_ctx_workarounds_init(engine, wal); 946 else if (IS_GEMINILAKE(i915)) 947 glk_ctx_workarounds_init(engine, wal); 948 else if (IS_KABYLAKE(i915)) 949 kbl_ctx_workarounds_init(engine, wal); 950 else if (IS_BROXTON(i915)) 951 bxt_ctx_workarounds_init(engine, wal); 952 else if (IS_SKYLAKE(i915)) 953 skl_ctx_workarounds_init(engine, wal); 954 else if (IS_CHERRYVIEW(i915)) 955 chv_ctx_workarounds_init(engine, wal); 956 else if (IS_BROADWELL(i915)) 957 bdw_ctx_workarounds_init(engine, wal); 958 else if (GRAPHICS_VER(i915) == 7) 959 gen7_ctx_workarounds_init(engine, wal); 960 else if (GRAPHICS_VER(i915) == 6) 961 gen6_ctx_workarounds_init(engine, wal); 962 else if (GRAPHICS_VER(i915) < 8) 963 ; 964 else 965 MISSING_CASE(GRAPHICS_VER(i915)); 966 967 done: 968 wa_init_finish(wal); 969 } 970 971 void intel_engine_init_ctx_wa(struct intel_engine_cs *engine) 972 { 973 __intel_engine_init_ctx_wa(engine, &engine->ctx_wa_list, "context"); 974 } 975 976 int intel_engine_emit_ctx_wa(struct i915_request *rq) 977 { 978 struct i915_wa_list *wal = &rq->engine->ctx_wa_list; 979 struct intel_uncore *uncore = rq->engine->uncore; 980 enum forcewake_domains fw; 981 unsigned long flags; 982 struct i915_wa *wa; 983 unsigned int i; 984 u32 *cs; 985 int ret; 986 987 if (wal->count == 0) 988 return 0; 989 990 ret = rq->engine->emit_flush(rq, EMIT_BARRIER); 991 if (ret) 992 return ret; 993 994 if ((IS_GFX_GT_IP_RANGE(rq->engine->gt, IP_VER(12, 70), IP_VER(12, 74)) || 995 IS_DG2(rq->i915)) && rq->engine->class == RENDER_CLASS) 996 cs = intel_ring_begin(rq, (wal->count * 2 + 6)); 997 else 998 cs = intel_ring_begin(rq, (wal->count * 2 + 2)); 999 1000 if (IS_ERR(cs)) 1001 return PTR_ERR(cs); 1002 1003 fw = wal_get_fw_for_rmw(uncore, wal); 1004 1005 intel_gt_mcr_lock(wal->gt, &flags); 1006 spin_lock(&uncore->lock); 1007 intel_uncore_forcewake_get__locked(uncore, fw); 1008 1009 *cs++ = MI_LOAD_REGISTER_IMM(wal->count); 1010 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) { 1011 u32 val; 1012 1013 /* Skip reading the register if it's not really needed */ 1014 if (wa->masked_reg || (wa->clr | wa->set) == U32_MAX) { 1015 val = wa->set; 1016 } else { 1017 val = wa->is_mcr ? 1018 intel_gt_mcr_read_any_fw(wal->gt, wa->mcr_reg) : 1019 intel_uncore_read_fw(uncore, wa->reg); 1020 val &= ~wa->clr; 1021 val |= wa->set; 1022 } 1023 1024 *cs++ = i915_mmio_reg_offset(wa->reg); 1025 *cs++ = val; 1026 } 1027 *cs++ = MI_NOOP; 1028 1029 /* Wa_14019789679 */ 1030 if ((IS_GFX_GT_IP_RANGE(rq->engine->gt, IP_VER(12, 70), IP_VER(12, 74)) || 1031 IS_DG2(rq->i915)) && rq->engine->class == RENDER_CLASS) { 1032 *cs++ = CMD_3DSTATE_MESH_CONTROL; 1033 *cs++ = 0; 1034 *cs++ = 0; 1035 *cs++ = MI_NOOP; 1036 } 1037 1038 intel_uncore_forcewake_put__locked(uncore, fw); 1039 spin_unlock(&uncore->lock); 1040 intel_gt_mcr_unlock(wal->gt, flags); 1041 1042 intel_ring_advance(rq, cs); 1043 1044 ret = rq->engine->emit_flush(rq, EMIT_BARRIER); 1045 if (ret) 1046 return ret; 1047 1048 return 0; 1049 } 1050 1051 static void 1052 gen4_gt_workarounds_init(struct intel_gt *gt, 1053 struct i915_wa_list *wal) 1054 { 1055 /* WaDisable_RenderCache_OperationalFlush:gen4,ilk */ 1056 wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE); 1057 } 1058 1059 static void 1060 g4x_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) 1061 { 1062 gen4_gt_workarounds_init(gt, wal); 1063 1064 /* WaDisableRenderCachePipelinedFlush:g4x,ilk */ 1065 wa_masked_en(wal, CACHE_MODE_0, CM0_PIPELINED_RENDER_FLUSH_DISABLE); 1066 } 1067 1068 static void 1069 ilk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) 1070 { 1071 g4x_gt_workarounds_init(gt, wal); 1072 1073 wa_masked_en(wal, _3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED); 1074 } 1075 1076 static void 1077 snb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) 1078 { 1079 } 1080 1081 static void 1082 ivb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) 1083 { 1084 /* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */ 1085 wa_masked_dis(wal, 1086 GEN7_COMMON_SLICE_CHICKEN1, 1087 GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC); 1088 1089 /* WaApplyL3ControlAndL3ChickenMode:ivb */ 1090 wa_write(wal, GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL); 1091 wa_write(wal, GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE); 1092 1093 /* WaForceL3Serialization:ivb */ 1094 wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE); 1095 } 1096 1097 static void 1098 vlv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) 1099 { 1100 /* WaForceL3Serialization:vlv */ 1101 wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE); 1102 1103 /* 1104 * WaIncreaseL3CreditsForVLVB0:vlv 1105 * This is the hardware default actually. 1106 */ 1107 wa_write(wal, GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE); 1108 } 1109 1110 static void 1111 hsw_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) 1112 { 1113 /* L3 caching of data atomics doesn't work -- disable it. */ 1114 wa_write(wal, HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE); 1115 1116 wa_add(wal, 1117 HSW_ROW_CHICKEN3, 0, 1118 _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE), 1119 0 /* XXX does this reg exist? */, true); 1120 1121 /* WaVSRefCountFullforceMissDisable:hsw */ 1122 wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME); 1123 } 1124 1125 static void 1126 gen9_wa_init_mcr(struct drm_i915_private *i915, struct i915_wa_list *wal) 1127 { 1128 const struct sseu_dev_info *sseu = &to_gt(i915)->info.sseu; 1129 unsigned int slice, subslice; 1130 u32 mcr, mcr_mask; 1131 1132 GEM_BUG_ON(GRAPHICS_VER(i915) != 9); 1133 1134 /* 1135 * WaProgramMgsrForCorrectSliceSpecificMmioReads:gen9,glk,kbl,cml 1136 * Before any MMIO read into slice/subslice specific registers, MCR 1137 * packet control register needs to be programmed to point to any 1138 * enabled s/ss pair. Otherwise, incorrect values will be returned. 1139 * This means each subsequent MMIO read will be forwarded to an 1140 * specific s/ss combination, but this is OK since these registers 1141 * are consistent across s/ss in almost all cases. In the rare 1142 * occasions, such as INSTDONE, where this value is dependent 1143 * on s/ss combo, the read should be done with read_subslice_reg. 1144 */ 1145 slice = ffs(sseu->slice_mask) - 1; 1146 GEM_BUG_ON(slice >= ARRAY_SIZE(sseu->subslice_mask.hsw)); 1147 subslice = ffs(intel_sseu_get_hsw_subslices(sseu, slice)); 1148 GEM_BUG_ON(!subslice); 1149 subslice--; 1150 1151 /* 1152 * We use GEN8_MCR..() macros to calculate the |mcr| value for 1153 * Gen9 to address WaProgramMgsrForCorrectSliceSpecificMmioReads 1154 */ 1155 mcr = GEN8_MCR_SLICE(slice) | GEN8_MCR_SUBSLICE(subslice); 1156 mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK; 1157 1158 drm_dbg(&i915->drm, "MCR slice:%d/subslice:%d = %x\n", slice, subslice, mcr); 1159 1160 wa_write_clr_set(wal, GEN8_MCR_SELECTOR, mcr_mask, mcr); 1161 } 1162 1163 static void 1164 gen9_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) 1165 { 1166 struct drm_i915_private *i915 = gt->i915; 1167 1168 /* WaProgramMgsrForCorrectSliceSpecificMmioReads:glk,kbl,cml,gen9 */ 1169 gen9_wa_init_mcr(i915, wal); 1170 1171 /* WaDisableKillLogic:bxt,skl,kbl */ 1172 if (!IS_COFFEELAKE(i915) && !IS_COMETLAKE(i915)) 1173 wa_write_or(wal, 1174 GAM_ECOCHK, 1175 ECOCHK_DIS_TLB); 1176 1177 if (HAS_LLC(i915)) { 1178 /* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl 1179 * 1180 * Must match Display Engine. See 1181 * WaCompressedResourceDisplayNewHashMode. 1182 */ 1183 wa_write_or(wal, 1184 MMCD_MISC_CTRL, 1185 MMCD_PCLA | MMCD_HOTSPOT_EN); 1186 } 1187 1188 /* WaDisableHDCInvalidation:skl,bxt,kbl,cfl */ 1189 wa_write_or(wal, 1190 GAM_ECOCHK, 1191 BDW_DISABLE_HDC_INVALIDATION); 1192 } 1193 1194 static void 1195 skl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) 1196 { 1197 gen9_gt_workarounds_init(gt, wal); 1198 1199 /* WaDisableGafsUnitClkGating:skl */ 1200 wa_write_or(wal, 1201 GEN7_UCGCTL4, 1202 GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE); 1203 1204 /* WaInPlaceDecompressionHang:skl */ 1205 if (IS_SKYLAKE(gt->i915) && IS_GRAPHICS_STEP(gt->i915, STEP_A0, STEP_H0)) 1206 wa_write_or(wal, 1207 GEN9_GAMT_ECO_REG_RW_IA, 1208 GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS); 1209 } 1210 1211 static void 1212 kbl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) 1213 { 1214 gen9_gt_workarounds_init(gt, wal); 1215 1216 /* WaDisableDynamicCreditSharing:kbl */ 1217 if (IS_KABYLAKE(gt->i915) && IS_GRAPHICS_STEP(gt->i915, 0, STEP_C0)) 1218 wa_write_or(wal, 1219 GAMT_CHKN_BIT_REG, 1220 GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING); 1221 1222 /* WaDisableGafsUnitClkGating:kbl */ 1223 wa_write_or(wal, 1224 GEN7_UCGCTL4, 1225 GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE); 1226 1227 /* WaInPlaceDecompressionHang:kbl */ 1228 wa_write_or(wal, 1229 GEN9_GAMT_ECO_REG_RW_IA, 1230 GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS); 1231 } 1232 1233 static void 1234 glk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) 1235 { 1236 gen9_gt_workarounds_init(gt, wal); 1237 } 1238 1239 static void 1240 cfl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) 1241 { 1242 gen9_gt_workarounds_init(gt, wal); 1243 1244 /* WaDisableGafsUnitClkGating:cfl */ 1245 wa_write_or(wal, 1246 GEN7_UCGCTL4, 1247 GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE); 1248 1249 /* WaInPlaceDecompressionHang:cfl */ 1250 wa_write_or(wal, 1251 GEN9_GAMT_ECO_REG_RW_IA, 1252 GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS); 1253 } 1254 1255 static void __set_mcr_steering(struct i915_wa_list *wal, 1256 i915_reg_t steering_reg, 1257 unsigned int slice, unsigned int subslice) 1258 { 1259 u32 mcr, mcr_mask; 1260 1261 mcr = GEN11_MCR_SLICE(slice) | GEN11_MCR_SUBSLICE(subslice); 1262 mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK; 1263 1264 wa_write_clr_set(wal, steering_reg, mcr_mask, mcr); 1265 } 1266 1267 static void debug_dump_steering(struct intel_gt *gt) 1268 { 1269 struct drm_printer p = drm_dbg_printer(>->i915->drm, DRM_UT_DRIVER, 1270 "MCR Steering:"); 1271 1272 if (drm_debug_enabled(DRM_UT_DRIVER)) 1273 intel_gt_mcr_report_steering(&p, gt, false); 1274 } 1275 1276 static void __add_mcr_wa(struct intel_gt *gt, struct i915_wa_list *wal, 1277 unsigned int slice, unsigned int subslice) 1278 { 1279 __set_mcr_steering(wal, GEN8_MCR_SELECTOR, slice, subslice); 1280 1281 gt->default_steering.groupid = slice; 1282 gt->default_steering.instanceid = subslice; 1283 1284 debug_dump_steering(gt); 1285 } 1286 1287 static void 1288 icl_wa_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal) 1289 { 1290 const struct sseu_dev_info *sseu = >->info.sseu; 1291 unsigned int subslice; 1292 1293 GEM_BUG_ON(GRAPHICS_VER(gt->i915) < 11); 1294 GEM_BUG_ON(hweight8(sseu->slice_mask) > 1); 1295 1296 /* 1297 * Although a platform may have subslices, we need to always steer 1298 * reads to the lowest instance that isn't fused off. When Render 1299 * Power Gating is enabled, grabbing forcewake will only power up a 1300 * single subslice (the "minconfig") if there isn't a real workload 1301 * that needs to be run; this means that if we steer register reads to 1302 * one of the higher subslices, we run the risk of reading back 0's or 1303 * random garbage. 1304 */ 1305 subslice = __ffs(intel_sseu_get_hsw_subslices(sseu, 0)); 1306 1307 /* 1308 * If the subslice we picked above also steers us to a valid L3 bank, 1309 * then we can just rely on the default steering and won't need to 1310 * worry about explicitly re-steering L3BANK reads later. 1311 */ 1312 if (gt->info.l3bank_mask & BIT(subslice)) 1313 gt->steering_table[L3BANK] = NULL; 1314 1315 __add_mcr_wa(gt, wal, 0, subslice); 1316 } 1317 1318 static void 1319 xehp_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal) 1320 { 1321 const struct sseu_dev_info *sseu = >->info.sseu; 1322 unsigned long slice, subslice = 0, slice_mask = 0; 1323 u32 lncf_mask = 0; 1324 int i; 1325 1326 /* 1327 * On Xe_HP the steering increases in complexity. There are now several 1328 * more units that require steering and we're not guaranteed to be able 1329 * to find a common setting for all of them. These are: 1330 * - GSLICE (fusable) 1331 * - DSS (sub-unit within gslice; fusable) 1332 * - L3 Bank (fusable) 1333 * - MSLICE (fusable) 1334 * - LNCF (sub-unit within mslice; always present if mslice is present) 1335 * 1336 * We'll do our default/implicit steering based on GSLICE (in the 1337 * sliceid field) and DSS (in the subsliceid field). If we can 1338 * find overlap between the valid MSLICE and/or LNCF values with 1339 * a suitable GSLICE, then we can just reuse the default value and 1340 * skip and explicit steering at runtime. 1341 * 1342 * We only need to look for overlap between GSLICE/MSLICE/LNCF to find 1343 * a valid sliceid value. DSS steering is the only type of steering 1344 * that utilizes the 'subsliceid' bits. 1345 * 1346 * Also note that, even though the steering domain is called "GSlice" 1347 * and it is encoded in the register using the gslice format, the spec 1348 * says that the combined (geometry | compute) fuse should be used to 1349 * select the steering. 1350 */ 1351 1352 /* Find the potential gslice candidates */ 1353 slice_mask = intel_slicemask_from_xehp_dssmask(sseu->subslice_mask, 1354 GEN_DSS_PER_GSLICE); 1355 1356 /* 1357 * Find the potential LNCF candidates. Either LNCF within a valid 1358 * mslice is fine. 1359 */ 1360 for_each_set_bit(i, >->info.mslice_mask, GEN12_MAX_MSLICES) 1361 lncf_mask |= (0x3 << (i * 2)); 1362 1363 /* 1364 * Are there any sliceid values that work for both GSLICE and LNCF 1365 * steering? 1366 */ 1367 if (slice_mask & lncf_mask) { 1368 slice_mask &= lncf_mask; 1369 gt->steering_table[LNCF] = NULL; 1370 } 1371 1372 /* How about sliceid values that also work for MSLICE steering? */ 1373 if (slice_mask & gt->info.mslice_mask) { 1374 slice_mask &= gt->info.mslice_mask; 1375 gt->steering_table[MSLICE] = NULL; 1376 } 1377 1378 slice = __ffs(slice_mask); 1379 subslice = intel_sseu_find_first_xehp_dss(sseu, GEN_DSS_PER_GSLICE, slice) % 1380 GEN_DSS_PER_GSLICE; 1381 1382 __add_mcr_wa(gt, wal, slice, subslice); 1383 1384 /* 1385 * SQIDI ranges are special because they use different steering 1386 * registers than everything else we work with. On XeHP SDV and 1387 * DG2-G10, any value in the steering registers will work fine since 1388 * all instances are present, but DG2-G11 only has SQIDI instances at 1389 * ID's 2 and 3, so we need to steer to one of those. For simplicity 1390 * we'll just steer to a hardcoded "2" since that value will work 1391 * everywhere. 1392 */ 1393 __set_mcr_steering(wal, MCFG_MCR_SELECTOR, 0, 2); 1394 __set_mcr_steering(wal, SF_MCR_SELECTOR, 0, 2); 1395 1396 /* 1397 * On DG2, GAM registers have a dedicated steering control register 1398 * and must always be programmed to a hardcoded groupid of "1." 1399 */ 1400 if (IS_DG2(gt->i915)) 1401 __set_mcr_steering(wal, GAM_MCR_SELECTOR, 1, 0); 1402 } 1403 1404 static void 1405 icl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) 1406 { 1407 struct drm_i915_private *i915 = gt->i915; 1408 1409 icl_wa_init_mcr(gt, wal); 1410 1411 /* WaModifyGamTlbPartitioning:icl */ 1412 wa_write_clr_set(wal, 1413 GEN11_GACB_PERF_CTRL, 1414 GEN11_HASH_CTRL_MASK, 1415 GEN11_HASH_CTRL_BIT0 | GEN11_HASH_CTRL_BIT4); 1416 1417 /* Wa_1405766107:icl 1418 * Formerly known as WaCL2SFHalfMaxAlloc 1419 */ 1420 wa_write_or(wal, 1421 GEN11_LSN_UNSLCVC, 1422 GEN11_LSN_UNSLCVC_GAFS_HALF_SF_MAXALLOC | 1423 GEN11_LSN_UNSLCVC_GAFS_HALF_CL2_MAXALLOC); 1424 1425 /* Wa_220166154:icl 1426 * Formerly known as WaDisCtxReload 1427 */ 1428 wa_write_or(wal, 1429 GEN8_GAMW_ECO_DEV_RW_IA, 1430 GAMW_ECO_DEV_CTX_RELOAD_DISABLE); 1431 1432 /* Wa_1406463099:icl 1433 * Formerly known as WaGamTlbPendError 1434 */ 1435 wa_write_or(wal, 1436 GAMT_CHKN_BIT_REG, 1437 GAMT_CHKN_DISABLE_L3_COH_PIPE); 1438 1439 /* 1440 * Wa_1408615072:icl,ehl (vsunit) 1441 * Wa_1407596294:icl,ehl (hsunit) 1442 */ 1443 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE, 1444 VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS); 1445 1446 /* Wa_1407352427:icl,ehl */ 1447 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2, 1448 PSDUNIT_CLKGATE_DIS); 1449 1450 /* Wa_1406680159:icl,ehl */ 1451 wa_mcr_write_or(wal, 1452 GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE, 1453 GWUNIT_CLKGATE_DIS); 1454 1455 /* Wa_1607087056:icl,ehl,jsl */ 1456 if (IS_ICELAKE(i915) || 1457 ((IS_JASPERLAKE(i915) || IS_ELKHARTLAKE(i915)) && 1458 IS_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))) 1459 wa_write_or(wal, 1460 GEN11_SLICE_UNIT_LEVEL_CLKGATE, 1461 L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS); 1462 1463 /* 1464 * This is not a documented workaround, but rather an optimization 1465 * to reduce sampler power. 1466 */ 1467 wa_mcr_write_clr(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE); 1468 } 1469 1470 /* 1471 * Though there are per-engine instances of these registers, 1472 * they retain their value through engine resets and should 1473 * only be provided on the GT workaround list rather than 1474 * the engine-specific workaround list. 1475 */ 1476 static void 1477 wa_14011060649(struct intel_gt *gt, struct i915_wa_list *wal) 1478 { 1479 struct intel_engine_cs *engine; 1480 int id; 1481 1482 for_each_engine(engine, gt, id) { 1483 if (engine->class != VIDEO_DECODE_CLASS || 1484 (engine->instance % 2)) 1485 continue; 1486 1487 wa_write_or(wal, VDBOX_CGCTL3F10(engine->mmio_base), 1488 IECPUNIT_CLKGATE_DIS); 1489 } 1490 } 1491 1492 static void 1493 gen12_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) 1494 { 1495 icl_wa_init_mcr(gt, wal); 1496 1497 /* Wa_14011060649:tgl,rkl,dg1,adl-s,adl-p */ 1498 wa_14011060649(gt, wal); 1499 1500 /* Wa_14011059788:tgl,rkl,adl-s,dg1,adl-p */ 1501 wa_mcr_write_or(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE); 1502 1503 /* 1504 * Wa_14015795083 1505 * 1506 * Firmware on some gen12 platforms locks the MISCCPCTL register, 1507 * preventing i915 from modifying it for this workaround. Skip the 1508 * readback verification for this workaround on debug builds; if the 1509 * workaround doesn't stick due to firmware behavior, it's not an error 1510 * that we want CI to flag. 1511 */ 1512 wa_add(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE, 1513 0, 0, false); 1514 } 1515 1516 static void 1517 dg1_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) 1518 { 1519 gen12_gt_workarounds_init(gt, wal); 1520 1521 /* Wa_1409420604:dg1 */ 1522 wa_mcr_write_or(wal, SUBSLICE_UNIT_LEVEL_CLKGATE2, 1523 CPSSUNIT_CLKGATE_DIS); 1524 1525 /* Wa_1408615072:dg1 */ 1526 /* Empirical testing shows this register is unaffected by engine reset. */ 1527 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2, VSUNIT_CLKGATE_DIS_TGL); 1528 } 1529 1530 static void 1531 dg2_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) 1532 { 1533 xehp_init_mcr(gt, wal); 1534 1535 /* Wa_14011060649:dg2 */ 1536 wa_14011060649(gt, wal); 1537 1538 if (IS_DG2_G10(gt->i915)) { 1539 /* Wa_22010523718:dg2 */ 1540 wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE, 1541 CG3DDISCFEG_CLKGATE_DIS); 1542 1543 /* Wa_14011006942:dg2 */ 1544 wa_mcr_write_or(wal, GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE, 1545 DSS_ROUTER_CLKGATE_DIS); 1546 } 1547 1548 /* Wa_14014830051:dg2 */ 1549 wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN); 1550 1551 /* 1552 * Wa_14015795083 1553 * Skip verification for possibly locked register. 1554 */ 1555 wa_add(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE, 1556 0, 0, false); 1557 1558 /* Wa_18018781329 */ 1559 wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB); 1560 wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB); 1561 wa_mcr_write_or(wal, XEHP_VDBX_MOD_CTRL, FORCE_MISS_FTLB); 1562 wa_mcr_write_or(wal, XEHP_VEBX_MOD_CTRL, FORCE_MISS_FTLB); 1563 1564 /* Wa_1509235366:dg2 */ 1565 wa_mcr_write_or(wal, XEHP_GAMCNTRL_CTRL, 1566 INVALIDATION_BROADCAST_MODE_DIS | GLOBAL_INVALIDATION_MODE); 1567 1568 /* Wa_14010648519:dg2 */ 1569 wa_mcr_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE); 1570 } 1571 1572 static void 1573 xelpg_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) 1574 { 1575 /* Wa_14018575942 / Wa_18018781329 */ 1576 wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB); 1577 wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB); 1578 1579 /* Wa_22016670082 */ 1580 wa_write_or(wal, GEN12_SQCNT1, GEN12_STRICT_RAR_ENABLE); 1581 1582 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || 1583 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) { 1584 /* Wa_14014830051 */ 1585 wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN); 1586 1587 /* Wa_14015795083 */ 1588 wa_write_clr(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE); 1589 } 1590 1591 /* 1592 * Unlike older platforms, we no longer setup implicit steering here; 1593 * all MCR accesses are explicitly steered. 1594 */ 1595 debug_dump_steering(gt); 1596 } 1597 1598 static void 1599 wa_16021867713(struct intel_gt *gt, struct i915_wa_list *wal) 1600 { 1601 struct intel_engine_cs *engine; 1602 int id; 1603 1604 for_each_engine(engine, gt, id) 1605 if (engine->class == VIDEO_DECODE_CLASS) 1606 wa_write_or(wal, VDBOX_CGCTL3F1C(engine->mmio_base), 1607 MFXPIPE_CLKGATE_DIS); 1608 } 1609 1610 static void 1611 xelpmp_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) 1612 { 1613 wa_16021867713(gt, wal); 1614 1615 /* 1616 * Wa_14018778641 1617 * Wa_18018781329 1618 * 1619 * Note that although these registers are MCR on the primary 1620 * GT, the media GT's versions are regular singleton registers. 1621 */ 1622 wa_write_or(wal, XELPMP_GSC_MOD_CTRL, FORCE_MISS_FTLB); 1623 1624 /* 1625 * Wa_14018575942 1626 * 1627 * Issue is seen on media KPI test running on VDBOX engine 1628 * especially VP9 encoding WLs 1629 */ 1630 wa_write_or(wal, XELPMP_VDBX_MOD_CTRL, FORCE_MISS_FTLB); 1631 1632 /* Wa_22016670082 */ 1633 wa_write_or(wal, GEN12_SQCNT1, GEN12_STRICT_RAR_ENABLE); 1634 1635 debug_dump_steering(gt); 1636 } 1637 1638 /* 1639 * The bspec performance guide has recommended MMIO tuning settings. These 1640 * aren't truly "workarounds" but we want to program them through the 1641 * workaround infrastructure to make sure they're (re)applied at the proper 1642 * times. 1643 * 1644 * The programming in this function is for settings that persist through 1645 * engine resets and also are not part of any engine's register state context. 1646 * I.e., settings that only need to be re-applied in the event of a full GT 1647 * reset. 1648 */ 1649 static void gt_tuning_settings(struct intel_gt *gt, struct i915_wa_list *wal) 1650 { 1651 if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74))) { 1652 wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS); 1653 wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS); 1654 } 1655 1656 if (IS_DG2(gt->i915)) { 1657 wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS); 1658 wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS); 1659 } 1660 } 1661 1662 static void 1663 gt_init_workarounds(struct intel_gt *gt, struct i915_wa_list *wal) 1664 { 1665 struct drm_i915_private *i915 = gt->i915; 1666 1667 gt_tuning_settings(gt, wal); 1668 1669 if (gt->type == GT_MEDIA) { 1670 if (MEDIA_VER_FULL(i915) == IP_VER(13, 0)) 1671 xelpmp_gt_workarounds_init(gt, wal); 1672 else 1673 MISSING_CASE(MEDIA_VER_FULL(i915)); 1674 1675 return; 1676 } 1677 1678 if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74))) 1679 xelpg_gt_workarounds_init(gt, wal); 1680 else if (IS_DG2(i915)) 1681 dg2_gt_workarounds_init(gt, wal); 1682 else if (IS_DG1(i915)) 1683 dg1_gt_workarounds_init(gt, wal); 1684 else if (GRAPHICS_VER(i915) == 12) 1685 gen12_gt_workarounds_init(gt, wal); 1686 else if (GRAPHICS_VER(i915) == 11) 1687 icl_gt_workarounds_init(gt, wal); 1688 else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915)) 1689 cfl_gt_workarounds_init(gt, wal); 1690 else if (IS_GEMINILAKE(i915)) 1691 glk_gt_workarounds_init(gt, wal); 1692 else if (IS_KABYLAKE(i915)) 1693 kbl_gt_workarounds_init(gt, wal); 1694 else if (IS_BROXTON(i915)) 1695 gen9_gt_workarounds_init(gt, wal); 1696 else if (IS_SKYLAKE(i915)) 1697 skl_gt_workarounds_init(gt, wal); 1698 else if (IS_HASWELL(i915)) 1699 hsw_gt_workarounds_init(gt, wal); 1700 else if (IS_VALLEYVIEW(i915)) 1701 vlv_gt_workarounds_init(gt, wal); 1702 else if (IS_IVYBRIDGE(i915)) 1703 ivb_gt_workarounds_init(gt, wal); 1704 else if (GRAPHICS_VER(i915) == 6) 1705 snb_gt_workarounds_init(gt, wal); 1706 else if (GRAPHICS_VER(i915) == 5) 1707 ilk_gt_workarounds_init(gt, wal); 1708 else if (IS_G4X(i915)) 1709 g4x_gt_workarounds_init(gt, wal); 1710 else if (GRAPHICS_VER(i915) == 4) 1711 gen4_gt_workarounds_init(gt, wal); 1712 else if (GRAPHICS_VER(i915) <= 8) 1713 ; 1714 else 1715 MISSING_CASE(GRAPHICS_VER(i915)); 1716 } 1717 1718 void intel_gt_init_workarounds(struct intel_gt *gt) 1719 { 1720 struct i915_wa_list *wal = >->wa_list; 1721 1722 wa_init_start(wal, gt, "GT", "global"); 1723 gt_init_workarounds(gt, wal); 1724 wa_init_finish(wal); 1725 } 1726 1727 static bool 1728 wa_verify(struct intel_gt *gt, const struct i915_wa *wa, u32 cur, 1729 const char *name, const char *from) 1730 { 1731 if ((cur ^ wa->set) & wa->read) { 1732 gt_err(gt, 1733 "%s workaround lost on %s! (reg[%x]=0x%x, relevant bits were 0x%x vs expected 0x%x)\n", 1734 name, from, i915_mmio_reg_offset(wa->reg), 1735 cur, cur & wa->read, wa->set & wa->read); 1736 1737 return false; 1738 } 1739 1740 return true; 1741 } 1742 1743 static void wa_list_apply(const struct i915_wa_list *wal) 1744 { 1745 struct intel_gt *gt = wal->gt; 1746 struct intel_uncore *uncore = gt->uncore; 1747 enum forcewake_domains fw; 1748 unsigned long flags; 1749 struct i915_wa *wa; 1750 unsigned int i; 1751 1752 if (!wal->count) 1753 return; 1754 1755 fw = wal_get_fw_for_rmw(uncore, wal); 1756 1757 intel_gt_mcr_lock(gt, &flags); 1758 spin_lock(&uncore->lock); 1759 intel_uncore_forcewake_get__locked(uncore, fw); 1760 1761 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) { 1762 u32 val, old = 0; 1763 1764 /* open-coded rmw due to steering */ 1765 if (wa->clr) 1766 old = wa->is_mcr ? 1767 intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) : 1768 intel_uncore_read_fw(uncore, wa->reg); 1769 val = (old & ~wa->clr) | wa->set; 1770 if (val != old || !wa->clr) { 1771 if (wa->is_mcr) 1772 intel_gt_mcr_multicast_write_fw(gt, wa->mcr_reg, val); 1773 else 1774 intel_uncore_write_fw(uncore, wa->reg, val); 1775 } 1776 1777 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) { 1778 u32 val = wa->is_mcr ? 1779 intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) : 1780 intel_uncore_read_fw(uncore, wa->reg); 1781 1782 wa_verify(gt, wa, val, wal->name, "application"); 1783 } 1784 } 1785 1786 intel_uncore_forcewake_put__locked(uncore, fw); 1787 spin_unlock(&uncore->lock); 1788 intel_gt_mcr_unlock(gt, flags); 1789 } 1790 1791 void intel_gt_apply_workarounds(struct intel_gt *gt) 1792 { 1793 wa_list_apply(>->wa_list); 1794 } 1795 1796 static bool wa_list_verify(struct intel_gt *gt, 1797 const struct i915_wa_list *wal, 1798 const char *from) 1799 { 1800 struct intel_uncore *uncore = gt->uncore; 1801 struct i915_wa *wa; 1802 enum forcewake_domains fw; 1803 unsigned long flags; 1804 unsigned int i; 1805 bool ok = true; 1806 1807 fw = wal_get_fw_for_rmw(uncore, wal); 1808 1809 intel_gt_mcr_lock(gt, &flags); 1810 spin_lock(&uncore->lock); 1811 intel_uncore_forcewake_get__locked(uncore, fw); 1812 1813 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) 1814 ok &= wa_verify(wal->gt, wa, wa->is_mcr ? 1815 intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) : 1816 intel_uncore_read_fw(uncore, wa->reg), 1817 wal->name, from); 1818 1819 intel_uncore_forcewake_put__locked(uncore, fw); 1820 spin_unlock(&uncore->lock); 1821 intel_gt_mcr_unlock(gt, flags); 1822 1823 return ok; 1824 } 1825 1826 bool intel_gt_verify_workarounds(struct intel_gt *gt, const char *from) 1827 { 1828 return wa_list_verify(gt, >->wa_list, from); 1829 } 1830 1831 __maybe_unused 1832 static bool is_nonpriv_flags_valid(u32 flags) 1833 { 1834 /* Check only valid flag bits are set */ 1835 if (flags & ~RING_FORCE_TO_NONPRIV_MASK_VALID) 1836 return false; 1837 1838 /* NB: Only 3 out of 4 enum values are valid for access field */ 1839 if ((flags & RING_FORCE_TO_NONPRIV_ACCESS_MASK) == 1840 RING_FORCE_TO_NONPRIV_ACCESS_INVALID) 1841 return false; 1842 1843 return true; 1844 } 1845 1846 static void 1847 whitelist_reg_ext(struct i915_wa_list *wal, i915_reg_t reg, u32 flags) 1848 { 1849 struct i915_wa wa = { 1850 .reg = reg 1851 }; 1852 1853 if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS)) 1854 return; 1855 1856 if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags))) 1857 return; 1858 1859 wa.reg.reg |= flags; 1860 _wa_add(wal, &wa); 1861 } 1862 1863 static void 1864 whitelist_mcr_reg_ext(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 flags) 1865 { 1866 struct i915_wa wa = { 1867 .mcr_reg = reg, 1868 .is_mcr = 1, 1869 }; 1870 1871 if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS)) 1872 return; 1873 1874 if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags))) 1875 return; 1876 1877 wa.mcr_reg.reg |= flags; 1878 _wa_add(wal, &wa); 1879 } 1880 1881 static void 1882 whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg) 1883 { 1884 whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW); 1885 } 1886 1887 static void 1888 whitelist_mcr_reg(struct i915_wa_list *wal, i915_mcr_reg_t reg) 1889 { 1890 whitelist_mcr_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW); 1891 } 1892 1893 static void gen9_whitelist_build(struct i915_wa_list *w) 1894 { 1895 /* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */ 1896 whitelist_reg(w, GEN9_CTX_PREEMPT_REG); 1897 1898 /* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl,cfl,[cnl] */ 1899 whitelist_reg(w, GEN8_CS_CHICKEN1); 1900 1901 /* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl,glk,cfl */ 1902 whitelist_reg(w, GEN8_HDC_CHICKEN1); 1903 1904 /* WaSendPushConstantsFromMMIO:skl,bxt */ 1905 whitelist_reg(w, COMMON_SLICE_CHICKEN2); 1906 } 1907 1908 static void skl_whitelist_build(struct intel_engine_cs *engine) 1909 { 1910 struct i915_wa_list *w = &engine->whitelist; 1911 1912 if (engine->class != RENDER_CLASS) 1913 return; 1914 1915 gen9_whitelist_build(w); 1916 1917 /* WaDisableLSQCROPERFforOCL:skl */ 1918 whitelist_mcr_reg(w, GEN8_L3SQCREG4); 1919 } 1920 1921 static void bxt_whitelist_build(struct intel_engine_cs *engine) 1922 { 1923 if (engine->class != RENDER_CLASS) 1924 return; 1925 1926 gen9_whitelist_build(&engine->whitelist); 1927 } 1928 1929 static void kbl_whitelist_build(struct intel_engine_cs *engine) 1930 { 1931 struct i915_wa_list *w = &engine->whitelist; 1932 1933 if (engine->class != RENDER_CLASS) 1934 return; 1935 1936 gen9_whitelist_build(w); 1937 1938 /* WaDisableLSQCROPERFforOCL:kbl */ 1939 whitelist_mcr_reg(w, GEN8_L3SQCREG4); 1940 } 1941 1942 static void glk_whitelist_build(struct intel_engine_cs *engine) 1943 { 1944 struct i915_wa_list *w = &engine->whitelist; 1945 1946 if (engine->class != RENDER_CLASS) 1947 return; 1948 1949 gen9_whitelist_build(w); 1950 1951 /* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */ 1952 whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1); 1953 } 1954 1955 static void cfl_whitelist_build(struct intel_engine_cs *engine) 1956 { 1957 struct i915_wa_list *w = &engine->whitelist; 1958 1959 if (engine->class != RENDER_CLASS) 1960 return; 1961 1962 gen9_whitelist_build(w); 1963 1964 /* 1965 * WaAllowPMDepthAndInvocationCountAccessFromUMD:cfl,whl,cml,aml 1966 * 1967 * This covers 4 register which are next to one another : 1968 * - PS_INVOCATION_COUNT 1969 * - PS_INVOCATION_COUNT_UDW 1970 * - PS_DEPTH_COUNT 1971 * - PS_DEPTH_COUNT_UDW 1972 */ 1973 whitelist_reg_ext(w, PS_INVOCATION_COUNT, 1974 RING_FORCE_TO_NONPRIV_ACCESS_RD | 1975 RING_FORCE_TO_NONPRIV_RANGE_4); 1976 } 1977 1978 static void allow_read_ctx_timestamp(struct intel_engine_cs *engine) 1979 { 1980 struct i915_wa_list *w = &engine->whitelist; 1981 1982 if (engine->class != RENDER_CLASS) 1983 whitelist_reg_ext(w, 1984 RING_CTX_TIMESTAMP(engine->mmio_base), 1985 RING_FORCE_TO_NONPRIV_ACCESS_RD); 1986 } 1987 1988 static void cml_whitelist_build(struct intel_engine_cs *engine) 1989 { 1990 allow_read_ctx_timestamp(engine); 1991 1992 cfl_whitelist_build(engine); 1993 } 1994 1995 static void icl_whitelist_build(struct intel_engine_cs *engine) 1996 { 1997 struct i915_wa_list *w = &engine->whitelist; 1998 1999 allow_read_ctx_timestamp(engine); 2000 2001 switch (engine->class) { 2002 case RENDER_CLASS: 2003 /* WaAllowUMDToModifyHalfSliceChicken7:icl */ 2004 whitelist_mcr_reg(w, GEN9_HALF_SLICE_CHICKEN7); 2005 2006 /* WaAllowUMDToModifySamplerMode:icl */ 2007 whitelist_mcr_reg(w, GEN10_SAMPLER_MODE); 2008 2009 /* WaEnableStateCacheRedirectToCS:icl */ 2010 whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1); 2011 2012 /* 2013 * WaAllowPMDepthAndInvocationCountAccessFromUMD:icl 2014 * 2015 * This covers 4 register which are next to one another : 2016 * - PS_INVOCATION_COUNT 2017 * - PS_INVOCATION_COUNT_UDW 2018 * - PS_DEPTH_COUNT 2019 * - PS_DEPTH_COUNT_UDW 2020 */ 2021 whitelist_reg_ext(w, PS_INVOCATION_COUNT, 2022 RING_FORCE_TO_NONPRIV_ACCESS_RD | 2023 RING_FORCE_TO_NONPRIV_RANGE_4); 2024 break; 2025 2026 case VIDEO_DECODE_CLASS: 2027 /* hucStatusRegOffset */ 2028 whitelist_reg_ext(w, _MMIO(0x2000 + engine->mmio_base), 2029 RING_FORCE_TO_NONPRIV_ACCESS_RD); 2030 /* hucUKernelHdrInfoRegOffset */ 2031 whitelist_reg_ext(w, _MMIO(0x2014 + engine->mmio_base), 2032 RING_FORCE_TO_NONPRIV_ACCESS_RD); 2033 /* hucStatus2RegOffset */ 2034 whitelist_reg_ext(w, _MMIO(0x23B0 + engine->mmio_base), 2035 RING_FORCE_TO_NONPRIV_ACCESS_RD); 2036 break; 2037 2038 default: 2039 break; 2040 } 2041 } 2042 2043 static void tgl_whitelist_build(struct intel_engine_cs *engine) 2044 { 2045 struct i915_wa_list *w = &engine->whitelist; 2046 2047 allow_read_ctx_timestamp(engine); 2048 2049 switch (engine->class) { 2050 case RENDER_CLASS: 2051 /* 2052 * WaAllowPMDepthAndInvocationCountAccessFromUMD:tgl 2053 * Wa_1408556865:tgl 2054 * 2055 * This covers 4 registers which are next to one another : 2056 * - PS_INVOCATION_COUNT 2057 * - PS_INVOCATION_COUNT_UDW 2058 * - PS_DEPTH_COUNT 2059 * - PS_DEPTH_COUNT_UDW 2060 */ 2061 whitelist_reg_ext(w, PS_INVOCATION_COUNT, 2062 RING_FORCE_TO_NONPRIV_ACCESS_RD | 2063 RING_FORCE_TO_NONPRIV_RANGE_4); 2064 2065 /* 2066 * Wa_1808121037:tgl 2067 * Wa_14012131227:dg1 2068 * Wa_1508744258:tgl,rkl,dg1,adl-s,adl-p 2069 */ 2070 whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1); 2071 2072 /* Wa_1806527549:tgl */ 2073 whitelist_reg(w, HIZ_CHICKEN); 2074 2075 /* Required by recommended tuning setting (not a workaround) */ 2076 whitelist_reg(w, GEN11_COMMON_SLICE_CHICKEN3); 2077 2078 break; 2079 default: 2080 break; 2081 } 2082 } 2083 2084 static void dg2_whitelist_build(struct intel_engine_cs *engine) 2085 { 2086 struct i915_wa_list *w = &engine->whitelist; 2087 2088 switch (engine->class) { 2089 case RENDER_CLASS: 2090 /* Required by recommended tuning setting (not a workaround) */ 2091 whitelist_mcr_reg(w, XEHP_COMMON_SLICE_CHICKEN3); 2092 whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1); 2093 break; 2094 default: 2095 break; 2096 } 2097 } 2098 2099 static void xelpg_whitelist_build(struct intel_engine_cs *engine) 2100 { 2101 struct i915_wa_list *w = &engine->whitelist; 2102 2103 switch (engine->class) { 2104 case RENDER_CLASS: 2105 /* Required by recommended tuning setting (not a workaround) */ 2106 whitelist_mcr_reg(w, XEHP_COMMON_SLICE_CHICKEN3); 2107 whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1); 2108 break; 2109 default: 2110 break; 2111 } 2112 } 2113 2114 void intel_engine_init_whitelist(struct intel_engine_cs *engine) 2115 { 2116 struct drm_i915_private *i915 = engine->i915; 2117 struct i915_wa_list *w = &engine->whitelist; 2118 2119 wa_init_start(w, engine->gt, "whitelist", engine->name); 2120 2121 if (engine->gt->type == GT_MEDIA) 2122 ; /* none yet */ 2123 else if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74))) 2124 xelpg_whitelist_build(engine); 2125 else if (IS_DG2(i915)) 2126 dg2_whitelist_build(engine); 2127 else if (GRAPHICS_VER(i915) == 12) 2128 tgl_whitelist_build(engine); 2129 else if (GRAPHICS_VER(i915) == 11) 2130 icl_whitelist_build(engine); 2131 else if (IS_COMETLAKE(i915)) 2132 cml_whitelist_build(engine); 2133 else if (IS_COFFEELAKE(i915)) 2134 cfl_whitelist_build(engine); 2135 else if (IS_GEMINILAKE(i915)) 2136 glk_whitelist_build(engine); 2137 else if (IS_KABYLAKE(i915)) 2138 kbl_whitelist_build(engine); 2139 else if (IS_BROXTON(i915)) 2140 bxt_whitelist_build(engine); 2141 else if (IS_SKYLAKE(i915)) 2142 skl_whitelist_build(engine); 2143 else if (GRAPHICS_VER(i915) <= 8) 2144 ; 2145 else 2146 MISSING_CASE(GRAPHICS_VER(i915)); 2147 2148 wa_init_finish(w); 2149 } 2150 2151 void intel_engine_apply_whitelist(struct intel_engine_cs *engine) 2152 { 2153 const struct i915_wa_list *wal = &engine->whitelist; 2154 struct intel_uncore *uncore = engine->uncore; 2155 const u32 base = engine->mmio_base; 2156 struct i915_wa *wa; 2157 unsigned int i; 2158 2159 if (!wal->count) 2160 return; 2161 2162 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) 2163 intel_uncore_write(uncore, 2164 RING_FORCE_TO_NONPRIV(base, i), 2165 i915_mmio_reg_offset(wa->reg)); 2166 2167 /* And clear the rest just in case of garbage */ 2168 for (; i < RING_MAX_NONPRIV_SLOTS; i++) 2169 intel_uncore_write(uncore, 2170 RING_FORCE_TO_NONPRIV(base, i), 2171 i915_mmio_reg_offset(RING_NOPID(base))); 2172 } 2173 2174 /* 2175 * engine_fake_wa_init(), a place holder to program the registers 2176 * which are not part of an official workaround defined by the 2177 * hardware team. 2178 * Adding programming of those register inside workaround will 2179 * allow utilizing wa framework to proper application and verification. 2180 */ 2181 static void 2182 engine_fake_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) 2183 { 2184 u8 mocs_w, mocs_r; 2185 2186 /* 2187 * RING_CMD_CCTL specifies the default MOCS entry that will be used 2188 * by the command streamer when executing commands that don't have 2189 * a way to explicitly specify a MOCS setting. The default should 2190 * usually reference whichever MOCS entry corresponds to uncached 2191 * behavior, although use of a WB cached entry is recommended by the 2192 * spec in certain circumstances on specific platforms. 2193 */ 2194 if (GRAPHICS_VER(engine->i915) >= 12) { 2195 mocs_r = engine->gt->mocs.uc_index; 2196 mocs_w = engine->gt->mocs.uc_index; 2197 2198 if (HAS_L3_CCS_READ(engine->i915) && 2199 engine->class == COMPUTE_CLASS) { 2200 mocs_r = engine->gt->mocs.wb_index; 2201 2202 /* 2203 * Even on the few platforms where MOCS 0 is a 2204 * legitimate table entry, it's never the correct 2205 * setting to use here; we can assume the MOCS init 2206 * just forgot to initialize wb_index. 2207 */ 2208 drm_WARN_ON(&engine->i915->drm, mocs_r == 0); 2209 } 2210 2211 wa_masked_field_set(wal, 2212 RING_CMD_CCTL(engine->mmio_base), 2213 CMD_CCTL_MOCS_MASK, 2214 CMD_CCTL_MOCS_OVERRIDE(mocs_w, mocs_r)); 2215 } 2216 } 2217 2218 static void 2219 rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) 2220 { 2221 struct drm_i915_private *i915 = engine->i915; 2222 struct intel_gt *gt = engine->gt; 2223 2224 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || 2225 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) { 2226 /* Wa_22014600077 */ 2227 wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS, 2228 ENABLE_EU_COUNT_FOR_TDL_FLUSH); 2229 } 2230 2231 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || 2232 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) || 2233 IS_DG2(i915)) { 2234 /* Wa_1509727124 */ 2235 wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE, 2236 SC_DISABLE_POWER_OPTIMIZATION_EBB); 2237 } 2238 2239 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || 2240 IS_DG2(i915)) { 2241 /* Wa_22012856258 */ 2242 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, 2243 GEN12_DISABLE_READ_SUPPRESSION); 2244 } 2245 2246 if (IS_DG2(i915)) { 2247 /* 2248 * Wa_22010960976:dg2 2249 * Wa_14013347512:dg2 2250 */ 2251 wa_mcr_masked_dis(wal, XEHP_HDC_CHICKEN0, 2252 LSC_L1_FLUSH_CTL_3D_DATAPORT_FLUSH_EVENTS_MASK); 2253 } 2254 2255 if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)) || 2256 IS_DG2(i915)) { 2257 /* Wa_14015150844 */ 2258 wa_mcr_add(wal, XEHP_HDC_CHICKEN0, 0, 2259 _MASKED_BIT_ENABLE(DIS_ATOMIC_CHAINING_TYPED_WRITES), 2260 0, true); 2261 } 2262 2263 if (IS_DG2(i915) || IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) || 2264 IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) { 2265 /* 2266 * Wa_1606700617:tgl,dg1,adl-p 2267 * Wa_22010271021:tgl,rkl,dg1,adl-s,adl-p 2268 * Wa_14010826681:tgl,dg1,rkl,adl-p 2269 * Wa_18019627453:dg2 2270 */ 2271 wa_masked_en(wal, 2272 GEN9_CS_DEBUG_MODE1, 2273 FF_DOP_CLOCK_GATE_DISABLE); 2274 } 2275 2276 if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) || IS_DG1(i915) || 2277 IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) { 2278 /* Wa_1606931601:tgl,rkl,dg1,adl-s,adl-p */ 2279 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, GEN12_DISABLE_EARLY_READ); 2280 2281 /* 2282 * Wa_1407928979:tgl A* 2283 * Wa_18011464164:tgl[B0+],dg1[B0+] 2284 * Wa_22010931296:tgl[B0+],dg1[B0+] 2285 * Wa_14010919138:rkl,dg1,adl-s,adl-p 2286 */ 2287 wa_write_or(wal, GEN7_FF_THREAD_MODE, 2288 GEN12_FF_TESSELATION_DOP_GATE_DISABLE); 2289 2290 /* Wa_1406941453:tgl,rkl,dg1,adl-s,adl-p */ 2291 wa_mcr_masked_en(wal, 2292 GEN10_SAMPLER_MODE, 2293 ENABLE_SMALLPL); 2294 } 2295 2296 if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) || 2297 IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) { 2298 /* Wa_1409804808 */ 2299 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, 2300 GEN12_PUSH_CONST_DEREF_HOLD_DIS); 2301 2302 /* Wa_14010229206 */ 2303 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN12_DISABLE_TDL_PUSH); 2304 } 2305 2306 if (IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915) || IS_ALDERLAKE_P(i915)) { 2307 /* 2308 * Wa_1607297627 2309 * 2310 * On TGL and RKL there are multiple entries for this WA in the 2311 * BSpec; some indicate this is an A0-only WA, others indicate 2312 * it applies to all steppings so we trust the "all steppings." 2313 */ 2314 wa_masked_en(wal, 2315 RING_PSMI_CTL(RENDER_RING_BASE), 2316 GEN12_WAIT_FOR_EVENT_POWER_DOWN_DISABLE | 2317 GEN8_RC_SEMA_IDLE_MSG_DISABLE); 2318 } 2319 2320 if (GRAPHICS_VER(i915) == 11) { 2321 /* This is not an Wa. Enable for better image quality */ 2322 wa_masked_en(wal, 2323 _3D_CHICKEN3, 2324 _3D_CHICKEN3_AA_LINE_QUALITY_FIX_ENABLE); 2325 2326 /* 2327 * Wa_1405543622:icl 2328 * Formerly known as WaGAPZPriorityScheme 2329 */ 2330 wa_write_or(wal, 2331 GEN8_GARBCNTL, 2332 GEN11_ARBITRATION_PRIO_ORDER_MASK); 2333 2334 /* 2335 * Wa_1604223664:icl 2336 * Formerly known as WaL3BankAddressHashing 2337 */ 2338 wa_write_clr_set(wal, 2339 GEN8_GARBCNTL, 2340 GEN11_HASH_CTRL_EXCL_MASK, 2341 GEN11_HASH_CTRL_EXCL_BIT0); 2342 wa_write_clr_set(wal, 2343 GEN11_GLBLINVL, 2344 GEN11_BANK_HASH_ADDR_EXCL_MASK, 2345 GEN11_BANK_HASH_ADDR_EXCL_BIT0); 2346 2347 /* 2348 * Wa_1405733216:icl 2349 * Formerly known as WaDisableCleanEvicts 2350 */ 2351 wa_mcr_write_or(wal, 2352 GEN8_L3SQCREG4, 2353 GEN11_LQSC_CLEAN_EVICT_DISABLE); 2354 2355 /* Wa_1606682166:icl */ 2356 wa_write_or(wal, 2357 GEN7_SARCHKMD, 2358 GEN7_DISABLE_SAMPLER_PREFETCH); 2359 2360 /* Wa_1409178092:icl */ 2361 wa_mcr_write_clr_set(wal, 2362 GEN11_SCRATCH2, 2363 GEN11_COHERENT_PARTIAL_WRITE_MERGE_ENABLE, 2364 0); 2365 2366 /* WaEnable32PlaneMode:icl */ 2367 wa_masked_en(wal, GEN9_CSFE_CHICKEN1_RCS, 2368 GEN11_ENABLE_32_PLANE_MODE); 2369 2370 /* 2371 * Wa_1408767742:icl[a2..forever],ehl[all] 2372 * Wa_1605460711:icl[a0..c0] 2373 */ 2374 wa_write_or(wal, 2375 GEN7_FF_THREAD_MODE, 2376 GEN12_FF_TESSELATION_DOP_GATE_DISABLE); 2377 2378 /* Wa_22010271021 */ 2379 wa_masked_en(wal, 2380 GEN9_CS_DEBUG_MODE1, 2381 FF_DOP_CLOCK_GATE_DISABLE); 2382 } 2383 2384 /* 2385 * Intel platforms that support fine-grained preemption (i.e., gen9 and 2386 * beyond) allow the kernel-mode driver to choose between two different 2387 * options for controlling preemption granularity and behavior. 2388 * 2389 * Option 1 (hardware default): 2390 * Preemption settings are controlled in a global manner via 2391 * kernel-only register CS_DEBUG_MODE1 (0x20EC). Any granularity 2392 * and settings chosen by the kernel-mode driver will apply to all 2393 * userspace clients. 2394 * 2395 * Option 2: 2396 * Preemption settings are controlled on a per-context basis via 2397 * register CS_CHICKEN1 (0x2580). CS_CHICKEN1 is saved/restored on 2398 * context switch and is writable by userspace (e.g., via 2399 * MI_LOAD_REGISTER_IMMEDIATE instructions placed in a batch buffer) 2400 * which allows different userspace drivers/clients to select 2401 * different settings, or to change those settings on the fly in 2402 * response to runtime needs. This option was known by name 2403 * "FtrPerCtxtPreemptionGranularityControl" at one time, although 2404 * that name is somewhat misleading as other non-granularity 2405 * preemption settings are also impacted by this decision. 2406 * 2407 * On Linux, our policy has always been to let userspace drivers 2408 * control preemption granularity/settings (Option 2). This was 2409 * originally mandatory on gen9 to prevent ABI breakage (old gen9 2410 * userspace developed before object-level preemption was enabled would 2411 * not behave well if i915 were to go with Option 1 and enable that 2412 * preemption in a global manner). On gen9 each context would have 2413 * object-level preemption disabled by default (see 2414 * WaDisable3DMidCmdPreemption in gen9_ctx_workarounds_init), but 2415 * userspace drivers could opt-in to object-level preemption as they 2416 * saw fit. For post-gen9 platforms, we continue to utilize Option 2; 2417 * even though it is no longer necessary for ABI compatibility when 2418 * enabling a new platform, it does ensure that userspace will be able 2419 * to implement any workarounds that show up requiring temporary 2420 * adjustments to preemption behavior at runtime. 2421 * 2422 * Notes/Workarounds: 2423 * - Wa_14015141709: On DG2 and early steppings of MTL, 2424 * CS_CHICKEN1[0] does not disable object-level preemption as 2425 * it is supposed to (nor does CS_DEBUG_MODE1[0] if we had been 2426 * using Option 1). Effectively this means userspace is unable 2427 * to disable object-level preemption on these platforms/steppings 2428 * despite the setting here. 2429 * 2430 * - Wa_16013994831: May require that userspace program 2431 * CS_CHICKEN1[10] when certain runtime conditions are true. 2432 * Userspace requires Option 2 to be in effect for their update of 2433 * CS_CHICKEN1[10] to be effective. 2434 * 2435 * Other workarounds may appear in the future that will also require 2436 * Option 2 behavior to allow proper userspace implementation. 2437 */ 2438 if (GRAPHICS_VER(i915) >= 9) 2439 wa_masked_en(wal, 2440 GEN7_FF_SLICE_CS_CHICKEN1, 2441 GEN9_FFSC_PERCTX_PREEMPT_CTRL); 2442 2443 if (IS_SKYLAKE(i915) || 2444 IS_KABYLAKE(i915) || 2445 IS_COFFEELAKE(i915) || 2446 IS_COMETLAKE(i915)) { 2447 /* WaEnableGapsTsvCreditFix:skl,kbl,cfl */ 2448 wa_write_or(wal, 2449 GEN8_GARBCNTL, 2450 GEN9_GAPS_TSV_CREDIT_DISABLE); 2451 } 2452 2453 if (IS_BROXTON(i915)) { 2454 /* WaDisablePooledEuLoadBalancingFix:bxt */ 2455 wa_masked_en(wal, 2456 FF_SLICE_CS_CHICKEN2, 2457 GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE); 2458 } 2459 2460 if (GRAPHICS_VER(i915) == 9) { 2461 /* WaContextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk,cfl */ 2462 wa_masked_en(wal, 2463 GEN9_CSFE_CHICKEN1_RCS, 2464 GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE); 2465 2466 /* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl,glk,cfl */ 2467 wa_mcr_write_or(wal, 2468 BDW_SCRATCH1, 2469 GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE); 2470 2471 /* WaProgramL3SqcReg1DefaultForPerf:bxt,glk */ 2472 if (IS_GEN9_LP(i915)) 2473 wa_mcr_write_clr_set(wal, 2474 GEN8_L3SQCREG1, 2475 L3_PRIO_CREDITS_MASK, 2476 L3_GENERAL_PRIO_CREDITS(62) | 2477 L3_HIGH_PRIO_CREDITS(2)); 2478 2479 /* WaOCLCoherentLineFlush:skl,bxt,kbl,cfl */ 2480 wa_mcr_write_or(wal, 2481 GEN8_L3SQCREG4, 2482 GEN8_LQSC_FLUSH_COHERENT_LINES); 2483 2484 /* Disable atomics in L3 to prevent unrecoverable hangs */ 2485 wa_write_clr_set(wal, GEN9_SCRATCH_LNCF1, 2486 GEN9_LNCF_NONIA_COHERENT_ATOMICS_ENABLE, 0); 2487 wa_mcr_write_clr_set(wal, GEN8_L3SQCREG4, 2488 GEN8_LQSQ_NONIA_COHERENT_ATOMICS_ENABLE, 0); 2489 wa_mcr_write_clr_set(wal, GEN9_SCRATCH1, 2490 EVICTION_PERF_FIX_ENABLE, 0); 2491 } 2492 2493 if (IS_HASWELL(i915)) { 2494 /* WaSampleCChickenBitEnable:hsw */ 2495 wa_masked_en(wal, 2496 HSW_HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE); 2497 2498 wa_masked_dis(wal, 2499 CACHE_MODE_0_GEN7, 2500 /* enable HiZ Raw Stall Optimization */ 2501 HIZ_RAW_STALL_OPT_DISABLE); 2502 } 2503 2504 if (IS_VALLEYVIEW(i915)) { 2505 /* WaDisableEarlyCull:vlv */ 2506 wa_masked_en(wal, 2507 _3D_CHICKEN3, 2508 _3D_CHICKEN_SF_DISABLE_OBJEND_CULL); 2509 2510 /* 2511 * WaVSThreadDispatchOverride:ivb,vlv 2512 * 2513 * This actually overrides the dispatch 2514 * mode for all thread types. 2515 */ 2516 wa_write_clr_set(wal, 2517 GEN7_FF_THREAD_MODE, 2518 GEN7_FF_SCHED_MASK, 2519 GEN7_FF_TS_SCHED_HW | 2520 GEN7_FF_VS_SCHED_HW | 2521 GEN7_FF_DS_SCHED_HW); 2522 2523 /* WaPsdDispatchEnable:vlv */ 2524 /* WaDisablePSDDualDispatchEnable:vlv */ 2525 wa_masked_en(wal, 2526 GEN7_HALF_SLICE_CHICKEN1, 2527 GEN7_MAX_PS_THREAD_DEP | 2528 GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE); 2529 } 2530 2531 if (IS_IVYBRIDGE(i915)) { 2532 /* WaDisableEarlyCull:ivb */ 2533 wa_masked_en(wal, 2534 _3D_CHICKEN3, 2535 _3D_CHICKEN_SF_DISABLE_OBJEND_CULL); 2536 2537 if (0) { /* causes HiZ corruption on ivb:gt1 */ 2538 /* enable HiZ Raw Stall Optimization */ 2539 wa_masked_dis(wal, 2540 CACHE_MODE_0_GEN7, 2541 HIZ_RAW_STALL_OPT_DISABLE); 2542 } 2543 2544 /* 2545 * WaVSThreadDispatchOverride:ivb,vlv 2546 * 2547 * This actually overrides the dispatch 2548 * mode for all thread types. 2549 */ 2550 wa_write_clr_set(wal, 2551 GEN7_FF_THREAD_MODE, 2552 GEN7_FF_SCHED_MASK, 2553 GEN7_FF_TS_SCHED_HW | 2554 GEN7_FF_VS_SCHED_HW | 2555 GEN7_FF_DS_SCHED_HW); 2556 2557 /* WaDisablePSDDualDispatchEnable:ivb */ 2558 if (INTEL_INFO(i915)->gt == 1) 2559 wa_masked_en(wal, 2560 GEN7_HALF_SLICE_CHICKEN1, 2561 GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE); 2562 } 2563 2564 if (GRAPHICS_VER(i915) == 7) { 2565 /* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */ 2566 wa_masked_en(wal, 2567 RING_MODE_GEN7(RENDER_RING_BASE), 2568 GFX_TLB_INVALIDATE_EXPLICIT | GFX_REPLAY_MODE); 2569 2570 /* WaDisable_RenderCache_OperationalFlush:ivb,vlv,hsw */ 2571 wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE); 2572 2573 /* 2574 * BSpec says this must be set, even though 2575 * WaDisable4x2SubspanOptimization:ivb,hsw 2576 * WaDisable4x2SubspanOptimization isn't listed for VLV. 2577 */ 2578 wa_masked_en(wal, 2579 CACHE_MODE_1, 2580 PIXEL_SUBSPAN_COLLECT_OPT_DISABLE); 2581 2582 /* 2583 * BSpec recommends 8x4 when MSAA is used, 2584 * however in practice 16x4 seems fastest. 2585 * 2586 * Note that PS/WM thread counts depend on the WIZ hashing 2587 * disable bit, which we don't touch here, but it's good 2588 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). 2589 */ 2590 wa_masked_field_set(wal, 2591 GEN7_GT_MODE, 2592 GEN6_WIZ_HASHING_MASK, 2593 GEN6_WIZ_HASHING_16x4); 2594 } 2595 2596 if (IS_GRAPHICS_VER(i915, 6, 7)) 2597 /* 2598 * We need to disable the AsyncFlip performance optimisations in 2599 * order to use MI_WAIT_FOR_EVENT within the CS. It should 2600 * already be programmed to '1' on all products. 2601 * 2602 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv 2603 */ 2604 wa_masked_en(wal, 2605 RING_MI_MODE(RENDER_RING_BASE), 2606 ASYNC_FLIP_PERF_DISABLE); 2607 2608 if (GRAPHICS_VER(i915) == 6) { 2609 /* 2610 * Required for the hardware to program scanline values for 2611 * waiting 2612 * WaEnableFlushTlbInvalidationMode:snb 2613 */ 2614 wa_masked_en(wal, 2615 GFX_MODE, 2616 GFX_TLB_INVALIDATE_EXPLICIT); 2617 2618 /* WaDisableHiZPlanesWhenMSAAEnabled:snb */ 2619 wa_masked_en(wal, 2620 _3D_CHICKEN, 2621 _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB); 2622 2623 wa_masked_en(wal, 2624 _3D_CHICKEN3, 2625 /* WaStripsFansDisableFastClipPerformanceFix:snb */ 2626 _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL | 2627 /* 2628 * Bspec says: 2629 * "This bit must be set if 3DSTATE_CLIP clip mode is set 2630 * to normal and 3DSTATE_SF number of SF output attributes 2631 * is more than 16." 2632 */ 2633 _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH); 2634 2635 /* 2636 * BSpec recommends 8x4 when MSAA is used, 2637 * however in practice 16x4 seems fastest. 2638 * 2639 * Note that PS/WM thread counts depend on the WIZ hashing 2640 * disable bit, which we don't touch here, but it's good 2641 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). 2642 */ 2643 wa_masked_field_set(wal, 2644 GEN6_GT_MODE, 2645 GEN6_WIZ_HASHING_MASK, 2646 GEN6_WIZ_HASHING_16x4); 2647 2648 /* WaDisable_RenderCache_OperationalFlush:snb */ 2649 wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE); 2650 2651 /* 2652 * From the Sandybridge PRM, volume 1 part 3, page 24: 2653 * "If this bit is set, STCunit will have LRA as replacement 2654 * policy. [...] This bit must be reset. LRA replacement 2655 * policy is not supported." 2656 */ 2657 wa_masked_dis(wal, 2658 CACHE_MODE_0, 2659 CM0_STC_EVICT_DISABLE_LRA_SNB); 2660 } 2661 2662 if (IS_GRAPHICS_VER(i915, 4, 6)) 2663 /* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */ 2664 wa_add(wal, RING_MI_MODE(RENDER_RING_BASE), 2665 0, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH), 2666 /* XXX bit doesn't stick on Broadwater */ 2667 IS_I965G(i915) ? 0 : VS_TIMER_DISPATCH, true); 2668 2669 if (GRAPHICS_VER(i915) == 4) 2670 /* 2671 * Disable CONSTANT_BUFFER before it is loaded from the context 2672 * image. For as it is loaded, it is executed and the stored 2673 * address may no longer be valid, leading to a GPU hang. 2674 * 2675 * This imposes the requirement that userspace reload their 2676 * CONSTANT_BUFFER on every batch, fortunately a requirement 2677 * they are already accustomed to from before contexts were 2678 * enabled. 2679 */ 2680 wa_add(wal, ECOSKPD(RENDER_RING_BASE), 2681 0, _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE), 2682 0 /* XXX bit doesn't stick on Broadwater */, 2683 true); 2684 } 2685 2686 static void 2687 xcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) 2688 { 2689 struct drm_i915_private *i915 = engine->i915; 2690 2691 /* WaKBLVECSSemaphoreWaitPoll:kbl */ 2692 if (IS_KABYLAKE(i915) && IS_GRAPHICS_STEP(i915, STEP_A0, STEP_F0)) { 2693 wa_write(wal, 2694 RING_SEMA_WAIT_POLL(engine->mmio_base), 2695 1); 2696 } 2697 /* Wa_16018031267, Wa_16018063123 */ 2698 if (NEEDS_FASTCOLOR_BLT_WABB(engine)) 2699 wa_masked_field_set(wal, ECOSKPD(engine->mmio_base), 2700 XEHP_BLITTER_SCHEDULING_MODE_MASK, 2701 XEHP_BLITTER_ROUND_ROBIN_MODE); 2702 } 2703 2704 static void 2705 ccs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) 2706 { 2707 /* boilerplate for any CCS engine workaround */ 2708 } 2709 2710 /* 2711 * The bspec performance guide has recommended MMIO tuning settings. These 2712 * aren't truly "workarounds" but we want to program them with the same 2713 * workaround infrastructure to ensure that they're automatically added to 2714 * the GuC save/restore lists, re-applied at the right times, and checked for 2715 * any conflicting programming requested by real workarounds. 2716 * 2717 * Programming settings should be added here only if their registers are not 2718 * part of an engine's register state context. If a register is part of a 2719 * context, then any tuning settings should be programmed in an appropriate 2720 * function invoked by __intel_engine_init_ctx_wa(). 2721 */ 2722 static void 2723 add_render_compute_tuning_settings(struct intel_gt *gt, 2724 struct i915_wa_list *wal) 2725 { 2726 struct drm_i915_private *i915 = gt->i915; 2727 2728 if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)) || IS_DG2(i915)) 2729 wa_mcr_write_clr_set(wal, RT_CTRL, STACKID_CTRL, STACKID_CTRL_512); 2730 2731 /* 2732 * This tuning setting proves beneficial only on ATS-M designs; the 2733 * default "age based" setting is optimal on regular DG2 and other 2734 * platforms. 2735 */ 2736 if (INTEL_INFO(i915)->tuning_thread_rr_after_dep) 2737 wa_mcr_masked_field_set(wal, GEN9_ROW_CHICKEN4, THREAD_EX_ARB_MODE, 2738 THREAD_EX_ARB_MODE_RR_AFTER_DEP); 2739 2740 if (GRAPHICS_VER(i915) == 12 && GRAPHICS_VER_FULL(i915) < IP_VER(12, 55)) 2741 wa_write_clr(wal, GEN8_GARBCNTL, GEN12_BUS_HASH_CTL_BIT_EXC); 2742 } 2743 2744 static void ccs_engine_wa_mode(struct intel_engine_cs *engine, struct i915_wa_list *wal) 2745 { 2746 struct intel_gt *gt = engine->gt; 2747 u32 mode; 2748 2749 if (!IS_DG2(gt->i915)) 2750 return; 2751 2752 /* 2753 * Wa_14019159160: This workaround, along with others, leads to 2754 * significant challenges in utilizing load balancing among the 2755 * CCS slices. Consequently, an architectural decision has been 2756 * made to completely disable automatic CCS load balancing. 2757 */ 2758 wa_masked_en(wal, GEN12_RCU_MODE, XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE); 2759 2760 /* 2761 * After having disabled automatic load balancing we need to 2762 * assign all slices to a single CCS. We will call it CCS mode 1 2763 */ 2764 mode = intel_gt_apply_ccs_mode(gt); 2765 wa_masked_en(wal, XEHP_CCS_MODE, mode); 2766 } 2767 2768 /* 2769 * The workarounds in this function apply to shared registers in 2770 * the general render reset domain that aren't tied to a 2771 * specific engine. Since all render+compute engines get reset 2772 * together, and the contents of these registers are lost during 2773 * the shared render domain reset, we'll define such workarounds 2774 * here and then add them to just a single RCS or CCS engine's 2775 * workaround list (whichever engine has the XXXX flag). 2776 */ 2777 static void 2778 general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) 2779 { 2780 struct drm_i915_private *i915 = engine->i915; 2781 struct intel_gt *gt = engine->gt; 2782 2783 add_render_compute_tuning_settings(gt, wal); 2784 2785 if (GRAPHICS_VER(i915) >= 11) { 2786 /* This is not a Wa (although referred to as 2787 * WaSetInidrectStateOverride in places), this allows 2788 * applications that reference sampler states through 2789 * the BindlessSamplerStateBaseAddress to have their 2790 * border color relative to DynamicStateBaseAddress 2791 * rather than BindlessSamplerStateBaseAddress. 2792 * 2793 * Otherwise SAMPLER_STATE border colors have to be 2794 * copied in multiple heaps (DynamicStateBaseAddress & 2795 * BindlessSamplerStateBaseAddress) 2796 * 2797 * BSpec: 46052 2798 */ 2799 wa_mcr_masked_en(wal, 2800 GEN10_SAMPLER_MODE, 2801 GEN11_INDIRECT_STATE_BASE_ADDR_OVERRIDE); 2802 } 2803 2804 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_B0, STEP_FOREVER) || 2805 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_B0, STEP_FOREVER) || 2806 IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 74), IP_VER(12, 74))) { 2807 /* Wa_14017856879 */ 2808 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN3, MTL_DISABLE_FIX_FOR_EOT_FLUSH); 2809 2810 /* Wa_14020495402 */ 2811 wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, XELPG_DISABLE_TDL_SVHS_GATING); 2812 } 2813 2814 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || 2815 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) 2816 /* 2817 * Wa_14017066071 2818 * Wa_14017654203 2819 */ 2820 wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE, 2821 MTL_DISABLE_SAMPLER_SC_OOO); 2822 2823 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) 2824 /* Wa_22015279794 */ 2825 wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS, 2826 DISABLE_PREFETCH_INTO_IC); 2827 2828 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || 2829 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) || 2830 IS_DG2(i915)) { 2831 /* Wa_22013037850 */ 2832 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, 2833 DISABLE_128B_EVICTION_COMMAND_UDW); 2834 2835 /* Wa_18017747507 */ 2836 wa_masked_en(wal, VFG_PREEMPTION_CHICKEN, POLYGON_TRIFAN_LINELOOP_DISABLE); 2837 } 2838 2839 if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || 2840 IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) || 2841 IS_DG2(i915)) { 2842 /* Wa_22014226127 */ 2843 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0, DISABLE_D8_D16_COASLESCE); 2844 } 2845 2846 if (IS_DG2(i915)) { 2847 /* Wa_14015227452:dg2,pvc */ 2848 wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, XEHP_DIS_BBL_SYSPIPE); 2849 2850 /* 2851 * Wa_16011620976:dg2_g11 2852 * Wa_22015475538:dg2 2853 */ 2854 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, DIS_CHAIN_2XSIMD8); 2855 2856 /* Wa_18028616096 */ 2857 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, UGM_FRAGMENT_THRESHOLD_TO_3); 2858 } 2859 2860 if (IS_DG2_G11(i915)) { 2861 /* 2862 * Wa_22012826095:dg2 2863 * Wa_22013059131:dg2 2864 */ 2865 wa_mcr_write_clr_set(wal, LSC_CHICKEN_BIT_0_UDW, 2866 MAXREQS_PER_BANK, 2867 REG_FIELD_PREP(MAXREQS_PER_BANK, 2)); 2868 2869 /* Wa_22013059131:dg2 */ 2870 wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0, 2871 FORCE_1_SUB_MESSAGE_PER_FRAGMENT); 2872 2873 /* 2874 * Wa_22012654132 2875 * 2876 * Note that register 0xE420 is write-only and cannot be read 2877 * back for verification on DG2 (due to Wa_14012342262), so 2878 * we need to explicitly skip the readback. 2879 */ 2880 wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0, 2881 _MASKED_BIT_ENABLE(ENABLE_PREFETCH_INTO_IC), 2882 0 /* write-only, so skip validation */, 2883 true); 2884 } 2885 } 2886 2887 static void 2888 engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal) 2889 { 2890 if (GRAPHICS_VER(engine->i915) < 4) 2891 return; 2892 2893 engine_fake_wa_init(engine, wal); 2894 2895 /* 2896 * These are common workarounds that just need to applied 2897 * to a single RCS/CCS engine's workaround list since 2898 * they're reset as part of the general render domain reset. 2899 */ 2900 if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE) { 2901 general_render_compute_wa_init(engine, wal); 2902 ccs_engine_wa_mode(engine, wal); 2903 } 2904 2905 if (engine->class == COMPUTE_CLASS) 2906 ccs_engine_wa_init(engine, wal); 2907 else if (engine->class == RENDER_CLASS) 2908 rcs_engine_wa_init(engine, wal); 2909 else 2910 xcs_engine_wa_init(engine, wal); 2911 } 2912 2913 void intel_engine_init_workarounds(struct intel_engine_cs *engine) 2914 { 2915 struct i915_wa_list *wal = &engine->wa_list; 2916 2917 wa_init_start(wal, engine->gt, "engine", engine->name); 2918 engine_init_workarounds(engine, wal); 2919 wa_init_finish(wal); 2920 } 2921 2922 void intel_engine_apply_workarounds(struct intel_engine_cs *engine) 2923 { 2924 wa_list_apply(&engine->wa_list); 2925 } 2926 2927 static const struct i915_range mcr_ranges_gen8[] = { 2928 { .start = 0x5500, .end = 0x55ff }, 2929 { .start = 0x7000, .end = 0x7fff }, 2930 { .start = 0x9400, .end = 0x97ff }, 2931 { .start = 0xb000, .end = 0xb3ff }, 2932 { .start = 0xe000, .end = 0xe7ff }, 2933 {}, 2934 }; 2935 2936 static const struct i915_range mcr_ranges_gen12[] = { 2937 { .start = 0x8150, .end = 0x815f }, 2938 { .start = 0x9520, .end = 0x955f }, 2939 { .start = 0xb100, .end = 0xb3ff }, 2940 { .start = 0xde80, .end = 0xe8ff }, 2941 { .start = 0x24a00, .end = 0x24a7f }, 2942 {}, 2943 }; 2944 2945 static const struct i915_range mcr_ranges_xehp[] = { 2946 { .start = 0x4000, .end = 0x4aff }, 2947 { .start = 0x5200, .end = 0x52ff }, 2948 { .start = 0x5400, .end = 0x7fff }, 2949 { .start = 0x8140, .end = 0x815f }, 2950 { .start = 0x8c80, .end = 0x8dff }, 2951 { .start = 0x94d0, .end = 0x955f }, 2952 { .start = 0x9680, .end = 0x96ff }, 2953 { .start = 0xb000, .end = 0xb3ff }, 2954 { .start = 0xc800, .end = 0xcfff }, 2955 { .start = 0xd800, .end = 0xd8ff }, 2956 { .start = 0xdc00, .end = 0xffff }, 2957 { .start = 0x17000, .end = 0x17fff }, 2958 { .start = 0x24a00, .end = 0x24a7f }, 2959 {}, 2960 }; 2961 2962 static bool mcr_range(struct drm_i915_private *i915, u32 offset) 2963 { 2964 const struct i915_range *mcr_ranges; 2965 int i; 2966 2967 if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 55)) 2968 mcr_ranges = mcr_ranges_xehp; 2969 else if (GRAPHICS_VER(i915) >= 12) 2970 mcr_ranges = mcr_ranges_gen12; 2971 else if (GRAPHICS_VER(i915) >= 8) 2972 mcr_ranges = mcr_ranges_gen8; 2973 else 2974 return false; 2975 2976 /* 2977 * Registers in these ranges are affected by the MCR selector 2978 * which only controls CPU initiated MMIO. Routing does not 2979 * work for CS access so we cannot verify them on this path. 2980 */ 2981 for (i = 0; mcr_ranges[i].start; i++) 2982 if (offset >= mcr_ranges[i].start && 2983 offset <= mcr_ranges[i].end) 2984 return true; 2985 2986 return false; 2987 } 2988 2989 static int 2990 wa_list_srm(struct i915_request *rq, 2991 const struct i915_wa_list *wal, 2992 struct i915_vma *vma) 2993 { 2994 struct drm_i915_private *i915 = rq->i915; 2995 unsigned int i, count = 0; 2996 const struct i915_wa *wa; 2997 u32 srm, *cs; 2998 2999 srm = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT; 3000 if (GRAPHICS_VER(i915) >= 8) 3001 srm++; 3002 3003 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) { 3004 if (!mcr_range(i915, i915_mmio_reg_offset(wa->reg))) 3005 count++; 3006 } 3007 3008 cs = intel_ring_begin(rq, 4 * count); 3009 if (IS_ERR(cs)) 3010 return PTR_ERR(cs); 3011 3012 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) { 3013 u32 offset = i915_mmio_reg_offset(wa->reg); 3014 3015 if (mcr_range(i915, offset)) 3016 continue; 3017 3018 *cs++ = srm; 3019 *cs++ = offset; 3020 *cs++ = i915_ggtt_offset(vma) + sizeof(u32) * i; 3021 *cs++ = 0; 3022 } 3023 intel_ring_advance(rq, cs); 3024 3025 return 0; 3026 } 3027 3028 static int engine_wa_list_verify(struct intel_context *ce, 3029 const struct i915_wa_list * const wal, 3030 const char *from) 3031 { 3032 const struct i915_wa *wa; 3033 struct i915_request *rq; 3034 struct i915_vma *vma; 3035 struct i915_gem_ww_ctx ww; 3036 unsigned int i; 3037 u32 *results; 3038 int err; 3039 3040 if (!wal->count) 3041 return 0; 3042 3043 vma = __vm_create_scratch_for_read(&ce->engine->gt->ggtt->vm, 3044 wal->count * sizeof(u32)); 3045 if (IS_ERR(vma)) 3046 return PTR_ERR(vma); 3047 3048 intel_engine_pm_get(ce->engine); 3049 i915_gem_ww_ctx_init(&ww, false); 3050 retry: 3051 err = i915_gem_object_lock(vma->obj, &ww); 3052 if (err == 0) 3053 err = intel_context_pin_ww(ce, &ww); 3054 if (err) 3055 goto err_pm; 3056 3057 err = i915_vma_pin_ww(vma, &ww, 0, 0, 3058 i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER); 3059 if (err) 3060 goto err_unpin; 3061 3062 rq = i915_request_create(ce); 3063 if (IS_ERR(rq)) { 3064 err = PTR_ERR(rq); 3065 goto err_vma; 3066 } 3067 3068 err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); 3069 if (err == 0) 3070 err = wa_list_srm(rq, wal, vma); 3071 3072 i915_request_get(rq); 3073 if (err) 3074 i915_request_set_error_once(rq, err); 3075 i915_request_add(rq); 3076 3077 if (err) 3078 goto err_rq; 3079 3080 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 3081 err = -ETIME; 3082 goto err_rq; 3083 } 3084 3085 results = i915_gem_object_pin_map(vma->obj, I915_MAP_WB); 3086 if (IS_ERR(results)) { 3087 err = PTR_ERR(results); 3088 goto err_rq; 3089 } 3090 3091 err = 0; 3092 for (i = 0, wa = wal->list; i < wal->count; i++, wa++) { 3093 if (mcr_range(rq->i915, i915_mmio_reg_offset(wa->reg))) 3094 continue; 3095 3096 if (!wa_verify(wal->gt, wa, results[i], wal->name, from)) 3097 err = -ENXIO; 3098 } 3099 3100 i915_gem_object_unpin_map(vma->obj); 3101 3102 err_rq: 3103 i915_request_put(rq); 3104 err_vma: 3105 i915_vma_unpin(vma); 3106 err_unpin: 3107 intel_context_unpin(ce); 3108 err_pm: 3109 if (err == -EDEADLK) { 3110 err = i915_gem_ww_ctx_backoff(&ww); 3111 if (!err) 3112 goto retry; 3113 } 3114 i915_gem_ww_ctx_fini(&ww); 3115 intel_engine_pm_put(ce->engine); 3116 i915_vma_put(vma); 3117 return err; 3118 } 3119 3120 int intel_engine_verify_workarounds(struct intel_engine_cs *engine, 3121 const char *from) 3122 { 3123 return engine_wa_list_verify(engine->kernel_context, 3124 &engine->wa_list, 3125 from); 3126 } 3127 3128 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 3129 #include "selftest_workarounds.c" 3130 #endif 3131