1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2008-2018 Intel Corporation 4 */ 5 6 #include <linux/sched/mm.h> 7 #include <linux/stop_machine.h> 8 9 #include "display/intel_display.h" 10 #include "display/intel_overlay.h" 11 12 #include "gem/i915_gem_context.h" 13 14 #include "gt/intel_gt_regs.h" 15 16 #include "i915_drv.h" 17 #include "i915_gpu_error.h" 18 #include "i915_irq.h" 19 #include "intel_breadcrumbs.h" 20 #include "intel_engine_pm.h" 21 #include "intel_engine_regs.h" 22 #include "intel_gt.h" 23 #include "intel_gt_pm.h" 24 #include "intel_gt_requests.h" 25 #include "intel_pci_config.h" 26 #include "intel_reset.h" 27 28 #include "uc/intel_guc.h" 29 30 #define RESET_MAX_RETRIES 3 31 32 /* XXX How to handle concurrent GGTT updates using tiling registers? */ 33 #define RESET_UNDER_STOP_MACHINE 0 34 35 static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set) 36 { 37 intel_uncore_rmw_fw(uncore, reg, 0, set); 38 } 39 40 static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr) 41 { 42 intel_uncore_rmw_fw(uncore, reg, clr, 0); 43 } 44 45 static void client_mark_guilty(struct i915_gem_context *ctx, bool banned) 46 { 47 struct drm_i915_file_private *file_priv = ctx->file_priv; 48 unsigned long prev_hang; 49 unsigned int score; 50 51 if (IS_ERR_OR_NULL(file_priv)) 52 return; 53 54 score = 0; 55 if (banned) 56 score = I915_CLIENT_SCORE_CONTEXT_BAN; 57 58 prev_hang = xchg(&file_priv->hang_timestamp, jiffies); 59 if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES)) 60 score += I915_CLIENT_SCORE_HANG_FAST; 61 62 if (score) { 63 atomic_add(score, &file_priv->ban_score); 64 65 drm_dbg(&ctx->i915->drm, 66 "client %s: gained %u ban score, now %u\n", 67 ctx->name, score, 68 atomic_read(&file_priv->ban_score)); 69 } 70 } 71 72 static bool mark_guilty(struct i915_request *rq) 73 { 74 struct i915_gem_context *ctx; 75 unsigned long prev_hang; 76 bool banned; 77 int i; 78 79 if (intel_context_is_closed(rq->context)) 80 return true; 81 82 rcu_read_lock(); 83 ctx = rcu_dereference(rq->context->gem_context); 84 if (ctx && !kref_get_unless_zero(&ctx->ref)) 85 ctx = NULL; 86 rcu_read_unlock(); 87 if (!ctx) 88 return intel_context_is_banned(rq->context); 89 90 atomic_inc(&ctx->guilty_count); 91 92 /* Cool contexts are too cool to be banned! (Used for reset testing.) */ 93 if (!i915_gem_context_is_bannable(ctx)) { 94 banned = false; 95 goto out; 96 } 97 98 drm_notice(&ctx->i915->drm, 99 "%s context reset due to GPU hang\n", 100 ctx->name); 101 102 /* Record the timestamp for the last N hangs */ 103 prev_hang = ctx->hang_timestamp[0]; 104 for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++) 105 ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1]; 106 ctx->hang_timestamp[i] = jiffies; 107 108 /* If we have hung N+1 times in rapid succession, we ban the context! */ 109 banned = !i915_gem_context_is_recoverable(ctx); 110 if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES)) 111 banned = true; 112 if (banned) 113 drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n", 114 ctx->name, atomic_read(&ctx->guilty_count)); 115 116 client_mark_guilty(ctx, banned); 117 118 out: 119 i915_gem_context_put(ctx); 120 return banned; 121 } 122 123 static void mark_innocent(struct i915_request *rq) 124 { 125 struct i915_gem_context *ctx; 126 127 rcu_read_lock(); 128 ctx = rcu_dereference(rq->context->gem_context); 129 if (ctx) 130 atomic_inc(&ctx->active_count); 131 rcu_read_unlock(); 132 } 133 134 void __i915_request_reset(struct i915_request *rq, bool guilty) 135 { 136 bool banned = false; 137 138 RQ_TRACE(rq, "guilty? %s\n", yesno(guilty)); 139 GEM_BUG_ON(__i915_request_is_complete(rq)); 140 141 rcu_read_lock(); /* protect the GEM context */ 142 if (guilty) { 143 i915_request_set_error_once(rq, -EIO); 144 __i915_request_skip(rq); 145 banned = mark_guilty(rq); 146 } else { 147 i915_request_set_error_once(rq, -EAGAIN); 148 mark_innocent(rq); 149 } 150 rcu_read_unlock(); 151 152 if (banned) 153 intel_context_ban(rq->context, rq); 154 } 155 156 static bool i915_in_reset(struct pci_dev *pdev) 157 { 158 u8 gdrst; 159 160 pci_read_config_byte(pdev, I915_GDRST, &gdrst); 161 return gdrst & GRDOM_RESET_STATUS; 162 } 163 164 static int i915_do_reset(struct intel_gt *gt, 165 intel_engine_mask_t engine_mask, 166 unsigned int retry) 167 { 168 struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev); 169 int err; 170 171 /* Assert reset for at least 20 usec, and wait for acknowledgement. */ 172 pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); 173 udelay(50); 174 err = wait_for_atomic(i915_in_reset(pdev), 50); 175 176 /* Clear the reset request. */ 177 pci_write_config_byte(pdev, I915_GDRST, 0); 178 udelay(50); 179 if (!err) 180 err = wait_for_atomic(!i915_in_reset(pdev), 50); 181 182 return err; 183 } 184 185 static bool g4x_reset_complete(struct pci_dev *pdev) 186 { 187 u8 gdrst; 188 189 pci_read_config_byte(pdev, I915_GDRST, &gdrst); 190 return (gdrst & GRDOM_RESET_ENABLE) == 0; 191 } 192 193 static int g33_do_reset(struct intel_gt *gt, 194 intel_engine_mask_t engine_mask, 195 unsigned int retry) 196 { 197 struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev); 198 199 pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); 200 return wait_for_atomic(g4x_reset_complete(pdev), 50); 201 } 202 203 static int g4x_do_reset(struct intel_gt *gt, 204 intel_engine_mask_t engine_mask, 205 unsigned int retry) 206 { 207 struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev); 208 struct intel_uncore *uncore = gt->uncore; 209 int ret; 210 211 /* WaVcpClkGateDisableForMediaReset:ctg,elk */ 212 rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE); 213 intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); 214 215 pci_write_config_byte(pdev, I915_GDRST, 216 GRDOM_MEDIA | GRDOM_RESET_ENABLE); 217 ret = wait_for_atomic(g4x_reset_complete(pdev), 50); 218 if (ret) { 219 GT_TRACE(gt, "Wait for media reset failed\n"); 220 goto out; 221 } 222 223 pci_write_config_byte(pdev, I915_GDRST, 224 GRDOM_RENDER | GRDOM_RESET_ENABLE); 225 ret = wait_for_atomic(g4x_reset_complete(pdev), 50); 226 if (ret) { 227 GT_TRACE(gt, "Wait for render reset failed\n"); 228 goto out; 229 } 230 231 out: 232 pci_write_config_byte(pdev, I915_GDRST, 0); 233 234 rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE); 235 intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); 236 237 return ret; 238 } 239 240 static int ilk_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask, 241 unsigned int retry) 242 { 243 struct intel_uncore *uncore = gt->uncore; 244 int ret; 245 246 intel_uncore_write_fw(uncore, ILK_GDSR, 247 ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE); 248 ret = __intel_wait_for_register_fw(uncore, ILK_GDSR, 249 ILK_GRDOM_RESET_ENABLE, 0, 250 5000, 0, 251 NULL); 252 if (ret) { 253 GT_TRACE(gt, "Wait for render reset failed\n"); 254 goto out; 255 } 256 257 intel_uncore_write_fw(uncore, ILK_GDSR, 258 ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE); 259 ret = __intel_wait_for_register_fw(uncore, ILK_GDSR, 260 ILK_GRDOM_RESET_ENABLE, 0, 261 5000, 0, 262 NULL); 263 if (ret) { 264 GT_TRACE(gt, "Wait for media reset failed\n"); 265 goto out; 266 } 267 268 out: 269 intel_uncore_write_fw(uncore, ILK_GDSR, 0); 270 intel_uncore_posting_read_fw(uncore, ILK_GDSR); 271 return ret; 272 } 273 274 /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */ 275 static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask) 276 { 277 struct intel_uncore *uncore = gt->uncore; 278 int err; 279 280 /* 281 * GEN6_GDRST is not in the gt power well, no need to check 282 * for fifo space for the write or forcewake the chip for 283 * the read 284 */ 285 intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask); 286 287 /* Wait for the device to ack the reset requests */ 288 err = __intel_wait_for_register_fw(uncore, 289 GEN6_GDRST, hw_domain_mask, 0, 290 500, 0, 291 NULL); 292 if (err) 293 GT_TRACE(gt, 294 "Wait for 0x%08x engines reset failed\n", 295 hw_domain_mask); 296 297 return err; 298 } 299 300 static int gen6_reset_engines(struct intel_gt *gt, 301 intel_engine_mask_t engine_mask, 302 unsigned int retry) 303 { 304 struct intel_engine_cs *engine; 305 u32 hw_mask; 306 307 if (engine_mask == ALL_ENGINES) { 308 hw_mask = GEN6_GRDOM_FULL; 309 } else { 310 intel_engine_mask_t tmp; 311 312 hw_mask = 0; 313 for_each_engine_masked(engine, gt, engine_mask, tmp) { 314 hw_mask |= engine->reset_domain; 315 } 316 } 317 318 return gen6_hw_domain_reset(gt, hw_mask); 319 } 320 321 static struct intel_engine_cs *find_sfc_paired_vecs_engine(struct intel_engine_cs *engine) 322 { 323 int vecs_id; 324 325 GEM_BUG_ON(engine->class != VIDEO_DECODE_CLASS); 326 327 vecs_id = _VECS((engine->instance) / 2); 328 329 return engine->gt->engine[vecs_id]; 330 } 331 332 struct sfc_lock_data { 333 i915_reg_t lock_reg; 334 i915_reg_t ack_reg; 335 i915_reg_t usage_reg; 336 u32 lock_bit; 337 u32 ack_bit; 338 u32 usage_bit; 339 u32 reset_bit; 340 }; 341 342 static void get_sfc_forced_lock_data(struct intel_engine_cs *engine, 343 struct sfc_lock_data *sfc_lock) 344 { 345 switch (engine->class) { 346 default: 347 MISSING_CASE(engine->class); 348 fallthrough; 349 case VIDEO_DECODE_CLASS: 350 sfc_lock->lock_reg = GEN11_VCS_SFC_FORCED_LOCK(engine); 351 sfc_lock->lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; 352 353 sfc_lock->ack_reg = GEN11_VCS_SFC_LOCK_STATUS(engine); 354 sfc_lock->ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT; 355 356 sfc_lock->usage_reg = GEN11_VCS_SFC_LOCK_STATUS(engine); 357 sfc_lock->usage_bit = GEN11_VCS_SFC_USAGE_BIT; 358 sfc_lock->reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance); 359 360 break; 361 case VIDEO_ENHANCEMENT_CLASS: 362 sfc_lock->lock_reg = GEN11_VECS_SFC_FORCED_LOCK(engine); 363 sfc_lock->lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; 364 365 sfc_lock->ack_reg = GEN11_VECS_SFC_LOCK_ACK(engine); 366 sfc_lock->ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT; 367 368 sfc_lock->usage_reg = GEN11_VECS_SFC_USAGE(engine); 369 sfc_lock->usage_bit = GEN11_VECS_SFC_USAGE_BIT; 370 sfc_lock->reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance); 371 372 break; 373 } 374 } 375 376 static int gen11_lock_sfc(struct intel_engine_cs *engine, 377 u32 *reset_mask, 378 u32 *unlock_mask) 379 { 380 struct intel_uncore *uncore = engine->uncore; 381 u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access; 382 struct sfc_lock_data sfc_lock; 383 bool lock_obtained, lock_to_other = false; 384 int ret; 385 386 switch (engine->class) { 387 case VIDEO_DECODE_CLASS: 388 if ((BIT(engine->instance) & vdbox_sfc_access) == 0) 389 return 0; 390 391 fallthrough; 392 case VIDEO_ENHANCEMENT_CLASS: 393 get_sfc_forced_lock_data(engine, &sfc_lock); 394 395 break; 396 default: 397 return 0; 398 } 399 400 if (!(intel_uncore_read_fw(uncore, sfc_lock.usage_reg) & sfc_lock.usage_bit)) { 401 struct intel_engine_cs *paired_vecs; 402 403 if (engine->class != VIDEO_DECODE_CLASS || 404 GRAPHICS_VER(engine->i915) != 12) 405 return 0; 406 407 /* 408 * Wa_14010733141 409 * 410 * If the VCS-MFX isn't using the SFC, we also need to check 411 * whether VCS-HCP is using it. If so, we need to issue a *VE* 412 * forced lock on the VE engine that shares the same SFC. 413 */ 414 if (!(intel_uncore_read_fw(uncore, 415 GEN12_HCP_SFC_LOCK_STATUS(engine)) & 416 GEN12_HCP_SFC_USAGE_BIT)) 417 return 0; 418 419 paired_vecs = find_sfc_paired_vecs_engine(engine); 420 get_sfc_forced_lock_data(paired_vecs, &sfc_lock); 421 lock_to_other = true; 422 *unlock_mask |= paired_vecs->mask; 423 } else { 424 *unlock_mask |= engine->mask; 425 } 426 427 /* 428 * If the engine is using an SFC, tell the engine that a software reset 429 * is going to happen. The engine will then try to force lock the SFC. 430 * If SFC ends up being locked to the engine we want to reset, we have 431 * to reset it as well (we will unlock it once the reset sequence is 432 * completed). 433 */ 434 rmw_set_fw(uncore, sfc_lock.lock_reg, sfc_lock.lock_bit); 435 436 ret = __intel_wait_for_register_fw(uncore, 437 sfc_lock.ack_reg, 438 sfc_lock.ack_bit, 439 sfc_lock.ack_bit, 440 1000, 0, NULL); 441 442 /* 443 * Was the SFC released while we were trying to lock it? 444 * 445 * We should reset both the engine and the SFC if: 446 * - We were locking the SFC to this engine and the lock succeeded 447 * OR 448 * - We were locking the SFC to a different engine (Wa_14010733141) 449 * but the SFC was released before the lock was obtained. 450 * 451 * Otherwise we need only reset the engine by itself and we can 452 * leave the SFC alone. 453 */ 454 lock_obtained = (intel_uncore_read_fw(uncore, sfc_lock.usage_reg) & 455 sfc_lock.usage_bit) != 0; 456 if (lock_obtained == lock_to_other) 457 return 0; 458 459 if (ret) { 460 ENGINE_TRACE(engine, "Wait for SFC forced lock ack failed\n"); 461 return ret; 462 } 463 464 *reset_mask |= sfc_lock.reset_bit; 465 return 0; 466 } 467 468 static void gen11_unlock_sfc(struct intel_engine_cs *engine) 469 { 470 struct intel_uncore *uncore = engine->uncore; 471 u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access; 472 struct sfc_lock_data sfc_lock = {}; 473 474 if (engine->class != VIDEO_DECODE_CLASS && 475 engine->class != VIDEO_ENHANCEMENT_CLASS) 476 return; 477 478 if (engine->class == VIDEO_DECODE_CLASS && 479 (BIT(engine->instance) & vdbox_sfc_access) == 0) 480 return; 481 482 get_sfc_forced_lock_data(engine, &sfc_lock); 483 484 rmw_clear_fw(uncore, sfc_lock.lock_reg, sfc_lock.lock_bit); 485 } 486 487 static int gen11_reset_engines(struct intel_gt *gt, 488 intel_engine_mask_t engine_mask, 489 unsigned int retry) 490 { 491 struct intel_engine_cs *engine; 492 intel_engine_mask_t tmp; 493 u32 reset_mask, unlock_mask = 0; 494 int ret; 495 496 if (engine_mask == ALL_ENGINES) { 497 reset_mask = GEN11_GRDOM_FULL; 498 } else { 499 reset_mask = 0; 500 for_each_engine_masked(engine, gt, engine_mask, tmp) { 501 reset_mask |= engine->reset_domain; 502 ret = gen11_lock_sfc(engine, &reset_mask, &unlock_mask); 503 if (ret) 504 goto sfc_unlock; 505 } 506 } 507 508 ret = gen6_hw_domain_reset(gt, reset_mask); 509 510 sfc_unlock: 511 /* 512 * We unlock the SFC based on the lock status and not the result of 513 * gen11_lock_sfc to make sure that we clean properly if something 514 * wrong happened during the lock (e.g. lock acquired after timeout 515 * expiration). 516 * 517 * Due to Wa_14010733141, we may have locked an SFC to an engine that 518 * wasn't being reset. So instead of calling gen11_unlock_sfc() 519 * on engine_mask, we instead call it on the mask of engines that our 520 * gen11_lock_sfc() calls told us actually had locks attempted. 521 */ 522 for_each_engine_masked(engine, gt, unlock_mask, tmp) 523 gen11_unlock_sfc(engine); 524 525 return ret; 526 } 527 528 static int gen8_engine_reset_prepare(struct intel_engine_cs *engine) 529 { 530 struct intel_uncore *uncore = engine->uncore; 531 const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base); 532 u32 request, mask, ack; 533 int ret; 534 535 if (I915_SELFTEST_ONLY(should_fail(&engine->reset_timeout, 1))) 536 return -ETIMEDOUT; 537 538 ack = intel_uncore_read_fw(uncore, reg); 539 if (ack & RESET_CTL_CAT_ERROR) { 540 /* 541 * For catastrophic errors, ready-for-reset sequence 542 * needs to be bypassed: HAS#396813 543 */ 544 request = RESET_CTL_CAT_ERROR; 545 mask = RESET_CTL_CAT_ERROR; 546 547 /* Catastrophic errors need to be cleared by HW */ 548 ack = 0; 549 } else if (!(ack & RESET_CTL_READY_TO_RESET)) { 550 request = RESET_CTL_REQUEST_RESET; 551 mask = RESET_CTL_READY_TO_RESET; 552 ack = RESET_CTL_READY_TO_RESET; 553 } else { 554 return 0; 555 } 556 557 intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request)); 558 ret = __intel_wait_for_register_fw(uncore, reg, mask, ack, 559 700, 0, NULL); 560 if (ret) 561 drm_err(&engine->i915->drm, 562 "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", 563 engine->name, request, 564 intel_uncore_read_fw(uncore, reg)); 565 566 return ret; 567 } 568 569 static void gen8_engine_reset_cancel(struct intel_engine_cs *engine) 570 { 571 intel_uncore_write_fw(engine->uncore, 572 RING_RESET_CTL(engine->mmio_base), 573 _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET)); 574 } 575 576 static int gen8_reset_engines(struct intel_gt *gt, 577 intel_engine_mask_t engine_mask, 578 unsigned int retry) 579 { 580 struct intel_engine_cs *engine; 581 const bool reset_non_ready = retry >= 1; 582 intel_engine_mask_t tmp; 583 int ret; 584 585 for_each_engine_masked(engine, gt, engine_mask, tmp) { 586 ret = gen8_engine_reset_prepare(engine); 587 if (ret && !reset_non_ready) 588 goto skip_reset; 589 590 /* 591 * If this is not the first failed attempt to prepare, 592 * we decide to proceed anyway. 593 * 594 * By doing so we risk context corruption and with 595 * some gens (kbl), possible system hang if reset 596 * happens during active bb execution. 597 * 598 * We rather take context corruption instead of 599 * failed reset with a wedged driver/gpu. And 600 * active bb execution case should be covered by 601 * stop_engines() we have before the reset. 602 */ 603 } 604 605 if (GRAPHICS_VER(gt->i915) >= 11) 606 ret = gen11_reset_engines(gt, engine_mask, retry); 607 else 608 ret = gen6_reset_engines(gt, engine_mask, retry); 609 610 skip_reset: 611 for_each_engine_masked(engine, gt, engine_mask, tmp) 612 gen8_engine_reset_cancel(engine); 613 614 return ret; 615 } 616 617 static int mock_reset(struct intel_gt *gt, 618 intel_engine_mask_t mask, 619 unsigned int retry) 620 { 621 return 0; 622 } 623 624 typedef int (*reset_func)(struct intel_gt *, 625 intel_engine_mask_t engine_mask, 626 unsigned int retry); 627 628 static reset_func intel_get_gpu_reset(const struct intel_gt *gt) 629 { 630 struct drm_i915_private *i915 = gt->i915; 631 632 if (is_mock_gt(gt)) 633 return mock_reset; 634 else if (GRAPHICS_VER(i915) >= 8) 635 return gen8_reset_engines; 636 else if (GRAPHICS_VER(i915) >= 6) 637 return gen6_reset_engines; 638 else if (GRAPHICS_VER(i915) >= 5) 639 return ilk_do_reset; 640 else if (IS_G4X(i915)) 641 return g4x_do_reset; 642 else if (IS_G33(i915) || IS_PINEVIEW(i915)) 643 return g33_do_reset; 644 else if (GRAPHICS_VER(i915) >= 3) 645 return i915_do_reset; 646 else 647 return NULL; 648 } 649 650 int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask) 651 { 652 const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1; 653 reset_func reset; 654 int ret = -ETIMEDOUT; 655 int retry; 656 657 reset = intel_get_gpu_reset(gt); 658 if (!reset) 659 return -ENODEV; 660 661 /* 662 * If the power well sleeps during the reset, the reset 663 * request may be dropped and never completes (causing -EIO). 664 */ 665 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 666 for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) { 667 GT_TRACE(gt, "engine_mask=%x\n", engine_mask); 668 preempt_disable(); 669 ret = reset(gt, engine_mask, retry); 670 preempt_enable(); 671 } 672 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 673 674 return ret; 675 } 676 677 bool intel_has_gpu_reset(const struct intel_gt *gt) 678 { 679 if (!gt->i915->params.reset) 680 return NULL; 681 682 return intel_get_gpu_reset(gt); 683 } 684 685 bool intel_has_reset_engine(const struct intel_gt *gt) 686 { 687 if (gt->i915->params.reset < 2) 688 return false; 689 690 return INTEL_INFO(gt->i915)->has_reset_engine; 691 } 692 693 int intel_reset_guc(struct intel_gt *gt) 694 { 695 u32 guc_domain = 696 GRAPHICS_VER(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC; 697 int ret; 698 699 GEM_BUG_ON(!HAS_GT_UC(gt->i915)); 700 701 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 702 ret = gen6_hw_domain_reset(gt, guc_domain); 703 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 704 705 return ret; 706 } 707 708 /* 709 * Ensure irq handler finishes, and not run again. 710 * Also return the active request so that we only search for it once. 711 */ 712 static void reset_prepare_engine(struct intel_engine_cs *engine) 713 { 714 /* 715 * During the reset sequence, we must prevent the engine from 716 * entering RC6. As the context state is undefined until we restart 717 * the engine, if it does enter RC6 during the reset, the state 718 * written to the powercontext is undefined and so we may lose 719 * GPU state upon resume, i.e. fail to restart after a reset. 720 */ 721 intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL); 722 if (engine->reset.prepare) 723 engine->reset.prepare(engine); 724 } 725 726 static void revoke_mmaps(struct intel_gt *gt) 727 { 728 int i; 729 730 for (i = 0; i < gt->ggtt->num_fences; i++) { 731 struct drm_vma_offset_node *node; 732 struct i915_vma *vma; 733 u64 vma_offset; 734 735 vma = READ_ONCE(gt->ggtt->fence_regs[i].vma); 736 if (!vma) 737 continue; 738 739 if (!i915_vma_has_userfault(vma)) 740 continue; 741 742 GEM_BUG_ON(vma->fence != >->ggtt->fence_regs[i]); 743 744 if (!vma->mmo) 745 continue; 746 747 node = &vma->mmo->vma_node; 748 vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT; 749 750 unmap_mapping_range(gt->i915->drm.anon_inode->i_mapping, 751 drm_vma_node_offset_addr(node) + vma_offset, 752 vma->size, 753 1); 754 } 755 } 756 757 static intel_engine_mask_t reset_prepare(struct intel_gt *gt) 758 { 759 struct intel_engine_cs *engine; 760 intel_engine_mask_t awake = 0; 761 enum intel_engine_id id; 762 763 for_each_engine(engine, gt, id) { 764 if (intel_engine_pm_get_if_awake(engine)) 765 awake |= engine->mask; 766 reset_prepare_engine(engine); 767 } 768 769 intel_uc_reset_prepare(>->uc); 770 771 return awake; 772 } 773 774 static void gt_revoke(struct intel_gt *gt) 775 { 776 revoke_mmaps(gt); 777 } 778 779 static int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask) 780 { 781 struct intel_engine_cs *engine; 782 enum intel_engine_id id; 783 int err; 784 785 /* 786 * Everything depends on having the GTT running, so we need to start 787 * there. 788 */ 789 err = i915_ggtt_enable_hw(gt->i915); 790 if (err) 791 return err; 792 793 local_bh_disable(); 794 for_each_engine(engine, gt, id) 795 __intel_engine_reset(engine, stalled_mask & engine->mask); 796 local_bh_enable(); 797 798 intel_uc_reset(>->uc, true); 799 800 intel_ggtt_restore_fences(gt->ggtt); 801 802 return err; 803 } 804 805 static void reset_finish_engine(struct intel_engine_cs *engine) 806 { 807 if (engine->reset.finish) 808 engine->reset.finish(engine); 809 intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL); 810 811 intel_engine_signal_breadcrumbs(engine); 812 } 813 814 static void reset_finish(struct intel_gt *gt, intel_engine_mask_t awake) 815 { 816 struct intel_engine_cs *engine; 817 enum intel_engine_id id; 818 819 for_each_engine(engine, gt, id) { 820 reset_finish_engine(engine); 821 if (awake & engine->mask) 822 intel_engine_pm_put(engine); 823 } 824 825 intel_uc_reset_finish(>->uc); 826 } 827 828 static void nop_submit_request(struct i915_request *request) 829 { 830 RQ_TRACE(request, "-EIO\n"); 831 832 request = i915_request_mark_eio(request); 833 if (request) { 834 i915_request_submit(request); 835 intel_engine_signal_breadcrumbs(request->engine); 836 837 i915_request_put(request); 838 } 839 } 840 841 static void __intel_gt_set_wedged(struct intel_gt *gt) 842 { 843 struct intel_engine_cs *engine; 844 intel_engine_mask_t awake; 845 enum intel_engine_id id; 846 847 if (test_bit(I915_WEDGED, >->reset.flags)) 848 return; 849 850 GT_TRACE(gt, "start\n"); 851 852 /* 853 * First, stop submission to hw, but do not yet complete requests by 854 * rolling the global seqno forward (since this would complete requests 855 * for which we haven't set the fence error to EIO yet). 856 */ 857 awake = reset_prepare(gt); 858 859 /* Even if the GPU reset fails, it should still stop the engines */ 860 if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) 861 __intel_gt_reset(gt, ALL_ENGINES); 862 863 for_each_engine(engine, gt, id) 864 engine->submit_request = nop_submit_request; 865 866 /* 867 * Make sure no request can slip through without getting completed by 868 * either this call here to intel_engine_write_global_seqno, or the one 869 * in nop_submit_request. 870 */ 871 synchronize_rcu_expedited(); 872 set_bit(I915_WEDGED, >->reset.flags); 873 874 /* Mark all executing requests as skipped */ 875 local_bh_disable(); 876 for_each_engine(engine, gt, id) 877 if (engine->reset.cancel) 878 engine->reset.cancel(engine); 879 intel_uc_cancel_requests(>->uc); 880 local_bh_enable(); 881 882 reset_finish(gt, awake); 883 884 GT_TRACE(gt, "end\n"); 885 } 886 887 void intel_gt_set_wedged(struct intel_gt *gt) 888 { 889 intel_wakeref_t wakeref; 890 891 if (test_bit(I915_WEDGED, >->reset.flags)) 892 return; 893 894 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 895 mutex_lock(>->reset.mutex); 896 897 if (GEM_SHOW_DEBUG()) { 898 struct drm_printer p = drm_debug_printer(__func__); 899 struct intel_engine_cs *engine; 900 enum intel_engine_id id; 901 902 drm_printf(&p, "called from %pS\n", (void *)_RET_IP_); 903 for_each_engine(engine, gt, id) { 904 if (intel_engine_is_idle(engine)) 905 continue; 906 907 intel_engine_dump(engine, &p, "%s\n", engine->name); 908 } 909 } 910 911 __intel_gt_set_wedged(gt); 912 913 mutex_unlock(>->reset.mutex); 914 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 915 } 916 917 static bool __intel_gt_unset_wedged(struct intel_gt *gt) 918 { 919 struct intel_gt_timelines *timelines = >->timelines; 920 struct intel_timeline *tl; 921 bool ok; 922 923 if (!test_bit(I915_WEDGED, >->reset.flags)) 924 return true; 925 926 /* Never fully initialised, recovery impossible */ 927 if (intel_gt_has_unrecoverable_error(gt)) 928 return false; 929 930 GT_TRACE(gt, "start\n"); 931 932 /* 933 * Before unwedging, make sure that all pending operations 934 * are flushed and errored out - we may have requests waiting upon 935 * third party fences. We marked all inflight requests as EIO, and 936 * every execbuf since returned EIO, for consistency we want all 937 * the currently pending requests to also be marked as EIO, which 938 * is done inside our nop_submit_request - and so we must wait. 939 * 940 * No more can be submitted until we reset the wedged bit. 941 */ 942 spin_lock(&timelines->lock); 943 list_for_each_entry(tl, &timelines->active_list, link) { 944 struct dma_fence *fence; 945 946 fence = i915_active_fence_get(&tl->last_request); 947 if (!fence) 948 continue; 949 950 spin_unlock(&timelines->lock); 951 952 /* 953 * All internal dependencies (i915_requests) will have 954 * been flushed by the set-wedge, but we may be stuck waiting 955 * for external fences. These should all be capped to 10s 956 * (I915_FENCE_TIMEOUT) so this wait should not be unbounded 957 * in the worst case. 958 */ 959 dma_fence_default_wait(fence, false, MAX_SCHEDULE_TIMEOUT); 960 dma_fence_put(fence); 961 962 /* Restart iteration after droping lock */ 963 spin_lock(&timelines->lock); 964 tl = list_entry(&timelines->active_list, typeof(*tl), link); 965 } 966 spin_unlock(&timelines->lock); 967 968 /* We must reset pending GPU events before restoring our submission */ 969 ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */ 970 if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) 971 ok = __intel_gt_reset(gt, ALL_ENGINES) == 0; 972 if (!ok) { 973 /* 974 * Warn CI about the unrecoverable wedged condition. 975 * Time for a reboot. 976 */ 977 add_taint_for_CI(gt->i915, TAINT_WARN); 978 return false; 979 } 980 981 /* 982 * Undo nop_submit_request. We prevent all new i915 requests from 983 * being queued (by disallowing execbuf whilst wedged) so having 984 * waited for all active requests above, we know the system is idle 985 * and do not have to worry about a thread being inside 986 * engine->submit_request() as we swap over. So unlike installing 987 * the nop_submit_request on reset, we can do this from normal 988 * context and do not require stop_machine(). 989 */ 990 intel_engines_reset_default_submission(gt); 991 992 GT_TRACE(gt, "end\n"); 993 994 smp_mb__before_atomic(); /* complete takeover before enabling execbuf */ 995 clear_bit(I915_WEDGED, >->reset.flags); 996 997 return true; 998 } 999 1000 bool intel_gt_unset_wedged(struct intel_gt *gt) 1001 { 1002 bool result; 1003 1004 mutex_lock(>->reset.mutex); 1005 result = __intel_gt_unset_wedged(gt); 1006 mutex_unlock(>->reset.mutex); 1007 1008 return result; 1009 } 1010 1011 static int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask) 1012 { 1013 int err, i; 1014 1015 err = __intel_gt_reset(gt, ALL_ENGINES); 1016 for (i = 0; err && i < RESET_MAX_RETRIES; i++) { 1017 msleep(10 * (i + 1)); 1018 err = __intel_gt_reset(gt, ALL_ENGINES); 1019 } 1020 if (err) 1021 return err; 1022 1023 return gt_reset(gt, stalled_mask); 1024 } 1025 1026 static int resume(struct intel_gt *gt) 1027 { 1028 struct intel_engine_cs *engine; 1029 enum intel_engine_id id; 1030 int ret; 1031 1032 for_each_engine(engine, gt, id) { 1033 ret = intel_engine_resume(engine); 1034 if (ret) 1035 return ret; 1036 } 1037 1038 return 0; 1039 } 1040 1041 /** 1042 * intel_gt_reset - reset chip after a hang 1043 * @gt: #intel_gt to reset 1044 * @stalled_mask: mask of the stalled engines with the guilty requests 1045 * @reason: user error message for why we are resetting 1046 * 1047 * Reset the chip. Useful if a hang is detected. Marks the device as wedged 1048 * on failure. 1049 * 1050 * Procedure is fairly simple: 1051 * - reset the chip using the reset reg 1052 * - re-init context state 1053 * - re-init hardware status page 1054 * - re-init ring buffer 1055 * - re-init interrupt state 1056 * - re-init display 1057 */ 1058 void intel_gt_reset(struct intel_gt *gt, 1059 intel_engine_mask_t stalled_mask, 1060 const char *reason) 1061 { 1062 intel_engine_mask_t awake; 1063 int ret; 1064 1065 GT_TRACE(gt, "flags=%lx\n", gt->reset.flags); 1066 1067 might_sleep(); 1068 GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, >->reset.flags)); 1069 1070 /* 1071 * FIXME: Revoking cpu mmap ptes cannot be done from a dma_fence 1072 * critical section like gpu reset. 1073 */ 1074 gt_revoke(gt); 1075 1076 mutex_lock(>->reset.mutex); 1077 1078 /* Clear any previous failed attempts at recovery. Time to try again. */ 1079 if (!__intel_gt_unset_wedged(gt)) 1080 goto unlock; 1081 1082 if (reason) 1083 drm_notice(>->i915->drm, 1084 "Resetting chip for %s\n", reason); 1085 atomic_inc(>->i915->gpu_error.reset_count); 1086 1087 awake = reset_prepare(gt); 1088 1089 if (!intel_has_gpu_reset(gt)) { 1090 if (gt->i915->params.reset) 1091 drm_err(>->i915->drm, "GPU reset not supported\n"); 1092 else 1093 drm_dbg(>->i915->drm, "GPU reset disabled\n"); 1094 goto error; 1095 } 1096 1097 if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) 1098 intel_runtime_pm_disable_interrupts(gt->i915); 1099 1100 if (do_reset(gt, stalled_mask)) { 1101 drm_err(>->i915->drm, "Failed to reset chip\n"); 1102 goto taint; 1103 } 1104 1105 if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display) 1106 intel_runtime_pm_enable_interrupts(gt->i915); 1107 1108 intel_overlay_reset(gt->i915); 1109 1110 /* 1111 * Next we need to restore the context, but we don't use those 1112 * yet either... 1113 * 1114 * Ring buffer needs to be re-initialized in the KMS case, or if X 1115 * was running at the time of the reset (i.e. we weren't VT 1116 * switched away). 1117 */ 1118 ret = intel_gt_init_hw(gt); 1119 if (ret) { 1120 drm_err(>->i915->drm, 1121 "Failed to initialise HW following reset (%d)\n", 1122 ret); 1123 goto taint; 1124 } 1125 1126 ret = resume(gt); 1127 if (ret) 1128 goto taint; 1129 1130 finish: 1131 reset_finish(gt, awake); 1132 unlock: 1133 mutex_unlock(>->reset.mutex); 1134 return; 1135 1136 taint: 1137 /* 1138 * History tells us that if we cannot reset the GPU now, we 1139 * never will. This then impacts everything that is run 1140 * subsequently. On failing the reset, we mark the driver 1141 * as wedged, preventing further execution on the GPU. 1142 * We also want to go one step further and add a taint to the 1143 * kernel so that any subsequent faults can be traced back to 1144 * this failure. This is important for CI, where if the 1145 * GPU/driver fails we would like to reboot and restart testing 1146 * rather than continue on into oblivion. For everyone else, 1147 * the system should still plod along, but they have been warned! 1148 */ 1149 add_taint_for_CI(gt->i915, TAINT_WARN); 1150 error: 1151 __intel_gt_set_wedged(gt); 1152 goto finish; 1153 } 1154 1155 static int intel_gt_reset_engine(struct intel_engine_cs *engine) 1156 { 1157 return __intel_gt_reset(engine->gt, engine->mask); 1158 } 1159 1160 int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *msg) 1161 { 1162 struct intel_gt *gt = engine->gt; 1163 int ret; 1164 1165 ENGINE_TRACE(engine, "flags=%lx\n", gt->reset.flags); 1166 GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, >->reset.flags)); 1167 1168 if (intel_engine_uses_guc(engine)) 1169 return -ENODEV; 1170 1171 if (!intel_engine_pm_get_if_awake(engine)) 1172 return 0; 1173 1174 reset_prepare_engine(engine); 1175 1176 if (msg) 1177 drm_notice(&engine->i915->drm, 1178 "Resetting %s for %s\n", engine->name, msg); 1179 atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]); 1180 1181 ret = intel_gt_reset_engine(engine); 1182 if (ret) { 1183 /* If we fail here, we expect to fallback to a global reset */ 1184 ENGINE_TRACE(engine, "Failed to reset %s, err: %d\n", engine->name, ret); 1185 goto out; 1186 } 1187 1188 /* 1189 * The request that caused the hang is stuck on elsp, we know the 1190 * active request and can drop it, adjust head to skip the offending 1191 * request to resume executing remaining requests in the queue. 1192 */ 1193 __intel_engine_reset(engine, true); 1194 1195 /* 1196 * The engine and its registers (and workarounds in case of render) 1197 * have been reset to their default values. Follow the init_ring 1198 * process to program RING_MODE, HWSP and re-enable submission. 1199 */ 1200 ret = intel_engine_resume(engine); 1201 1202 out: 1203 intel_engine_cancel_stop_cs(engine); 1204 reset_finish_engine(engine); 1205 intel_engine_pm_put_async(engine); 1206 return ret; 1207 } 1208 1209 /** 1210 * intel_engine_reset - reset GPU engine to recover from a hang 1211 * @engine: engine to reset 1212 * @msg: reason for GPU reset; or NULL for no drm_notice() 1213 * 1214 * Reset a specific GPU engine. Useful if a hang is detected. 1215 * Returns zero on successful reset or otherwise an error code. 1216 * 1217 * Procedure is: 1218 * - identifies the request that caused the hang and it is dropped 1219 * - reset engine (which will force the engine to idle) 1220 * - re-init/configure engine 1221 */ 1222 int intel_engine_reset(struct intel_engine_cs *engine, const char *msg) 1223 { 1224 int err; 1225 1226 local_bh_disable(); 1227 err = __intel_engine_reset_bh(engine, msg); 1228 local_bh_enable(); 1229 1230 return err; 1231 } 1232 1233 static void intel_gt_reset_global(struct intel_gt *gt, 1234 u32 engine_mask, 1235 const char *reason) 1236 { 1237 struct kobject *kobj = >->i915->drm.primary->kdev->kobj; 1238 char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; 1239 char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; 1240 char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL }; 1241 struct intel_wedge_me w; 1242 1243 kobject_uevent_env(kobj, KOBJ_CHANGE, error_event); 1244 1245 GT_TRACE(gt, "resetting chip, engines=%x\n", engine_mask); 1246 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event); 1247 1248 /* Use a watchdog to ensure that our reset completes */ 1249 intel_wedge_on_timeout(&w, gt, 5 * HZ) { 1250 intel_display_prepare_reset(gt->i915); 1251 1252 /* Flush everyone using a resource about to be clobbered */ 1253 synchronize_srcu_expedited(>->reset.backoff_srcu); 1254 1255 intel_gt_reset(gt, engine_mask, reason); 1256 1257 intel_display_finish_reset(gt->i915); 1258 } 1259 1260 if (!test_bit(I915_WEDGED, >->reset.flags)) 1261 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event); 1262 } 1263 1264 /** 1265 * intel_gt_handle_error - handle a gpu error 1266 * @gt: the intel_gt 1267 * @engine_mask: mask representing engines that are hung 1268 * @flags: control flags 1269 * @fmt: Error message format string 1270 * 1271 * Do some basic checking of register state at error time and 1272 * dump it to the syslog. Also call i915_capture_error_state() to make 1273 * sure we get a record and make it available in debugfs. Fire a uevent 1274 * so userspace knows something bad happened (should trigger collection 1275 * of a ring dump etc.). 1276 */ 1277 void intel_gt_handle_error(struct intel_gt *gt, 1278 intel_engine_mask_t engine_mask, 1279 unsigned long flags, 1280 const char *fmt, ...) 1281 { 1282 struct intel_engine_cs *engine; 1283 intel_wakeref_t wakeref; 1284 intel_engine_mask_t tmp; 1285 char error_msg[80]; 1286 char *msg = NULL; 1287 1288 if (fmt) { 1289 va_list args; 1290 1291 va_start(args, fmt); 1292 vscnprintf(error_msg, sizeof(error_msg), fmt, args); 1293 va_end(args); 1294 1295 msg = error_msg; 1296 } 1297 1298 /* 1299 * In most cases it's guaranteed that we get here with an RPM 1300 * reference held, for example because there is a pending GPU 1301 * request that won't finish until the reset is done. This 1302 * isn't the case at least when we get here by doing a 1303 * simulated reset via debugfs, so get an RPM reference. 1304 */ 1305 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 1306 1307 engine_mask &= gt->info.engine_mask; 1308 1309 if (flags & I915_ERROR_CAPTURE) { 1310 i915_capture_error_state(gt, engine_mask); 1311 intel_gt_clear_error_registers(gt, engine_mask); 1312 } 1313 1314 /* 1315 * Try engine reset when available. We fall back to full reset if 1316 * single reset fails. 1317 */ 1318 if (!intel_uc_uses_guc_submission(>->uc) && 1319 intel_has_reset_engine(gt) && !intel_gt_is_wedged(gt)) { 1320 local_bh_disable(); 1321 for_each_engine_masked(engine, gt, engine_mask, tmp) { 1322 BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE); 1323 if (test_and_set_bit(I915_RESET_ENGINE + engine->id, 1324 >->reset.flags)) 1325 continue; 1326 1327 if (__intel_engine_reset_bh(engine, msg) == 0) 1328 engine_mask &= ~engine->mask; 1329 1330 clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id, 1331 >->reset.flags); 1332 } 1333 local_bh_enable(); 1334 } 1335 1336 if (!engine_mask) 1337 goto out; 1338 1339 /* Full reset needs the mutex, stop any other user trying to do so. */ 1340 if (test_and_set_bit(I915_RESET_BACKOFF, >->reset.flags)) { 1341 wait_event(gt->reset.queue, 1342 !test_bit(I915_RESET_BACKOFF, >->reset.flags)); 1343 goto out; /* piggy-back on the other reset */ 1344 } 1345 1346 /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */ 1347 synchronize_rcu_expedited(); 1348 1349 /* 1350 * Prevent any other reset-engine attempt. We don't do this for GuC 1351 * submission the GuC owns the per-engine reset, not the i915. 1352 */ 1353 if (!intel_uc_uses_guc_submission(>->uc)) { 1354 for_each_engine(engine, gt, tmp) { 1355 while (test_and_set_bit(I915_RESET_ENGINE + engine->id, 1356 >->reset.flags)) 1357 wait_on_bit(>->reset.flags, 1358 I915_RESET_ENGINE + engine->id, 1359 TASK_UNINTERRUPTIBLE); 1360 } 1361 } 1362 1363 intel_gt_reset_global(gt, engine_mask, msg); 1364 1365 if (!intel_uc_uses_guc_submission(>->uc)) { 1366 for_each_engine(engine, gt, tmp) 1367 clear_bit_unlock(I915_RESET_ENGINE + engine->id, 1368 >->reset.flags); 1369 } 1370 clear_bit_unlock(I915_RESET_BACKOFF, >->reset.flags); 1371 smp_mb__after_atomic(); 1372 wake_up_all(>->reset.queue); 1373 1374 out: 1375 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 1376 } 1377 1378 int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu) 1379 { 1380 might_lock(>->reset.backoff_srcu); 1381 might_sleep(); 1382 1383 rcu_read_lock(); 1384 while (test_bit(I915_RESET_BACKOFF, >->reset.flags)) { 1385 rcu_read_unlock(); 1386 1387 if (wait_event_interruptible(gt->reset.queue, 1388 !test_bit(I915_RESET_BACKOFF, 1389 >->reset.flags))) 1390 return -EINTR; 1391 1392 rcu_read_lock(); 1393 } 1394 *srcu = srcu_read_lock(>->reset.backoff_srcu); 1395 rcu_read_unlock(); 1396 1397 return 0; 1398 } 1399 1400 void intel_gt_reset_unlock(struct intel_gt *gt, int tag) 1401 __releases(>->reset.backoff_srcu) 1402 { 1403 srcu_read_unlock(>->reset.backoff_srcu, tag); 1404 } 1405 1406 int intel_gt_terminally_wedged(struct intel_gt *gt) 1407 { 1408 might_sleep(); 1409 1410 if (!intel_gt_is_wedged(gt)) 1411 return 0; 1412 1413 if (intel_gt_has_unrecoverable_error(gt)) 1414 return -EIO; 1415 1416 /* Reset still in progress? Maybe we will recover? */ 1417 if (wait_event_interruptible(gt->reset.queue, 1418 !test_bit(I915_RESET_BACKOFF, 1419 >->reset.flags))) 1420 return -EINTR; 1421 1422 return intel_gt_is_wedged(gt) ? -EIO : 0; 1423 } 1424 1425 void intel_gt_set_wedged_on_init(struct intel_gt *gt) 1426 { 1427 BUILD_BUG_ON(I915_RESET_ENGINE + I915_NUM_ENGINES > 1428 I915_WEDGED_ON_INIT); 1429 intel_gt_set_wedged(gt); 1430 i915_disable_error_state(gt->i915, -ENODEV); 1431 set_bit(I915_WEDGED_ON_INIT, >->reset.flags); 1432 1433 /* Wedged on init is non-recoverable */ 1434 add_taint_for_CI(gt->i915, TAINT_WARN); 1435 } 1436 1437 void intel_gt_set_wedged_on_fini(struct intel_gt *gt) 1438 { 1439 intel_gt_set_wedged(gt); 1440 i915_disable_error_state(gt->i915, -ENODEV); 1441 set_bit(I915_WEDGED_ON_FINI, >->reset.flags); 1442 intel_gt_retire_requests(gt); /* cleanup any wedged requests */ 1443 } 1444 1445 void intel_gt_init_reset(struct intel_gt *gt) 1446 { 1447 init_waitqueue_head(>->reset.queue); 1448 mutex_init(>->reset.mutex); 1449 init_srcu_struct(>->reset.backoff_srcu); 1450 1451 /* 1452 * While undesirable to wait inside the shrinker, complain anyway. 1453 * 1454 * If we have to wait during shrinking, we guarantee forward progress 1455 * by forcing the reset. Therefore during the reset we must not 1456 * re-enter the shrinker. By declaring that we take the reset mutex 1457 * within the shrinker, we forbid ourselves from performing any 1458 * fs-reclaim or taking related locks during reset. 1459 */ 1460 i915_gem_shrinker_taints_mutex(gt->i915, >->reset.mutex); 1461 1462 /* no GPU until we are ready! */ 1463 __set_bit(I915_WEDGED, >->reset.flags); 1464 } 1465 1466 void intel_gt_fini_reset(struct intel_gt *gt) 1467 { 1468 cleanup_srcu_struct(>->reset.backoff_srcu); 1469 } 1470 1471 static void intel_wedge_me(struct work_struct *work) 1472 { 1473 struct intel_wedge_me *w = container_of(work, typeof(*w), work.work); 1474 1475 drm_err(&w->gt->i915->drm, 1476 "%s timed out, cancelling all in-flight rendering.\n", 1477 w->name); 1478 intel_gt_set_wedged(w->gt); 1479 } 1480 1481 void __intel_init_wedge(struct intel_wedge_me *w, 1482 struct intel_gt *gt, 1483 long timeout, 1484 const char *name) 1485 { 1486 w->gt = gt; 1487 w->name = name; 1488 1489 INIT_DELAYED_WORK_ONSTACK(&w->work, intel_wedge_me); 1490 schedule_delayed_work(&w->work, timeout); 1491 } 1492 1493 void __intel_fini_wedge(struct intel_wedge_me *w) 1494 { 1495 cancel_delayed_work_sync(&w->work); 1496 destroy_delayed_work_on_stack(&w->work); 1497 w->gt = NULL; 1498 } 1499 1500 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1501 #include "selftest_reset.c" 1502 #include "selftest_hangcheck.c" 1503 #endif 1504