1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2016 Intel Corporation 4 */ 5 6 #include <linux/kthread.h> 7 8 #include "gem/i915_gem_context.h" 9 10 #include "intel_gt.h" 11 #include "intel_engine_heartbeat.h" 12 #include "intel_engine_pm.h" 13 #include "selftest_engine_heartbeat.h" 14 15 #include "i915_selftest.h" 16 #include "selftests/i915_random.h" 17 #include "selftests/igt_flush_test.h" 18 #include "selftests/igt_reset.h" 19 #include "selftests/igt_atomic.h" 20 #include "selftests/igt_spinner.h" 21 #include "selftests/intel_scheduler_helpers.h" 22 23 #include "selftests/mock_drm.h" 24 25 #include "gem/selftests/mock_context.h" 26 #include "gem/selftests/igt_gem_utils.h" 27 28 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 29 30 struct hang { 31 struct intel_gt *gt; 32 struct drm_i915_gem_object *hws; 33 struct drm_i915_gem_object *obj; 34 struct i915_gem_context *ctx; 35 u32 *seqno; 36 u32 *batch; 37 }; 38 39 static int hang_init(struct hang *h, struct intel_gt *gt) 40 { 41 void *vaddr; 42 int err; 43 44 memset(h, 0, sizeof(*h)); 45 h->gt = gt; 46 47 h->ctx = kernel_context(gt->i915, NULL); 48 if (IS_ERR(h->ctx)) 49 return PTR_ERR(h->ctx); 50 51 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 52 53 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 54 if (IS_ERR(h->hws)) { 55 err = PTR_ERR(h->hws); 56 goto err_ctx; 57 } 58 59 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 60 if (IS_ERR(h->obj)) { 61 err = PTR_ERR(h->obj); 62 goto err_hws; 63 } 64 65 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 66 vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB); 67 if (IS_ERR(vaddr)) { 68 err = PTR_ERR(vaddr); 69 goto err_obj; 70 } 71 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 72 73 vaddr = i915_gem_object_pin_map_unlocked(h->obj, 74 i915_coherent_map_type(gt->i915, h->obj, false)); 75 if (IS_ERR(vaddr)) { 76 err = PTR_ERR(vaddr); 77 goto err_unpin_hws; 78 } 79 h->batch = vaddr; 80 81 return 0; 82 83 err_unpin_hws: 84 i915_gem_object_unpin_map(h->hws); 85 err_obj: 86 i915_gem_object_put(h->obj); 87 err_hws: 88 i915_gem_object_put(h->hws); 89 err_ctx: 90 kernel_context_close(h->ctx); 91 return err; 92 } 93 94 static u64 hws_address(const struct i915_vma *hws, 95 const struct i915_request *rq) 96 { 97 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); 98 } 99 100 static int move_to_active(struct i915_vma *vma, 101 struct i915_request *rq, 102 unsigned int flags) 103 { 104 int err; 105 106 i915_vma_lock(vma); 107 err = i915_request_await_object(rq, vma->obj, 108 flags & EXEC_OBJECT_WRITE); 109 if (err == 0) 110 err = i915_vma_move_to_active(vma, rq, flags); 111 i915_vma_unlock(vma); 112 113 return err; 114 } 115 116 static struct i915_request * 117 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 118 { 119 struct intel_gt *gt = h->gt; 120 struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx); 121 struct drm_i915_gem_object *obj; 122 struct i915_request *rq = NULL; 123 struct i915_vma *hws, *vma; 124 unsigned int flags; 125 void *vaddr; 126 u32 *batch; 127 int err; 128 129 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 130 if (IS_ERR(obj)) { 131 i915_vm_put(vm); 132 return ERR_CAST(obj); 133 } 134 135 vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false)); 136 if (IS_ERR(vaddr)) { 137 i915_gem_object_put(obj); 138 i915_vm_put(vm); 139 return ERR_CAST(vaddr); 140 } 141 142 i915_gem_object_unpin_map(h->obj); 143 i915_gem_object_put(h->obj); 144 145 h->obj = obj; 146 h->batch = vaddr; 147 148 vma = i915_vma_instance(h->obj, vm, NULL); 149 if (IS_ERR(vma)) { 150 i915_vm_put(vm); 151 return ERR_CAST(vma); 152 } 153 154 hws = i915_vma_instance(h->hws, vm, NULL); 155 if (IS_ERR(hws)) { 156 i915_vm_put(vm); 157 return ERR_CAST(hws); 158 } 159 160 err = i915_vma_pin(vma, 0, 0, PIN_USER); 161 if (err) { 162 i915_vm_put(vm); 163 return ERR_PTR(err); 164 } 165 166 err = i915_vma_pin(hws, 0, 0, PIN_USER); 167 if (err) 168 goto unpin_vma; 169 170 rq = igt_request_alloc(h->ctx, engine); 171 if (IS_ERR(rq)) { 172 err = PTR_ERR(rq); 173 goto unpin_hws; 174 } 175 176 err = move_to_active(vma, rq, 0); 177 if (err) 178 goto cancel_rq; 179 180 err = move_to_active(hws, rq, 0); 181 if (err) 182 goto cancel_rq; 183 184 batch = h->batch; 185 if (GRAPHICS_VER(gt->i915) >= 8) { 186 *batch++ = MI_STORE_DWORD_IMM_GEN4; 187 *batch++ = lower_32_bits(hws_address(hws, rq)); 188 *batch++ = upper_32_bits(hws_address(hws, rq)); 189 *batch++ = rq->fence.seqno; 190 *batch++ = MI_NOOP; 191 192 memset(batch, 0, 1024); 193 batch += 1024 / sizeof(*batch); 194 195 *batch++ = MI_NOOP; 196 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 197 *batch++ = lower_32_bits(vma->node.start); 198 *batch++ = upper_32_bits(vma->node.start); 199 } else if (GRAPHICS_VER(gt->i915) >= 6) { 200 *batch++ = MI_STORE_DWORD_IMM_GEN4; 201 *batch++ = 0; 202 *batch++ = lower_32_bits(hws_address(hws, rq)); 203 *batch++ = rq->fence.seqno; 204 *batch++ = MI_NOOP; 205 206 memset(batch, 0, 1024); 207 batch += 1024 / sizeof(*batch); 208 209 *batch++ = MI_NOOP; 210 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 211 *batch++ = lower_32_bits(vma->node.start); 212 } else if (GRAPHICS_VER(gt->i915) >= 4) { 213 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 214 *batch++ = 0; 215 *batch++ = lower_32_bits(hws_address(hws, rq)); 216 *batch++ = rq->fence.seqno; 217 *batch++ = MI_NOOP; 218 219 memset(batch, 0, 1024); 220 batch += 1024 / sizeof(*batch); 221 222 *batch++ = MI_NOOP; 223 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 224 *batch++ = lower_32_bits(vma->node.start); 225 } else { 226 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 227 *batch++ = lower_32_bits(hws_address(hws, rq)); 228 *batch++ = rq->fence.seqno; 229 *batch++ = MI_NOOP; 230 231 memset(batch, 0, 1024); 232 batch += 1024 / sizeof(*batch); 233 234 *batch++ = MI_NOOP; 235 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 236 *batch++ = lower_32_bits(vma->node.start); 237 } 238 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 239 intel_gt_chipset_flush(engine->gt); 240 241 if (rq->engine->emit_init_breadcrumb) { 242 err = rq->engine->emit_init_breadcrumb(rq); 243 if (err) 244 goto cancel_rq; 245 } 246 247 flags = 0; 248 if (GRAPHICS_VER(gt->i915) <= 5) 249 flags |= I915_DISPATCH_SECURE; 250 251 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); 252 253 cancel_rq: 254 if (err) { 255 i915_request_set_error_once(rq, err); 256 i915_request_add(rq); 257 } 258 unpin_hws: 259 i915_vma_unpin(hws); 260 unpin_vma: 261 i915_vma_unpin(vma); 262 i915_vm_put(vm); 263 return err ? ERR_PTR(err) : rq; 264 } 265 266 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 267 { 268 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 269 } 270 271 static void hang_fini(struct hang *h) 272 { 273 *h->batch = MI_BATCH_BUFFER_END; 274 intel_gt_chipset_flush(h->gt); 275 276 i915_gem_object_unpin_map(h->obj); 277 i915_gem_object_put(h->obj); 278 279 i915_gem_object_unpin_map(h->hws); 280 i915_gem_object_put(h->hws); 281 282 kernel_context_close(h->ctx); 283 284 igt_flush_test(h->gt->i915); 285 } 286 287 static bool wait_until_running(struct hang *h, struct i915_request *rq) 288 { 289 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 290 rq->fence.seqno), 291 10) && 292 wait_for(i915_seqno_passed(hws_seqno(h, rq), 293 rq->fence.seqno), 294 1000)); 295 } 296 297 static int igt_hang_sanitycheck(void *arg) 298 { 299 struct intel_gt *gt = arg; 300 struct i915_request *rq; 301 struct intel_engine_cs *engine; 302 enum intel_engine_id id; 303 struct hang h; 304 int err; 305 306 /* Basic check that we can execute our hanging batch */ 307 308 err = hang_init(&h, gt); 309 if (err) 310 return err; 311 312 for_each_engine(engine, gt, id) { 313 struct intel_wedge_me w; 314 long timeout; 315 316 if (!intel_engine_can_store_dword(engine)) 317 continue; 318 319 rq = hang_create_request(&h, engine); 320 if (IS_ERR(rq)) { 321 err = PTR_ERR(rq); 322 pr_err("Failed to create request for %s, err=%d\n", 323 engine->name, err); 324 goto fini; 325 } 326 327 i915_request_get(rq); 328 329 *h.batch = MI_BATCH_BUFFER_END; 330 intel_gt_chipset_flush(engine->gt); 331 332 i915_request_add(rq); 333 334 timeout = 0; 335 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 336 timeout = i915_request_wait(rq, 0, 337 MAX_SCHEDULE_TIMEOUT); 338 if (intel_gt_is_wedged(gt)) 339 timeout = -EIO; 340 341 i915_request_put(rq); 342 343 if (timeout < 0) { 344 err = timeout; 345 pr_err("Wait for request failed on %s, err=%d\n", 346 engine->name, err); 347 goto fini; 348 } 349 } 350 351 fini: 352 hang_fini(&h); 353 return err; 354 } 355 356 static bool wait_for_idle(struct intel_engine_cs *engine) 357 { 358 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 359 } 360 361 static int igt_reset_nop(void *arg) 362 { 363 struct intel_gt *gt = arg; 364 struct i915_gpu_error *global = >->i915->gpu_error; 365 struct intel_engine_cs *engine; 366 unsigned int reset_count, count; 367 enum intel_engine_id id; 368 IGT_TIMEOUT(end_time); 369 int err = 0; 370 371 /* Check that we can reset during non-user portions of requests */ 372 373 reset_count = i915_reset_count(global); 374 count = 0; 375 do { 376 for_each_engine(engine, gt, id) { 377 struct intel_context *ce; 378 int i; 379 380 ce = intel_context_create(engine); 381 if (IS_ERR(ce)) { 382 err = PTR_ERR(ce); 383 pr_err("[%s] Create context failed: %d!\n", engine->name, err); 384 break; 385 } 386 387 for (i = 0; i < 16; i++) { 388 struct i915_request *rq; 389 390 rq = intel_context_create_request(ce); 391 if (IS_ERR(rq)) { 392 err = PTR_ERR(rq); 393 pr_err("[%s] Create request failed: %d!\n", 394 engine->name, err); 395 break; 396 } 397 398 i915_request_add(rq); 399 } 400 401 intel_context_put(ce); 402 } 403 404 igt_global_reset_lock(gt); 405 intel_gt_reset(gt, ALL_ENGINES, NULL); 406 igt_global_reset_unlock(gt); 407 408 if (intel_gt_is_wedged(gt)) { 409 pr_err("[%s] GT is wedged!\n", engine->name); 410 err = -EIO; 411 break; 412 } 413 414 if (i915_reset_count(global) != reset_count + ++count) { 415 pr_err("[%s] Reset not recorded: %d vs %d + %d!\n", 416 engine->name, i915_reset_count(global), reset_count, count); 417 err = -EINVAL; 418 break; 419 } 420 421 err = igt_flush_test(gt->i915); 422 if (err) { 423 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 424 break; 425 } 426 } while (time_before(jiffies, end_time)); 427 pr_info("%s: %d resets\n", __func__, count); 428 429 if (igt_flush_test(gt->i915)) { 430 pr_err("Post flush failed: %d!\n", err); 431 err = -EIO; 432 } 433 434 return err; 435 } 436 437 static int igt_reset_nop_engine(void *arg) 438 { 439 struct intel_gt *gt = arg; 440 struct i915_gpu_error *global = >->i915->gpu_error; 441 struct intel_engine_cs *engine; 442 enum intel_engine_id id; 443 444 /* Check that we can engine-reset during non-user portions */ 445 446 if (!intel_has_reset_engine(gt)) 447 return 0; 448 449 for_each_engine(engine, gt, id) { 450 unsigned int reset_count, reset_engine_count, count; 451 struct intel_context *ce; 452 IGT_TIMEOUT(end_time); 453 int err; 454 455 if (intel_engine_uses_guc(engine)) { 456 /* Engine level resets are triggered by GuC when a hang 457 * is detected. They can't be triggered by the KMD any 458 * more. Thus a nop batch cannot be used as a reset test 459 */ 460 continue; 461 } 462 463 ce = intel_context_create(engine); 464 if (IS_ERR(ce)) { 465 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce); 466 return PTR_ERR(ce); 467 } 468 469 reset_count = i915_reset_count(global); 470 reset_engine_count = i915_reset_engine_count(global, engine); 471 count = 0; 472 473 st_engine_heartbeat_disable(engine); 474 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 475 >->reset.flags)); 476 do { 477 int i; 478 479 if (!wait_for_idle(engine)) { 480 pr_err("%s failed to idle before reset\n", 481 engine->name); 482 err = -EIO; 483 break; 484 } 485 486 for (i = 0; i < 16; i++) { 487 struct i915_request *rq; 488 489 rq = intel_context_create_request(ce); 490 if (IS_ERR(rq)) { 491 struct drm_printer p = 492 drm_info_printer(gt->i915->drm.dev); 493 intel_engine_dump(engine, &p, 494 "%s(%s): failed to submit request\n", 495 __func__, 496 engine->name); 497 498 GEM_TRACE("%s(%s): failed to submit request\n", 499 __func__, 500 engine->name); 501 GEM_TRACE_DUMP(); 502 503 intel_gt_set_wedged(gt); 504 505 err = PTR_ERR(rq); 506 break; 507 } 508 509 i915_request_add(rq); 510 } 511 err = intel_engine_reset(engine, NULL); 512 if (err) { 513 pr_err("intel_engine_reset(%s) failed, err:%d\n", 514 engine->name, err); 515 break; 516 } 517 518 if (i915_reset_count(global) != reset_count) { 519 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 520 err = -EINVAL; 521 break; 522 } 523 524 if (i915_reset_engine_count(global, engine) != 525 reset_engine_count + ++count) { 526 pr_err("%s engine reset not recorded!\n", 527 engine->name); 528 err = -EINVAL; 529 break; 530 } 531 } while (time_before(jiffies, end_time)); 532 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 533 st_engine_heartbeat_enable(engine); 534 535 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 536 537 intel_context_put(ce); 538 if (igt_flush_test(gt->i915)) 539 err = -EIO; 540 if (err) 541 return err; 542 } 543 544 return 0; 545 } 546 547 static void force_reset_timeout(struct intel_engine_cs *engine) 548 { 549 engine->reset_timeout.probability = 999; 550 atomic_set(&engine->reset_timeout.times, -1); 551 } 552 553 static void cancel_reset_timeout(struct intel_engine_cs *engine) 554 { 555 memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout)); 556 } 557 558 static int igt_reset_fail_engine(void *arg) 559 { 560 struct intel_gt *gt = arg; 561 struct intel_engine_cs *engine; 562 enum intel_engine_id id; 563 564 /* Check that we can recover from engine-reset failues */ 565 566 if (!intel_has_reset_engine(gt)) 567 return 0; 568 569 for_each_engine(engine, gt, id) { 570 unsigned int count; 571 struct intel_context *ce; 572 IGT_TIMEOUT(end_time); 573 int err; 574 575 /* Can't manually break the reset if i915 doesn't perform it */ 576 if (intel_engine_uses_guc(engine)) 577 continue; 578 579 ce = intel_context_create(engine); 580 if (IS_ERR(ce)) { 581 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce); 582 return PTR_ERR(ce); 583 } 584 585 st_engine_heartbeat_disable(engine); 586 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 587 >->reset.flags)); 588 589 force_reset_timeout(engine); 590 err = intel_engine_reset(engine, NULL); 591 cancel_reset_timeout(engine); 592 if (err == 0) /* timeouts only generated on gen8+ */ 593 goto skip; 594 595 count = 0; 596 do { 597 struct i915_request *last = NULL; 598 int i; 599 600 if (!wait_for_idle(engine)) { 601 pr_err("%s failed to idle before reset\n", 602 engine->name); 603 err = -EIO; 604 break; 605 } 606 607 for (i = 0; i < count % 15; i++) { 608 struct i915_request *rq; 609 610 rq = intel_context_create_request(ce); 611 if (IS_ERR(rq)) { 612 struct drm_printer p = 613 drm_info_printer(gt->i915->drm.dev); 614 intel_engine_dump(engine, &p, 615 "%s(%s): failed to submit request\n", 616 __func__, 617 engine->name); 618 619 GEM_TRACE("%s(%s): failed to submit request\n", 620 __func__, 621 engine->name); 622 GEM_TRACE_DUMP(); 623 624 intel_gt_set_wedged(gt); 625 if (last) 626 i915_request_put(last); 627 628 err = PTR_ERR(rq); 629 goto out; 630 } 631 632 if (last) 633 i915_request_put(last); 634 last = i915_request_get(rq); 635 i915_request_add(rq); 636 } 637 638 if (count & 1) { 639 err = intel_engine_reset(engine, NULL); 640 if (err) { 641 GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n", 642 engine->name, err); 643 GEM_TRACE_DUMP(); 644 i915_request_put(last); 645 break; 646 } 647 } else { 648 force_reset_timeout(engine); 649 err = intel_engine_reset(engine, NULL); 650 cancel_reset_timeout(engine); 651 if (err != -ETIMEDOUT) { 652 pr_err("intel_engine_reset(%s) did not fail, err:%d\n", 653 engine->name, err); 654 i915_request_put(last); 655 break; 656 } 657 } 658 659 err = 0; 660 if (last) { 661 if (i915_request_wait(last, 0, HZ / 2) < 0) { 662 struct drm_printer p = 663 drm_info_printer(gt->i915->drm.dev); 664 665 intel_engine_dump(engine, &p, 666 "%s(%s): failed to complete request\n", 667 __func__, 668 engine->name); 669 670 GEM_TRACE("%s(%s): failed to complete request\n", 671 __func__, 672 engine->name); 673 GEM_TRACE_DUMP(); 674 675 err = -EIO; 676 } 677 i915_request_put(last); 678 } 679 count++; 680 } while (err == 0 && time_before(jiffies, end_time)); 681 out: 682 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 683 skip: 684 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 685 st_engine_heartbeat_enable(engine); 686 intel_context_put(ce); 687 688 if (igt_flush_test(gt->i915)) 689 err = -EIO; 690 if (err) 691 return err; 692 } 693 694 return 0; 695 } 696 697 static int __igt_reset_engine(struct intel_gt *gt, bool active) 698 { 699 struct i915_gpu_error *global = >->i915->gpu_error; 700 struct intel_engine_cs *engine; 701 enum intel_engine_id id; 702 struct hang h; 703 int err = 0; 704 705 /* Check that we can issue an engine reset on an idle engine (no-op) */ 706 707 if (!intel_has_reset_engine(gt)) 708 return 0; 709 710 if (active) { 711 err = hang_init(&h, gt); 712 if (err) 713 return err; 714 } 715 716 for_each_engine(engine, gt, id) { 717 unsigned int reset_count, reset_engine_count; 718 unsigned long count; 719 bool using_guc = intel_engine_uses_guc(engine); 720 IGT_TIMEOUT(end_time); 721 722 if (using_guc && !active) 723 continue; 724 725 if (active && !intel_engine_can_store_dword(engine)) 726 continue; 727 728 if (!wait_for_idle(engine)) { 729 pr_err("%s failed to idle before reset\n", 730 engine->name); 731 err = -EIO; 732 break; 733 } 734 735 reset_count = i915_reset_count(global); 736 reset_engine_count = i915_reset_engine_count(global, engine); 737 738 st_engine_heartbeat_disable(engine); 739 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 740 >->reset.flags)); 741 count = 0; 742 do { 743 struct i915_request *rq = NULL; 744 struct intel_selftest_saved_policy saved; 745 int err2; 746 747 err = intel_selftest_modify_policy(engine, &saved, 748 SELFTEST_SCHEDULER_MODIFY_FAST_RESET); 749 if (err) { 750 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 751 break; 752 } 753 754 if (active) { 755 rq = hang_create_request(&h, engine); 756 if (IS_ERR(rq)) { 757 err = PTR_ERR(rq); 758 pr_err("[%s] Create hang request failed: %d!\n", 759 engine->name, err); 760 goto restore; 761 } 762 763 i915_request_get(rq); 764 i915_request_add(rq); 765 766 if (!wait_until_running(&h, rq)) { 767 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 768 769 pr_err("%s: Failed to start request %llx, at %x\n", 770 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 771 intel_engine_dump(engine, &p, 772 "%s\n", engine->name); 773 774 i915_request_put(rq); 775 err = -EIO; 776 goto restore; 777 } 778 } 779 780 if (!using_guc) { 781 err = intel_engine_reset(engine, NULL); 782 if (err) { 783 pr_err("intel_engine_reset(%s) failed, err:%d\n", 784 engine->name, err); 785 goto skip; 786 } 787 } 788 789 if (rq) { 790 /* Ensure the reset happens and kills the engine */ 791 err = intel_selftest_wait_for_rq(rq); 792 if (err) 793 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n", 794 engine->name, rq->fence.context, 795 rq->fence.seqno, rq->context->guc_id.id, err); 796 } 797 798 skip: 799 if (rq) 800 i915_request_put(rq); 801 802 if (i915_reset_count(global) != reset_count) { 803 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 804 err = -EINVAL; 805 goto restore; 806 } 807 808 /* GuC based resets are not logged per engine */ 809 if (!using_guc) { 810 if (i915_reset_engine_count(global, engine) != 811 ++reset_engine_count) { 812 pr_err("%s engine reset not recorded!\n", 813 engine->name); 814 err = -EINVAL; 815 goto restore; 816 } 817 } 818 819 count++; 820 821 restore: 822 err2 = intel_selftest_restore_policy(engine, &saved); 823 if (err2) 824 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err); 825 if (err == 0) 826 err = err2; 827 if (err) 828 break; 829 } while (time_before(jiffies, end_time)); 830 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 831 st_engine_heartbeat_enable(engine); 832 pr_info("%s: Completed %lu %s resets\n", 833 engine->name, count, active ? "active" : "idle"); 834 835 if (err) 836 break; 837 838 err = igt_flush_test(gt->i915); 839 if (err) { 840 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 841 break; 842 } 843 } 844 845 if (intel_gt_is_wedged(gt)) { 846 pr_err("GT is wedged!\n"); 847 err = -EIO; 848 } 849 850 if (active) 851 hang_fini(&h); 852 853 return err; 854 } 855 856 static int igt_reset_idle_engine(void *arg) 857 { 858 return __igt_reset_engine(arg, false); 859 } 860 861 static int igt_reset_active_engine(void *arg) 862 { 863 return __igt_reset_engine(arg, true); 864 } 865 866 struct active_engine { 867 struct task_struct *task; 868 struct intel_engine_cs *engine; 869 unsigned long resets; 870 unsigned int flags; 871 }; 872 873 #define TEST_ACTIVE BIT(0) 874 #define TEST_OTHERS BIT(1) 875 #define TEST_SELF BIT(2) 876 #define TEST_PRIORITY BIT(3) 877 878 static int active_request_put(struct i915_request *rq) 879 { 880 int err = 0; 881 882 if (!rq) 883 return 0; 884 885 if (i915_request_wait(rq, 0, 10 * HZ) < 0) { 886 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 887 rq->engine->name, 888 rq->fence.context, 889 rq->fence.seqno); 890 GEM_TRACE_DUMP(); 891 892 intel_gt_set_wedged(rq->engine->gt); 893 err = -EIO; 894 } 895 896 i915_request_put(rq); 897 898 return err; 899 } 900 901 static int active_engine(void *data) 902 { 903 I915_RND_STATE(prng); 904 struct active_engine *arg = data; 905 struct intel_engine_cs *engine = arg->engine; 906 struct i915_request *rq[8] = {}; 907 struct intel_context *ce[ARRAY_SIZE(rq)]; 908 unsigned long count; 909 int err = 0; 910 911 for (count = 0; count < ARRAY_SIZE(ce); count++) { 912 ce[count] = intel_context_create(engine); 913 if (IS_ERR(ce[count])) { 914 err = PTR_ERR(ce[count]); 915 pr_err("[%s] Create context #%ld failed: %d!\n", engine->name, count, err); 916 while (--count) 917 intel_context_put(ce[count]); 918 return err; 919 } 920 } 921 922 count = 0; 923 while (!kthread_should_stop()) { 924 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 925 struct i915_request *old = rq[idx]; 926 struct i915_request *new; 927 928 new = intel_context_create_request(ce[idx]); 929 if (IS_ERR(new)) { 930 err = PTR_ERR(new); 931 pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err); 932 break; 933 } 934 935 rq[idx] = i915_request_get(new); 936 i915_request_add(new); 937 938 if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) { 939 struct i915_sched_attr attr = { 940 .priority = 941 i915_prandom_u32_max_state(512, &prng), 942 }; 943 engine->sched_engine->schedule(rq[idx], &attr); 944 } 945 946 err = active_request_put(old); 947 if (err) { 948 pr_err("[%s] Request put failed: %d!\n", engine->name, err); 949 break; 950 } 951 952 cond_resched(); 953 } 954 955 for (count = 0; count < ARRAY_SIZE(rq); count++) { 956 int err__ = active_request_put(rq[count]); 957 958 if (err) 959 pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err); 960 961 /* Keep the first error */ 962 if (!err) 963 err = err__; 964 965 intel_context_put(ce[count]); 966 } 967 968 return err; 969 } 970 971 static int __igt_reset_engines(struct intel_gt *gt, 972 const char *test_name, 973 unsigned int flags) 974 { 975 struct i915_gpu_error *global = >->i915->gpu_error; 976 struct intel_engine_cs *engine, *other; 977 enum intel_engine_id id, tmp; 978 struct hang h; 979 int err = 0; 980 981 /* Check that issuing a reset on one engine does not interfere 982 * with any other engine. 983 */ 984 985 if (!intel_has_reset_engine(gt)) 986 return 0; 987 988 if (flags & TEST_ACTIVE) { 989 err = hang_init(&h, gt); 990 if (err) 991 return err; 992 993 if (flags & TEST_PRIORITY) 994 h.ctx->sched.priority = 1024; 995 } 996 997 for_each_engine(engine, gt, id) { 998 struct active_engine threads[I915_NUM_ENGINES] = {}; 999 unsigned long device = i915_reset_count(global); 1000 unsigned long count = 0, reported; 1001 bool using_guc = intel_engine_uses_guc(engine); 1002 IGT_TIMEOUT(end_time); 1003 1004 if (flags & TEST_ACTIVE) { 1005 if (!intel_engine_can_store_dword(engine)) 1006 continue; 1007 } else if (using_guc) 1008 continue; 1009 1010 if (!wait_for_idle(engine)) { 1011 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 1012 engine->name, test_name); 1013 err = -EIO; 1014 break; 1015 } 1016 1017 memset(threads, 0, sizeof(threads)); 1018 for_each_engine(other, gt, tmp) { 1019 struct task_struct *tsk; 1020 1021 threads[tmp].resets = 1022 i915_reset_engine_count(global, other); 1023 1024 if (other == engine && !(flags & TEST_SELF)) 1025 continue; 1026 1027 if (other != engine && !(flags & TEST_OTHERS)) 1028 continue; 1029 1030 threads[tmp].engine = other; 1031 threads[tmp].flags = flags; 1032 1033 tsk = kthread_run(active_engine, &threads[tmp], 1034 "igt/%s", other->name); 1035 if (IS_ERR(tsk)) { 1036 err = PTR_ERR(tsk); 1037 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err); 1038 goto unwind; 1039 } 1040 1041 threads[tmp].task = tsk; 1042 get_task_struct(tsk); 1043 } 1044 1045 yield(); /* start all threads before we begin */ 1046 1047 st_engine_heartbeat_disable_no_pm(engine); 1048 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 1049 >->reset.flags)); 1050 do { 1051 struct i915_request *rq = NULL; 1052 struct intel_selftest_saved_policy saved; 1053 int err2; 1054 1055 err = intel_selftest_modify_policy(engine, &saved, 1056 SELFTEST_SCHEDULER_MODIFY_FAST_RESET); 1057 if (err) { 1058 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 1059 break; 1060 } 1061 1062 if (flags & TEST_ACTIVE) { 1063 rq = hang_create_request(&h, engine); 1064 if (IS_ERR(rq)) { 1065 err = PTR_ERR(rq); 1066 pr_err("[%s] Create hang request failed: %d!\n", 1067 engine->name, err); 1068 goto restore; 1069 } 1070 1071 i915_request_get(rq); 1072 i915_request_add(rq); 1073 1074 if (!wait_until_running(&h, rq)) { 1075 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1076 1077 pr_err("%s: Failed to start request %llx, at %x\n", 1078 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1079 intel_engine_dump(engine, &p, 1080 "%s\n", engine->name); 1081 1082 i915_request_put(rq); 1083 err = -EIO; 1084 goto restore; 1085 } 1086 } else { 1087 intel_engine_pm_get(engine); 1088 } 1089 1090 if (!using_guc) { 1091 err = intel_engine_reset(engine, NULL); 1092 if (err) { 1093 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 1094 engine->name, test_name, err); 1095 goto restore; 1096 } 1097 } 1098 1099 if (rq) { 1100 /* Ensure the reset happens and kills the engine */ 1101 err = intel_selftest_wait_for_rq(rq); 1102 if (err) 1103 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n", 1104 engine->name, rq->fence.context, 1105 rq->fence.seqno, rq->context->guc_id.id, err); 1106 } 1107 1108 count++; 1109 1110 if (rq) { 1111 if (rq->fence.error != -EIO) { 1112 pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n", 1113 engine->name, test_name, 1114 rq->fence.context, 1115 rq->fence.seqno, rq->context->guc_id.id); 1116 i915_request_put(rq); 1117 1118 GEM_TRACE_DUMP(); 1119 intel_gt_set_wedged(gt); 1120 err = -EIO; 1121 goto restore; 1122 } 1123 1124 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 1125 struct drm_printer p = 1126 drm_info_printer(gt->i915->drm.dev); 1127 1128 pr_err("i915_reset_engine(%s:%s):" 1129 " failed to complete request %llx:%lld after reset\n", 1130 engine->name, test_name, 1131 rq->fence.context, 1132 rq->fence.seqno); 1133 intel_engine_dump(engine, &p, 1134 "%s\n", engine->name); 1135 i915_request_put(rq); 1136 1137 GEM_TRACE_DUMP(); 1138 intel_gt_set_wedged(gt); 1139 err = -EIO; 1140 goto restore; 1141 } 1142 1143 i915_request_put(rq); 1144 } 1145 1146 if (!(flags & TEST_ACTIVE)) 1147 intel_engine_pm_put(engine); 1148 1149 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 1150 struct drm_printer p = 1151 drm_info_printer(gt->i915->drm.dev); 1152 1153 pr_err("i915_reset_engine(%s:%s):" 1154 " failed to idle after reset\n", 1155 engine->name, test_name); 1156 intel_engine_dump(engine, &p, 1157 "%s\n", engine->name); 1158 1159 err = -EIO; 1160 goto restore; 1161 } 1162 1163 restore: 1164 err2 = intel_selftest_restore_policy(engine, &saved); 1165 if (err2) 1166 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2); 1167 if (err == 0) 1168 err = err2; 1169 if (err) 1170 break; 1171 } while (time_before(jiffies, end_time)); 1172 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 1173 st_engine_heartbeat_enable_no_pm(engine); 1174 1175 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 1176 engine->name, test_name, count); 1177 1178 /* GuC based resets are not logged per engine */ 1179 if (!using_guc) { 1180 reported = i915_reset_engine_count(global, engine); 1181 reported -= threads[engine->id].resets; 1182 if (reported != count) { 1183 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 1184 engine->name, test_name, count, reported); 1185 if (!err) 1186 err = -EINVAL; 1187 } 1188 } 1189 1190 unwind: 1191 for_each_engine(other, gt, tmp) { 1192 int ret; 1193 1194 if (!threads[tmp].task) 1195 continue; 1196 1197 ret = kthread_stop(threads[tmp].task); 1198 if (ret) { 1199 pr_err("kthread for other engine %s failed, err=%d\n", 1200 other->name, ret); 1201 if (!err) 1202 err = ret; 1203 } 1204 put_task_struct(threads[tmp].task); 1205 1206 /* GuC based resets are not logged per engine */ 1207 if (!using_guc) { 1208 if (other->uabi_class != engine->uabi_class && 1209 threads[tmp].resets != 1210 i915_reset_engine_count(global, other)) { 1211 pr_err("Innocent engine %s was reset (count=%ld)\n", 1212 other->name, 1213 i915_reset_engine_count(global, other) - 1214 threads[tmp].resets); 1215 if (!err) 1216 err = -EINVAL; 1217 } 1218 } 1219 } 1220 1221 if (device != i915_reset_count(global)) { 1222 pr_err("Global reset (count=%ld)!\n", 1223 i915_reset_count(global) - device); 1224 if (!err) 1225 err = -EINVAL; 1226 } 1227 1228 if (err) 1229 break; 1230 1231 err = igt_flush_test(gt->i915); 1232 if (err) { 1233 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 1234 break; 1235 } 1236 } 1237 1238 if (intel_gt_is_wedged(gt)) 1239 err = -EIO; 1240 1241 if (flags & TEST_ACTIVE) 1242 hang_fini(&h); 1243 1244 return err; 1245 } 1246 1247 static int igt_reset_engines(void *arg) 1248 { 1249 static const struct { 1250 const char *name; 1251 unsigned int flags; 1252 } phases[] = { 1253 { "idle", 0 }, 1254 { "active", TEST_ACTIVE }, 1255 { "others-idle", TEST_OTHERS }, 1256 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 1257 { 1258 "others-priority", 1259 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 1260 }, 1261 { 1262 "self-priority", 1263 TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 1264 }, 1265 { } 1266 }; 1267 struct intel_gt *gt = arg; 1268 typeof(*phases) *p; 1269 int err; 1270 1271 for (p = phases; p->name; p++) { 1272 if (p->flags & TEST_PRIORITY) { 1273 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1274 continue; 1275 } 1276 1277 err = __igt_reset_engines(arg, p->name, p->flags); 1278 if (err) 1279 return err; 1280 } 1281 1282 return 0; 1283 } 1284 1285 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1286 { 1287 u32 count = i915_reset_count(>->i915->gpu_error); 1288 1289 intel_gt_reset(gt, mask, NULL); 1290 1291 return count; 1292 } 1293 1294 static int igt_reset_wait(void *arg) 1295 { 1296 struct intel_gt *gt = arg; 1297 struct i915_gpu_error *global = >->i915->gpu_error; 1298 struct intel_engine_cs *engine = gt->engine[RCS0]; 1299 struct i915_request *rq; 1300 unsigned int reset_count; 1301 struct hang h; 1302 long timeout; 1303 int err; 1304 1305 if (!engine || !intel_engine_can_store_dword(engine)) 1306 return 0; 1307 1308 /* Check that we detect a stuck waiter and issue a reset */ 1309 1310 igt_global_reset_lock(gt); 1311 1312 err = hang_init(&h, gt); 1313 if (err) { 1314 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1315 goto unlock; 1316 } 1317 1318 rq = hang_create_request(&h, engine); 1319 if (IS_ERR(rq)) { 1320 err = PTR_ERR(rq); 1321 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1322 goto fini; 1323 } 1324 1325 i915_request_get(rq); 1326 i915_request_add(rq); 1327 1328 if (!wait_until_running(&h, rq)) { 1329 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1330 1331 pr_err("%s: Failed to start request %llx, at %x\n", 1332 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1333 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1334 1335 intel_gt_set_wedged(gt); 1336 1337 err = -EIO; 1338 goto out_rq; 1339 } 1340 1341 reset_count = fake_hangcheck(gt, ALL_ENGINES); 1342 1343 timeout = i915_request_wait(rq, 0, 10); 1344 if (timeout < 0) { 1345 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1346 timeout); 1347 err = timeout; 1348 goto out_rq; 1349 } 1350 1351 if (i915_reset_count(global) == reset_count) { 1352 pr_err("No GPU reset recorded!\n"); 1353 err = -EINVAL; 1354 goto out_rq; 1355 } 1356 1357 out_rq: 1358 i915_request_put(rq); 1359 fini: 1360 hang_fini(&h); 1361 unlock: 1362 igt_global_reset_unlock(gt); 1363 1364 if (intel_gt_is_wedged(gt)) 1365 return -EIO; 1366 1367 return err; 1368 } 1369 1370 struct evict_vma { 1371 struct completion completion; 1372 struct i915_vma *vma; 1373 }; 1374 1375 static int evict_vma(void *data) 1376 { 1377 struct evict_vma *arg = data; 1378 struct i915_address_space *vm = arg->vma->vm; 1379 struct drm_mm_node evict = arg->vma->node; 1380 int err; 1381 1382 complete(&arg->completion); 1383 1384 mutex_lock(&vm->mutex); 1385 err = i915_gem_evict_for_node(vm, &evict, 0); 1386 mutex_unlock(&vm->mutex); 1387 1388 return err; 1389 } 1390 1391 static int evict_fence(void *data) 1392 { 1393 struct evict_vma *arg = data; 1394 int err; 1395 1396 complete(&arg->completion); 1397 1398 /* Mark the fence register as dirty to force the mmio update. */ 1399 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1400 if (err) { 1401 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1402 return err; 1403 } 1404 1405 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); 1406 if (err) { 1407 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); 1408 return err; 1409 } 1410 1411 err = i915_vma_pin_fence(arg->vma); 1412 i915_vma_unpin(arg->vma); 1413 if (err) { 1414 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1415 return err; 1416 } 1417 1418 i915_vma_unpin_fence(arg->vma); 1419 1420 return 0; 1421 } 1422 1423 static int __igt_reset_evict_vma(struct intel_gt *gt, 1424 struct i915_address_space *vm, 1425 int (*fn)(void *), 1426 unsigned int flags) 1427 { 1428 struct intel_engine_cs *engine = gt->engine[RCS0]; 1429 struct drm_i915_gem_object *obj; 1430 struct task_struct *tsk = NULL; 1431 struct i915_request *rq; 1432 struct evict_vma arg; 1433 struct hang h; 1434 unsigned int pin_flags; 1435 int err; 1436 1437 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE) 1438 return 0; 1439 1440 if (!engine || !intel_engine_can_store_dword(engine)) 1441 return 0; 1442 1443 /* Check that we can recover an unbind stuck on a hanging request */ 1444 1445 err = hang_init(&h, gt); 1446 if (err) { 1447 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1448 return err; 1449 } 1450 1451 obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1452 if (IS_ERR(obj)) { 1453 err = PTR_ERR(obj); 1454 pr_err("[%s] Create object failed: %d!\n", engine->name, err); 1455 goto fini; 1456 } 1457 1458 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1459 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1460 if (err) { 1461 pr_err("Invalid X-tiling settings; err:%d\n", err); 1462 goto out_obj; 1463 } 1464 } 1465 1466 arg.vma = i915_vma_instance(obj, vm, NULL); 1467 if (IS_ERR(arg.vma)) { 1468 err = PTR_ERR(arg.vma); 1469 pr_err("[%s] VMA instance failed: %d!\n", engine->name, err); 1470 goto out_obj; 1471 } 1472 1473 rq = hang_create_request(&h, engine); 1474 if (IS_ERR(rq)) { 1475 err = PTR_ERR(rq); 1476 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1477 goto out_obj; 1478 } 1479 1480 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER; 1481 1482 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1483 pin_flags |= PIN_MAPPABLE; 1484 1485 err = i915_vma_pin(arg.vma, 0, 0, pin_flags); 1486 if (err) { 1487 i915_request_add(rq); 1488 pr_err("[%s] VMA pin failed: %d!\n", engine->name, err); 1489 goto out_obj; 1490 } 1491 1492 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1493 err = i915_vma_pin_fence(arg.vma); 1494 if (err) { 1495 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1496 i915_vma_unpin(arg.vma); 1497 i915_request_add(rq); 1498 goto out_obj; 1499 } 1500 } 1501 1502 i915_vma_lock(arg.vma); 1503 err = i915_request_await_object(rq, arg.vma->obj, 1504 flags & EXEC_OBJECT_WRITE); 1505 if (err == 0) { 1506 err = i915_vma_move_to_active(arg.vma, rq, flags); 1507 if (err) 1508 pr_err("[%s] Move to active failed: %d!\n", engine->name, err); 1509 } else { 1510 pr_err("[%s] Request await failed: %d!\n", engine->name, err); 1511 } 1512 1513 i915_vma_unlock(arg.vma); 1514 1515 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1516 i915_vma_unpin_fence(arg.vma); 1517 i915_vma_unpin(arg.vma); 1518 1519 i915_request_get(rq); 1520 i915_request_add(rq); 1521 if (err) 1522 goto out_rq; 1523 1524 if (!wait_until_running(&h, rq)) { 1525 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1526 1527 pr_err("%s: Failed to start request %llx, at %x\n", 1528 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1529 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1530 1531 intel_gt_set_wedged(gt); 1532 goto out_reset; 1533 } 1534 1535 init_completion(&arg.completion); 1536 1537 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1538 if (IS_ERR(tsk)) { 1539 err = PTR_ERR(tsk); 1540 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err); 1541 tsk = NULL; 1542 goto out_reset; 1543 } 1544 get_task_struct(tsk); 1545 1546 wait_for_completion(&arg.completion); 1547 1548 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1549 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1550 1551 pr_err("igt/evict_vma kthread did not wait\n"); 1552 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1553 1554 intel_gt_set_wedged(gt); 1555 goto out_reset; 1556 } 1557 1558 out_reset: 1559 igt_global_reset_lock(gt); 1560 fake_hangcheck(gt, rq->engine->mask); 1561 igt_global_reset_unlock(gt); 1562 1563 if (tsk) { 1564 struct intel_wedge_me w; 1565 1566 /* The reset, even indirectly, should take less than 10ms. */ 1567 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1568 err = kthread_stop(tsk); 1569 1570 put_task_struct(tsk); 1571 } 1572 1573 out_rq: 1574 i915_request_put(rq); 1575 out_obj: 1576 i915_gem_object_put(obj); 1577 fini: 1578 hang_fini(&h); 1579 if (intel_gt_is_wedged(gt)) 1580 return -EIO; 1581 1582 return err; 1583 } 1584 1585 static int igt_reset_evict_ggtt(void *arg) 1586 { 1587 struct intel_gt *gt = arg; 1588 1589 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1590 evict_vma, EXEC_OBJECT_WRITE); 1591 } 1592 1593 static int igt_reset_evict_ppgtt(void *arg) 1594 { 1595 struct intel_gt *gt = arg; 1596 struct i915_ppgtt *ppgtt; 1597 int err; 1598 1599 /* aliasing == global gtt locking, covered above */ 1600 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL) 1601 return 0; 1602 1603 ppgtt = i915_ppgtt_create(gt, 0); 1604 if (IS_ERR(ppgtt)) 1605 return PTR_ERR(ppgtt); 1606 1607 err = __igt_reset_evict_vma(gt, &ppgtt->vm, 1608 evict_vma, EXEC_OBJECT_WRITE); 1609 i915_vm_put(&ppgtt->vm); 1610 1611 return err; 1612 } 1613 1614 static int igt_reset_evict_fence(void *arg) 1615 { 1616 struct intel_gt *gt = arg; 1617 1618 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1619 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1620 } 1621 1622 static int wait_for_others(struct intel_gt *gt, 1623 struct intel_engine_cs *exclude) 1624 { 1625 struct intel_engine_cs *engine; 1626 enum intel_engine_id id; 1627 1628 for_each_engine(engine, gt, id) { 1629 if (engine == exclude) 1630 continue; 1631 1632 if (!wait_for_idle(engine)) 1633 return -EIO; 1634 } 1635 1636 return 0; 1637 } 1638 1639 static int igt_reset_queue(void *arg) 1640 { 1641 struct intel_gt *gt = arg; 1642 struct i915_gpu_error *global = >->i915->gpu_error; 1643 struct intel_engine_cs *engine; 1644 enum intel_engine_id id; 1645 struct hang h; 1646 int err; 1647 1648 /* Check that we replay pending requests following a hang */ 1649 1650 igt_global_reset_lock(gt); 1651 1652 err = hang_init(&h, gt); 1653 if (err) 1654 goto unlock; 1655 1656 for_each_engine(engine, gt, id) { 1657 struct intel_selftest_saved_policy saved; 1658 struct i915_request *prev; 1659 IGT_TIMEOUT(end_time); 1660 unsigned int count; 1661 bool using_guc = intel_engine_uses_guc(engine); 1662 1663 if (!intel_engine_can_store_dword(engine)) 1664 continue; 1665 1666 if (using_guc) { 1667 err = intel_selftest_modify_policy(engine, &saved, 1668 SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK); 1669 if (err) { 1670 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 1671 goto fini; 1672 } 1673 } 1674 1675 prev = hang_create_request(&h, engine); 1676 if (IS_ERR(prev)) { 1677 err = PTR_ERR(prev); 1678 pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err); 1679 goto restore; 1680 } 1681 1682 i915_request_get(prev); 1683 i915_request_add(prev); 1684 1685 count = 0; 1686 do { 1687 struct i915_request *rq; 1688 unsigned int reset_count; 1689 1690 rq = hang_create_request(&h, engine); 1691 if (IS_ERR(rq)) { 1692 err = PTR_ERR(rq); 1693 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1694 goto restore; 1695 } 1696 1697 i915_request_get(rq); 1698 i915_request_add(rq); 1699 1700 /* 1701 * XXX We don't handle resetting the kernel context 1702 * very well. If we trigger a device reset twice in 1703 * quick succession while the kernel context is 1704 * executing, we may end up skipping the breadcrumb. 1705 * This is really only a problem for the selftest as 1706 * normally there is a large interlude between resets 1707 * (hangcheck), or we focus on resetting just one 1708 * engine and so avoid repeatedly resetting innocents. 1709 */ 1710 err = wait_for_others(gt, engine); 1711 if (err) { 1712 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1713 __func__, engine->name); 1714 i915_request_put(rq); 1715 i915_request_put(prev); 1716 1717 GEM_TRACE_DUMP(); 1718 intel_gt_set_wedged(gt); 1719 goto restore; 1720 } 1721 1722 if (!wait_until_running(&h, prev)) { 1723 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1724 1725 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1726 __func__, engine->name, 1727 prev->fence.seqno, hws_seqno(&h, prev)); 1728 intel_engine_dump(engine, &p, 1729 "%s\n", engine->name); 1730 1731 i915_request_put(rq); 1732 i915_request_put(prev); 1733 1734 intel_gt_set_wedged(gt); 1735 1736 err = -EIO; 1737 goto restore; 1738 } 1739 1740 reset_count = fake_hangcheck(gt, BIT(id)); 1741 1742 if (prev->fence.error != -EIO) { 1743 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1744 prev->fence.error); 1745 i915_request_put(rq); 1746 i915_request_put(prev); 1747 err = -EINVAL; 1748 goto restore; 1749 } 1750 1751 if (rq->fence.error) { 1752 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1753 rq->fence.error); 1754 i915_request_put(rq); 1755 i915_request_put(prev); 1756 err = -EINVAL; 1757 goto restore; 1758 } 1759 1760 if (i915_reset_count(global) == reset_count) { 1761 pr_err("No GPU reset recorded!\n"); 1762 i915_request_put(rq); 1763 i915_request_put(prev); 1764 err = -EINVAL; 1765 goto restore; 1766 } 1767 1768 i915_request_put(prev); 1769 prev = rq; 1770 count++; 1771 } while (time_before(jiffies, end_time)); 1772 pr_info("%s: Completed %d queued resets\n", 1773 engine->name, count); 1774 1775 *h.batch = MI_BATCH_BUFFER_END; 1776 intel_gt_chipset_flush(engine->gt); 1777 1778 i915_request_put(prev); 1779 1780 restore: 1781 if (using_guc) { 1782 int err2 = intel_selftest_restore_policy(engine, &saved); 1783 1784 if (err2) 1785 pr_err("%s:%d> [%s] Restore policy failed: %d!\n", 1786 __func__, __LINE__, engine->name, err2); 1787 if (err == 0) 1788 err = err2; 1789 } 1790 if (err) 1791 goto fini; 1792 1793 err = igt_flush_test(gt->i915); 1794 if (err) { 1795 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 1796 break; 1797 } 1798 } 1799 1800 fini: 1801 hang_fini(&h); 1802 unlock: 1803 igt_global_reset_unlock(gt); 1804 1805 if (intel_gt_is_wedged(gt)) 1806 return -EIO; 1807 1808 return err; 1809 } 1810 1811 static int igt_handle_error(void *arg) 1812 { 1813 struct intel_gt *gt = arg; 1814 struct i915_gpu_error *global = >->i915->gpu_error; 1815 struct intel_engine_cs *engine = gt->engine[RCS0]; 1816 struct hang h; 1817 struct i915_request *rq; 1818 struct i915_gpu_coredump *error; 1819 int err; 1820 1821 /* Check that we can issue a global GPU and engine reset */ 1822 1823 if (!intel_has_reset_engine(gt)) 1824 return 0; 1825 1826 if (!engine || !intel_engine_can_store_dword(engine)) 1827 return 0; 1828 1829 err = hang_init(&h, gt); 1830 if (err) { 1831 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1832 return err; 1833 } 1834 1835 rq = hang_create_request(&h, engine); 1836 if (IS_ERR(rq)) { 1837 err = PTR_ERR(rq); 1838 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1839 goto err_fini; 1840 } 1841 1842 i915_request_get(rq); 1843 i915_request_add(rq); 1844 1845 if (!wait_until_running(&h, rq)) { 1846 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1847 1848 pr_err("%s: Failed to start request %llx, at %x\n", 1849 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1850 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1851 1852 intel_gt_set_wedged(gt); 1853 1854 err = -EIO; 1855 goto err_request; 1856 } 1857 1858 /* Temporarily disable error capture */ 1859 error = xchg(&global->first_error, (void *)-1); 1860 1861 intel_gt_handle_error(gt, engine->mask, 0, NULL); 1862 1863 xchg(&global->first_error, error); 1864 1865 if (rq->fence.error != -EIO) { 1866 pr_err("Guilty request not identified!\n"); 1867 err = -EINVAL; 1868 goto err_request; 1869 } 1870 1871 err_request: 1872 i915_request_put(rq); 1873 err_fini: 1874 hang_fini(&h); 1875 return err; 1876 } 1877 1878 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1879 const struct igt_atomic_section *p, 1880 const char *mode) 1881 { 1882 struct tasklet_struct * const t = &engine->sched_engine->tasklet; 1883 int err; 1884 1885 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1886 engine->name, mode, p->name); 1887 1888 if (t->func) 1889 tasklet_disable(t); 1890 if (strcmp(p->name, "softirq")) 1891 local_bh_disable(); 1892 p->critical_section_begin(); 1893 1894 err = __intel_engine_reset_bh(engine, NULL); 1895 1896 p->critical_section_end(); 1897 if (strcmp(p->name, "softirq")) 1898 local_bh_enable(); 1899 if (t->func) { 1900 tasklet_enable(t); 1901 tasklet_hi_schedule(t); 1902 } 1903 1904 if (err) 1905 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1906 engine->name, mode, p->name); 1907 1908 return err; 1909 } 1910 1911 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1912 const struct igt_atomic_section *p) 1913 { 1914 struct i915_request *rq; 1915 struct hang h; 1916 int err; 1917 1918 err = __igt_atomic_reset_engine(engine, p, "idle"); 1919 if (err) 1920 return err; 1921 1922 err = hang_init(&h, engine->gt); 1923 if (err) { 1924 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1925 return err; 1926 } 1927 1928 rq = hang_create_request(&h, engine); 1929 if (IS_ERR(rq)) { 1930 err = PTR_ERR(rq); 1931 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1932 goto out; 1933 } 1934 1935 i915_request_get(rq); 1936 i915_request_add(rq); 1937 1938 if (wait_until_running(&h, rq)) { 1939 err = __igt_atomic_reset_engine(engine, p, "active"); 1940 } else { 1941 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1942 __func__, engine->name, 1943 rq->fence.seqno, hws_seqno(&h, rq)); 1944 intel_gt_set_wedged(engine->gt); 1945 err = -EIO; 1946 } 1947 1948 if (err == 0) { 1949 struct intel_wedge_me w; 1950 1951 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1952 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1953 if (intel_gt_is_wedged(engine->gt)) 1954 err = -EIO; 1955 } 1956 1957 i915_request_put(rq); 1958 out: 1959 hang_fini(&h); 1960 return err; 1961 } 1962 1963 static int igt_reset_engines_atomic(void *arg) 1964 { 1965 struct intel_gt *gt = arg; 1966 const typeof(*igt_atomic_phases) *p; 1967 int err = 0; 1968 1969 /* Check that the engines resets are usable from atomic context */ 1970 1971 if (!intel_has_reset_engine(gt)) 1972 return 0; 1973 1974 if (intel_uc_uses_guc_submission(>->uc)) 1975 return 0; 1976 1977 igt_global_reset_lock(gt); 1978 1979 /* Flush any requests before we get started and check basics */ 1980 if (!igt_force_reset(gt)) 1981 goto unlock; 1982 1983 for (p = igt_atomic_phases; p->name; p++) { 1984 struct intel_engine_cs *engine; 1985 enum intel_engine_id id; 1986 1987 for_each_engine(engine, gt, id) { 1988 err = igt_atomic_reset_engine(engine, p); 1989 if (err) 1990 goto out; 1991 } 1992 } 1993 1994 out: 1995 /* As we poke around the guts, do a full reset before continuing. */ 1996 igt_force_reset(gt); 1997 unlock: 1998 igt_global_reset_unlock(gt); 1999 2000 return err; 2001 } 2002 2003 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 2004 { 2005 static const struct i915_subtest tests[] = { 2006 SUBTEST(igt_hang_sanitycheck), 2007 SUBTEST(igt_reset_nop), 2008 SUBTEST(igt_reset_nop_engine), 2009 SUBTEST(igt_reset_idle_engine), 2010 SUBTEST(igt_reset_active_engine), 2011 SUBTEST(igt_reset_fail_engine), 2012 SUBTEST(igt_reset_engines), 2013 SUBTEST(igt_reset_engines_atomic), 2014 SUBTEST(igt_reset_queue), 2015 SUBTEST(igt_reset_wait), 2016 SUBTEST(igt_reset_evict_ggtt), 2017 SUBTEST(igt_reset_evict_ppgtt), 2018 SUBTEST(igt_reset_evict_fence), 2019 SUBTEST(igt_handle_error), 2020 }; 2021 struct intel_gt *gt = to_gt(i915); 2022 intel_wakeref_t wakeref; 2023 int err; 2024 2025 if (!intel_has_gpu_reset(gt)) 2026 return 0; 2027 2028 if (intel_gt_is_wedged(gt)) 2029 return -EIO; /* we're long past hope of a successful reset */ 2030 2031 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 2032 2033 err = intel_gt_live_subtests(tests, gt); 2034 2035 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 2036 2037 return err; 2038 } 2039