1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2016 Intel Corporation 4 */ 5 6 #include <linux/kthread.h> 7 8 #include "gem/i915_gem_context.h" 9 #include "gem/i915_gem_internal.h" 10 11 #include "i915_gem_evict.h" 12 #include "intel_gt.h" 13 #include "intel_engine_heartbeat.h" 14 #include "intel_engine_pm.h" 15 #include "selftest_engine_heartbeat.h" 16 17 #include "i915_selftest.h" 18 #include "selftests/i915_random.h" 19 #include "selftests/igt_flush_test.h" 20 #include "selftests/igt_reset.h" 21 #include "selftests/igt_atomic.h" 22 #include "selftests/igt_spinner.h" 23 #include "selftests/intel_scheduler_helpers.h" 24 25 #include "selftests/mock_drm.h" 26 27 #include "gem/selftests/mock_context.h" 28 #include "gem/selftests/igt_gem_utils.h" 29 30 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ 31 32 struct hang { 33 struct intel_gt *gt; 34 struct drm_i915_gem_object *hws; 35 struct drm_i915_gem_object *obj; 36 struct i915_gem_context *ctx; 37 u32 *seqno; 38 u32 *batch; 39 }; 40 41 static int hang_init(struct hang *h, struct intel_gt *gt) 42 { 43 void *vaddr; 44 int err; 45 46 memset(h, 0, sizeof(*h)); 47 h->gt = gt; 48 49 h->ctx = kernel_context(gt->i915, NULL); 50 if (IS_ERR(h->ctx)) 51 return PTR_ERR(h->ctx); 52 53 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); 54 55 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 56 if (IS_ERR(h->hws)) { 57 err = PTR_ERR(h->hws); 58 goto err_ctx; 59 } 60 61 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 62 if (IS_ERR(h->obj)) { 63 err = PTR_ERR(h->obj); 64 goto err_hws; 65 } 66 67 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); 68 vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB); 69 if (IS_ERR(vaddr)) { 70 err = PTR_ERR(vaddr); 71 goto err_obj; 72 } 73 h->seqno = memset(vaddr, 0xff, PAGE_SIZE); 74 75 vaddr = i915_gem_object_pin_map_unlocked(h->obj, 76 intel_gt_coherent_map_type(gt, h->obj, false)); 77 if (IS_ERR(vaddr)) { 78 err = PTR_ERR(vaddr); 79 goto err_unpin_hws; 80 } 81 h->batch = vaddr; 82 83 return 0; 84 85 err_unpin_hws: 86 i915_gem_object_unpin_map(h->hws); 87 err_obj: 88 i915_gem_object_put(h->obj); 89 err_hws: 90 i915_gem_object_put(h->hws); 91 err_ctx: 92 kernel_context_close(h->ctx); 93 return err; 94 } 95 96 static u64 hws_address(const struct i915_vma *hws, 97 const struct i915_request *rq) 98 { 99 return i915_vma_offset(hws) + 100 offset_in_page(sizeof(u32) * rq->fence.context); 101 } 102 103 static struct i915_request * 104 hang_create_request(struct hang *h, struct intel_engine_cs *engine) 105 { 106 struct intel_gt *gt = h->gt; 107 struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx); 108 struct drm_i915_gem_object *obj; 109 struct i915_request *rq = NULL; 110 struct i915_vma *hws, *vma; 111 unsigned int flags; 112 void *vaddr; 113 u32 *batch; 114 int err; 115 116 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 117 if (IS_ERR(obj)) { 118 i915_vm_put(vm); 119 return ERR_CAST(obj); 120 } 121 122 vaddr = i915_gem_object_pin_map_unlocked(obj, intel_gt_coherent_map_type(gt, obj, false)); 123 if (IS_ERR(vaddr)) { 124 i915_gem_object_put(obj); 125 i915_vm_put(vm); 126 return ERR_CAST(vaddr); 127 } 128 129 i915_gem_object_unpin_map(h->obj); 130 i915_gem_object_put(h->obj); 131 132 h->obj = obj; 133 h->batch = vaddr; 134 135 vma = i915_vma_instance(h->obj, vm, NULL); 136 if (IS_ERR(vma)) { 137 i915_vm_put(vm); 138 return ERR_CAST(vma); 139 } 140 141 hws = i915_vma_instance(h->hws, vm, NULL); 142 if (IS_ERR(hws)) { 143 i915_vm_put(vm); 144 return ERR_CAST(hws); 145 } 146 147 err = i915_vma_pin(vma, 0, 0, PIN_USER); 148 if (err) { 149 i915_vm_put(vm); 150 return ERR_PTR(err); 151 } 152 153 err = i915_vma_pin(hws, 0, 0, PIN_USER); 154 if (err) 155 goto unpin_vma; 156 157 rq = igt_request_alloc(h->ctx, engine); 158 if (IS_ERR(rq)) { 159 err = PTR_ERR(rq); 160 goto unpin_hws; 161 } 162 163 err = igt_vma_move_to_active_unlocked(vma, rq, 0); 164 if (err) 165 goto cancel_rq; 166 167 err = igt_vma_move_to_active_unlocked(hws, rq, 0); 168 if (err) 169 goto cancel_rq; 170 171 batch = h->batch; 172 if (GRAPHICS_VER(gt->i915) >= 8) { 173 *batch++ = MI_STORE_DWORD_IMM_GEN4; 174 *batch++ = lower_32_bits(hws_address(hws, rq)); 175 *batch++ = upper_32_bits(hws_address(hws, rq)); 176 *batch++ = rq->fence.seqno; 177 *batch++ = MI_NOOP; 178 179 memset(batch, 0, 1024); 180 batch += 1024 / sizeof(*batch); 181 182 *batch++ = MI_NOOP; 183 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 184 *batch++ = lower_32_bits(i915_vma_offset(vma)); 185 *batch++ = upper_32_bits(i915_vma_offset(vma)); 186 } else if (GRAPHICS_VER(gt->i915) >= 6) { 187 *batch++ = MI_STORE_DWORD_IMM_GEN4; 188 *batch++ = 0; 189 *batch++ = lower_32_bits(hws_address(hws, rq)); 190 *batch++ = rq->fence.seqno; 191 *batch++ = MI_NOOP; 192 193 memset(batch, 0, 1024); 194 batch += 1024 / sizeof(*batch); 195 196 *batch++ = MI_NOOP; 197 *batch++ = MI_BATCH_BUFFER_START | 1 << 8; 198 *batch++ = lower_32_bits(i915_vma_offset(vma)); 199 } else if (GRAPHICS_VER(gt->i915) >= 4) { 200 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 201 *batch++ = 0; 202 *batch++ = lower_32_bits(hws_address(hws, rq)); 203 *batch++ = rq->fence.seqno; 204 *batch++ = MI_NOOP; 205 206 memset(batch, 0, 1024); 207 batch += 1024 / sizeof(*batch); 208 209 *batch++ = MI_NOOP; 210 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 211 *batch++ = lower_32_bits(i915_vma_offset(vma)); 212 } else { 213 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; 214 *batch++ = lower_32_bits(hws_address(hws, rq)); 215 *batch++ = rq->fence.seqno; 216 *batch++ = MI_NOOP; 217 218 memset(batch, 0, 1024); 219 batch += 1024 / sizeof(*batch); 220 221 *batch++ = MI_NOOP; 222 *batch++ = MI_BATCH_BUFFER_START | 2 << 6; 223 *batch++ = lower_32_bits(i915_vma_offset(vma)); 224 } 225 *batch++ = MI_BATCH_BUFFER_END; /* not reached */ 226 intel_gt_chipset_flush(engine->gt); 227 228 if (rq->engine->emit_init_breadcrumb) { 229 err = rq->engine->emit_init_breadcrumb(rq); 230 if (err) 231 goto cancel_rq; 232 } 233 234 flags = 0; 235 if (GRAPHICS_VER(gt->i915) <= 5) 236 flags |= I915_DISPATCH_SECURE; 237 238 err = rq->engine->emit_bb_start(rq, i915_vma_offset(vma), PAGE_SIZE, flags); 239 240 cancel_rq: 241 if (err) { 242 i915_request_set_error_once(rq, err); 243 i915_request_add(rq); 244 } 245 unpin_hws: 246 i915_vma_unpin(hws); 247 unpin_vma: 248 i915_vma_unpin(vma); 249 i915_vm_put(vm); 250 return err ? ERR_PTR(err) : rq; 251 } 252 253 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) 254 { 255 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); 256 } 257 258 static void hang_fini(struct hang *h) 259 { 260 *h->batch = MI_BATCH_BUFFER_END; 261 intel_gt_chipset_flush(h->gt); 262 263 i915_gem_object_unpin_map(h->obj); 264 i915_gem_object_put(h->obj); 265 266 i915_gem_object_unpin_map(h->hws); 267 i915_gem_object_put(h->hws); 268 269 kernel_context_close(h->ctx); 270 271 igt_flush_test(h->gt->i915); 272 } 273 274 static bool wait_until_running(struct hang *h, struct i915_request *rq) 275 { 276 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), 277 rq->fence.seqno), 278 10) && 279 wait_for(i915_seqno_passed(hws_seqno(h, rq), 280 rq->fence.seqno), 281 1000)); 282 } 283 284 static int igt_hang_sanitycheck(void *arg) 285 { 286 struct intel_gt *gt = arg; 287 struct i915_request *rq; 288 struct intel_engine_cs *engine; 289 enum intel_engine_id id; 290 struct hang h; 291 int err; 292 293 /* Basic check that we can execute our hanging batch */ 294 295 err = hang_init(&h, gt); 296 if (err) 297 return err; 298 299 for_each_engine(engine, gt, id) { 300 struct intel_wedge_me w; 301 long timeout; 302 303 if (!intel_engine_can_store_dword(engine)) 304 continue; 305 306 rq = hang_create_request(&h, engine); 307 if (IS_ERR(rq)) { 308 err = PTR_ERR(rq); 309 pr_err("Failed to create request for %s, err=%d\n", 310 engine->name, err); 311 goto fini; 312 } 313 314 i915_request_get(rq); 315 316 *h.batch = MI_BATCH_BUFFER_END; 317 intel_gt_chipset_flush(engine->gt); 318 319 i915_request_add(rq); 320 321 timeout = 0; 322 intel_wedge_on_timeout(&w, gt, HZ / 5 /* 200ms */) 323 timeout = i915_request_wait(rq, 0, 324 MAX_SCHEDULE_TIMEOUT); 325 if (intel_gt_is_wedged(gt)) 326 timeout = -EIO; 327 328 i915_request_put(rq); 329 330 if (timeout < 0) { 331 err = timeout; 332 pr_err("Wait for request failed on %s, err=%d\n", 333 engine->name, err); 334 goto fini; 335 } 336 } 337 338 fini: 339 hang_fini(&h); 340 return err; 341 } 342 343 static bool wait_for_idle(struct intel_engine_cs *engine) 344 { 345 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; 346 } 347 348 static int igt_reset_nop(void *arg) 349 { 350 struct intel_gt *gt = arg; 351 struct i915_gpu_error *global = >->i915->gpu_error; 352 struct intel_engine_cs *engine; 353 unsigned int reset_count, count; 354 enum intel_engine_id id; 355 IGT_TIMEOUT(end_time); 356 int err = 0; 357 358 /* Check that we can reset during non-user portions of requests */ 359 360 reset_count = i915_reset_count(global); 361 count = 0; 362 do { 363 for_each_engine(engine, gt, id) { 364 struct intel_context *ce; 365 int i; 366 367 ce = intel_context_create(engine); 368 if (IS_ERR(ce)) { 369 err = PTR_ERR(ce); 370 pr_err("[%s] Create context failed: %d!\n", engine->name, err); 371 break; 372 } 373 374 for (i = 0; i < 16; i++) { 375 struct i915_request *rq; 376 377 rq = intel_context_create_request(ce); 378 if (IS_ERR(rq)) { 379 err = PTR_ERR(rq); 380 pr_err("[%s] Create request failed: %d!\n", 381 engine->name, err); 382 break; 383 } 384 385 i915_request_add(rq); 386 } 387 388 intel_context_put(ce); 389 } 390 391 igt_global_reset_lock(gt); 392 intel_gt_reset(gt, ALL_ENGINES, NULL); 393 igt_global_reset_unlock(gt); 394 395 if (intel_gt_is_wedged(gt)) { 396 pr_err("[%s] GT is wedged!\n", engine->name); 397 err = -EIO; 398 break; 399 } 400 401 if (i915_reset_count(global) != reset_count + ++count) { 402 pr_err("[%s] Reset not recorded: %d vs %d + %d!\n", 403 engine->name, i915_reset_count(global), reset_count, count); 404 err = -EINVAL; 405 break; 406 } 407 408 err = igt_flush_test(gt->i915); 409 if (err) { 410 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 411 break; 412 } 413 } while (time_before(jiffies, end_time)); 414 pr_info("%s: %d resets\n", __func__, count); 415 416 if (igt_flush_test(gt->i915)) { 417 pr_err("Post flush failed: %d!\n", err); 418 err = -EIO; 419 } 420 421 return err; 422 } 423 424 static int igt_reset_nop_engine(void *arg) 425 { 426 struct intel_gt *gt = arg; 427 struct i915_gpu_error *global = >->i915->gpu_error; 428 struct intel_engine_cs *engine; 429 enum intel_engine_id id; 430 431 /* Check that we can engine-reset during non-user portions */ 432 433 if (!intel_has_reset_engine(gt)) 434 return 0; 435 436 for_each_engine(engine, gt, id) { 437 unsigned int reset_count, reset_engine_count, count; 438 struct intel_context *ce; 439 IGT_TIMEOUT(end_time); 440 int err; 441 442 if (intel_engine_uses_guc(engine)) { 443 /* Engine level resets are triggered by GuC when a hang 444 * is detected. They can't be triggered by the KMD any 445 * more. Thus a nop batch cannot be used as a reset test 446 */ 447 continue; 448 } 449 450 ce = intel_context_create(engine); 451 if (IS_ERR(ce)) { 452 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce); 453 return PTR_ERR(ce); 454 } 455 456 reset_count = i915_reset_count(global); 457 reset_engine_count = i915_reset_engine_count(global, engine); 458 count = 0; 459 460 st_engine_heartbeat_disable(engine); 461 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 462 >->reset.flags)); 463 do { 464 int i; 465 466 if (!wait_for_idle(engine)) { 467 pr_err("%s failed to idle before reset\n", 468 engine->name); 469 err = -EIO; 470 break; 471 } 472 473 for (i = 0; i < 16; i++) { 474 struct i915_request *rq; 475 476 rq = intel_context_create_request(ce); 477 if (IS_ERR(rq)) { 478 struct drm_printer p = 479 drm_info_printer(gt->i915->drm.dev); 480 intel_engine_dump(engine, &p, 481 "%s(%s): failed to submit request\n", 482 __func__, 483 engine->name); 484 485 GEM_TRACE("%s(%s): failed to submit request\n", 486 __func__, 487 engine->name); 488 GEM_TRACE_DUMP(); 489 490 intel_gt_set_wedged(gt); 491 492 err = PTR_ERR(rq); 493 break; 494 } 495 496 i915_request_add(rq); 497 } 498 err = intel_engine_reset(engine, NULL); 499 if (err) { 500 pr_err("intel_engine_reset(%s) failed, err:%d\n", 501 engine->name, err); 502 break; 503 } 504 505 if (i915_reset_count(global) != reset_count) { 506 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 507 err = -EINVAL; 508 break; 509 } 510 511 if (i915_reset_engine_count(global, engine) != 512 reset_engine_count + ++count) { 513 pr_err("%s engine reset not recorded!\n", 514 engine->name); 515 err = -EINVAL; 516 break; 517 } 518 } while (time_before(jiffies, end_time)); 519 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 520 st_engine_heartbeat_enable(engine); 521 522 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 523 524 intel_context_put(ce); 525 if (igt_flush_test(gt->i915)) 526 err = -EIO; 527 if (err) 528 return err; 529 } 530 531 return 0; 532 } 533 534 static void force_reset_timeout(struct intel_engine_cs *engine) 535 { 536 engine->reset_timeout.probability = 999; 537 atomic_set(&engine->reset_timeout.times, -1); 538 } 539 540 static void cancel_reset_timeout(struct intel_engine_cs *engine) 541 { 542 memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout)); 543 } 544 545 static int igt_reset_fail_engine(void *arg) 546 { 547 struct intel_gt *gt = arg; 548 struct intel_engine_cs *engine; 549 enum intel_engine_id id; 550 551 /* Check that we can recover from engine-reset failures */ 552 553 if (!intel_has_reset_engine(gt)) 554 return 0; 555 556 for_each_engine(engine, gt, id) { 557 unsigned int count; 558 struct intel_context *ce; 559 IGT_TIMEOUT(end_time); 560 int err; 561 562 /* Can't manually break the reset if i915 doesn't perform it */ 563 if (intel_engine_uses_guc(engine)) 564 continue; 565 566 ce = intel_context_create(engine); 567 if (IS_ERR(ce)) { 568 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce); 569 return PTR_ERR(ce); 570 } 571 572 st_engine_heartbeat_disable(engine); 573 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 574 >->reset.flags)); 575 576 force_reset_timeout(engine); 577 err = intel_engine_reset(engine, NULL); 578 cancel_reset_timeout(engine); 579 if (err == 0) /* timeouts only generated on gen8+ */ 580 goto skip; 581 582 count = 0; 583 do { 584 struct i915_request *last = NULL; 585 int i; 586 587 if (!wait_for_idle(engine)) { 588 pr_err("%s failed to idle before reset\n", 589 engine->name); 590 err = -EIO; 591 break; 592 } 593 594 for (i = 0; i < count % 15; i++) { 595 struct i915_request *rq; 596 597 rq = intel_context_create_request(ce); 598 if (IS_ERR(rq)) { 599 struct drm_printer p = 600 drm_info_printer(gt->i915->drm.dev); 601 intel_engine_dump(engine, &p, 602 "%s(%s): failed to submit request\n", 603 __func__, 604 engine->name); 605 606 GEM_TRACE("%s(%s): failed to submit request\n", 607 __func__, 608 engine->name); 609 GEM_TRACE_DUMP(); 610 611 intel_gt_set_wedged(gt); 612 if (last) 613 i915_request_put(last); 614 615 err = PTR_ERR(rq); 616 goto out; 617 } 618 619 if (last) 620 i915_request_put(last); 621 last = i915_request_get(rq); 622 i915_request_add(rq); 623 } 624 625 if (count & 1) { 626 err = intel_engine_reset(engine, NULL); 627 if (err) { 628 GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n", 629 engine->name, err); 630 GEM_TRACE_DUMP(); 631 i915_request_put(last); 632 break; 633 } 634 } else { 635 force_reset_timeout(engine); 636 err = intel_engine_reset(engine, NULL); 637 cancel_reset_timeout(engine); 638 if (err != -ETIMEDOUT) { 639 pr_err("intel_engine_reset(%s) did not fail, err:%d\n", 640 engine->name, err); 641 i915_request_put(last); 642 break; 643 } 644 } 645 646 err = 0; 647 if (last) { 648 if (i915_request_wait(last, 0, HZ / 2) < 0) { 649 struct drm_printer p = 650 drm_info_printer(gt->i915->drm.dev); 651 652 intel_engine_dump(engine, &p, 653 "%s(%s): failed to complete request\n", 654 __func__, 655 engine->name); 656 657 GEM_TRACE("%s(%s): failed to complete request\n", 658 __func__, 659 engine->name); 660 GEM_TRACE_DUMP(); 661 662 err = -EIO; 663 } 664 i915_request_put(last); 665 } 666 count++; 667 } while (err == 0 && time_before(jiffies, end_time)); 668 out: 669 pr_info("%s(%s): %d resets\n", __func__, engine->name, count); 670 skip: 671 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 672 st_engine_heartbeat_enable(engine); 673 intel_context_put(ce); 674 675 if (igt_flush_test(gt->i915)) 676 err = -EIO; 677 if (err) 678 return err; 679 } 680 681 return 0; 682 } 683 684 static int __igt_reset_engine(struct intel_gt *gt, bool active) 685 { 686 struct i915_gpu_error *global = >->i915->gpu_error; 687 struct intel_engine_cs *engine; 688 enum intel_engine_id id; 689 struct hang h; 690 int err = 0; 691 692 /* Check that we can issue an engine reset on an idle engine (no-op) */ 693 694 if (!intel_has_reset_engine(gt)) 695 return 0; 696 697 if (active) { 698 err = hang_init(&h, gt); 699 if (err) 700 return err; 701 } 702 703 for_each_engine(engine, gt, id) { 704 unsigned int reset_count, reset_engine_count; 705 unsigned long count; 706 bool using_guc = intel_engine_uses_guc(engine); 707 IGT_TIMEOUT(end_time); 708 709 if (using_guc && !active) 710 continue; 711 712 if (active && !intel_engine_can_store_dword(engine)) 713 continue; 714 715 if (!wait_for_idle(engine)) { 716 pr_err("%s failed to idle before reset\n", 717 engine->name); 718 err = -EIO; 719 break; 720 } 721 722 reset_count = i915_reset_count(global); 723 reset_engine_count = i915_reset_engine_count(global, engine); 724 725 st_engine_heartbeat_disable(engine); 726 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 727 >->reset.flags)); 728 count = 0; 729 do { 730 struct i915_request *rq = NULL; 731 struct intel_selftest_saved_policy saved; 732 int err2; 733 734 err = intel_selftest_modify_policy(engine, &saved, 735 SELFTEST_SCHEDULER_MODIFY_FAST_RESET); 736 if (err) { 737 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 738 break; 739 } 740 741 if (active) { 742 rq = hang_create_request(&h, engine); 743 if (IS_ERR(rq)) { 744 err = PTR_ERR(rq); 745 pr_err("[%s] Create hang request failed: %d!\n", 746 engine->name, err); 747 goto restore; 748 } 749 750 i915_request_get(rq); 751 i915_request_add(rq); 752 753 if (!wait_until_running(&h, rq)) { 754 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 755 756 pr_err("%s: Failed to start request %llx, at %x\n", 757 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 758 intel_engine_dump(engine, &p, 759 "%s\n", engine->name); 760 761 i915_request_put(rq); 762 err = -EIO; 763 goto restore; 764 } 765 } 766 767 if (!using_guc) { 768 err = intel_engine_reset(engine, NULL); 769 if (err) { 770 pr_err("intel_engine_reset(%s) failed, err:%d\n", 771 engine->name, err); 772 goto skip; 773 } 774 } 775 776 if (rq) { 777 /* Ensure the reset happens and kills the engine */ 778 err = intel_selftest_wait_for_rq(rq); 779 if (err) 780 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n", 781 engine->name, rq->fence.context, 782 rq->fence.seqno, rq->context->guc_id.id, err); 783 } 784 785 skip: 786 if (rq) 787 i915_request_put(rq); 788 789 if (i915_reset_count(global) != reset_count) { 790 pr_err("Full GPU reset recorded! (engine reset expected)\n"); 791 err = -EINVAL; 792 goto restore; 793 } 794 795 /* GuC based resets are not logged per engine */ 796 if (!using_guc) { 797 if (i915_reset_engine_count(global, engine) != 798 ++reset_engine_count) { 799 pr_err("%s engine reset not recorded!\n", 800 engine->name); 801 err = -EINVAL; 802 goto restore; 803 } 804 } 805 806 count++; 807 808 restore: 809 err2 = intel_selftest_restore_policy(engine, &saved); 810 if (err2) 811 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err); 812 if (err == 0) 813 err = err2; 814 if (err) 815 break; 816 } while (time_before(jiffies, end_time)); 817 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 818 st_engine_heartbeat_enable(engine); 819 pr_info("%s: Completed %lu %s resets\n", 820 engine->name, count, active ? "active" : "idle"); 821 822 if (err) 823 break; 824 825 err = igt_flush_test(gt->i915); 826 if (err) { 827 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 828 break; 829 } 830 } 831 832 if (intel_gt_is_wedged(gt)) { 833 pr_err("GT is wedged!\n"); 834 err = -EIO; 835 } 836 837 if (active) 838 hang_fini(&h); 839 840 return err; 841 } 842 843 static int igt_reset_idle_engine(void *arg) 844 { 845 return __igt_reset_engine(arg, false); 846 } 847 848 static int igt_reset_active_engine(void *arg) 849 { 850 return __igt_reset_engine(arg, true); 851 } 852 853 struct active_engine { 854 struct kthread_worker *worker; 855 struct kthread_work work; 856 struct intel_engine_cs *engine; 857 unsigned long resets; 858 unsigned int flags; 859 bool stop; 860 int result; 861 }; 862 863 #define TEST_ACTIVE BIT(0) 864 #define TEST_OTHERS BIT(1) 865 #define TEST_SELF BIT(2) 866 #define TEST_PRIORITY BIT(3) 867 868 static int active_request_put(struct i915_request *rq) 869 { 870 int err = 0; 871 872 if (!rq) 873 return 0; 874 875 if (i915_request_wait(rq, 0, 10 * HZ) < 0) { 876 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", 877 rq->engine->name, 878 rq->fence.context, 879 rq->fence.seqno); 880 GEM_TRACE_DUMP(); 881 882 intel_gt_set_wedged(rq->engine->gt); 883 err = -EIO; 884 } 885 886 i915_request_put(rq); 887 888 return err; 889 } 890 891 static void active_engine(struct kthread_work *work) 892 { 893 I915_RND_STATE(prng); 894 struct active_engine *arg = container_of(work, typeof(*arg), work); 895 struct intel_engine_cs *engine = arg->engine; 896 struct i915_request *rq[8] = {}; 897 struct intel_context *ce[ARRAY_SIZE(rq)]; 898 unsigned long count; 899 int err = 0; 900 901 for (count = 0; count < ARRAY_SIZE(ce); count++) { 902 ce[count] = intel_context_create(engine); 903 if (IS_ERR(ce[count])) { 904 arg->result = PTR_ERR(ce[count]); 905 pr_err("[%s] Create context #%ld failed: %d!\n", 906 engine->name, count, arg->result); 907 if (!count) 908 return; 909 while (--count) 910 intel_context_put(ce[count]); 911 return; 912 } 913 } 914 915 count = 0; 916 while (!READ_ONCE(arg->stop)) { 917 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); 918 struct i915_request *old = rq[idx]; 919 struct i915_request *new; 920 921 new = intel_context_create_request(ce[idx]); 922 if (IS_ERR(new)) { 923 err = PTR_ERR(new); 924 pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err); 925 break; 926 } 927 928 rq[idx] = i915_request_get(new); 929 i915_request_add(new); 930 931 if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) { 932 struct i915_sched_attr attr = { 933 .priority = 934 i915_prandom_u32_max_state(512, &prng), 935 }; 936 engine->sched_engine->schedule(rq[idx], &attr); 937 } 938 939 err = active_request_put(old); 940 if (err) { 941 pr_err("[%s] Request put failed: %d!\n", engine->name, err); 942 break; 943 } 944 945 cond_resched(); 946 } 947 948 for (count = 0; count < ARRAY_SIZE(rq); count++) { 949 int err__ = active_request_put(rq[count]); 950 951 if (err) 952 pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err); 953 954 /* Keep the first error */ 955 if (!err) 956 err = err__; 957 958 intel_context_put(ce[count]); 959 } 960 961 arg->result = err; 962 } 963 964 static int __igt_reset_engines(struct intel_gt *gt, 965 const char *test_name, 966 unsigned int flags) 967 { 968 struct i915_gpu_error *global = >->i915->gpu_error; 969 struct intel_engine_cs *engine, *other; 970 struct active_engine *threads; 971 enum intel_engine_id id, tmp; 972 struct hang h; 973 int err = 0; 974 975 /* Check that issuing a reset on one engine does not interfere 976 * with any other engine. 977 */ 978 979 if (!intel_has_reset_engine(gt)) 980 return 0; 981 982 if (flags & TEST_ACTIVE) { 983 err = hang_init(&h, gt); 984 if (err) 985 return err; 986 987 if (flags & TEST_PRIORITY) 988 h.ctx->sched.priority = 1024; 989 } 990 991 threads = kmalloc_array(I915_NUM_ENGINES, sizeof(*threads), GFP_KERNEL); 992 if (!threads) 993 return -ENOMEM; 994 995 for_each_engine(engine, gt, id) { 996 unsigned long device = i915_reset_count(global); 997 unsigned long count = 0, reported; 998 bool using_guc = intel_engine_uses_guc(engine); 999 IGT_TIMEOUT(end_time); 1000 1001 if (flags & TEST_ACTIVE) { 1002 if (!intel_engine_can_store_dword(engine)) 1003 continue; 1004 } else if (using_guc) 1005 continue; 1006 1007 if (!wait_for_idle(engine)) { 1008 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", 1009 engine->name, test_name); 1010 err = -EIO; 1011 break; 1012 } 1013 1014 memset(threads, 0, sizeof(*threads) * I915_NUM_ENGINES); 1015 for_each_engine(other, gt, tmp) { 1016 struct kthread_worker *worker; 1017 1018 threads[tmp].resets = 1019 i915_reset_engine_count(global, other); 1020 1021 if (other == engine && !(flags & TEST_SELF)) 1022 continue; 1023 1024 if (other != engine && !(flags & TEST_OTHERS)) 1025 continue; 1026 1027 threads[tmp].engine = other; 1028 threads[tmp].flags = flags; 1029 1030 worker = kthread_run_worker(0, "igt/%s", 1031 other->name); 1032 if (IS_ERR(worker)) { 1033 err = PTR_ERR(worker); 1034 pr_err("[%s] Worker create failed: %d!\n", 1035 engine->name, err); 1036 goto unwind; 1037 } 1038 1039 threads[tmp].worker = worker; 1040 1041 kthread_init_work(&threads[tmp].work, active_engine); 1042 kthread_queue_work(threads[tmp].worker, 1043 &threads[tmp].work); 1044 } 1045 1046 st_engine_heartbeat_disable_no_pm(engine); 1047 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, 1048 >->reset.flags)); 1049 do { 1050 struct i915_request *rq = NULL; 1051 struct intel_selftest_saved_policy saved; 1052 int err2; 1053 1054 err = intel_selftest_modify_policy(engine, &saved, 1055 SELFTEST_SCHEDULER_MODIFY_FAST_RESET); 1056 if (err) { 1057 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 1058 break; 1059 } 1060 1061 if (flags & TEST_ACTIVE) { 1062 rq = hang_create_request(&h, engine); 1063 if (IS_ERR(rq)) { 1064 err = PTR_ERR(rq); 1065 pr_err("[%s] Create hang request failed: %d!\n", 1066 engine->name, err); 1067 goto restore; 1068 } 1069 1070 i915_request_get(rq); 1071 i915_request_add(rq); 1072 1073 if (!wait_until_running(&h, rq)) { 1074 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1075 1076 pr_err("%s: Failed to start request %llx, at %x\n", 1077 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1078 intel_engine_dump(engine, &p, 1079 "%s\n", engine->name); 1080 1081 i915_request_put(rq); 1082 err = -EIO; 1083 goto restore; 1084 } 1085 } else { 1086 intel_engine_pm_get(engine); 1087 } 1088 1089 if (!using_guc) { 1090 err = intel_engine_reset(engine, NULL); 1091 if (err) { 1092 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", 1093 engine->name, test_name, err); 1094 goto restore; 1095 } 1096 } 1097 1098 if (rq) { 1099 /* Ensure the reset happens and kills the engine */ 1100 err = intel_selftest_wait_for_rq(rq); 1101 if (err) 1102 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n", 1103 engine->name, rq->fence.context, 1104 rq->fence.seqno, rq->context->guc_id.id, err); 1105 } 1106 1107 count++; 1108 1109 if (rq) { 1110 if (rq->fence.error != -EIO) { 1111 pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n", 1112 engine->name, test_name, 1113 rq->fence.context, 1114 rq->fence.seqno, rq->context->guc_id.id); 1115 i915_request_put(rq); 1116 1117 GEM_TRACE_DUMP(); 1118 intel_gt_set_wedged(gt); 1119 err = -EIO; 1120 goto restore; 1121 } 1122 1123 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 1124 struct drm_printer p = 1125 drm_info_printer(gt->i915->drm.dev); 1126 1127 pr_err("i915_reset_engine(%s:%s):" 1128 " failed to complete request %llx:%lld after reset\n", 1129 engine->name, test_name, 1130 rq->fence.context, 1131 rq->fence.seqno); 1132 intel_engine_dump(engine, &p, 1133 "%s\n", engine->name); 1134 i915_request_put(rq); 1135 1136 GEM_TRACE_DUMP(); 1137 intel_gt_set_wedged(gt); 1138 err = -EIO; 1139 goto restore; 1140 } 1141 1142 i915_request_put(rq); 1143 } 1144 1145 if (!(flags & TEST_ACTIVE)) 1146 intel_engine_pm_put(engine); 1147 1148 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { 1149 struct drm_printer p = 1150 drm_info_printer(gt->i915->drm.dev); 1151 1152 pr_err("i915_reset_engine(%s:%s):" 1153 " failed to idle after reset\n", 1154 engine->name, test_name); 1155 intel_engine_dump(engine, &p, 1156 "%s\n", engine->name); 1157 1158 err = -EIO; 1159 goto restore; 1160 } 1161 1162 restore: 1163 err2 = intel_selftest_restore_policy(engine, &saved); 1164 if (err2) 1165 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2); 1166 if (err == 0) 1167 err = err2; 1168 if (err) 1169 break; 1170 } while (time_before(jiffies, end_time)); 1171 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags); 1172 st_engine_heartbeat_enable_no_pm(engine); 1173 1174 pr_info("i915_reset_engine(%s:%s): %lu resets\n", 1175 engine->name, test_name, count); 1176 1177 /* GuC based resets are not logged per engine */ 1178 if (!using_guc) { 1179 reported = i915_reset_engine_count(global, engine); 1180 reported -= threads[engine->id].resets; 1181 if (reported != count) { 1182 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", 1183 engine->name, test_name, count, reported); 1184 if (!err) 1185 err = -EINVAL; 1186 } 1187 } 1188 1189 unwind: 1190 for_each_engine(other, gt, tmp) { 1191 int ret; 1192 1193 if (!threads[tmp].worker) 1194 continue; 1195 1196 WRITE_ONCE(threads[tmp].stop, true); 1197 kthread_flush_work(&threads[tmp].work); 1198 ret = READ_ONCE(threads[tmp].result); 1199 if (ret) { 1200 pr_err("kthread for other engine %s failed, err=%d\n", 1201 other->name, ret); 1202 if (!err) 1203 err = ret; 1204 } 1205 1206 kthread_destroy_worker(threads[tmp].worker); 1207 1208 /* GuC based resets are not logged per engine */ 1209 if (!using_guc) { 1210 if (other->uabi_class != engine->uabi_class && 1211 threads[tmp].resets != 1212 i915_reset_engine_count(global, other)) { 1213 pr_err("Innocent engine %s was reset (count=%ld)\n", 1214 other->name, 1215 i915_reset_engine_count(global, other) - 1216 threads[tmp].resets); 1217 if (!err) 1218 err = -EINVAL; 1219 } 1220 } 1221 } 1222 1223 if (device != i915_reset_count(global)) { 1224 pr_err("Global reset (count=%ld)!\n", 1225 i915_reset_count(global) - device); 1226 if (!err) 1227 err = -EINVAL; 1228 } 1229 1230 if (err) 1231 break; 1232 1233 err = igt_flush_test(gt->i915); 1234 if (err) { 1235 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 1236 break; 1237 } 1238 } 1239 kfree(threads); 1240 1241 if (intel_gt_is_wedged(gt)) 1242 err = -EIO; 1243 1244 if (flags & TEST_ACTIVE) 1245 hang_fini(&h); 1246 1247 return err; 1248 } 1249 1250 static int igt_reset_engines(void *arg) 1251 { 1252 static const struct { 1253 const char *name; 1254 unsigned int flags; 1255 } phases[] = { 1256 { "idle", 0 }, 1257 { "active", TEST_ACTIVE }, 1258 { "others-idle", TEST_OTHERS }, 1259 { "others-active", TEST_OTHERS | TEST_ACTIVE }, 1260 { 1261 "others-priority", 1262 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY 1263 }, 1264 { 1265 "self-priority", 1266 TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, 1267 }, 1268 { } 1269 }; 1270 struct intel_gt *gt = arg; 1271 typeof(*phases) *p; 1272 int err; 1273 1274 for (p = phases; p->name; p++) { 1275 if (p->flags & TEST_PRIORITY) { 1276 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) 1277 continue; 1278 } 1279 1280 err = __igt_reset_engines(arg, p->name, p->flags); 1281 if (err) 1282 return err; 1283 } 1284 1285 return 0; 1286 } 1287 1288 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask) 1289 { 1290 u32 count = i915_reset_count(>->i915->gpu_error); 1291 1292 intel_gt_reset(gt, mask, NULL); 1293 1294 return count; 1295 } 1296 1297 static int igt_reset_wait(void *arg) 1298 { 1299 struct intel_gt *gt = arg; 1300 struct i915_gpu_error *global = >->i915->gpu_error; 1301 struct intel_engine_cs *engine; 1302 struct i915_request *rq; 1303 unsigned int reset_count; 1304 struct hang h; 1305 long timeout; 1306 int err; 1307 1308 engine = intel_selftest_find_any_engine(gt); 1309 1310 if (!engine || !intel_engine_can_store_dword(engine)) 1311 return 0; 1312 1313 /* Check that we detect a stuck waiter and issue a reset */ 1314 1315 igt_global_reset_lock(gt); 1316 1317 err = hang_init(&h, gt); 1318 if (err) { 1319 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1320 goto unlock; 1321 } 1322 1323 rq = hang_create_request(&h, engine); 1324 if (IS_ERR(rq)) { 1325 err = PTR_ERR(rq); 1326 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1327 goto fini; 1328 } 1329 1330 i915_request_get(rq); 1331 i915_request_add(rq); 1332 1333 if (!wait_until_running(&h, rq)) { 1334 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1335 1336 pr_err("%s: Failed to start request %llx, at %x\n", 1337 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1338 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1339 1340 intel_gt_set_wedged(gt); 1341 1342 err = -EIO; 1343 goto out_rq; 1344 } 1345 1346 reset_count = fake_hangcheck(gt, ALL_ENGINES); 1347 1348 timeout = i915_request_wait(rq, 0, 10); 1349 if (timeout < 0) { 1350 pr_err("i915_request_wait failed on a stuck request: err=%ld\n", 1351 timeout); 1352 err = timeout; 1353 goto out_rq; 1354 } 1355 1356 if (i915_reset_count(global) == reset_count) { 1357 pr_err("No GPU reset recorded!\n"); 1358 err = -EINVAL; 1359 goto out_rq; 1360 } 1361 1362 out_rq: 1363 i915_request_put(rq); 1364 fini: 1365 hang_fini(&h); 1366 unlock: 1367 igt_global_reset_unlock(gt); 1368 1369 if (intel_gt_is_wedged(gt)) 1370 return -EIO; 1371 1372 return err; 1373 } 1374 1375 struct evict_vma { 1376 struct completion completion; 1377 struct i915_vma *vma; 1378 }; 1379 1380 static int evict_vma(void *data) 1381 { 1382 struct evict_vma *arg = data; 1383 struct i915_address_space *vm = arg->vma->vm; 1384 struct drm_mm_node evict = arg->vma->node; 1385 int err; 1386 1387 complete(&arg->completion); 1388 1389 mutex_lock(&vm->mutex); 1390 err = i915_gem_evict_for_node(vm, NULL, &evict, 0); 1391 mutex_unlock(&vm->mutex); 1392 1393 return err; 1394 } 1395 1396 static int evict_fence(void *data) 1397 { 1398 struct evict_vma *arg = data; 1399 int err; 1400 1401 complete(&arg->completion); 1402 1403 /* Mark the fence register as dirty to force the mmio update. */ 1404 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); 1405 if (err) { 1406 pr_err("Invalid Y-tiling settings; err:%d\n", err); 1407 return err; 1408 } 1409 1410 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE); 1411 if (err) { 1412 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err); 1413 return err; 1414 } 1415 1416 err = i915_vma_pin_fence(arg->vma); 1417 i915_vma_unpin(arg->vma); 1418 if (err) { 1419 pr_err("Unable to pin Y-tiled fence; err:%d\n", err); 1420 return err; 1421 } 1422 1423 i915_vma_unpin_fence(arg->vma); 1424 1425 return 0; 1426 } 1427 1428 static int __igt_reset_evict_vma(struct intel_gt *gt, 1429 struct i915_address_space *vm, 1430 int (*fn)(void *), 1431 unsigned int flags) 1432 { 1433 struct intel_engine_cs *engine; 1434 struct drm_i915_gem_object *obj; 1435 struct task_struct *tsk = NULL; 1436 struct i915_request *rq; 1437 struct evict_vma arg; 1438 struct hang h; 1439 unsigned int pin_flags; 1440 int err; 1441 1442 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE) 1443 return 0; 1444 1445 engine = intel_selftest_find_any_engine(gt); 1446 1447 if (!engine || !intel_engine_can_store_dword(engine)) 1448 return 0; 1449 1450 /* Check that we can recover an unbind stuck on a hanging request */ 1451 1452 err = hang_init(&h, gt); 1453 if (err) { 1454 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1455 return err; 1456 } 1457 1458 obj = i915_gem_object_create_internal(gt->i915, SZ_1M); 1459 if (IS_ERR(obj)) { 1460 err = PTR_ERR(obj); 1461 pr_err("[%s] Create object failed: %d!\n", engine->name, err); 1462 goto fini; 1463 } 1464 1465 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1466 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); 1467 if (err) { 1468 pr_err("Invalid X-tiling settings; err:%d\n", err); 1469 goto out_obj; 1470 } 1471 } 1472 1473 arg.vma = i915_vma_instance(obj, vm, NULL); 1474 if (IS_ERR(arg.vma)) { 1475 err = PTR_ERR(arg.vma); 1476 pr_err("[%s] VMA instance failed: %d!\n", engine->name, err); 1477 goto out_obj; 1478 } 1479 1480 rq = hang_create_request(&h, engine); 1481 if (IS_ERR(rq)) { 1482 err = PTR_ERR(rq); 1483 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1484 goto out_obj; 1485 } 1486 1487 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER; 1488 1489 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1490 pin_flags |= PIN_MAPPABLE; 1491 1492 err = i915_vma_pin(arg.vma, 0, 0, pin_flags); 1493 if (err) { 1494 i915_request_add(rq); 1495 pr_err("[%s] VMA pin failed: %d!\n", engine->name, err); 1496 goto out_obj; 1497 } 1498 1499 if (flags & EXEC_OBJECT_NEEDS_FENCE) { 1500 err = i915_vma_pin_fence(arg.vma); 1501 if (err) { 1502 pr_err("Unable to pin X-tiled fence; err:%d\n", err); 1503 i915_vma_unpin(arg.vma); 1504 i915_request_add(rq); 1505 goto out_obj; 1506 } 1507 } 1508 1509 err = igt_vma_move_to_active_unlocked(arg.vma, rq, flags); 1510 if (err) 1511 pr_err("[%s] Move to active failed: %d!\n", engine->name, err); 1512 1513 if (flags & EXEC_OBJECT_NEEDS_FENCE) 1514 i915_vma_unpin_fence(arg.vma); 1515 i915_vma_unpin(arg.vma); 1516 1517 i915_request_get(rq); 1518 i915_request_add(rq); 1519 if (err) 1520 goto out_rq; 1521 1522 if (!wait_until_running(&h, rq)) { 1523 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1524 1525 pr_err("%s: Failed to start request %llx, at %x\n", 1526 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1527 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1528 1529 intel_gt_set_wedged(gt); 1530 goto out_reset; 1531 } 1532 1533 init_completion(&arg.completion); 1534 1535 tsk = kthread_run(fn, &arg, "igt/evict_vma"); 1536 if (IS_ERR(tsk)) { 1537 err = PTR_ERR(tsk); 1538 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err); 1539 tsk = NULL; 1540 goto out_reset; 1541 } 1542 get_task_struct(tsk); 1543 1544 wait_for_completion(&arg.completion); 1545 1546 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { 1547 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1548 1549 pr_err("igt/evict_vma kthread did not wait\n"); 1550 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1551 1552 intel_gt_set_wedged(gt); 1553 goto out_reset; 1554 } 1555 1556 out_reset: 1557 igt_global_reset_lock(gt); 1558 fake_hangcheck(gt, rq->engine->mask); 1559 igt_global_reset_unlock(gt); 1560 1561 if (tsk) { 1562 struct intel_wedge_me w; 1563 1564 /* The reset, even indirectly, should take less than 10ms. */ 1565 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */) 1566 err = kthread_stop(tsk); 1567 1568 put_task_struct(tsk); 1569 } 1570 1571 out_rq: 1572 i915_request_put(rq); 1573 out_obj: 1574 i915_gem_object_put(obj); 1575 fini: 1576 hang_fini(&h); 1577 if (intel_gt_is_wedged(gt)) 1578 return -EIO; 1579 1580 return err; 1581 } 1582 1583 static int igt_reset_evict_ggtt(void *arg) 1584 { 1585 struct intel_gt *gt = arg; 1586 1587 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1588 evict_vma, EXEC_OBJECT_WRITE); 1589 } 1590 1591 static int igt_reset_evict_ppgtt(void *arg) 1592 { 1593 struct intel_gt *gt = arg; 1594 struct i915_ppgtt *ppgtt; 1595 int err; 1596 1597 /* aliasing == global gtt locking, covered above */ 1598 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL) 1599 return 0; 1600 1601 ppgtt = i915_ppgtt_create(gt, 0); 1602 if (IS_ERR(ppgtt)) 1603 return PTR_ERR(ppgtt); 1604 1605 err = __igt_reset_evict_vma(gt, &ppgtt->vm, 1606 evict_vma, EXEC_OBJECT_WRITE); 1607 i915_vm_put(&ppgtt->vm); 1608 1609 return err; 1610 } 1611 1612 static int igt_reset_evict_fence(void *arg) 1613 { 1614 struct intel_gt *gt = arg; 1615 1616 return __igt_reset_evict_vma(gt, >->ggtt->vm, 1617 evict_fence, EXEC_OBJECT_NEEDS_FENCE); 1618 } 1619 1620 static int wait_for_others(struct intel_gt *gt, 1621 struct intel_engine_cs *exclude) 1622 { 1623 struct intel_engine_cs *engine; 1624 enum intel_engine_id id; 1625 1626 for_each_engine(engine, gt, id) { 1627 if (engine == exclude) 1628 continue; 1629 1630 if (!wait_for_idle(engine)) 1631 return -EIO; 1632 } 1633 1634 return 0; 1635 } 1636 1637 static int igt_reset_queue(void *arg) 1638 { 1639 struct intel_gt *gt = arg; 1640 struct i915_gpu_error *global = >->i915->gpu_error; 1641 struct intel_engine_cs *engine; 1642 enum intel_engine_id id; 1643 struct hang h; 1644 int err; 1645 1646 /* Check that we replay pending requests following a hang */ 1647 1648 igt_global_reset_lock(gt); 1649 1650 err = hang_init(&h, gt); 1651 if (err) 1652 goto unlock; 1653 1654 for_each_engine(engine, gt, id) { 1655 struct intel_selftest_saved_policy saved; 1656 struct i915_request *prev; 1657 IGT_TIMEOUT(end_time); 1658 unsigned int count; 1659 bool using_guc = intel_engine_uses_guc(engine); 1660 1661 if (!intel_engine_can_store_dword(engine)) 1662 continue; 1663 1664 if (using_guc) { 1665 err = intel_selftest_modify_policy(engine, &saved, 1666 SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK); 1667 if (err) { 1668 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err); 1669 goto fini; 1670 } 1671 } 1672 1673 prev = hang_create_request(&h, engine); 1674 if (IS_ERR(prev)) { 1675 err = PTR_ERR(prev); 1676 pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err); 1677 goto restore; 1678 } 1679 1680 i915_request_get(prev); 1681 i915_request_add(prev); 1682 1683 count = 0; 1684 do { 1685 struct i915_request *rq; 1686 unsigned int reset_count; 1687 1688 rq = hang_create_request(&h, engine); 1689 if (IS_ERR(rq)) { 1690 err = PTR_ERR(rq); 1691 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1692 goto restore; 1693 } 1694 1695 i915_request_get(rq); 1696 i915_request_add(rq); 1697 1698 /* 1699 * XXX We don't handle resetting the kernel context 1700 * very well. If we trigger a device reset twice in 1701 * quick succession while the kernel context is 1702 * executing, we may end up skipping the breadcrumb. 1703 * This is really only a problem for the selftest as 1704 * normally there is a large interlude between resets 1705 * (hangcheck), or we focus on resetting just one 1706 * engine and so avoid repeatedly resetting innocents. 1707 */ 1708 err = wait_for_others(gt, engine); 1709 if (err) { 1710 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 1711 __func__, engine->name); 1712 i915_request_put(rq); 1713 i915_request_put(prev); 1714 1715 GEM_TRACE_DUMP(); 1716 intel_gt_set_wedged(gt); 1717 goto restore; 1718 } 1719 1720 if (!wait_until_running(&h, prev)) { 1721 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1722 1723 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1724 __func__, engine->name, 1725 prev->fence.seqno, hws_seqno(&h, prev)); 1726 intel_engine_dump(engine, &p, 1727 "%s\n", engine->name); 1728 1729 i915_request_put(rq); 1730 i915_request_put(prev); 1731 1732 intel_gt_set_wedged(gt); 1733 1734 err = -EIO; 1735 goto restore; 1736 } 1737 1738 reset_count = fake_hangcheck(gt, BIT(id)); 1739 1740 if (prev->fence.error != -EIO) { 1741 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1742 prev->fence.error); 1743 i915_request_put(rq); 1744 i915_request_put(prev); 1745 err = -EINVAL; 1746 goto restore; 1747 } 1748 1749 if (rq->fence.error) { 1750 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1751 rq->fence.error); 1752 i915_request_put(rq); 1753 i915_request_put(prev); 1754 err = -EINVAL; 1755 goto restore; 1756 } 1757 1758 if (i915_reset_count(global) == reset_count) { 1759 pr_err("No GPU reset recorded!\n"); 1760 i915_request_put(rq); 1761 i915_request_put(prev); 1762 err = -EINVAL; 1763 goto restore; 1764 } 1765 1766 i915_request_put(prev); 1767 prev = rq; 1768 count++; 1769 } while (time_before(jiffies, end_time)); 1770 pr_info("%s: Completed %d queued resets\n", 1771 engine->name, count); 1772 1773 *h.batch = MI_BATCH_BUFFER_END; 1774 intel_gt_chipset_flush(engine->gt); 1775 1776 i915_request_put(prev); 1777 1778 restore: 1779 if (using_guc) { 1780 int err2 = intel_selftest_restore_policy(engine, &saved); 1781 1782 if (err2) 1783 pr_err("%s:%d> [%s] Restore policy failed: %d!\n", 1784 __func__, __LINE__, engine->name, err2); 1785 if (err == 0) 1786 err = err2; 1787 } 1788 if (err) 1789 goto fini; 1790 1791 err = igt_flush_test(gt->i915); 1792 if (err) { 1793 pr_err("[%s] Flush failed: %d!\n", engine->name, err); 1794 break; 1795 } 1796 } 1797 1798 fini: 1799 hang_fini(&h); 1800 unlock: 1801 igt_global_reset_unlock(gt); 1802 1803 if (intel_gt_is_wedged(gt)) 1804 return -EIO; 1805 1806 return err; 1807 } 1808 1809 static int igt_handle_error(void *arg) 1810 { 1811 struct intel_gt *gt = arg; 1812 struct i915_gpu_error *global = >->i915->gpu_error; 1813 struct intel_engine_cs *engine; 1814 struct hang h; 1815 struct i915_request *rq; 1816 struct i915_gpu_coredump *error; 1817 int err; 1818 1819 engine = intel_selftest_find_any_engine(gt); 1820 1821 /* Check that we can issue a global GPU and engine reset */ 1822 1823 if (!intel_has_reset_engine(gt)) 1824 return 0; 1825 1826 if (!engine || !intel_engine_can_store_dword(engine)) 1827 return 0; 1828 1829 err = hang_init(&h, gt); 1830 if (err) { 1831 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1832 return err; 1833 } 1834 1835 rq = hang_create_request(&h, engine); 1836 if (IS_ERR(rq)) { 1837 err = PTR_ERR(rq); 1838 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1839 goto err_fini; 1840 } 1841 1842 i915_request_get(rq); 1843 i915_request_add(rq); 1844 1845 if (!wait_until_running(&h, rq)) { 1846 struct drm_printer p = drm_info_printer(gt->i915->drm.dev); 1847 1848 pr_err("%s: Failed to start request %llx, at %x\n", 1849 __func__, rq->fence.seqno, hws_seqno(&h, rq)); 1850 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); 1851 1852 intel_gt_set_wedged(gt); 1853 1854 err = -EIO; 1855 goto err_request; 1856 } 1857 1858 /* Temporarily disable error capture */ 1859 error = xchg(&global->first_error, (void *)-1); 1860 1861 intel_gt_handle_error(gt, engine->mask, 0, NULL); 1862 1863 xchg(&global->first_error, error); 1864 1865 if (rq->fence.error != -EIO) { 1866 pr_err("Guilty request not identified!\n"); 1867 err = -EINVAL; 1868 goto err_request; 1869 } 1870 1871 err_request: 1872 i915_request_put(rq); 1873 err_fini: 1874 hang_fini(&h); 1875 return err; 1876 } 1877 1878 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, 1879 const struct igt_atomic_section *p, 1880 const char *mode) 1881 { 1882 struct tasklet_struct * const t = &engine->sched_engine->tasklet; 1883 int err; 1884 1885 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", 1886 engine->name, mode, p->name); 1887 1888 if (t->func) 1889 tasklet_disable(t); 1890 if (strcmp(p->name, "softirq")) 1891 local_bh_disable(); 1892 p->critical_section_begin(); 1893 1894 err = __intel_engine_reset_bh(engine, NULL); 1895 1896 p->critical_section_end(); 1897 if (strcmp(p->name, "softirq")) 1898 local_bh_enable(); 1899 if (t->func) { 1900 tasklet_enable(t); 1901 tasklet_hi_schedule(t); 1902 } 1903 1904 if (err) 1905 pr_err("i915_reset_engine(%s:%s) failed under %s\n", 1906 engine->name, mode, p->name); 1907 1908 return err; 1909 } 1910 1911 static int igt_atomic_reset_engine(struct intel_engine_cs *engine, 1912 const struct igt_atomic_section *p) 1913 { 1914 struct i915_request *rq; 1915 struct hang h; 1916 int err; 1917 1918 err = __igt_atomic_reset_engine(engine, p, "idle"); 1919 if (err) 1920 return err; 1921 1922 err = hang_init(&h, engine->gt); 1923 if (err) { 1924 pr_err("[%s] Hang init failed: %d!\n", engine->name, err); 1925 return err; 1926 } 1927 1928 rq = hang_create_request(&h, engine); 1929 if (IS_ERR(rq)) { 1930 err = PTR_ERR(rq); 1931 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err); 1932 goto out; 1933 } 1934 1935 i915_request_get(rq); 1936 i915_request_add(rq); 1937 1938 if (wait_until_running(&h, rq)) { 1939 err = __igt_atomic_reset_engine(engine, p, "active"); 1940 } else { 1941 pr_err("%s(%s): Failed to start request %llx, at %x\n", 1942 __func__, engine->name, 1943 rq->fence.seqno, hws_seqno(&h, rq)); 1944 intel_gt_set_wedged(engine->gt); 1945 err = -EIO; 1946 } 1947 1948 if (err == 0) { 1949 struct intel_wedge_me w; 1950 1951 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */) 1952 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); 1953 if (intel_gt_is_wedged(engine->gt)) 1954 err = -EIO; 1955 } 1956 1957 i915_request_put(rq); 1958 out: 1959 hang_fini(&h); 1960 return err; 1961 } 1962 1963 static int igt_reset_engines_atomic(void *arg) 1964 { 1965 struct intel_gt *gt = arg; 1966 const typeof(*igt_atomic_phases) *p; 1967 int err = 0; 1968 1969 /* Check that the engines resets are usable from atomic context */ 1970 1971 if (!intel_has_reset_engine(gt)) 1972 return 0; 1973 1974 if (intel_uc_uses_guc_submission(>->uc)) 1975 return 0; 1976 1977 igt_global_reset_lock(gt); 1978 1979 /* Flush any requests before we get started and check basics */ 1980 if (!igt_force_reset(gt)) 1981 goto unlock; 1982 1983 for (p = igt_atomic_phases; p->name; p++) { 1984 struct intel_engine_cs *engine; 1985 enum intel_engine_id id; 1986 1987 for_each_engine(engine, gt, id) { 1988 err = igt_atomic_reset_engine(engine, p); 1989 if (err) 1990 goto out; 1991 } 1992 } 1993 1994 out: 1995 /* As we poke around the guts, do a full reset before continuing. */ 1996 igt_force_reset(gt); 1997 unlock: 1998 igt_global_reset_unlock(gt); 1999 2000 return err; 2001 } 2002 2003 int intel_hangcheck_live_selftests(struct drm_i915_private *i915) 2004 { 2005 static const struct i915_subtest tests[] = { 2006 SUBTEST(igt_hang_sanitycheck), 2007 SUBTEST(igt_reset_nop), 2008 SUBTEST(igt_reset_nop_engine), 2009 SUBTEST(igt_reset_idle_engine), 2010 SUBTEST(igt_reset_active_engine), 2011 SUBTEST(igt_reset_fail_engine), 2012 SUBTEST(igt_reset_engines), 2013 SUBTEST(igt_reset_engines_atomic), 2014 SUBTEST(igt_reset_queue), 2015 SUBTEST(igt_reset_wait), 2016 SUBTEST(igt_reset_evict_ggtt), 2017 SUBTEST(igt_reset_evict_ppgtt), 2018 SUBTEST(igt_reset_evict_fence), 2019 SUBTEST(igt_handle_error), 2020 }; 2021 struct intel_gt *gt = to_gt(i915); 2022 intel_wakeref_t wakeref; 2023 int err; 2024 2025 if (!intel_has_gpu_reset(gt)) 2026 return 0; 2027 2028 if (intel_gt_is_wedged(gt)) 2029 return -EIO; /* we're long past hope of a successful reset */ 2030 2031 wakeref = intel_runtime_pm_get(gt->uncore->rpm); 2032 2033 err = intel_gt_live_subtests(tests, gt); 2034 2035 intel_runtime_pm_put(gt->uncore->rpm, wakeref); 2036 2037 return err; 2038 } 2039