1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/pm_qos.h> 26 #include <linux/prime_numbers.h> 27 #include <linux/sort.h> 28 29 #include "gem/i915_gem_internal.h" 30 #include "gem/i915_gem_pm.h" 31 #include "gem/selftests/mock_context.h" 32 #include "gt/intel_engine_heartbeat.h" 33 #include "gt/intel_engine_pm.h" 34 #include "gt/intel_engine_user.h" 35 #include "gt/intel_gt.h" 36 #include "gt/intel_gt_clock_utils.h" 37 #include "gt/intel_gt_requests.h" 38 #include "gt/selftest_engine_heartbeat.h" 39 40 #include "i915_random.h" 41 #include "i915_selftest.h" 42 #include "i915_wait_util.h" 43 #include "igt_flush_test.h" 44 #include "igt_live_test.h" 45 #include "igt_spinner.h" 46 #include "lib_sw_fence.h" 47 #include "mock_drm.h" 48 #include "mock_gem_device.h" 49 50 static unsigned int num_uabi_engines(struct drm_i915_private *i915) 51 { 52 struct intel_engine_cs *engine; 53 unsigned int count; 54 55 count = 0; 56 for_each_uabi_engine(engine, i915) 57 count++; 58 59 return count; 60 } 61 62 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915) 63 { 64 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0); 65 } 66 67 static int igt_add_request(void *arg) 68 { 69 struct drm_i915_private *i915 = arg; 70 struct i915_request *request; 71 72 /* Basic preliminary test to create a request and let it loose! */ 73 74 request = mock_request(rcs0(i915)->kernel_context, HZ / 10); 75 if (IS_ERR(request)) 76 return PTR_ERR(request); 77 78 i915_request_add(request); 79 80 return 0; 81 } 82 83 static int igt_wait_request(void *arg) 84 { 85 const long T = HZ / 4; 86 struct drm_i915_private *i915 = arg; 87 struct i915_request *request; 88 int err = -EINVAL; 89 90 /* Submit a request, then wait upon it */ 91 92 request = mock_request(rcs0(i915)->kernel_context, T); 93 if (IS_ERR(request)) 94 return PTR_ERR(request); 95 96 i915_request_get(request); 97 98 if (i915_request_wait(request, 0, 0) != -ETIME) { 99 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n"); 100 goto out_request; 101 } 102 103 if (i915_request_wait(request, 0, T) != -ETIME) { 104 pr_err("request wait succeeded (expected timeout before submit!)\n"); 105 goto out_request; 106 } 107 108 if (i915_request_completed(request)) { 109 pr_err("request completed before submit!!\n"); 110 goto out_request; 111 } 112 113 i915_request_add(request); 114 115 if (i915_request_wait(request, 0, 0) != -ETIME) { 116 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n"); 117 goto out_request; 118 } 119 120 if (i915_request_completed(request)) { 121 pr_err("request completed immediately!\n"); 122 goto out_request; 123 } 124 125 if (i915_request_wait(request, 0, T / 2) != -ETIME) { 126 pr_err("request wait succeeded (expected timeout!)\n"); 127 goto out_request; 128 } 129 130 if (i915_request_wait(request, 0, T) == -ETIME) { 131 pr_err("request wait timed out!\n"); 132 goto out_request; 133 } 134 135 if (!i915_request_completed(request)) { 136 pr_err("request not complete after waiting!\n"); 137 goto out_request; 138 } 139 140 if (i915_request_wait(request, 0, T) == -ETIME) { 141 pr_err("request wait timed out when already complete!\n"); 142 goto out_request; 143 } 144 145 err = 0; 146 out_request: 147 i915_request_put(request); 148 mock_device_flush(i915); 149 return err; 150 } 151 152 static int igt_fence_wait(void *arg) 153 { 154 const long T = HZ / 4; 155 struct drm_i915_private *i915 = arg; 156 struct i915_request *request; 157 int err = -EINVAL; 158 159 /* Submit a request, treat it as a fence and wait upon it */ 160 161 request = mock_request(rcs0(i915)->kernel_context, T); 162 if (IS_ERR(request)) 163 return PTR_ERR(request); 164 165 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) { 166 pr_err("fence wait success before submit (expected timeout)!\n"); 167 goto out; 168 } 169 170 i915_request_add(request); 171 172 if (dma_fence_is_signaled(&request->fence)) { 173 pr_err("fence signaled immediately!\n"); 174 goto out; 175 } 176 177 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) { 178 pr_err("fence wait success after submit (expected timeout)!\n"); 179 goto out; 180 } 181 182 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 183 pr_err("fence wait timed out (expected success)!\n"); 184 goto out; 185 } 186 187 if (!dma_fence_is_signaled(&request->fence)) { 188 pr_err("fence unsignaled after waiting!\n"); 189 goto out; 190 } 191 192 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 193 pr_err("fence wait timed out when complete (expected success)!\n"); 194 goto out; 195 } 196 197 err = 0; 198 out: 199 mock_device_flush(i915); 200 return err; 201 } 202 203 static int igt_request_rewind(void *arg) 204 { 205 struct drm_i915_private *i915 = arg; 206 struct i915_request *request, *vip; 207 struct i915_gem_context *ctx[2]; 208 struct intel_context *ce; 209 int err = -EINVAL; 210 211 ctx[0] = mock_context(i915, "A"); 212 if (!ctx[0]) { 213 err = -ENOMEM; 214 goto err_ctx_0; 215 } 216 217 ce = i915_gem_context_get_engine(ctx[0], RCS0); 218 GEM_BUG_ON(IS_ERR(ce)); 219 request = mock_request(ce, 2 * HZ); 220 intel_context_put(ce); 221 if (IS_ERR(request)) { 222 err = PTR_ERR(request); 223 goto err_context_0; 224 } 225 226 i915_request_get(request); 227 i915_request_add(request); 228 229 ctx[1] = mock_context(i915, "B"); 230 if (!ctx[1]) { 231 err = -ENOMEM; 232 goto err_ctx_1; 233 } 234 235 ce = i915_gem_context_get_engine(ctx[1], RCS0); 236 GEM_BUG_ON(IS_ERR(ce)); 237 vip = mock_request(ce, 0); 238 intel_context_put(ce); 239 if (IS_ERR(vip)) { 240 err = PTR_ERR(vip); 241 goto err_context_1; 242 } 243 244 /* Simulate preemption by manual reordering */ 245 if (!mock_cancel_request(request)) { 246 pr_err("failed to cancel request (already executed)!\n"); 247 i915_request_add(vip); 248 goto err_context_1; 249 } 250 i915_request_get(vip); 251 i915_request_add(vip); 252 rcu_read_lock(); 253 request->engine->submit_request(request); 254 rcu_read_unlock(); 255 256 257 if (i915_request_wait(vip, 0, HZ) == -ETIME) { 258 pr_err("timed out waiting for high priority request\n"); 259 goto err; 260 } 261 262 if (i915_request_completed(request)) { 263 pr_err("low priority request already completed\n"); 264 goto err; 265 } 266 267 err = 0; 268 err: 269 i915_request_put(vip); 270 err_context_1: 271 mock_context_close(ctx[1]); 272 err_ctx_1: 273 i915_request_put(request); 274 err_context_0: 275 mock_context_close(ctx[0]); 276 err_ctx_0: 277 mock_device_flush(i915); 278 return err; 279 } 280 281 struct smoketest { 282 struct intel_engine_cs *engine; 283 struct i915_gem_context **contexts; 284 atomic_long_t num_waits, num_fences; 285 int ncontexts, max_batch; 286 struct i915_request *(*request_alloc)(struct intel_context *ce); 287 }; 288 289 static struct i915_request * 290 __mock_request_alloc(struct intel_context *ce) 291 { 292 return mock_request(ce, 0); 293 } 294 295 static struct i915_request * 296 __live_request_alloc(struct intel_context *ce) 297 { 298 return intel_context_create_request(ce); 299 } 300 301 struct smoke_thread { 302 struct kthread_worker *worker; 303 struct kthread_work work; 304 struct smoketest *t; 305 bool stop; 306 int result; 307 }; 308 309 static void __igt_breadcrumbs_smoketest(struct kthread_work *work) 310 { 311 struct smoke_thread *thread = container_of(work, typeof(*thread), work); 312 struct smoketest *t = thread->t; 313 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1; 314 const unsigned int total = 4 * t->ncontexts + 1; 315 unsigned int num_waits = 0, num_fences = 0; 316 struct i915_request **requests; 317 I915_RND_STATE(prng); 318 unsigned int *order; 319 int err = 0; 320 321 /* 322 * A very simple test to catch the most egregious of list handling bugs. 323 * 324 * At its heart, we simply create oodles of requests running across 325 * multiple kthreads and enable signaling on them, for the sole purpose 326 * of stressing our breadcrumb handling. The only inspection we do is 327 * that the fences were marked as signaled. 328 */ 329 330 requests = kcalloc(total, sizeof(*requests), GFP_KERNEL); 331 if (!requests) { 332 thread->result = -ENOMEM; 333 return; 334 } 335 336 order = i915_random_order(total, &prng); 337 if (!order) { 338 err = -ENOMEM; 339 goto out_requests; 340 } 341 342 while (!READ_ONCE(thread->stop)) { 343 struct i915_sw_fence *submit, *wait; 344 unsigned int n, count; 345 346 submit = heap_fence_create(GFP_KERNEL); 347 if (!submit) { 348 err = -ENOMEM; 349 break; 350 } 351 352 wait = heap_fence_create(GFP_KERNEL); 353 if (!wait) { 354 i915_sw_fence_commit(submit); 355 heap_fence_put(submit); 356 err = -ENOMEM; 357 break; 358 } 359 360 i915_random_reorder(order, total, &prng); 361 count = 1 + i915_prandom_u32_max_state(max_batch, &prng); 362 363 for (n = 0; n < count; n++) { 364 struct i915_gem_context *ctx = 365 t->contexts[order[n] % t->ncontexts]; 366 struct i915_request *rq; 367 struct intel_context *ce; 368 369 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx); 370 GEM_BUG_ON(IS_ERR(ce)); 371 rq = t->request_alloc(ce); 372 intel_context_put(ce); 373 if (IS_ERR(rq)) { 374 err = PTR_ERR(rq); 375 count = n; 376 break; 377 } 378 379 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 380 submit, 381 GFP_KERNEL); 382 383 requests[n] = i915_request_get(rq); 384 i915_request_add(rq); 385 386 if (err >= 0) 387 err = i915_sw_fence_await_dma_fence(wait, 388 &rq->fence, 389 0, 390 GFP_KERNEL); 391 392 if (err < 0) { 393 i915_request_put(rq); 394 count = n; 395 break; 396 } 397 } 398 399 i915_sw_fence_commit(submit); 400 i915_sw_fence_commit(wait); 401 402 if (!wait_event_timeout(wait->wait, 403 i915_sw_fence_done(wait), 404 5 * HZ)) { 405 struct i915_request *rq = requests[count - 1]; 406 407 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n", 408 atomic_read(&wait->pending), count, 409 rq->fence.context, rq->fence.seqno, 410 t->engine->name); 411 GEM_TRACE_DUMP(); 412 413 intel_gt_set_wedged(t->engine->gt); 414 GEM_BUG_ON(!i915_request_completed(rq)); 415 i915_sw_fence_wait(wait); 416 err = -EIO; 417 } 418 419 for (n = 0; n < count; n++) { 420 struct i915_request *rq = requests[n]; 421 422 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 423 &rq->fence.flags)) { 424 pr_err("%llu:%llu was not signaled!\n", 425 rq->fence.context, rq->fence.seqno); 426 err = -EINVAL; 427 } 428 429 i915_request_put(rq); 430 } 431 432 heap_fence_put(wait); 433 heap_fence_put(submit); 434 435 if (err < 0) 436 break; 437 438 num_fences += count; 439 num_waits++; 440 441 cond_resched(); 442 } 443 444 atomic_long_add(num_fences, &t->num_fences); 445 atomic_long_add(num_waits, &t->num_waits); 446 447 kfree(order); 448 out_requests: 449 kfree(requests); 450 thread->result = err; 451 } 452 453 static int mock_breadcrumbs_smoketest(void *arg) 454 { 455 struct drm_i915_private *i915 = arg; 456 struct smoketest t = { 457 .engine = rcs0(i915), 458 .ncontexts = 1024, 459 .max_batch = 1024, 460 .request_alloc = __mock_request_alloc 461 }; 462 unsigned int ncpus = num_online_cpus(); 463 struct smoke_thread *threads; 464 unsigned int n; 465 int ret = 0; 466 467 /* 468 * Smoketest our breadcrumb/signal handling for requests across multiple 469 * threads. A very simple test to only catch the most egregious of bugs. 470 * See __igt_breadcrumbs_smoketest(); 471 */ 472 473 threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL); 474 if (!threads) 475 return -ENOMEM; 476 477 t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL); 478 if (!t.contexts) { 479 ret = -ENOMEM; 480 goto out_threads; 481 } 482 483 for (n = 0; n < t.ncontexts; n++) { 484 t.contexts[n] = mock_context(t.engine->i915, "mock"); 485 if (!t.contexts[n]) { 486 ret = -ENOMEM; 487 goto out_contexts; 488 } 489 } 490 491 for (n = 0; n < ncpus; n++) { 492 struct kthread_worker *worker; 493 494 worker = kthread_run_worker(0, "igt/%d", n); 495 if (IS_ERR(worker)) { 496 ret = PTR_ERR(worker); 497 ncpus = n; 498 break; 499 } 500 501 threads[n].worker = worker; 502 threads[n].t = &t; 503 threads[n].stop = false; 504 threads[n].result = 0; 505 506 kthread_init_work(&threads[n].work, 507 __igt_breadcrumbs_smoketest); 508 kthread_queue_work(worker, &threads[n].work); 509 } 510 511 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 512 513 for (n = 0; n < ncpus; n++) { 514 int err; 515 516 WRITE_ONCE(threads[n].stop, true); 517 kthread_flush_work(&threads[n].work); 518 err = READ_ONCE(threads[n].result); 519 if (err < 0 && !ret) 520 ret = err; 521 522 kthread_destroy_worker(threads[n].worker); 523 } 524 pr_info("Completed %lu waits for %lu fence across %d cpus\n", 525 atomic_long_read(&t.num_waits), 526 atomic_long_read(&t.num_fences), 527 ncpus); 528 529 out_contexts: 530 for (n = 0; n < t.ncontexts; n++) { 531 if (!t.contexts[n]) 532 break; 533 mock_context_close(t.contexts[n]); 534 } 535 kfree(t.contexts); 536 out_threads: 537 kfree(threads); 538 return ret; 539 } 540 541 int i915_request_mock_selftests(void) 542 { 543 static const struct i915_subtest tests[] = { 544 SUBTEST(igt_add_request), 545 SUBTEST(igt_wait_request), 546 SUBTEST(igt_fence_wait), 547 SUBTEST(igt_request_rewind), 548 SUBTEST(mock_breadcrumbs_smoketest), 549 }; 550 struct drm_i915_private *i915; 551 intel_wakeref_t wakeref; 552 int err = 0; 553 554 i915 = mock_gem_device(); 555 if (!i915) 556 return -ENOMEM; 557 558 with_intel_runtime_pm(&i915->runtime_pm, wakeref) 559 err = i915_subtests(tests, i915); 560 561 mock_destroy_device(i915); 562 563 return err; 564 } 565 566 static int live_nop_request(void *arg) 567 { 568 struct drm_i915_private *i915 = arg; 569 struct intel_engine_cs *engine; 570 struct igt_live_test t; 571 int err = -ENODEV; 572 573 /* 574 * Submit various sized batches of empty requests, to each engine 575 * (individually), and wait for the batch to complete. We can check 576 * the overhead of submitting requests to the hardware. 577 */ 578 579 for_each_uabi_engine(engine, i915) { 580 unsigned long n, prime; 581 IGT_TIMEOUT(end_time); 582 ktime_t times[2] = {}; 583 584 err = igt_live_test_begin(&t, i915, __func__, engine->name); 585 if (err) 586 return err; 587 588 intel_engine_pm_get(engine); 589 for_each_prime_number_from(prime, 1, 8192) { 590 struct i915_request *request = NULL; 591 592 times[1] = ktime_get_raw(); 593 594 for (n = 0; n < prime; n++) { 595 i915_request_put(request); 596 request = i915_request_create(engine->kernel_context); 597 if (IS_ERR(request)) 598 return PTR_ERR(request); 599 600 /* 601 * This space is left intentionally blank. 602 * 603 * We do not actually want to perform any 604 * action with this request, we just want 605 * to measure the latency in allocation 606 * and submission of our breadcrumbs - 607 * ensuring that the bare request is sufficient 608 * for the system to work (i.e. proper HEAD 609 * tracking of the rings, interrupt handling, 610 * etc). It also gives us the lowest bounds 611 * for latency. 612 */ 613 614 i915_request_get(request); 615 i915_request_add(request); 616 } 617 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 618 i915_request_put(request); 619 620 times[1] = ktime_sub(ktime_get_raw(), times[1]); 621 if (prime == 1) 622 times[0] = times[1]; 623 624 if (__igt_timeout(end_time, NULL)) 625 break; 626 } 627 intel_engine_pm_put(engine); 628 629 err = igt_live_test_end(&t); 630 if (err) 631 return err; 632 633 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n", 634 engine->name, 635 ktime_to_ns(times[0]), 636 prime, div64_u64(ktime_to_ns(times[1]), prime)); 637 } 638 639 return err; 640 } 641 642 static int __cancel_inactive(struct intel_engine_cs *engine) 643 { 644 struct intel_context *ce; 645 struct igt_spinner spin; 646 struct i915_request *rq; 647 int err = 0; 648 649 if (igt_spinner_init(&spin, engine->gt)) 650 return -ENOMEM; 651 652 ce = intel_context_create(engine); 653 if (IS_ERR(ce)) { 654 err = PTR_ERR(ce); 655 goto out_spin; 656 } 657 658 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 659 if (IS_ERR(rq)) { 660 err = PTR_ERR(rq); 661 goto out_ce; 662 } 663 664 pr_debug("%s: Cancelling inactive request\n", engine->name); 665 i915_request_cancel(rq, -EINTR); 666 i915_request_get(rq); 667 i915_request_add(rq); 668 669 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 670 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 671 672 pr_err("%s: Failed to cancel inactive request\n", engine->name); 673 intel_engine_dump(engine, &p, "%s\n", engine->name); 674 err = -ETIME; 675 goto out_rq; 676 } 677 678 if (rq->fence.error != -EINTR) { 679 pr_err("%s: fence not cancelled (%u)\n", 680 engine->name, rq->fence.error); 681 err = -EINVAL; 682 } 683 684 out_rq: 685 i915_request_put(rq); 686 out_ce: 687 intel_context_put(ce); 688 out_spin: 689 igt_spinner_fini(&spin); 690 if (err) 691 pr_err("%s: %s error %d\n", __func__, engine->name, err); 692 return err; 693 } 694 695 static int __cancel_active(struct intel_engine_cs *engine) 696 { 697 struct intel_context *ce; 698 struct igt_spinner spin; 699 struct i915_request *rq; 700 int err = 0; 701 702 if (igt_spinner_init(&spin, engine->gt)) 703 return -ENOMEM; 704 705 ce = intel_context_create(engine); 706 if (IS_ERR(ce)) { 707 err = PTR_ERR(ce); 708 goto out_spin; 709 } 710 711 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 712 if (IS_ERR(rq)) { 713 err = PTR_ERR(rq); 714 goto out_ce; 715 } 716 717 pr_debug("%s: Cancelling active request\n", engine->name); 718 i915_request_get(rq); 719 i915_request_add(rq); 720 if (!igt_wait_for_spinner(&spin, rq)) { 721 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 722 723 pr_err("Failed to start spinner on %s\n", engine->name); 724 intel_engine_dump(engine, &p, "%s\n", engine->name); 725 err = -ETIME; 726 goto out_rq; 727 } 728 i915_request_cancel(rq, -EINTR); 729 730 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 731 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 732 733 pr_err("%s: Failed to cancel active request\n", engine->name); 734 intel_engine_dump(engine, &p, "%s\n", engine->name); 735 err = -ETIME; 736 goto out_rq; 737 } 738 739 if (rq->fence.error != -EINTR) { 740 pr_err("%s: fence not cancelled (%u)\n", 741 engine->name, rq->fence.error); 742 err = -EINVAL; 743 } 744 745 out_rq: 746 i915_request_put(rq); 747 out_ce: 748 intel_context_put(ce); 749 out_spin: 750 igt_spinner_fini(&spin); 751 if (err) 752 pr_err("%s: %s error %d\n", __func__, engine->name, err); 753 return err; 754 } 755 756 static int __cancel_completed(struct intel_engine_cs *engine) 757 { 758 struct intel_context *ce; 759 struct igt_spinner spin; 760 struct i915_request *rq; 761 int err = 0; 762 763 if (igt_spinner_init(&spin, engine->gt)) 764 return -ENOMEM; 765 766 ce = intel_context_create(engine); 767 if (IS_ERR(ce)) { 768 err = PTR_ERR(ce); 769 goto out_spin; 770 } 771 772 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 773 if (IS_ERR(rq)) { 774 err = PTR_ERR(rq); 775 goto out_ce; 776 } 777 igt_spinner_end(&spin); 778 i915_request_get(rq); 779 i915_request_add(rq); 780 781 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 782 err = -ETIME; 783 goto out_rq; 784 } 785 786 pr_debug("%s: Cancelling completed request\n", engine->name); 787 i915_request_cancel(rq, -EINTR); 788 if (rq->fence.error) { 789 pr_err("%s: fence not cancelled (%u)\n", 790 engine->name, rq->fence.error); 791 err = -EINVAL; 792 } 793 794 out_rq: 795 i915_request_put(rq); 796 out_ce: 797 intel_context_put(ce); 798 out_spin: 799 igt_spinner_fini(&spin); 800 if (err) 801 pr_err("%s: %s error %d\n", __func__, engine->name, err); 802 return err; 803 } 804 805 /* 806 * Test to prove a non-preemptable request can be cancelled and a subsequent 807 * request on the same context can successfully complete after cancellation. 808 * 809 * Testing methodology is to create a non-preemptible request and submit it, 810 * wait for spinner to start, create a NOP request and submit it, cancel the 811 * spinner, wait for spinner to complete and verify it failed with an error, 812 * finally wait for NOP request to complete verify it succeeded without an 813 * error. Preemption timeout also reduced / restored so test runs in a timely 814 * maner. 815 */ 816 static int __cancel_reset(struct drm_i915_private *i915, 817 struct intel_engine_cs *engine) 818 { 819 struct intel_context *ce; 820 struct igt_spinner spin; 821 struct i915_request *rq, *nop; 822 unsigned long preempt_timeout_ms; 823 int err = 0; 824 825 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT || 826 !intel_has_reset_engine(engine->gt)) 827 return 0; 828 829 preempt_timeout_ms = engine->props.preempt_timeout_ms; 830 engine->props.preempt_timeout_ms = 100; 831 832 if (igt_spinner_init(&spin, engine->gt)) 833 goto out_restore; 834 835 ce = intel_context_create(engine); 836 if (IS_ERR(ce)) { 837 err = PTR_ERR(ce); 838 goto out_spin; 839 } 840 841 rq = igt_spinner_create_request(&spin, ce, MI_NOOP); 842 if (IS_ERR(rq)) { 843 err = PTR_ERR(rq); 844 goto out_ce; 845 } 846 847 pr_debug("%s: Cancelling active non-preemptable request\n", 848 engine->name); 849 i915_request_get(rq); 850 i915_request_add(rq); 851 if (!igt_wait_for_spinner(&spin, rq)) { 852 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 853 854 pr_err("Failed to start spinner on %s\n", engine->name); 855 intel_engine_dump(engine, &p, "%s\n", engine->name); 856 err = -ETIME; 857 goto out_rq; 858 } 859 860 nop = intel_context_create_request(ce); 861 if (IS_ERR(nop)) 862 goto out_rq; 863 i915_request_get(nop); 864 i915_request_add(nop); 865 866 i915_request_cancel(rq, -EINTR); 867 868 if (i915_request_wait(rq, 0, HZ) < 0) { 869 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 870 871 pr_err("%s: Failed to cancel hung request\n", engine->name); 872 intel_engine_dump(engine, &p, "%s\n", engine->name); 873 err = -ETIME; 874 goto out_nop; 875 } 876 877 if (rq->fence.error != -EINTR) { 878 pr_err("%s: fence not cancelled (%u)\n", 879 engine->name, rq->fence.error); 880 err = -EINVAL; 881 goto out_nop; 882 } 883 884 if (i915_request_wait(nop, 0, HZ) < 0) { 885 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 886 887 pr_err("%s: Failed to complete nop request\n", engine->name); 888 intel_engine_dump(engine, &p, "%s\n", engine->name); 889 err = -ETIME; 890 goto out_nop; 891 } 892 893 if (nop->fence.error != 0) { 894 pr_err("%s: Nop request errored (%u)\n", 895 engine->name, nop->fence.error); 896 err = -EINVAL; 897 } 898 899 out_nop: 900 i915_request_put(nop); 901 out_rq: 902 i915_request_put(rq); 903 out_ce: 904 intel_context_put(ce); 905 out_spin: 906 igt_spinner_fini(&spin); 907 out_restore: 908 engine->props.preempt_timeout_ms = preempt_timeout_ms; 909 if (err) 910 pr_err("%s: %s error %d\n", __func__, engine->name, err); 911 return err; 912 } 913 914 static int live_cancel_request(void *arg) 915 { 916 struct drm_i915_private *i915 = arg; 917 struct intel_engine_cs *engine; 918 919 /* 920 * Check cancellation of requests. We expect to be able to immediately 921 * cancel active requests, even if they are currently on the GPU. 922 */ 923 924 for_each_uabi_engine(engine, i915) { 925 struct igt_live_test t; 926 int err, err2; 927 928 if (!intel_engine_has_preemption(engine)) 929 continue; 930 931 err = igt_live_test_begin(&t, i915, __func__, engine->name); 932 if (err) 933 return err; 934 935 err = __cancel_inactive(engine); 936 if (err == 0) 937 err = __cancel_active(engine); 938 if (err == 0) 939 err = __cancel_completed(engine); 940 941 err2 = igt_live_test_end(&t); 942 if (err) 943 return err; 944 if (err2) 945 return err2; 946 947 /* Expects reset so call outside of igt_live_test_* */ 948 err = __cancel_reset(i915, engine); 949 if (err) 950 return err; 951 952 if (igt_flush_test(i915)) 953 return -EIO; 954 } 955 956 return 0; 957 } 958 959 static struct i915_vma *empty_batch(struct intel_gt *gt) 960 { 961 struct drm_i915_gem_object *obj; 962 struct i915_vma *vma; 963 u32 *cmd; 964 int err; 965 966 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 967 if (IS_ERR(obj)) 968 return ERR_CAST(obj); 969 970 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC); 971 if (IS_ERR(cmd)) { 972 err = PTR_ERR(cmd); 973 goto err; 974 } 975 976 *cmd = MI_BATCH_BUFFER_END; 977 978 __i915_gem_object_flush_map(obj, 0, 64); 979 i915_gem_object_unpin_map(obj); 980 981 intel_gt_chipset_flush(gt); 982 983 vma = i915_vma_instance(obj, gt->vm, NULL); 984 if (IS_ERR(vma)) { 985 err = PTR_ERR(vma); 986 goto err; 987 } 988 989 err = i915_vma_pin(vma, 0, 0, PIN_USER); 990 if (err) 991 goto err; 992 993 /* Force the wait now to avoid including it in the benchmark */ 994 err = i915_vma_sync(vma); 995 if (err) 996 goto err_pin; 997 998 return vma; 999 1000 err_pin: 1001 i915_vma_unpin(vma); 1002 err: 1003 i915_gem_object_put(obj); 1004 return ERR_PTR(err); 1005 } 1006 1007 static int emit_bb_start(struct i915_request *rq, struct i915_vma *batch) 1008 { 1009 return rq->engine->emit_bb_start(rq, 1010 i915_vma_offset(batch), 1011 i915_vma_size(batch), 1012 0); 1013 } 1014 1015 static struct i915_request * 1016 empty_request(struct intel_engine_cs *engine, 1017 struct i915_vma *batch) 1018 { 1019 struct i915_request *request; 1020 int err; 1021 1022 request = i915_request_create(engine->kernel_context); 1023 if (IS_ERR(request)) 1024 return request; 1025 1026 err = emit_bb_start(request, batch); 1027 if (err) 1028 goto out_request; 1029 1030 i915_request_get(request); 1031 out_request: 1032 i915_request_add(request); 1033 return err ? ERR_PTR(err) : request; 1034 } 1035 1036 static int live_empty_request(void *arg) 1037 { 1038 struct drm_i915_private *i915 = arg; 1039 struct intel_engine_cs *engine; 1040 struct igt_live_test t; 1041 int err; 1042 1043 /* 1044 * Submit various sized batches of empty requests, to each engine 1045 * (individually), and wait for the batch to complete. We can check 1046 * the overhead of submitting requests to the hardware. 1047 */ 1048 1049 for_each_uabi_engine(engine, i915) { 1050 IGT_TIMEOUT(end_time); 1051 struct i915_request *request; 1052 struct i915_vma *batch; 1053 unsigned long n, prime; 1054 ktime_t times[2] = {}; 1055 1056 batch = empty_batch(engine->gt); 1057 if (IS_ERR(batch)) 1058 return PTR_ERR(batch); 1059 1060 err = igt_live_test_begin(&t, i915, __func__, engine->name); 1061 if (err) 1062 goto out_batch; 1063 1064 intel_engine_pm_get(engine); 1065 1066 /* Warmup / preload */ 1067 request = empty_request(engine, batch); 1068 if (IS_ERR(request)) { 1069 err = PTR_ERR(request); 1070 intel_engine_pm_put(engine); 1071 goto out_batch; 1072 } 1073 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 1074 1075 for_each_prime_number_from(prime, 1, 8192) { 1076 times[1] = ktime_get_raw(); 1077 1078 for (n = 0; n < prime; n++) { 1079 i915_request_put(request); 1080 request = empty_request(engine, batch); 1081 if (IS_ERR(request)) { 1082 err = PTR_ERR(request); 1083 intel_engine_pm_put(engine); 1084 goto out_batch; 1085 } 1086 } 1087 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 1088 1089 times[1] = ktime_sub(ktime_get_raw(), times[1]); 1090 if (prime == 1) 1091 times[0] = times[1]; 1092 1093 if (__igt_timeout(end_time, NULL)) 1094 break; 1095 } 1096 i915_request_put(request); 1097 intel_engine_pm_put(engine); 1098 1099 err = igt_live_test_end(&t); 1100 if (err) 1101 goto out_batch; 1102 1103 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n", 1104 engine->name, 1105 ktime_to_ns(times[0]), 1106 prime, div64_u64(ktime_to_ns(times[1]), prime)); 1107 out_batch: 1108 i915_vma_unpin(batch); 1109 i915_vma_put(batch); 1110 if (err) 1111 break; 1112 } 1113 1114 return err; 1115 } 1116 1117 static struct i915_vma *recursive_batch(struct intel_gt *gt) 1118 { 1119 struct drm_i915_gem_object *obj; 1120 const int ver = GRAPHICS_VER(gt->i915); 1121 struct i915_vma *vma; 1122 u32 *cmd; 1123 int err; 1124 1125 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 1126 if (IS_ERR(obj)) 1127 return ERR_CAST(obj); 1128 1129 vma = i915_vma_instance(obj, gt->vm, NULL); 1130 if (IS_ERR(vma)) { 1131 err = PTR_ERR(vma); 1132 goto err; 1133 } 1134 1135 err = i915_vma_pin(vma, 0, 0, PIN_USER); 1136 if (err) 1137 goto err; 1138 1139 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC); 1140 if (IS_ERR(cmd)) { 1141 err = PTR_ERR(cmd); 1142 goto err; 1143 } 1144 1145 if (ver >= 8) { 1146 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 1147 *cmd++ = lower_32_bits(i915_vma_offset(vma)); 1148 *cmd++ = upper_32_bits(i915_vma_offset(vma)); 1149 } else if (ver >= 6) { 1150 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8; 1151 *cmd++ = lower_32_bits(i915_vma_offset(vma)); 1152 } else { 1153 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; 1154 *cmd++ = lower_32_bits(i915_vma_offset(vma)); 1155 } 1156 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */ 1157 1158 __i915_gem_object_flush_map(obj, 0, 64); 1159 i915_gem_object_unpin_map(obj); 1160 1161 intel_gt_chipset_flush(gt); 1162 1163 return vma; 1164 1165 err: 1166 i915_gem_object_put(obj); 1167 return ERR_PTR(err); 1168 } 1169 1170 static int recursive_batch_resolve(struct i915_vma *batch) 1171 { 1172 u32 *cmd; 1173 1174 cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC); 1175 if (IS_ERR(cmd)) 1176 return PTR_ERR(cmd); 1177 1178 *cmd = MI_BATCH_BUFFER_END; 1179 1180 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd)); 1181 i915_gem_object_unpin_map(batch->obj); 1182 1183 intel_gt_chipset_flush(batch->vm->gt); 1184 1185 return 0; 1186 } 1187 1188 static int live_all_engines(void *arg) 1189 { 1190 struct drm_i915_private *i915 = arg; 1191 const unsigned int nengines = num_uabi_engines(i915); 1192 struct intel_engine_cs *engine; 1193 struct i915_request **request; 1194 struct igt_live_test t; 1195 unsigned int idx; 1196 int err; 1197 1198 /* 1199 * Check we can submit requests to all engines simultaneously. We 1200 * send a recursive batch to each engine - checking that we don't 1201 * block doing so, and that they don't complete too soon. 1202 */ 1203 1204 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 1205 if (!request) 1206 return -ENOMEM; 1207 1208 err = igt_live_test_begin(&t, i915, __func__, ""); 1209 if (err) 1210 goto out_free; 1211 1212 idx = 0; 1213 for_each_uabi_engine(engine, i915) { 1214 struct i915_vma *batch; 1215 1216 batch = recursive_batch(engine->gt); 1217 if (IS_ERR(batch)) { 1218 err = PTR_ERR(batch); 1219 pr_err("%s: Unable to create batch, err=%d\n", 1220 __func__, err); 1221 goto out_free; 1222 } 1223 1224 i915_vma_lock(batch); 1225 request[idx] = intel_engine_create_kernel_request(engine); 1226 if (IS_ERR(request[idx])) { 1227 err = PTR_ERR(request[idx]); 1228 pr_err("%s: Request allocation failed with err=%d\n", 1229 __func__, err); 1230 goto out_unlock; 1231 } 1232 GEM_BUG_ON(request[idx]->context->vm != batch->vm); 1233 1234 err = i915_vma_move_to_active(batch, request[idx], 0); 1235 GEM_BUG_ON(err); 1236 1237 err = emit_bb_start(request[idx], batch); 1238 GEM_BUG_ON(err); 1239 request[idx]->batch = batch; 1240 1241 i915_request_get(request[idx]); 1242 i915_request_add(request[idx]); 1243 idx++; 1244 out_unlock: 1245 i915_vma_unlock(batch); 1246 if (err) 1247 goto out_request; 1248 } 1249 1250 idx = 0; 1251 for_each_uabi_engine(engine, i915) { 1252 if (i915_request_completed(request[idx])) { 1253 pr_err("%s(%s): request completed too early!\n", 1254 __func__, engine->name); 1255 err = -EINVAL; 1256 goto out_request; 1257 } 1258 idx++; 1259 } 1260 1261 idx = 0; 1262 for_each_uabi_engine(engine, i915) { 1263 err = recursive_batch_resolve(request[idx]->batch); 1264 if (err) { 1265 pr_err("%s: failed to resolve batch, err=%d\n", 1266 __func__, err); 1267 goto out_request; 1268 } 1269 idx++; 1270 } 1271 1272 idx = 0; 1273 for_each_uabi_engine(engine, i915) { 1274 struct i915_request *rq = request[idx]; 1275 long timeout; 1276 1277 timeout = i915_request_wait(rq, 0, 1278 MAX_SCHEDULE_TIMEOUT); 1279 if (timeout < 0) { 1280 err = timeout; 1281 pr_err("%s: error waiting for request on %s, err=%d\n", 1282 __func__, engine->name, err); 1283 goto out_request; 1284 } 1285 1286 GEM_BUG_ON(!i915_request_completed(rq)); 1287 i915_vma_unpin(rq->batch); 1288 i915_vma_put(rq->batch); 1289 i915_request_put(rq); 1290 request[idx] = NULL; 1291 idx++; 1292 } 1293 1294 err = igt_live_test_end(&t); 1295 1296 out_request: 1297 idx = 0; 1298 for_each_uabi_engine(engine, i915) { 1299 struct i915_request *rq = request[idx]; 1300 1301 if (!rq) 1302 continue; 1303 1304 if (rq->batch) { 1305 i915_vma_unpin(rq->batch); 1306 i915_vma_put(rq->batch); 1307 } 1308 i915_request_put(rq); 1309 idx++; 1310 } 1311 out_free: 1312 kfree(request); 1313 return err; 1314 } 1315 1316 static int live_sequential_engines(void *arg) 1317 { 1318 struct drm_i915_private *i915 = arg; 1319 const unsigned int nengines = num_uabi_engines(i915); 1320 struct i915_request **request; 1321 struct i915_request *prev = NULL; 1322 struct intel_engine_cs *engine; 1323 struct igt_live_test t; 1324 unsigned int idx; 1325 int err; 1326 1327 /* 1328 * Check we can submit requests to all engines sequentially, such 1329 * that each successive request waits for the earlier ones. This 1330 * tests that we don't execute requests out of order, even though 1331 * they are running on independent engines. 1332 */ 1333 1334 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 1335 if (!request) 1336 return -ENOMEM; 1337 1338 err = igt_live_test_begin(&t, i915, __func__, ""); 1339 if (err) 1340 goto out_free; 1341 1342 idx = 0; 1343 for_each_uabi_engine(engine, i915) { 1344 struct i915_vma *batch; 1345 1346 batch = recursive_batch(engine->gt); 1347 if (IS_ERR(batch)) { 1348 err = PTR_ERR(batch); 1349 pr_err("%s: Unable to create batch for %s, err=%d\n", 1350 __func__, engine->name, err); 1351 goto out_free; 1352 } 1353 1354 i915_vma_lock(batch); 1355 request[idx] = intel_engine_create_kernel_request(engine); 1356 if (IS_ERR(request[idx])) { 1357 err = PTR_ERR(request[idx]); 1358 pr_err("%s: Request allocation failed for %s with err=%d\n", 1359 __func__, engine->name, err); 1360 goto out_unlock; 1361 } 1362 GEM_BUG_ON(request[idx]->context->vm != batch->vm); 1363 1364 if (prev) { 1365 err = i915_request_await_dma_fence(request[idx], 1366 &prev->fence); 1367 if (err) { 1368 i915_request_add(request[idx]); 1369 pr_err("%s: Request await failed for %s with err=%d\n", 1370 __func__, engine->name, err); 1371 goto out_unlock; 1372 } 1373 } 1374 1375 err = i915_vma_move_to_active(batch, request[idx], 0); 1376 GEM_BUG_ON(err); 1377 1378 err = emit_bb_start(request[idx], batch); 1379 GEM_BUG_ON(err); 1380 request[idx]->batch = batch; 1381 1382 i915_request_get(request[idx]); 1383 i915_request_add(request[idx]); 1384 1385 prev = request[idx]; 1386 idx++; 1387 1388 out_unlock: 1389 i915_vma_unlock(batch); 1390 if (err) 1391 goto out_request; 1392 } 1393 1394 idx = 0; 1395 for_each_uabi_engine(engine, i915) { 1396 long timeout; 1397 1398 if (i915_request_completed(request[idx])) { 1399 pr_err("%s(%s): request completed too early!\n", 1400 __func__, engine->name); 1401 err = -EINVAL; 1402 goto out_request; 1403 } 1404 1405 err = recursive_batch_resolve(request[idx]->batch); 1406 if (err) { 1407 pr_err("%s: failed to resolve batch, err=%d\n", 1408 __func__, err); 1409 goto out_request; 1410 } 1411 1412 timeout = i915_request_wait(request[idx], 0, 1413 MAX_SCHEDULE_TIMEOUT); 1414 if (timeout < 0) { 1415 err = timeout; 1416 pr_err("%s: error waiting for request on %s, err=%d\n", 1417 __func__, engine->name, err); 1418 goto out_request; 1419 } 1420 1421 GEM_BUG_ON(!i915_request_completed(request[idx])); 1422 idx++; 1423 } 1424 1425 err = igt_live_test_end(&t); 1426 1427 out_request: 1428 idx = 0; 1429 for_each_uabi_engine(engine, i915) { 1430 u32 *cmd; 1431 1432 if (!request[idx]) 1433 break; 1434 1435 cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj, 1436 I915_MAP_WC); 1437 if (!IS_ERR(cmd)) { 1438 *cmd = MI_BATCH_BUFFER_END; 1439 1440 __i915_gem_object_flush_map(request[idx]->batch->obj, 1441 0, sizeof(*cmd)); 1442 i915_gem_object_unpin_map(request[idx]->batch->obj); 1443 1444 intel_gt_chipset_flush(engine->gt); 1445 } 1446 1447 i915_vma_put(request[idx]->batch); 1448 i915_request_put(request[idx]); 1449 idx++; 1450 } 1451 out_free: 1452 kfree(request); 1453 return err; 1454 } 1455 1456 struct parallel_thread { 1457 struct kthread_worker *worker; 1458 struct kthread_work work; 1459 struct intel_engine_cs *engine; 1460 int result; 1461 }; 1462 1463 static void __live_parallel_engine1(struct kthread_work *work) 1464 { 1465 struct parallel_thread *thread = 1466 container_of(work, typeof(*thread), work); 1467 struct intel_engine_cs *engine = thread->engine; 1468 IGT_TIMEOUT(end_time); 1469 unsigned long count; 1470 int err = 0; 1471 1472 count = 0; 1473 intel_engine_pm_get(engine); 1474 do { 1475 struct i915_request *rq; 1476 1477 rq = i915_request_create(engine->kernel_context); 1478 if (IS_ERR(rq)) { 1479 err = PTR_ERR(rq); 1480 break; 1481 } 1482 1483 i915_request_get(rq); 1484 i915_request_add(rq); 1485 1486 err = 0; 1487 if (i915_request_wait(rq, 0, HZ) < 0) 1488 err = -ETIME; 1489 i915_request_put(rq); 1490 if (err) 1491 break; 1492 1493 count++; 1494 } while (!__igt_timeout(end_time, NULL)); 1495 intel_engine_pm_put(engine); 1496 1497 pr_info("%s: %lu request + sync\n", engine->name, count); 1498 thread->result = err; 1499 } 1500 1501 static void __live_parallel_engineN(struct kthread_work *work) 1502 { 1503 struct parallel_thread *thread = 1504 container_of(work, typeof(*thread), work); 1505 struct intel_engine_cs *engine = thread->engine; 1506 IGT_TIMEOUT(end_time); 1507 unsigned long count; 1508 int err = 0; 1509 1510 count = 0; 1511 intel_engine_pm_get(engine); 1512 do { 1513 struct i915_request *rq; 1514 1515 rq = i915_request_create(engine->kernel_context); 1516 if (IS_ERR(rq)) { 1517 err = PTR_ERR(rq); 1518 break; 1519 } 1520 1521 i915_request_add(rq); 1522 count++; 1523 } while (!__igt_timeout(end_time, NULL)); 1524 intel_engine_pm_put(engine); 1525 1526 pr_info("%s: %lu requests\n", engine->name, count); 1527 thread->result = err; 1528 } 1529 1530 static bool wake_all(struct drm_i915_private *i915) 1531 { 1532 if (atomic_dec_and_test(&i915->selftest.counter)) { 1533 wake_up_var(&i915->selftest.counter); 1534 return true; 1535 } 1536 1537 return false; 1538 } 1539 1540 static int wait_for_all(struct drm_i915_private *i915) 1541 { 1542 if (wake_all(i915)) 1543 return 0; 1544 1545 if (wait_var_event_timeout(&i915->selftest.counter, 1546 !atomic_read(&i915->selftest.counter), 1547 i915_selftest.timeout_jiffies)) 1548 return 0; 1549 1550 return -ETIME; 1551 } 1552 1553 static void __live_parallel_spin(struct kthread_work *work) 1554 { 1555 struct parallel_thread *thread = 1556 container_of(work, typeof(*thread), work); 1557 struct intel_engine_cs *engine = thread->engine; 1558 struct igt_spinner spin; 1559 struct i915_request *rq; 1560 int err = 0; 1561 1562 /* 1563 * Create a spinner running for eternity on each engine. If a second 1564 * spinner is incorrectly placed on the same engine, it will not be 1565 * able to start in time. 1566 */ 1567 1568 if (igt_spinner_init(&spin, engine->gt)) { 1569 wake_all(engine->i915); 1570 thread->result = -ENOMEM; 1571 return; 1572 } 1573 1574 intel_engine_pm_get(engine); 1575 rq = igt_spinner_create_request(&spin, 1576 engine->kernel_context, 1577 MI_NOOP); /* no preemption */ 1578 intel_engine_pm_put(engine); 1579 if (IS_ERR(rq)) { 1580 err = PTR_ERR(rq); 1581 if (err == -ENODEV) 1582 err = 0; 1583 wake_all(engine->i915); 1584 goto out_spin; 1585 } 1586 1587 i915_request_get(rq); 1588 i915_request_add(rq); 1589 if (igt_wait_for_spinner(&spin, rq)) { 1590 /* Occupy this engine for the whole test */ 1591 err = wait_for_all(engine->i915); 1592 } else { 1593 pr_err("Failed to start spinner on %s\n", engine->name); 1594 err = -EINVAL; 1595 } 1596 igt_spinner_end(&spin); 1597 1598 if (err == 0 && i915_request_wait(rq, 0, HZ) < 0) 1599 err = -EIO; 1600 i915_request_put(rq); 1601 1602 out_spin: 1603 igt_spinner_fini(&spin); 1604 thread->result = err; 1605 } 1606 1607 static int live_parallel_engines(void *arg) 1608 { 1609 struct drm_i915_private *i915 = arg; 1610 static void (* const func[])(struct kthread_work *) = { 1611 __live_parallel_engine1, 1612 __live_parallel_engineN, 1613 __live_parallel_spin, 1614 NULL, 1615 }; 1616 const unsigned int nengines = num_uabi_engines(i915); 1617 struct parallel_thread *threads; 1618 struct intel_engine_cs *engine; 1619 void (* const *fn)(struct kthread_work *); 1620 int err = 0; 1621 1622 /* 1623 * Check we can submit requests to all engines concurrently. This 1624 * tests that we load up the system maximally. 1625 */ 1626 1627 threads = kcalloc(nengines, sizeof(*threads), GFP_KERNEL); 1628 if (!threads) 1629 return -ENOMEM; 1630 1631 for (fn = func; !err && *fn; fn++) { 1632 char name[KSYM_NAME_LEN]; 1633 struct igt_live_test t; 1634 unsigned int idx; 1635 1636 snprintf(name, sizeof(name), "%ps", *fn); 1637 err = igt_live_test_begin(&t, i915, __func__, name); 1638 if (err) 1639 break; 1640 1641 atomic_set(&i915->selftest.counter, nengines); 1642 1643 idx = 0; 1644 for_each_uabi_engine(engine, i915) { 1645 struct kthread_worker *worker; 1646 1647 worker = kthread_run_worker(0, "igt/parallel:%s", 1648 engine->name); 1649 if (IS_ERR(worker)) { 1650 err = PTR_ERR(worker); 1651 break; 1652 } 1653 1654 threads[idx].worker = worker; 1655 threads[idx].result = 0; 1656 threads[idx].engine = engine; 1657 1658 kthread_init_work(&threads[idx].work, *fn); 1659 kthread_queue_work(worker, &threads[idx].work); 1660 idx++; 1661 } 1662 1663 idx = 0; 1664 for_each_uabi_engine(engine, i915) { 1665 int status; 1666 1667 if (!threads[idx].worker) 1668 break; 1669 1670 kthread_flush_work(&threads[idx].work); 1671 status = READ_ONCE(threads[idx].result); 1672 if (status && !err) 1673 err = status; 1674 1675 kthread_destroy_worker(threads[idx++].worker); 1676 } 1677 1678 if (igt_live_test_end(&t)) 1679 err = -EIO; 1680 } 1681 1682 kfree(threads); 1683 return err; 1684 } 1685 1686 static int 1687 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine) 1688 { 1689 struct i915_request *rq; 1690 int ret; 1691 1692 /* 1693 * Before execlists, all contexts share the same ringbuffer. With 1694 * execlists, each context/engine has a separate ringbuffer and 1695 * for the purposes of this test, inexhaustible. 1696 * 1697 * For the global ringbuffer though, we have to be very careful 1698 * that we do not wrap while preventing the execution of requests 1699 * with a unsignaled fence. 1700 */ 1701 if (HAS_EXECLISTS(ctx->i915)) 1702 return INT_MAX; 1703 1704 rq = igt_request_alloc(ctx, engine); 1705 if (IS_ERR(rq)) { 1706 ret = PTR_ERR(rq); 1707 } else { 1708 int sz; 1709 1710 ret = rq->ring->size - rq->reserved_space; 1711 i915_request_add(rq); 1712 1713 sz = rq->ring->emit - rq->head; 1714 if (sz < 0) 1715 sz += rq->ring->size; 1716 ret /= sz; 1717 ret /= 2; /* leave half spare, in case of emergency! */ 1718 } 1719 1720 return ret; 1721 } 1722 1723 static int live_breadcrumbs_smoketest(void *arg) 1724 { 1725 struct drm_i915_private *i915 = arg; 1726 const unsigned int nengines = num_uabi_engines(i915); 1727 const unsigned int ncpus = /* saturate with nengines * ncpus */ 1728 max_t(int, 2, DIV_ROUND_UP(num_online_cpus(), nengines)); 1729 unsigned long num_waits, num_fences; 1730 struct intel_engine_cs *engine; 1731 struct smoke_thread *threads; 1732 struct igt_live_test live; 1733 intel_wakeref_t wakeref; 1734 struct smoketest *smoke; 1735 unsigned int n, idx; 1736 struct file *file; 1737 int ret = 0; 1738 1739 /* 1740 * Smoketest our breadcrumb/signal handling for requests across multiple 1741 * threads. A very simple test to only catch the most egregious of bugs. 1742 * See __igt_breadcrumbs_smoketest(); 1743 * 1744 * On real hardware this time. 1745 */ 1746 1747 wakeref = intel_runtime_pm_get(&i915->runtime_pm); 1748 1749 file = mock_file(i915); 1750 if (IS_ERR(file)) { 1751 ret = PTR_ERR(file); 1752 goto out_rpm; 1753 } 1754 1755 smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL); 1756 if (!smoke) { 1757 ret = -ENOMEM; 1758 goto out_file; 1759 } 1760 1761 threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL); 1762 if (!threads) { 1763 ret = -ENOMEM; 1764 goto out_smoke; 1765 } 1766 1767 smoke[0].request_alloc = __live_request_alloc; 1768 smoke[0].ncontexts = 64; 1769 smoke[0].contexts = kcalloc(smoke[0].ncontexts, 1770 sizeof(*smoke[0].contexts), 1771 GFP_KERNEL); 1772 if (!smoke[0].contexts) { 1773 ret = -ENOMEM; 1774 goto out_threads; 1775 } 1776 1777 for (n = 0; n < smoke[0].ncontexts; n++) { 1778 smoke[0].contexts[n] = live_context(i915, file); 1779 if (IS_ERR(smoke[0].contexts[n])) { 1780 ret = PTR_ERR(smoke[0].contexts[n]); 1781 goto out_contexts; 1782 } 1783 } 1784 1785 ret = igt_live_test_begin(&live, i915, __func__, ""); 1786 if (ret) 1787 goto out_contexts; 1788 1789 idx = 0; 1790 for_each_uabi_engine(engine, i915) { 1791 smoke[idx] = smoke[0]; 1792 smoke[idx].engine = engine; 1793 smoke[idx].max_batch = 1794 max_batches(smoke[0].contexts[0], engine); 1795 if (smoke[idx].max_batch < 0) { 1796 ret = smoke[idx].max_batch; 1797 goto out_flush; 1798 } 1799 /* One ring interleaved between requests from all cpus */ 1800 smoke[idx].max_batch /= ncpus + 1; 1801 pr_debug("Limiting batches to %d requests on %s\n", 1802 smoke[idx].max_batch, engine->name); 1803 1804 for (n = 0; n < ncpus; n++) { 1805 unsigned int i = idx * ncpus + n; 1806 struct kthread_worker *worker; 1807 1808 worker = kthread_run_worker(0, "igt/%d.%d", idx, n); 1809 if (IS_ERR(worker)) { 1810 ret = PTR_ERR(worker); 1811 goto out_flush; 1812 } 1813 1814 threads[i].worker = worker; 1815 threads[i].t = &smoke[idx]; 1816 1817 kthread_init_work(&threads[i].work, 1818 __igt_breadcrumbs_smoketest); 1819 kthread_queue_work(worker, &threads[i].work); 1820 } 1821 1822 idx++; 1823 } 1824 1825 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 1826 1827 out_flush: 1828 idx = 0; 1829 num_waits = 0; 1830 num_fences = 0; 1831 for_each_uabi_engine(engine, i915) { 1832 for (n = 0; n < ncpus; n++) { 1833 unsigned int i = idx * ncpus + n; 1834 int err; 1835 1836 if (!threads[i].worker) 1837 continue; 1838 1839 WRITE_ONCE(threads[i].stop, true); 1840 kthread_flush_work(&threads[i].work); 1841 err = READ_ONCE(threads[i].result); 1842 if (err < 0 && !ret) 1843 ret = err; 1844 1845 kthread_destroy_worker(threads[i].worker); 1846 } 1847 1848 num_waits += atomic_long_read(&smoke[idx].num_waits); 1849 num_fences += atomic_long_read(&smoke[idx].num_fences); 1850 idx++; 1851 } 1852 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n", 1853 num_waits, num_fences, idx, ncpus); 1854 1855 ret = igt_live_test_end(&live) ?: ret; 1856 out_contexts: 1857 kfree(smoke[0].contexts); 1858 out_threads: 1859 kfree(threads); 1860 out_smoke: 1861 kfree(smoke); 1862 out_file: 1863 fput(file); 1864 out_rpm: 1865 intel_runtime_pm_put(&i915->runtime_pm, wakeref); 1866 1867 return ret; 1868 } 1869 1870 int i915_request_live_selftests(struct drm_i915_private *i915) 1871 { 1872 static const struct i915_subtest tests[] = { 1873 SUBTEST(live_nop_request), 1874 SUBTEST(live_all_engines), 1875 SUBTEST(live_sequential_engines), 1876 SUBTEST(live_parallel_engines), 1877 SUBTEST(live_empty_request), 1878 SUBTEST(live_cancel_request), 1879 SUBTEST(live_breadcrumbs_smoketest), 1880 }; 1881 1882 if (intel_gt_is_wedged(to_gt(i915))) 1883 return 0; 1884 1885 return i915_live_subtests(tests, i915); 1886 } 1887 1888 static int switch_to_kernel_sync(struct intel_context *ce, int err) 1889 { 1890 struct i915_request *rq; 1891 struct dma_fence *fence; 1892 1893 rq = intel_engine_create_kernel_request(ce->engine); 1894 if (IS_ERR(rq)) 1895 return PTR_ERR(rq); 1896 1897 fence = i915_active_fence_get(&ce->timeline->last_request); 1898 if (fence) { 1899 i915_request_await_dma_fence(rq, fence); 1900 dma_fence_put(fence); 1901 } 1902 1903 rq = i915_request_get(rq); 1904 i915_request_add(rq); 1905 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err) 1906 err = -ETIME; 1907 i915_request_put(rq); 1908 1909 while (!err && !intel_engine_is_idle(ce->engine)) 1910 intel_engine_flush_submission(ce->engine); 1911 1912 return err; 1913 } 1914 1915 struct perf_stats { 1916 struct intel_engine_cs *engine; 1917 unsigned long count; 1918 ktime_t time; 1919 ktime_t busy; 1920 u64 runtime; 1921 }; 1922 1923 struct perf_series { 1924 struct drm_i915_private *i915; 1925 unsigned int nengines; 1926 struct intel_context *ce[] __counted_by(nengines); 1927 }; 1928 1929 static int cmp_u32(const void *A, const void *B) 1930 { 1931 const u32 *a = A, *b = B; 1932 1933 return *a - *b; 1934 } 1935 1936 static u32 trifilter(u32 *a) 1937 { 1938 u64 sum; 1939 1940 #define TF_COUNT 5 1941 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL); 1942 1943 sum = mul_u32_u32(a[2], 2); 1944 sum += a[1]; 1945 sum += a[3]; 1946 1947 GEM_BUG_ON(sum > U32_MAX); 1948 return sum; 1949 #define TF_BIAS 2 1950 } 1951 1952 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles) 1953 { 1954 u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles); 1955 1956 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS); 1957 } 1958 1959 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset) 1960 { 1961 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; 1962 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base))); 1963 *cs++ = offset; 1964 *cs++ = 0; 1965 1966 return cs; 1967 } 1968 1969 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value) 1970 { 1971 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1972 *cs++ = offset; 1973 *cs++ = 0; 1974 *cs++ = value; 1975 1976 return cs; 1977 } 1978 1979 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset) 1980 { 1981 *cs++ = MI_SEMAPHORE_WAIT | 1982 MI_SEMAPHORE_GLOBAL_GTT | 1983 MI_SEMAPHORE_POLL | 1984 mode; 1985 *cs++ = value; 1986 *cs++ = offset; 1987 *cs++ = 0; 1988 1989 return cs; 1990 } 1991 1992 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value) 1993 { 1994 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset); 1995 } 1996 1997 static void semaphore_set(u32 *sema, u32 value) 1998 { 1999 WRITE_ONCE(*sema, value); 2000 wmb(); /* flush the update to the cache, and beyond */ 2001 } 2002 2003 static u32 *hwsp_scratch(const struct intel_context *ce) 2004 { 2005 return memset32(ce->engine->status_page.addr + 1000, 0, 21); 2006 } 2007 2008 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw) 2009 { 2010 return (i915_ggtt_offset(ce->engine->status_page.vma) + 2011 offset_in_page(dw)); 2012 } 2013 2014 static int measure_semaphore_response(struct intel_context *ce) 2015 { 2016 u32 *sema = hwsp_scratch(ce); 2017 const u32 offset = hwsp_offset(ce, sema); 2018 u32 elapsed[TF_COUNT], cycles; 2019 struct i915_request *rq; 2020 u32 *cs; 2021 int err; 2022 int i; 2023 2024 /* 2025 * Measure how many cycles it takes for the HW to detect the change 2026 * in a semaphore value. 2027 * 2028 * A: read CS_TIMESTAMP from CPU 2029 * poke semaphore 2030 * B: read CS_TIMESTAMP on GPU 2031 * 2032 * Semaphore latency: B - A 2033 */ 2034 2035 semaphore_set(sema, -1); 2036 2037 rq = i915_request_create(ce); 2038 if (IS_ERR(rq)) 2039 return PTR_ERR(rq); 2040 2041 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed)); 2042 if (IS_ERR(cs)) { 2043 i915_request_add(rq); 2044 err = PTR_ERR(cs); 2045 goto err; 2046 } 2047 2048 cs = emit_store_dw(cs, offset, 0); 2049 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2050 cs = emit_semaphore_poll_until(cs, offset, i); 2051 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2052 cs = emit_store_dw(cs, offset, 0); 2053 } 2054 2055 intel_ring_advance(rq, cs); 2056 i915_request_add(rq); 2057 2058 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 2059 err = -EIO; 2060 goto err; 2061 } 2062 2063 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2064 preempt_disable(); 2065 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2066 semaphore_set(sema, i); 2067 preempt_enable(); 2068 2069 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 2070 err = -EIO; 2071 goto err; 2072 } 2073 2074 elapsed[i - 1] = sema[i] - cycles; 2075 } 2076 2077 cycles = trifilter(elapsed); 2078 pr_info("%s: semaphore response %d cycles, %lluns\n", 2079 ce->engine->name, cycles >> TF_BIAS, 2080 cycles_to_ns(ce->engine, cycles)); 2081 2082 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2083 2084 err: 2085 intel_gt_set_wedged(ce->engine->gt); 2086 return err; 2087 } 2088 2089 static int measure_idle_dispatch(struct intel_context *ce) 2090 { 2091 u32 *sema = hwsp_scratch(ce); 2092 const u32 offset = hwsp_offset(ce, sema); 2093 u32 elapsed[TF_COUNT], cycles; 2094 u32 *cs; 2095 int err; 2096 int i; 2097 2098 /* 2099 * Measure how long it takes for us to submit a request while the 2100 * engine is idle, but is resting in our context. 2101 * 2102 * A: read CS_TIMESTAMP from CPU 2103 * submit request 2104 * B: read CS_TIMESTAMP on GPU 2105 * 2106 * Submission latency: B - A 2107 */ 2108 2109 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 2110 struct i915_request *rq; 2111 2112 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2113 if (err) 2114 return err; 2115 2116 rq = i915_request_create(ce); 2117 if (IS_ERR(rq)) { 2118 err = PTR_ERR(rq); 2119 goto err; 2120 } 2121 2122 cs = intel_ring_begin(rq, 4); 2123 if (IS_ERR(cs)) { 2124 i915_request_add(rq); 2125 err = PTR_ERR(cs); 2126 goto err; 2127 } 2128 2129 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2130 2131 intel_ring_advance(rq, cs); 2132 2133 preempt_disable(); 2134 local_bh_disable(); 2135 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2136 i915_request_add(rq); 2137 local_bh_enable(); 2138 preempt_enable(); 2139 } 2140 2141 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2142 if (err) 2143 goto err; 2144 2145 for (i = 0; i < ARRAY_SIZE(elapsed); i++) 2146 elapsed[i] = sema[i] - elapsed[i]; 2147 2148 cycles = trifilter(elapsed); 2149 pr_info("%s: idle dispatch latency %d cycles, %lluns\n", 2150 ce->engine->name, cycles >> TF_BIAS, 2151 cycles_to_ns(ce->engine, cycles)); 2152 2153 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2154 2155 err: 2156 intel_gt_set_wedged(ce->engine->gt); 2157 return err; 2158 } 2159 2160 static int measure_busy_dispatch(struct intel_context *ce) 2161 { 2162 u32 *sema = hwsp_scratch(ce); 2163 const u32 offset = hwsp_offset(ce, sema); 2164 u32 elapsed[TF_COUNT + 1], cycles; 2165 u32 *cs; 2166 int err; 2167 int i; 2168 2169 /* 2170 * Measure how long it takes for us to submit a request while the 2171 * engine is busy, polling on a semaphore in our context. With 2172 * direct submission, this will include the cost of a lite restore. 2173 * 2174 * A: read CS_TIMESTAMP from CPU 2175 * submit request 2176 * B: read CS_TIMESTAMP on GPU 2177 * 2178 * Submission latency: B - A 2179 */ 2180 2181 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2182 struct i915_request *rq; 2183 2184 rq = i915_request_create(ce); 2185 if (IS_ERR(rq)) { 2186 err = PTR_ERR(rq); 2187 goto err; 2188 } 2189 2190 cs = intel_ring_begin(rq, 12); 2191 if (IS_ERR(cs)) { 2192 i915_request_add(rq); 2193 err = PTR_ERR(cs); 2194 goto err; 2195 } 2196 2197 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 2198 cs = emit_semaphore_poll_until(cs, offset, i); 2199 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2200 2201 intel_ring_advance(rq, cs); 2202 2203 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) { 2204 err = -EIO; 2205 goto err; 2206 } 2207 2208 preempt_disable(); 2209 local_bh_disable(); 2210 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2211 i915_request_add(rq); 2212 local_bh_enable(); 2213 semaphore_set(sema, i - 1); 2214 preempt_enable(); 2215 } 2216 2217 wait_for(READ_ONCE(sema[i - 1]), 500); 2218 semaphore_set(sema, i - 1); 2219 2220 for (i = 1; i <= TF_COUNT; i++) { 2221 GEM_BUG_ON(sema[i] == -1); 2222 elapsed[i - 1] = sema[i] - elapsed[i]; 2223 } 2224 2225 cycles = trifilter(elapsed); 2226 pr_info("%s: busy dispatch latency %d cycles, %lluns\n", 2227 ce->engine->name, cycles >> TF_BIAS, 2228 cycles_to_ns(ce->engine, cycles)); 2229 2230 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2231 2232 err: 2233 intel_gt_set_wedged(ce->engine->gt); 2234 return err; 2235 } 2236 2237 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value) 2238 { 2239 const u32 offset = 2240 i915_ggtt_offset(engine->status_page.vma) + 2241 offset_in_page(sema); 2242 struct i915_request *rq; 2243 u32 *cs; 2244 2245 rq = i915_request_create(engine->kernel_context); 2246 if (IS_ERR(rq)) 2247 return PTR_ERR(rq); 2248 2249 cs = intel_ring_begin(rq, 4); 2250 if (IS_ERR(cs)) { 2251 i915_request_add(rq); 2252 return PTR_ERR(cs); 2253 } 2254 2255 cs = emit_semaphore_poll(cs, mode, value, offset); 2256 2257 intel_ring_advance(rq, cs); 2258 i915_request_add(rq); 2259 2260 return 0; 2261 } 2262 2263 static int measure_inter_request(struct intel_context *ce) 2264 { 2265 u32 *sema = hwsp_scratch(ce); 2266 const u32 offset = hwsp_offset(ce, sema); 2267 u32 elapsed[TF_COUNT + 1], cycles; 2268 struct i915_sw_fence *submit; 2269 int i, err; 2270 2271 /* 2272 * Measure how long it takes to advance from one request into the 2273 * next. Between each request we flush the GPU caches to memory, 2274 * update the breadcrumbs, and then invalidate those caches. 2275 * We queue up all the requests to be submitted in one batch so 2276 * it should be one set of contiguous measurements. 2277 * 2278 * A: read CS_TIMESTAMP on GPU 2279 * advance request 2280 * B: read CS_TIMESTAMP on GPU 2281 * 2282 * Request latency: B - A 2283 */ 2284 2285 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 2286 if (err) 2287 return err; 2288 2289 submit = heap_fence_create(GFP_KERNEL); 2290 if (!submit) { 2291 semaphore_set(sema, 1); 2292 return -ENOMEM; 2293 } 2294 2295 intel_engine_flush_submission(ce->engine); 2296 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2297 struct i915_request *rq; 2298 u32 *cs; 2299 2300 rq = i915_request_create(ce); 2301 if (IS_ERR(rq)) { 2302 err = PTR_ERR(rq); 2303 goto err_submit; 2304 } 2305 2306 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 2307 submit, 2308 GFP_KERNEL); 2309 if (err < 0) { 2310 i915_request_add(rq); 2311 goto err_submit; 2312 } 2313 2314 cs = intel_ring_begin(rq, 4); 2315 if (IS_ERR(cs)) { 2316 i915_request_add(rq); 2317 err = PTR_ERR(cs); 2318 goto err_submit; 2319 } 2320 2321 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2322 2323 intel_ring_advance(rq, cs); 2324 i915_request_add(rq); 2325 } 2326 i915_sw_fence_commit(submit); 2327 intel_engine_flush_submission(ce->engine); 2328 heap_fence_put(submit); 2329 2330 semaphore_set(sema, 1); 2331 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2332 if (err) 2333 goto err; 2334 2335 for (i = 1; i <= TF_COUNT; i++) 2336 elapsed[i - 1] = sema[i + 1] - sema[i]; 2337 2338 cycles = trifilter(elapsed); 2339 pr_info("%s: inter-request latency %d cycles, %lluns\n", 2340 ce->engine->name, cycles >> TF_BIAS, 2341 cycles_to_ns(ce->engine, cycles)); 2342 2343 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2344 2345 err_submit: 2346 i915_sw_fence_commit(submit); 2347 heap_fence_put(submit); 2348 semaphore_set(sema, 1); 2349 err: 2350 intel_gt_set_wedged(ce->engine->gt); 2351 return err; 2352 } 2353 2354 static int measure_context_switch(struct intel_context *ce) 2355 { 2356 u32 *sema = hwsp_scratch(ce); 2357 const u32 offset = hwsp_offset(ce, sema); 2358 struct i915_request *fence = NULL; 2359 u32 elapsed[TF_COUNT + 1], cycles; 2360 int i, j, err; 2361 u32 *cs; 2362 2363 /* 2364 * Measure how long it takes to advance from one request in one 2365 * context to a request in another context. This allows us to 2366 * measure how long the context save/restore take, along with all 2367 * the inter-context setup we require. 2368 * 2369 * A: read CS_TIMESTAMP on GPU 2370 * switch context 2371 * B: read CS_TIMESTAMP on GPU 2372 * 2373 * Context switch latency: B - A 2374 */ 2375 2376 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 2377 if (err) 2378 return err; 2379 2380 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2381 struct intel_context *arr[] = { 2382 ce, ce->engine->kernel_context 2383 }; 2384 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32); 2385 2386 for (j = 0; j < ARRAY_SIZE(arr); j++) { 2387 struct i915_request *rq; 2388 2389 rq = i915_request_create(arr[j]); 2390 if (IS_ERR(rq)) { 2391 err = PTR_ERR(rq); 2392 goto err_fence; 2393 } 2394 2395 if (fence) { 2396 err = i915_request_await_dma_fence(rq, 2397 &fence->fence); 2398 if (err) { 2399 i915_request_add(rq); 2400 goto err_fence; 2401 } 2402 } 2403 2404 cs = intel_ring_begin(rq, 4); 2405 if (IS_ERR(cs)) { 2406 i915_request_add(rq); 2407 err = PTR_ERR(cs); 2408 goto err_fence; 2409 } 2410 2411 cs = emit_timestamp_store(cs, ce, addr); 2412 addr += sizeof(u32); 2413 2414 intel_ring_advance(rq, cs); 2415 2416 i915_request_put(fence); 2417 fence = i915_request_get(rq); 2418 2419 i915_request_add(rq); 2420 } 2421 } 2422 i915_request_put(fence); 2423 intel_engine_flush_submission(ce->engine); 2424 2425 semaphore_set(sema, 1); 2426 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2427 if (err) 2428 goto err; 2429 2430 for (i = 1; i <= TF_COUNT; i++) 2431 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1]; 2432 2433 cycles = trifilter(elapsed); 2434 pr_info("%s: context switch latency %d cycles, %lluns\n", 2435 ce->engine->name, cycles >> TF_BIAS, 2436 cycles_to_ns(ce->engine, cycles)); 2437 2438 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2439 2440 err_fence: 2441 i915_request_put(fence); 2442 semaphore_set(sema, 1); 2443 err: 2444 intel_gt_set_wedged(ce->engine->gt); 2445 return err; 2446 } 2447 2448 static int measure_preemption(struct intel_context *ce) 2449 { 2450 u32 *sema = hwsp_scratch(ce); 2451 const u32 offset = hwsp_offset(ce, sema); 2452 u32 elapsed[TF_COUNT], cycles; 2453 u32 *cs; 2454 int err; 2455 int i; 2456 2457 /* 2458 * We measure two latencies while triggering preemption. The first 2459 * latency is how long it takes for us to submit a preempting request. 2460 * The second latency is how it takes for us to return from the 2461 * preemption back to the original context. 2462 * 2463 * A: read CS_TIMESTAMP from CPU 2464 * submit preemption 2465 * B: read CS_TIMESTAMP on GPU (in preempting context) 2466 * context switch 2467 * C: read CS_TIMESTAMP on GPU (in original context) 2468 * 2469 * Preemption dispatch latency: B - A 2470 * Preemption switch latency: C - B 2471 */ 2472 2473 if (!intel_engine_has_preemption(ce->engine)) 2474 return 0; 2475 2476 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2477 u32 addr = offset + 2 * i * sizeof(u32); 2478 struct i915_request *rq; 2479 2480 rq = i915_request_create(ce); 2481 if (IS_ERR(rq)) { 2482 err = PTR_ERR(rq); 2483 goto err; 2484 } 2485 2486 cs = intel_ring_begin(rq, 12); 2487 if (IS_ERR(cs)) { 2488 i915_request_add(rq); 2489 err = PTR_ERR(cs); 2490 goto err; 2491 } 2492 2493 cs = emit_store_dw(cs, addr, -1); 2494 cs = emit_semaphore_poll_until(cs, offset, i); 2495 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32)); 2496 2497 intel_ring_advance(rq, cs); 2498 i915_request_add(rq); 2499 2500 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) { 2501 err = -EIO; 2502 goto err; 2503 } 2504 2505 rq = i915_request_create(ce->engine->kernel_context); 2506 if (IS_ERR(rq)) { 2507 err = PTR_ERR(rq); 2508 goto err; 2509 } 2510 2511 cs = intel_ring_begin(rq, 8); 2512 if (IS_ERR(cs)) { 2513 i915_request_add(rq); 2514 err = PTR_ERR(cs); 2515 goto err; 2516 } 2517 2518 cs = emit_timestamp_store(cs, ce, addr); 2519 cs = emit_store_dw(cs, offset, i); 2520 2521 intel_ring_advance(rq, cs); 2522 rq->sched.attr.priority = I915_PRIORITY_BARRIER; 2523 2524 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2525 i915_request_add(rq); 2526 } 2527 2528 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) { 2529 err = -EIO; 2530 goto err; 2531 } 2532 2533 for (i = 1; i <= TF_COUNT; i++) 2534 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1]; 2535 2536 cycles = trifilter(elapsed); 2537 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n", 2538 ce->engine->name, cycles >> TF_BIAS, 2539 cycles_to_ns(ce->engine, cycles)); 2540 2541 for (i = 1; i <= TF_COUNT; i++) 2542 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0]; 2543 2544 cycles = trifilter(elapsed); 2545 pr_info("%s: preemption switch latency %d cycles, %lluns\n", 2546 ce->engine->name, cycles >> TF_BIAS, 2547 cycles_to_ns(ce->engine, cycles)); 2548 2549 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2550 2551 err: 2552 intel_gt_set_wedged(ce->engine->gt); 2553 return err; 2554 } 2555 2556 struct signal_cb { 2557 struct dma_fence_cb base; 2558 bool seen; 2559 }; 2560 2561 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb) 2562 { 2563 struct signal_cb *s = container_of(cb, typeof(*s), base); 2564 2565 smp_store_mb(s->seen, true); /* be safe, be strong */ 2566 } 2567 2568 static int measure_completion(struct intel_context *ce) 2569 { 2570 u32 *sema = hwsp_scratch(ce); 2571 const u32 offset = hwsp_offset(ce, sema); 2572 u32 elapsed[TF_COUNT], cycles; 2573 u32 *cs; 2574 int err; 2575 int i; 2576 2577 /* 2578 * Measure how long it takes for the signal (interrupt) to be 2579 * sent from the GPU to be processed by the CPU. 2580 * 2581 * A: read CS_TIMESTAMP on GPU 2582 * signal 2583 * B: read CS_TIMESTAMP from CPU 2584 * 2585 * Completion latency: B - A 2586 */ 2587 2588 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2589 struct signal_cb cb = { .seen = false }; 2590 struct i915_request *rq; 2591 2592 rq = i915_request_create(ce); 2593 if (IS_ERR(rq)) { 2594 err = PTR_ERR(rq); 2595 goto err; 2596 } 2597 2598 cs = intel_ring_begin(rq, 12); 2599 if (IS_ERR(cs)) { 2600 i915_request_add(rq); 2601 err = PTR_ERR(cs); 2602 goto err; 2603 } 2604 2605 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 2606 cs = emit_semaphore_poll_until(cs, offset, i); 2607 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2608 2609 intel_ring_advance(rq, cs); 2610 2611 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb); 2612 i915_request_add(rq); 2613 2614 intel_engine_flush_submission(ce->engine); 2615 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) { 2616 err = -EIO; 2617 goto err; 2618 } 2619 2620 preempt_disable(); 2621 semaphore_set(sema, i); 2622 while (!READ_ONCE(cb.seen)) 2623 cpu_relax(); 2624 2625 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2626 preempt_enable(); 2627 } 2628 2629 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2630 if (err) 2631 goto err; 2632 2633 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 2634 GEM_BUG_ON(sema[i + 1] == -1); 2635 elapsed[i] = elapsed[i] - sema[i + 1]; 2636 } 2637 2638 cycles = trifilter(elapsed); 2639 pr_info("%s: completion latency %d cycles, %lluns\n", 2640 ce->engine->name, cycles >> TF_BIAS, 2641 cycles_to_ns(ce->engine, cycles)); 2642 2643 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2644 2645 err: 2646 intel_gt_set_wedged(ce->engine->gt); 2647 return err; 2648 } 2649 2650 static void rps_pin(struct intel_gt *gt) 2651 { 2652 /* Pin the frequency to max */ 2653 atomic_inc(>->rps.num_waiters); 2654 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 2655 2656 mutex_lock(>->rps.lock); 2657 intel_rps_set(>->rps, gt->rps.max_freq); 2658 mutex_unlock(>->rps.lock); 2659 } 2660 2661 static void rps_unpin(struct intel_gt *gt) 2662 { 2663 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 2664 atomic_dec(>->rps.num_waiters); 2665 } 2666 2667 static int perf_request_latency(void *arg) 2668 { 2669 struct drm_i915_private *i915 = arg; 2670 struct intel_engine_cs *engine; 2671 struct pm_qos_request qos; 2672 int err = 0; 2673 2674 if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */ 2675 return 0; 2676 2677 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2678 2679 for_each_uabi_engine(engine, i915) { 2680 struct intel_context *ce; 2681 2682 ce = intel_context_create(engine); 2683 if (IS_ERR(ce)) { 2684 err = PTR_ERR(ce); 2685 goto out; 2686 } 2687 2688 err = intel_context_pin(ce); 2689 if (err) { 2690 intel_context_put(ce); 2691 goto out; 2692 } 2693 2694 st_engine_heartbeat_disable(engine); 2695 rps_pin(engine->gt); 2696 2697 if (err == 0) 2698 err = measure_semaphore_response(ce); 2699 if (err == 0) 2700 err = measure_idle_dispatch(ce); 2701 if (err == 0) 2702 err = measure_busy_dispatch(ce); 2703 if (err == 0) 2704 err = measure_inter_request(ce); 2705 if (err == 0) 2706 err = measure_context_switch(ce); 2707 if (err == 0) 2708 err = measure_preemption(ce); 2709 if (err == 0) 2710 err = measure_completion(ce); 2711 2712 rps_unpin(engine->gt); 2713 st_engine_heartbeat_enable(engine); 2714 2715 intel_context_unpin(ce); 2716 intel_context_put(ce); 2717 if (err) 2718 goto out; 2719 } 2720 2721 out: 2722 if (igt_flush_test(i915)) 2723 err = -EIO; 2724 2725 cpu_latency_qos_remove_request(&qos); 2726 return err; 2727 } 2728 2729 static int s_sync0(void *arg) 2730 { 2731 struct perf_series *ps = arg; 2732 IGT_TIMEOUT(end_time); 2733 unsigned int idx = 0; 2734 int err = 0; 2735 2736 GEM_BUG_ON(!ps->nengines); 2737 do { 2738 struct i915_request *rq; 2739 2740 rq = i915_request_create(ps->ce[idx]); 2741 if (IS_ERR(rq)) { 2742 err = PTR_ERR(rq); 2743 break; 2744 } 2745 2746 i915_request_get(rq); 2747 i915_request_add(rq); 2748 2749 if (i915_request_wait(rq, 0, HZ / 5) < 0) 2750 err = -ETIME; 2751 i915_request_put(rq); 2752 if (err) 2753 break; 2754 2755 if (++idx == ps->nengines) 2756 idx = 0; 2757 } while (!__igt_timeout(end_time, NULL)); 2758 2759 return err; 2760 } 2761 2762 static int s_sync1(void *arg) 2763 { 2764 struct perf_series *ps = arg; 2765 struct i915_request *prev = NULL; 2766 IGT_TIMEOUT(end_time); 2767 unsigned int idx = 0; 2768 int err = 0; 2769 2770 GEM_BUG_ON(!ps->nengines); 2771 do { 2772 struct i915_request *rq; 2773 2774 rq = i915_request_create(ps->ce[idx]); 2775 if (IS_ERR(rq)) { 2776 err = PTR_ERR(rq); 2777 break; 2778 } 2779 2780 i915_request_get(rq); 2781 i915_request_add(rq); 2782 2783 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0) 2784 err = -ETIME; 2785 i915_request_put(prev); 2786 prev = rq; 2787 if (err) 2788 break; 2789 2790 if (++idx == ps->nengines) 2791 idx = 0; 2792 } while (!__igt_timeout(end_time, NULL)); 2793 i915_request_put(prev); 2794 2795 return err; 2796 } 2797 2798 static int s_many(void *arg) 2799 { 2800 struct perf_series *ps = arg; 2801 IGT_TIMEOUT(end_time); 2802 unsigned int idx = 0; 2803 2804 GEM_BUG_ON(!ps->nengines); 2805 do { 2806 struct i915_request *rq; 2807 2808 rq = i915_request_create(ps->ce[idx]); 2809 if (IS_ERR(rq)) 2810 return PTR_ERR(rq); 2811 2812 i915_request_add(rq); 2813 2814 if (++idx == ps->nengines) 2815 idx = 0; 2816 } while (!__igt_timeout(end_time, NULL)); 2817 2818 return 0; 2819 } 2820 2821 static int perf_series_engines(void *arg) 2822 { 2823 struct drm_i915_private *i915 = arg; 2824 static int (* const func[])(void *arg) = { 2825 s_sync0, 2826 s_sync1, 2827 s_many, 2828 NULL, 2829 }; 2830 const unsigned int nengines = num_uabi_engines(i915); 2831 struct intel_engine_cs *engine; 2832 int (* const *fn)(void *arg); 2833 struct pm_qos_request qos; 2834 struct perf_stats *stats; 2835 struct perf_series *ps; 2836 unsigned int idx; 2837 int err = 0; 2838 2839 stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL); 2840 if (!stats) 2841 return -ENOMEM; 2842 2843 ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL); 2844 if (!ps) { 2845 kfree(stats); 2846 return -ENOMEM; 2847 } 2848 2849 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2850 2851 ps->i915 = i915; 2852 ps->nengines = nengines; 2853 2854 idx = 0; 2855 for_each_uabi_engine(engine, i915) { 2856 struct intel_context *ce; 2857 2858 ce = intel_context_create(engine); 2859 if (IS_ERR(ce)) { 2860 err = PTR_ERR(ce); 2861 goto out; 2862 } 2863 2864 err = intel_context_pin(ce); 2865 if (err) { 2866 intel_context_put(ce); 2867 goto out; 2868 } 2869 2870 ps->ce[idx++] = ce; 2871 } 2872 GEM_BUG_ON(idx != ps->nengines); 2873 2874 for (fn = func; *fn && !err; fn++) { 2875 char name[KSYM_NAME_LEN]; 2876 struct igt_live_test t; 2877 2878 snprintf(name, sizeof(name), "%ps", *fn); 2879 err = igt_live_test_begin(&t, i915, __func__, name); 2880 if (err) 2881 break; 2882 2883 for (idx = 0; idx < nengines; idx++) { 2884 struct perf_stats *p = 2885 memset(&stats[idx], 0, sizeof(stats[idx])); 2886 struct intel_context *ce = ps->ce[idx]; 2887 2888 p->engine = ps->ce[idx]->engine; 2889 intel_engine_pm_get(p->engine); 2890 2891 if (intel_engine_supports_stats(p->engine)) 2892 p->busy = intel_engine_get_busy_time(p->engine, 2893 &p->time) + 1; 2894 else 2895 p->time = ktime_get(); 2896 p->runtime = -intel_context_get_total_runtime_ns(ce); 2897 } 2898 2899 err = (*fn)(ps); 2900 if (igt_live_test_end(&t)) 2901 err = -EIO; 2902 2903 for (idx = 0; idx < nengines; idx++) { 2904 struct perf_stats *p = &stats[idx]; 2905 struct intel_context *ce = ps->ce[idx]; 2906 int integer, decimal; 2907 u64 busy, dt, now; 2908 2909 if (p->busy) 2910 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine, 2911 &now), 2912 p->busy - 1); 2913 else 2914 now = ktime_get(); 2915 p->time = ktime_sub(now, p->time); 2916 2917 err = switch_to_kernel_sync(ce, err); 2918 p->runtime += intel_context_get_total_runtime_ns(ce); 2919 intel_engine_pm_put(p->engine); 2920 2921 busy = 100 * ktime_to_ns(p->busy); 2922 dt = ktime_to_ns(p->time); 2923 if (dt) { 2924 integer = div64_u64(busy, dt); 2925 busy -= integer * dt; 2926 decimal = div64_u64(100 * busy, dt); 2927 } else { 2928 integer = 0; 2929 decimal = 0; 2930 } 2931 2932 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 2933 name, p->engine->name, ce->timeline->seqno, 2934 integer, decimal, 2935 div_u64(p->runtime, 1000 * 1000), 2936 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 2937 } 2938 } 2939 2940 out: 2941 for (idx = 0; idx < nengines; idx++) { 2942 if (IS_ERR_OR_NULL(ps->ce[idx])) 2943 break; 2944 2945 intel_context_unpin(ps->ce[idx]); 2946 intel_context_put(ps->ce[idx]); 2947 } 2948 kfree(ps); 2949 2950 cpu_latency_qos_remove_request(&qos); 2951 kfree(stats); 2952 return err; 2953 } 2954 2955 struct p_thread { 2956 struct perf_stats p; 2957 struct kthread_worker *worker; 2958 struct kthread_work work; 2959 struct intel_engine_cs *engine; 2960 int result; 2961 }; 2962 2963 static void p_sync0(struct kthread_work *work) 2964 { 2965 struct p_thread *thread = container_of(work, typeof(*thread), work); 2966 struct perf_stats *p = &thread->p; 2967 struct intel_engine_cs *engine = p->engine; 2968 struct intel_context *ce; 2969 IGT_TIMEOUT(end_time); 2970 unsigned long count; 2971 bool busy; 2972 int err = 0; 2973 2974 ce = intel_context_create(engine); 2975 if (IS_ERR(ce)) { 2976 thread->result = PTR_ERR(ce); 2977 return; 2978 } 2979 2980 err = intel_context_pin(ce); 2981 if (err) { 2982 intel_context_put(ce); 2983 thread->result = err; 2984 return; 2985 } 2986 2987 if (intel_engine_supports_stats(engine)) { 2988 p->busy = intel_engine_get_busy_time(engine, &p->time); 2989 busy = true; 2990 } else { 2991 p->time = ktime_get(); 2992 busy = false; 2993 } 2994 2995 count = 0; 2996 do { 2997 struct i915_request *rq; 2998 2999 rq = i915_request_create(ce); 3000 if (IS_ERR(rq)) { 3001 err = PTR_ERR(rq); 3002 break; 3003 } 3004 3005 i915_request_get(rq); 3006 i915_request_add(rq); 3007 3008 err = 0; 3009 if (i915_request_wait(rq, 0, HZ) < 0) 3010 err = -ETIME; 3011 i915_request_put(rq); 3012 if (err) 3013 break; 3014 3015 count++; 3016 } while (!__igt_timeout(end_time, NULL)); 3017 3018 if (busy) { 3019 ktime_t now; 3020 3021 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3022 p->busy); 3023 p->time = ktime_sub(now, p->time); 3024 } else { 3025 p->time = ktime_sub(ktime_get(), p->time); 3026 } 3027 3028 err = switch_to_kernel_sync(ce, err); 3029 p->runtime = intel_context_get_total_runtime_ns(ce); 3030 p->count = count; 3031 3032 intel_context_unpin(ce); 3033 intel_context_put(ce); 3034 thread->result = err; 3035 } 3036 3037 static void p_sync1(struct kthread_work *work) 3038 { 3039 struct p_thread *thread = container_of(work, typeof(*thread), work); 3040 struct perf_stats *p = &thread->p; 3041 struct intel_engine_cs *engine = p->engine; 3042 struct i915_request *prev = NULL; 3043 struct intel_context *ce; 3044 IGT_TIMEOUT(end_time); 3045 unsigned long count; 3046 bool busy; 3047 int err = 0; 3048 3049 ce = intel_context_create(engine); 3050 if (IS_ERR(ce)) { 3051 thread->result = PTR_ERR(ce); 3052 return; 3053 } 3054 3055 err = intel_context_pin(ce); 3056 if (err) { 3057 intel_context_put(ce); 3058 thread->result = err; 3059 return; 3060 } 3061 3062 if (intel_engine_supports_stats(engine)) { 3063 p->busy = intel_engine_get_busy_time(engine, &p->time); 3064 busy = true; 3065 } else { 3066 p->time = ktime_get(); 3067 busy = false; 3068 } 3069 3070 count = 0; 3071 do { 3072 struct i915_request *rq; 3073 3074 rq = i915_request_create(ce); 3075 if (IS_ERR(rq)) { 3076 err = PTR_ERR(rq); 3077 break; 3078 } 3079 3080 i915_request_get(rq); 3081 i915_request_add(rq); 3082 3083 err = 0; 3084 if (prev && i915_request_wait(prev, 0, HZ) < 0) 3085 err = -ETIME; 3086 i915_request_put(prev); 3087 prev = rq; 3088 if (err) 3089 break; 3090 3091 count++; 3092 } while (!__igt_timeout(end_time, NULL)); 3093 i915_request_put(prev); 3094 3095 if (busy) { 3096 ktime_t now; 3097 3098 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3099 p->busy); 3100 p->time = ktime_sub(now, p->time); 3101 } else { 3102 p->time = ktime_sub(ktime_get(), p->time); 3103 } 3104 3105 err = switch_to_kernel_sync(ce, err); 3106 p->runtime = intel_context_get_total_runtime_ns(ce); 3107 p->count = count; 3108 3109 intel_context_unpin(ce); 3110 intel_context_put(ce); 3111 thread->result = err; 3112 } 3113 3114 static void p_many(struct kthread_work *work) 3115 { 3116 struct p_thread *thread = container_of(work, typeof(*thread), work); 3117 struct perf_stats *p = &thread->p; 3118 struct intel_engine_cs *engine = p->engine; 3119 struct intel_context *ce; 3120 IGT_TIMEOUT(end_time); 3121 unsigned long count; 3122 int err = 0; 3123 bool busy; 3124 3125 ce = intel_context_create(engine); 3126 if (IS_ERR(ce)) { 3127 thread->result = PTR_ERR(ce); 3128 return; 3129 } 3130 3131 err = intel_context_pin(ce); 3132 if (err) { 3133 intel_context_put(ce); 3134 thread->result = err; 3135 return; 3136 } 3137 3138 if (intel_engine_supports_stats(engine)) { 3139 p->busy = intel_engine_get_busy_time(engine, &p->time); 3140 busy = true; 3141 } else { 3142 p->time = ktime_get(); 3143 busy = false; 3144 } 3145 3146 count = 0; 3147 do { 3148 struct i915_request *rq; 3149 3150 rq = i915_request_create(ce); 3151 if (IS_ERR(rq)) { 3152 err = PTR_ERR(rq); 3153 break; 3154 } 3155 3156 i915_request_add(rq); 3157 count++; 3158 } while (!__igt_timeout(end_time, NULL)); 3159 3160 if (busy) { 3161 ktime_t now; 3162 3163 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3164 p->busy); 3165 p->time = ktime_sub(now, p->time); 3166 } else { 3167 p->time = ktime_sub(ktime_get(), p->time); 3168 } 3169 3170 err = switch_to_kernel_sync(ce, err); 3171 p->runtime = intel_context_get_total_runtime_ns(ce); 3172 p->count = count; 3173 3174 intel_context_unpin(ce); 3175 intel_context_put(ce); 3176 thread->result = err; 3177 } 3178 3179 static int perf_parallel_engines(void *arg) 3180 { 3181 struct drm_i915_private *i915 = arg; 3182 static void (* const func[])(struct kthread_work *) = { 3183 p_sync0, 3184 p_sync1, 3185 p_many, 3186 NULL, 3187 }; 3188 const unsigned int nengines = num_uabi_engines(i915); 3189 void (* const *fn)(struct kthread_work *); 3190 struct intel_engine_cs *engine; 3191 struct pm_qos_request qos; 3192 struct p_thread *engines; 3193 int err = 0; 3194 3195 engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL); 3196 if (!engines) 3197 return -ENOMEM; 3198 3199 cpu_latency_qos_add_request(&qos, 0); 3200 3201 for (fn = func; *fn; fn++) { 3202 char name[KSYM_NAME_LEN]; 3203 struct igt_live_test t; 3204 unsigned int idx; 3205 3206 snprintf(name, sizeof(name), "%ps", *fn); 3207 err = igt_live_test_begin(&t, i915, __func__, name); 3208 if (err) 3209 break; 3210 3211 atomic_set(&i915->selftest.counter, nengines); 3212 3213 idx = 0; 3214 for_each_uabi_engine(engine, i915) { 3215 struct kthread_worker *worker; 3216 3217 intel_engine_pm_get(engine); 3218 3219 memset(&engines[idx].p, 0, sizeof(engines[idx].p)); 3220 3221 worker = kthread_run_worker(0, "igt:%s", 3222 engine->name); 3223 if (IS_ERR(worker)) { 3224 err = PTR_ERR(worker); 3225 intel_engine_pm_put(engine); 3226 break; 3227 } 3228 engines[idx].worker = worker; 3229 engines[idx].result = 0; 3230 engines[idx].p.engine = engine; 3231 engines[idx].engine = engine; 3232 3233 kthread_init_work(&engines[idx].work, *fn); 3234 kthread_queue_work(worker, &engines[idx].work); 3235 idx++; 3236 } 3237 3238 idx = 0; 3239 for_each_uabi_engine(engine, i915) { 3240 int status; 3241 3242 if (!engines[idx].worker) 3243 break; 3244 3245 kthread_flush_work(&engines[idx].work); 3246 status = READ_ONCE(engines[idx].result); 3247 if (status && !err) 3248 err = status; 3249 3250 intel_engine_pm_put(engine); 3251 3252 kthread_destroy_worker(engines[idx].worker); 3253 idx++; 3254 } 3255 3256 if (igt_live_test_end(&t)) 3257 err = -EIO; 3258 if (err) 3259 break; 3260 3261 idx = 0; 3262 for_each_uabi_engine(engine, i915) { 3263 struct perf_stats *p = &engines[idx].p; 3264 u64 busy = 100 * ktime_to_ns(p->busy); 3265 u64 dt = ktime_to_ns(p->time); 3266 int integer, decimal; 3267 3268 if (dt) { 3269 integer = div64_u64(busy, dt); 3270 busy -= integer * dt; 3271 decimal = div64_u64(100 * busy, dt); 3272 } else { 3273 integer = 0; 3274 decimal = 0; 3275 } 3276 3277 GEM_BUG_ON(engine != p->engine); 3278 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 3279 name, engine->name, p->count, integer, decimal, 3280 div_u64(p->runtime, 1000 * 1000), 3281 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 3282 idx++; 3283 } 3284 } 3285 3286 cpu_latency_qos_remove_request(&qos); 3287 kfree(engines); 3288 return err; 3289 } 3290 3291 int i915_request_perf_selftests(struct drm_i915_private *i915) 3292 { 3293 static const struct i915_subtest tests[] = { 3294 SUBTEST(perf_request_latency), 3295 SUBTEST(perf_series_engines), 3296 SUBTEST(perf_parallel_engines), 3297 }; 3298 3299 if (intel_gt_is_wedged(to_gt(i915))) 3300 return 0; 3301 3302 return i915_subtests(tests, i915); 3303 } 3304