1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/pm_qos.h> 26 #include <linux/prime_numbers.h> 27 #include <linux/sort.h> 28 29 #include <drm/drm_print.h> 30 31 #include "gem/i915_gem_internal.h" 32 #include "gem/i915_gem_pm.h" 33 #include "gem/selftests/mock_context.h" 34 #include "gt/intel_engine_heartbeat.h" 35 #include "gt/intel_engine_pm.h" 36 #include "gt/intel_engine_user.h" 37 #include "gt/intel_gt.h" 38 #include "gt/intel_gt_clock_utils.h" 39 #include "gt/intel_gt_requests.h" 40 #include "gt/selftest_engine_heartbeat.h" 41 42 #include "i915_random.h" 43 #include "i915_selftest.h" 44 #include "i915_wait_util.h" 45 #include "igt_flush_test.h" 46 #include "igt_live_test.h" 47 #include "igt_spinner.h" 48 #include "lib_sw_fence.h" 49 #include "mock_drm.h" 50 #include "mock_gem_device.h" 51 52 static unsigned int num_uabi_engines(struct drm_i915_private *i915) 53 { 54 struct intel_engine_cs *engine; 55 unsigned int count; 56 57 count = 0; 58 for_each_uabi_engine(engine, i915) 59 count++; 60 61 return count; 62 } 63 64 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915) 65 { 66 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0); 67 } 68 69 static int igt_add_request(void *arg) 70 { 71 struct drm_i915_private *i915 = arg; 72 struct i915_request *request; 73 74 /* Basic preliminary test to create a request and let it loose! */ 75 76 request = mock_request(rcs0(i915)->kernel_context, HZ / 10); 77 if (IS_ERR(request)) 78 return PTR_ERR(request); 79 80 i915_request_add(request); 81 82 return 0; 83 } 84 85 static int igt_wait_request(void *arg) 86 { 87 const long T = HZ / 4; 88 struct drm_i915_private *i915 = arg; 89 struct i915_request *request; 90 int err = -EINVAL; 91 92 /* Submit a request, then wait upon it */ 93 94 request = mock_request(rcs0(i915)->kernel_context, T); 95 if (IS_ERR(request)) 96 return PTR_ERR(request); 97 98 i915_request_get(request); 99 100 if (i915_request_wait(request, 0, 0) != -ETIME) { 101 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n"); 102 goto out_request; 103 } 104 105 if (i915_request_wait(request, 0, T) != -ETIME) { 106 pr_err("request wait succeeded (expected timeout before submit!)\n"); 107 goto out_request; 108 } 109 110 if (i915_request_completed(request)) { 111 pr_err("request completed before submit!!\n"); 112 goto out_request; 113 } 114 115 i915_request_add(request); 116 117 if (i915_request_wait(request, 0, 0) != -ETIME) { 118 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n"); 119 goto out_request; 120 } 121 122 if (i915_request_completed(request)) { 123 pr_err("request completed immediately!\n"); 124 goto out_request; 125 } 126 127 if (i915_request_wait(request, 0, T / 2) != -ETIME) { 128 pr_err("request wait succeeded (expected timeout!)\n"); 129 goto out_request; 130 } 131 132 if (i915_request_wait(request, 0, T) == -ETIME) { 133 pr_err("request wait timed out!\n"); 134 goto out_request; 135 } 136 137 if (!i915_request_completed(request)) { 138 pr_err("request not complete after waiting!\n"); 139 goto out_request; 140 } 141 142 if (i915_request_wait(request, 0, T) == -ETIME) { 143 pr_err("request wait timed out when already complete!\n"); 144 goto out_request; 145 } 146 147 err = 0; 148 out_request: 149 i915_request_put(request); 150 mock_device_flush(i915); 151 return err; 152 } 153 154 static int igt_fence_wait(void *arg) 155 { 156 const long T = HZ / 4; 157 struct drm_i915_private *i915 = arg; 158 struct i915_request *request; 159 int err = -EINVAL; 160 161 /* Submit a request, treat it as a fence and wait upon it */ 162 163 request = mock_request(rcs0(i915)->kernel_context, T); 164 if (IS_ERR(request)) 165 return PTR_ERR(request); 166 167 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) { 168 pr_err("fence wait success before submit (expected timeout)!\n"); 169 goto out; 170 } 171 172 i915_request_add(request); 173 174 if (dma_fence_is_signaled(&request->fence)) { 175 pr_err("fence signaled immediately!\n"); 176 goto out; 177 } 178 179 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) { 180 pr_err("fence wait success after submit (expected timeout)!\n"); 181 goto out; 182 } 183 184 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 185 pr_err("fence wait timed out (expected success)!\n"); 186 goto out; 187 } 188 189 if (!dma_fence_is_signaled(&request->fence)) { 190 pr_err("fence unsignaled after waiting!\n"); 191 goto out; 192 } 193 194 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 195 pr_err("fence wait timed out when complete (expected success)!\n"); 196 goto out; 197 } 198 199 err = 0; 200 out: 201 mock_device_flush(i915); 202 return err; 203 } 204 205 static int igt_request_rewind(void *arg) 206 { 207 struct drm_i915_private *i915 = arg; 208 struct i915_request *request, *vip; 209 struct i915_gem_context *ctx[2]; 210 struct intel_context *ce; 211 int err = -EINVAL; 212 213 ctx[0] = mock_context(i915, "A"); 214 if (!ctx[0]) { 215 err = -ENOMEM; 216 goto err_ctx_0; 217 } 218 219 ce = i915_gem_context_get_engine(ctx[0], RCS0); 220 GEM_BUG_ON(IS_ERR(ce)); 221 request = mock_request(ce, 2 * HZ); 222 intel_context_put(ce); 223 if (IS_ERR(request)) { 224 err = PTR_ERR(request); 225 goto err_context_0; 226 } 227 228 i915_request_get(request); 229 i915_request_add(request); 230 231 ctx[1] = mock_context(i915, "B"); 232 if (!ctx[1]) { 233 err = -ENOMEM; 234 goto err_ctx_1; 235 } 236 237 ce = i915_gem_context_get_engine(ctx[1], RCS0); 238 GEM_BUG_ON(IS_ERR(ce)); 239 vip = mock_request(ce, 0); 240 intel_context_put(ce); 241 if (IS_ERR(vip)) { 242 err = PTR_ERR(vip); 243 goto err_context_1; 244 } 245 246 /* Simulate preemption by manual reordering */ 247 if (!mock_cancel_request(request)) { 248 pr_err("failed to cancel request (already executed)!\n"); 249 i915_request_add(vip); 250 goto err_context_1; 251 } 252 i915_request_get(vip); 253 i915_request_add(vip); 254 rcu_read_lock(); 255 request->engine->submit_request(request); 256 rcu_read_unlock(); 257 258 259 if (i915_request_wait(vip, 0, HZ) == -ETIME) { 260 pr_err("timed out waiting for high priority request\n"); 261 goto err; 262 } 263 264 if (i915_request_completed(request)) { 265 pr_err("low priority request already completed\n"); 266 goto err; 267 } 268 269 err = 0; 270 err: 271 i915_request_put(vip); 272 err_context_1: 273 mock_context_close(ctx[1]); 274 err_ctx_1: 275 i915_request_put(request); 276 err_context_0: 277 mock_context_close(ctx[0]); 278 err_ctx_0: 279 mock_device_flush(i915); 280 return err; 281 } 282 283 struct smoketest { 284 struct intel_engine_cs *engine; 285 struct i915_gem_context **contexts; 286 atomic_long_t num_waits, num_fences; 287 int ncontexts, max_batch; 288 struct i915_request *(*request_alloc)(struct intel_context *ce); 289 }; 290 291 static struct i915_request * 292 __mock_request_alloc(struct intel_context *ce) 293 { 294 return mock_request(ce, 0); 295 } 296 297 static struct i915_request * 298 __live_request_alloc(struct intel_context *ce) 299 { 300 return intel_context_create_request(ce); 301 } 302 303 struct smoke_thread { 304 struct kthread_worker *worker; 305 struct kthread_work work; 306 struct smoketest *t; 307 bool stop; 308 int result; 309 }; 310 311 static void __igt_breadcrumbs_smoketest(struct kthread_work *work) 312 { 313 struct smoke_thread *thread = container_of(work, typeof(*thread), work); 314 struct smoketest *t = thread->t; 315 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1; 316 const unsigned int total = 4 * t->ncontexts + 1; 317 unsigned int num_waits = 0, num_fences = 0; 318 struct i915_request **requests; 319 I915_RND_STATE(prng); 320 unsigned int *order; 321 int err = 0; 322 323 /* 324 * A very simple test to catch the most egregious of list handling bugs. 325 * 326 * At its heart, we simply create oodles of requests running across 327 * multiple kthreads and enable signaling on them, for the sole purpose 328 * of stressing our breadcrumb handling. The only inspection we do is 329 * that the fences were marked as signaled. 330 */ 331 332 requests = kcalloc(total, sizeof(*requests), GFP_KERNEL); 333 if (!requests) { 334 thread->result = -ENOMEM; 335 return; 336 } 337 338 order = i915_random_order(total, &prng); 339 if (!order) { 340 err = -ENOMEM; 341 goto out_requests; 342 } 343 344 while (!READ_ONCE(thread->stop)) { 345 struct i915_sw_fence *submit, *wait; 346 unsigned int n, count; 347 348 submit = heap_fence_create(GFP_KERNEL); 349 if (!submit) { 350 err = -ENOMEM; 351 break; 352 } 353 354 wait = heap_fence_create(GFP_KERNEL); 355 if (!wait) { 356 i915_sw_fence_commit(submit); 357 heap_fence_put(submit); 358 err = -ENOMEM; 359 break; 360 } 361 362 i915_random_reorder(order, total, &prng); 363 count = 1 + i915_prandom_u32_max_state(max_batch, &prng); 364 365 for (n = 0; n < count; n++) { 366 struct i915_gem_context *ctx = 367 t->contexts[order[n] % t->ncontexts]; 368 struct i915_request *rq; 369 struct intel_context *ce; 370 371 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx); 372 GEM_BUG_ON(IS_ERR(ce)); 373 rq = t->request_alloc(ce); 374 intel_context_put(ce); 375 if (IS_ERR(rq)) { 376 err = PTR_ERR(rq); 377 count = n; 378 break; 379 } 380 381 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 382 submit, 383 GFP_KERNEL); 384 385 requests[n] = i915_request_get(rq); 386 i915_request_add(rq); 387 388 if (err >= 0) 389 err = i915_sw_fence_await_dma_fence(wait, 390 &rq->fence, 391 0, 392 GFP_KERNEL); 393 394 if (err < 0) { 395 i915_request_put(rq); 396 count = n; 397 break; 398 } 399 } 400 401 i915_sw_fence_commit(submit); 402 i915_sw_fence_commit(wait); 403 404 if (!wait_event_timeout(wait->wait, 405 i915_sw_fence_done(wait), 406 5 * HZ)) { 407 struct i915_request *rq = requests[count - 1]; 408 409 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n", 410 atomic_read(&wait->pending), count, 411 rq->fence.context, rq->fence.seqno, 412 t->engine->name); 413 GEM_TRACE_DUMP(); 414 415 intel_gt_set_wedged(t->engine->gt); 416 GEM_BUG_ON(!i915_request_completed(rq)); 417 i915_sw_fence_wait(wait); 418 err = -EIO; 419 } 420 421 for (n = 0; n < count; n++) { 422 struct i915_request *rq = requests[n]; 423 424 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 425 &rq->fence.flags)) { 426 pr_err("%llu:%llu was not signaled!\n", 427 rq->fence.context, rq->fence.seqno); 428 err = -EINVAL; 429 } 430 431 i915_request_put(rq); 432 } 433 434 heap_fence_put(wait); 435 heap_fence_put(submit); 436 437 if (err < 0) 438 break; 439 440 num_fences += count; 441 num_waits++; 442 443 cond_resched(); 444 } 445 446 atomic_long_add(num_fences, &t->num_fences); 447 atomic_long_add(num_waits, &t->num_waits); 448 449 kfree(order); 450 out_requests: 451 kfree(requests); 452 thread->result = err; 453 } 454 455 static int mock_breadcrumbs_smoketest(void *arg) 456 { 457 struct drm_i915_private *i915 = arg; 458 struct smoketest t = { 459 .engine = rcs0(i915), 460 .ncontexts = 1024, 461 .max_batch = 1024, 462 .request_alloc = __mock_request_alloc 463 }; 464 unsigned int ncpus = num_online_cpus(); 465 struct smoke_thread *threads; 466 unsigned int n; 467 int ret = 0; 468 469 /* 470 * Smoketest our breadcrumb/signal handling for requests across multiple 471 * threads. A very simple test to only catch the most egregious of bugs. 472 * See __igt_breadcrumbs_smoketest(); 473 */ 474 475 threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL); 476 if (!threads) 477 return -ENOMEM; 478 479 t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL); 480 if (!t.contexts) { 481 ret = -ENOMEM; 482 goto out_threads; 483 } 484 485 for (n = 0; n < t.ncontexts; n++) { 486 t.contexts[n] = mock_context(t.engine->i915, "mock"); 487 if (!t.contexts[n]) { 488 ret = -ENOMEM; 489 goto out_contexts; 490 } 491 } 492 493 for (n = 0; n < ncpus; n++) { 494 struct kthread_worker *worker; 495 496 worker = kthread_run_worker(0, "igt/%d", n); 497 if (IS_ERR(worker)) { 498 ret = PTR_ERR(worker); 499 ncpus = n; 500 break; 501 } 502 503 threads[n].worker = worker; 504 threads[n].t = &t; 505 threads[n].stop = false; 506 threads[n].result = 0; 507 508 kthread_init_work(&threads[n].work, 509 __igt_breadcrumbs_smoketest); 510 kthread_queue_work(worker, &threads[n].work); 511 } 512 513 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 514 515 for (n = 0; n < ncpus; n++) { 516 int err; 517 518 WRITE_ONCE(threads[n].stop, true); 519 kthread_flush_work(&threads[n].work); 520 err = READ_ONCE(threads[n].result); 521 if (err < 0 && !ret) 522 ret = err; 523 524 kthread_destroy_worker(threads[n].worker); 525 } 526 pr_info("Completed %lu waits for %lu fence across %d cpus\n", 527 atomic_long_read(&t.num_waits), 528 atomic_long_read(&t.num_fences), 529 ncpus); 530 531 out_contexts: 532 for (n = 0; n < t.ncontexts; n++) { 533 if (!t.contexts[n]) 534 break; 535 mock_context_close(t.contexts[n]); 536 } 537 kfree(t.contexts); 538 out_threads: 539 kfree(threads); 540 return ret; 541 } 542 543 int i915_request_mock_selftests(void) 544 { 545 static const struct i915_subtest tests[] = { 546 SUBTEST(igt_add_request), 547 SUBTEST(igt_wait_request), 548 SUBTEST(igt_fence_wait), 549 SUBTEST(igt_request_rewind), 550 SUBTEST(mock_breadcrumbs_smoketest), 551 }; 552 struct drm_i915_private *i915; 553 intel_wakeref_t wakeref; 554 int err = 0; 555 556 i915 = mock_gem_device(); 557 if (!i915) 558 return -ENOMEM; 559 560 with_intel_runtime_pm(&i915->runtime_pm, wakeref) 561 err = i915_subtests(tests, i915); 562 563 mock_destroy_device(i915); 564 565 return err; 566 } 567 568 static int live_nop_request(void *arg) 569 { 570 struct drm_i915_private *i915 = arg; 571 struct intel_engine_cs *engine; 572 struct igt_live_test t; 573 int err = -ENODEV; 574 575 /* 576 * Submit various sized batches of empty requests, to each engine 577 * (individually), and wait for the batch to complete. We can check 578 * the overhead of submitting requests to the hardware. 579 */ 580 581 for_each_uabi_engine(engine, i915) { 582 unsigned long n, prime; 583 IGT_TIMEOUT(end_time); 584 ktime_t times[2] = {}; 585 586 err = igt_live_test_begin(&t, i915, __func__, engine->name); 587 if (err) 588 return err; 589 590 intel_engine_pm_get(engine); 591 for_each_prime_number_from(prime, 1, 8192) { 592 struct i915_request *request = NULL; 593 594 times[1] = ktime_get_raw(); 595 596 for (n = 0; n < prime; n++) { 597 i915_request_put(request); 598 request = i915_request_create(engine->kernel_context); 599 if (IS_ERR(request)) 600 return PTR_ERR(request); 601 602 /* 603 * This space is left intentionally blank. 604 * 605 * We do not actually want to perform any 606 * action with this request, we just want 607 * to measure the latency in allocation 608 * and submission of our breadcrumbs - 609 * ensuring that the bare request is sufficient 610 * for the system to work (i.e. proper HEAD 611 * tracking of the rings, interrupt handling, 612 * etc). It also gives us the lowest bounds 613 * for latency. 614 */ 615 616 i915_request_get(request); 617 i915_request_add(request); 618 } 619 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 620 i915_request_put(request); 621 622 times[1] = ktime_sub(ktime_get_raw(), times[1]); 623 if (prime == 1) 624 times[0] = times[1]; 625 626 if (__igt_timeout(end_time, NULL)) 627 break; 628 } 629 intel_engine_pm_put(engine); 630 631 err = igt_live_test_end(&t); 632 if (err) 633 return err; 634 635 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n", 636 engine->name, 637 ktime_to_ns(times[0]), 638 prime, div64_u64(ktime_to_ns(times[1]), prime)); 639 } 640 641 return err; 642 } 643 644 static int __cancel_inactive(struct intel_engine_cs *engine) 645 { 646 struct intel_context *ce; 647 struct igt_spinner spin; 648 struct i915_request *rq; 649 int err = 0; 650 651 if (igt_spinner_init(&spin, engine->gt)) 652 return -ENOMEM; 653 654 ce = intel_context_create(engine); 655 if (IS_ERR(ce)) { 656 err = PTR_ERR(ce); 657 goto out_spin; 658 } 659 660 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 661 if (IS_ERR(rq)) { 662 err = PTR_ERR(rq); 663 goto out_ce; 664 } 665 666 pr_debug("%s: Cancelling inactive request\n", engine->name); 667 i915_request_cancel(rq, -EINTR); 668 i915_request_get(rq); 669 i915_request_add(rq); 670 671 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 672 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 673 674 pr_err("%s: Failed to cancel inactive request\n", engine->name); 675 intel_engine_dump(engine, &p, "%s\n", engine->name); 676 err = -ETIME; 677 goto out_rq; 678 } 679 680 if (rq->fence.error != -EINTR) { 681 pr_err("%s: fence not cancelled (%u)\n", 682 engine->name, rq->fence.error); 683 err = -EINVAL; 684 } 685 686 out_rq: 687 i915_request_put(rq); 688 out_ce: 689 intel_context_put(ce); 690 out_spin: 691 igt_spinner_fini(&spin); 692 if (err) 693 pr_err("%s: %s error %d\n", __func__, engine->name, err); 694 return err; 695 } 696 697 static int __cancel_active(struct intel_engine_cs *engine) 698 { 699 struct intel_context *ce; 700 struct igt_spinner spin; 701 struct i915_request *rq; 702 int err = 0; 703 704 if (igt_spinner_init(&spin, engine->gt)) 705 return -ENOMEM; 706 707 ce = intel_context_create(engine); 708 if (IS_ERR(ce)) { 709 err = PTR_ERR(ce); 710 goto out_spin; 711 } 712 713 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 714 if (IS_ERR(rq)) { 715 err = PTR_ERR(rq); 716 goto out_ce; 717 } 718 719 pr_debug("%s: Cancelling active request\n", engine->name); 720 i915_request_get(rq); 721 i915_request_add(rq); 722 if (!igt_wait_for_spinner(&spin, rq)) { 723 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 724 725 pr_err("Failed to start spinner on %s\n", engine->name); 726 intel_engine_dump(engine, &p, "%s\n", engine->name); 727 err = -ETIME; 728 goto out_rq; 729 } 730 i915_request_cancel(rq, -EINTR); 731 732 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 733 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 734 735 pr_err("%s: Failed to cancel active request\n", engine->name); 736 intel_engine_dump(engine, &p, "%s\n", engine->name); 737 err = -ETIME; 738 goto out_rq; 739 } 740 741 if (rq->fence.error != -EINTR) { 742 pr_err("%s: fence not cancelled (%u)\n", 743 engine->name, rq->fence.error); 744 err = -EINVAL; 745 } 746 747 out_rq: 748 i915_request_put(rq); 749 out_ce: 750 intel_context_put(ce); 751 out_spin: 752 igt_spinner_fini(&spin); 753 if (err) 754 pr_err("%s: %s error %d\n", __func__, engine->name, err); 755 return err; 756 } 757 758 static int __cancel_completed(struct intel_engine_cs *engine) 759 { 760 struct intel_context *ce; 761 struct igt_spinner spin; 762 struct i915_request *rq; 763 int err = 0; 764 765 if (igt_spinner_init(&spin, engine->gt)) 766 return -ENOMEM; 767 768 ce = intel_context_create(engine); 769 if (IS_ERR(ce)) { 770 err = PTR_ERR(ce); 771 goto out_spin; 772 } 773 774 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 775 if (IS_ERR(rq)) { 776 err = PTR_ERR(rq); 777 goto out_ce; 778 } 779 igt_spinner_end(&spin); 780 i915_request_get(rq); 781 i915_request_add(rq); 782 783 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 784 err = -ETIME; 785 goto out_rq; 786 } 787 788 pr_debug("%s: Cancelling completed request\n", engine->name); 789 i915_request_cancel(rq, -EINTR); 790 if (rq->fence.error) { 791 pr_err("%s: fence not cancelled (%u)\n", 792 engine->name, rq->fence.error); 793 err = -EINVAL; 794 } 795 796 out_rq: 797 i915_request_put(rq); 798 out_ce: 799 intel_context_put(ce); 800 out_spin: 801 igt_spinner_fini(&spin); 802 if (err) 803 pr_err("%s: %s error %d\n", __func__, engine->name, err); 804 return err; 805 } 806 807 /* 808 * Test to prove a non-preemptable request can be cancelled and a subsequent 809 * request on the same context can successfully complete after cancellation. 810 * 811 * Testing methodology is to create a non-preemptible request and submit it, 812 * wait for spinner to start, create a NOP request and submit it, cancel the 813 * spinner, wait for spinner to complete and verify it failed with an error, 814 * finally wait for NOP request to complete verify it succeeded without an 815 * error. Preemption timeout also reduced / restored so test runs in a timely 816 * maner. 817 */ 818 static int __cancel_reset(struct drm_i915_private *i915, 819 struct intel_engine_cs *engine) 820 { 821 struct intel_context *ce; 822 struct igt_spinner spin; 823 struct i915_request *rq, *nop; 824 unsigned long preempt_timeout_ms; 825 int err = 0; 826 827 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT || 828 !intel_has_reset_engine(engine->gt)) 829 return 0; 830 831 preempt_timeout_ms = engine->props.preempt_timeout_ms; 832 engine->props.preempt_timeout_ms = 100; 833 834 if (igt_spinner_init(&spin, engine->gt)) 835 goto out_restore; 836 837 ce = intel_context_create(engine); 838 if (IS_ERR(ce)) { 839 err = PTR_ERR(ce); 840 goto out_spin; 841 } 842 843 rq = igt_spinner_create_request(&spin, ce, MI_NOOP); 844 if (IS_ERR(rq)) { 845 err = PTR_ERR(rq); 846 goto out_ce; 847 } 848 849 pr_debug("%s: Cancelling active non-preemptable request\n", 850 engine->name); 851 i915_request_get(rq); 852 i915_request_add(rq); 853 if (!igt_wait_for_spinner(&spin, rq)) { 854 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 855 856 pr_err("Failed to start spinner on %s\n", engine->name); 857 intel_engine_dump(engine, &p, "%s\n", engine->name); 858 err = -ETIME; 859 goto out_rq; 860 } 861 862 nop = intel_context_create_request(ce); 863 if (IS_ERR(nop)) 864 goto out_rq; 865 i915_request_get(nop); 866 i915_request_add(nop); 867 868 i915_request_cancel(rq, -EINTR); 869 870 if (i915_request_wait(rq, 0, HZ) < 0) { 871 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 872 873 pr_err("%s: Failed to cancel hung request\n", engine->name); 874 intel_engine_dump(engine, &p, "%s\n", engine->name); 875 err = -ETIME; 876 goto out_nop; 877 } 878 879 if (rq->fence.error != -EINTR) { 880 pr_err("%s: fence not cancelled (%u)\n", 881 engine->name, rq->fence.error); 882 err = -EINVAL; 883 goto out_nop; 884 } 885 886 if (i915_request_wait(nop, 0, HZ) < 0) { 887 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 888 889 pr_err("%s: Failed to complete nop request\n", engine->name); 890 intel_engine_dump(engine, &p, "%s\n", engine->name); 891 err = -ETIME; 892 goto out_nop; 893 } 894 895 if (nop->fence.error != 0) { 896 pr_err("%s: Nop request errored (%u)\n", 897 engine->name, nop->fence.error); 898 err = -EINVAL; 899 } 900 901 out_nop: 902 i915_request_put(nop); 903 out_rq: 904 i915_request_put(rq); 905 out_ce: 906 intel_context_put(ce); 907 out_spin: 908 igt_spinner_fini(&spin); 909 out_restore: 910 engine->props.preempt_timeout_ms = preempt_timeout_ms; 911 if (err) 912 pr_err("%s: %s error %d\n", __func__, engine->name, err); 913 return err; 914 } 915 916 static int live_cancel_request(void *arg) 917 { 918 struct drm_i915_private *i915 = arg; 919 struct intel_engine_cs *engine; 920 921 /* 922 * Check cancellation of requests. We expect to be able to immediately 923 * cancel active requests, even if they are currently on the GPU. 924 */ 925 926 for_each_uabi_engine(engine, i915) { 927 struct igt_live_test t; 928 int err, err2; 929 930 if (!intel_engine_has_preemption(engine)) 931 continue; 932 933 err = igt_live_test_begin(&t, i915, __func__, engine->name); 934 if (err) 935 return err; 936 937 err = __cancel_inactive(engine); 938 if (err == 0) 939 err = __cancel_active(engine); 940 if (err == 0) 941 err = __cancel_completed(engine); 942 943 err2 = igt_live_test_end(&t); 944 if (err) 945 return err; 946 if (err2) 947 return err2; 948 949 /* Expects reset so call outside of igt_live_test_* */ 950 err = __cancel_reset(i915, engine); 951 if (err) 952 return err; 953 954 if (igt_flush_test(i915)) 955 return -EIO; 956 } 957 958 return 0; 959 } 960 961 static struct i915_vma *empty_batch(struct intel_gt *gt) 962 { 963 struct drm_i915_gem_object *obj; 964 struct i915_vma *vma; 965 u32 *cmd; 966 int err; 967 968 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 969 if (IS_ERR(obj)) 970 return ERR_CAST(obj); 971 972 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC); 973 if (IS_ERR(cmd)) { 974 err = PTR_ERR(cmd); 975 goto err; 976 } 977 978 *cmd = MI_BATCH_BUFFER_END; 979 980 __i915_gem_object_flush_map(obj, 0, 64); 981 i915_gem_object_unpin_map(obj); 982 983 intel_gt_chipset_flush(gt); 984 985 vma = i915_vma_instance(obj, gt->vm, NULL); 986 if (IS_ERR(vma)) { 987 err = PTR_ERR(vma); 988 goto err; 989 } 990 991 err = i915_vma_pin(vma, 0, 0, PIN_USER); 992 if (err) 993 goto err; 994 995 /* Force the wait now to avoid including it in the benchmark */ 996 err = i915_vma_sync(vma); 997 if (err) 998 goto err_pin; 999 1000 return vma; 1001 1002 err_pin: 1003 i915_vma_unpin(vma); 1004 err: 1005 i915_gem_object_put(obj); 1006 return ERR_PTR(err); 1007 } 1008 1009 static int emit_bb_start(struct i915_request *rq, struct i915_vma *batch) 1010 { 1011 return rq->engine->emit_bb_start(rq, 1012 i915_vma_offset(batch), 1013 i915_vma_size(batch), 1014 0); 1015 } 1016 1017 static struct i915_request * 1018 empty_request(struct intel_engine_cs *engine, 1019 struct i915_vma *batch) 1020 { 1021 struct i915_request *request; 1022 int err; 1023 1024 request = i915_request_create(engine->kernel_context); 1025 if (IS_ERR(request)) 1026 return request; 1027 1028 err = emit_bb_start(request, batch); 1029 if (err) 1030 goto out_request; 1031 1032 i915_request_get(request); 1033 out_request: 1034 i915_request_add(request); 1035 return err ? ERR_PTR(err) : request; 1036 } 1037 1038 static int live_empty_request(void *arg) 1039 { 1040 struct drm_i915_private *i915 = arg; 1041 struct intel_engine_cs *engine; 1042 struct igt_live_test t; 1043 int err; 1044 1045 /* 1046 * Submit various sized batches of empty requests, to each engine 1047 * (individually), and wait for the batch to complete. We can check 1048 * the overhead of submitting requests to the hardware. 1049 */ 1050 1051 for_each_uabi_engine(engine, i915) { 1052 IGT_TIMEOUT(end_time); 1053 struct i915_request *request; 1054 struct i915_vma *batch; 1055 unsigned long n, prime; 1056 ktime_t times[2] = {}; 1057 1058 batch = empty_batch(engine->gt); 1059 if (IS_ERR(batch)) 1060 return PTR_ERR(batch); 1061 1062 err = igt_live_test_begin(&t, i915, __func__, engine->name); 1063 if (err) 1064 goto out_batch; 1065 1066 intel_engine_pm_get(engine); 1067 1068 /* Warmup / preload */ 1069 request = empty_request(engine, batch); 1070 if (IS_ERR(request)) { 1071 err = PTR_ERR(request); 1072 intel_engine_pm_put(engine); 1073 goto out_batch; 1074 } 1075 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 1076 1077 for_each_prime_number_from(prime, 1, 8192) { 1078 times[1] = ktime_get_raw(); 1079 1080 for (n = 0; n < prime; n++) { 1081 i915_request_put(request); 1082 request = empty_request(engine, batch); 1083 if (IS_ERR(request)) { 1084 err = PTR_ERR(request); 1085 intel_engine_pm_put(engine); 1086 goto out_batch; 1087 } 1088 } 1089 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 1090 1091 times[1] = ktime_sub(ktime_get_raw(), times[1]); 1092 if (prime == 1) 1093 times[0] = times[1]; 1094 1095 if (__igt_timeout(end_time, NULL)) 1096 break; 1097 } 1098 i915_request_put(request); 1099 intel_engine_pm_put(engine); 1100 1101 err = igt_live_test_end(&t); 1102 if (err) 1103 goto out_batch; 1104 1105 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n", 1106 engine->name, 1107 ktime_to_ns(times[0]), 1108 prime, div64_u64(ktime_to_ns(times[1]), prime)); 1109 out_batch: 1110 i915_vma_unpin(batch); 1111 i915_vma_put(batch); 1112 if (err) 1113 break; 1114 } 1115 1116 return err; 1117 } 1118 1119 static struct i915_vma *recursive_batch(struct intel_gt *gt) 1120 { 1121 struct drm_i915_gem_object *obj; 1122 const int ver = GRAPHICS_VER(gt->i915); 1123 struct i915_vma *vma; 1124 u32 *cmd; 1125 int err; 1126 1127 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 1128 if (IS_ERR(obj)) 1129 return ERR_CAST(obj); 1130 1131 vma = i915_vma_instance(obj, gt->vm, NULL); 1132 if (IS_ERR(vma)) { 1133 err = PTR_ERR(vma); 1134 goto err; 1135 } 1136 1137 err = i915_vma_pin(vma, 0, 0, PIN_USER); 1138 if (err) 1139 goto err; 1140 1141 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC); 1142 if (IS_ERR(cmd)) { 1143 err = PTR_ERR(cmd); 1144 goto err; 1145 } 1146 1147 if (ver >= 8) { 1148 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 1149 *cmd++ = lower_32_bits(i915_vma_offset(vma)); 1150 *cmd++ = upper_32_bits(i915_vma_offset(vma)); 1151 } else if (ver >= 6) { 1152 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8; 1153 *cmd++ = lower_32_bits(i915_vma_offset(vma)); 1154 } else { 1155 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; 1156 *cmd++ = lower_32_bits(i915_vma_offset(vma)); 1157 } 1158 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */ 1159 1160 __i915_gem_object_flush_map(obj, 0, 64); 1161 i915_gem_object_unpin_map(obj); 1162 1163 intel_gt_chipset_flush(gt); 1164 1165 return vma; 1166 1167 err: 1168 i915_gem_object_put(obj); 1169 return ERR_PTR(err); 1170 } 1171 1172 static int recursive_batch_resolve(struct i915_vma *batch) 1173 { 1174 u32 *cmd; 1175 1176 cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC); 1177 if (IS_ERR(cmd)) 1178 return PTR_ERR(cmd); 1179 1180 *cmd = MI_BATCH_BUFFER_END; 1181 1182 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd)); 1183 i915_gem_object_unpin_map(batch->obj); 1184 1185 intel_gt_chipset_flush(batch->vm->gt); 1186 1187 return 0; 1188 } 1189 1190 static int live_all_engines(void *arg) 1191 { 1192 struct drm_i915_private *i915 = arg; 1193 const unsigned int nengines = num_uabi_engines(i915); 1194 struct intel_engine_cs *engine; 1195 struct i915_request **request; 1196 struct igt_live_test t; 1197 unsigned int idx; 1198 int err; 1199 1200 /* 1201 * Check we can submit requests to all engines simultaneously. We 1202 * send a recursive batch to each engine - checking that we don't 1203 * block doing so, and that they don't complete too soon. 1204 */ 1205 1206 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 1207 if (!request) 1208 return -ENOMEM; 1209 1210 err = igt_live_test_begin(&t, i915, __func__, ""); 1211 if (err) 1212 goto out_free; 1213 1214 idx = 0; 1215 for_each_uabi_engine(engine, i915) { 1216 struct i915_vma *batch; 1217 1218 batch = recursive_batch(engine->gt); 1219 if (IS_ERR(batch)) { 1220 err = PTR_ERR(batch); 1221 pr_err("%s: Unable to create batch, err=%d\n", 1222 __func__, err); 1223 goto out_free; 1224 } 1225 1226 i915_vma_lock(batch); 1227 request[idx] = intel_engine_create_kernel_request(engine); 1228 if (IS_ERR(request[idx])) { 1229 err = PTR_ERR(request[idx]); 1230 pr_err("%s: Request allocation failed with err=%d\n", 1231 __func__, err); 1232 goto out_unlock; 1233 } 1234 GEM_BUG_ON(request[idx]->context->vm != batch->vm); 1235 1236 err = i915_vma_move_to_active(batch, request[idx], 0); 1237 GEM_BUG_ON(err); 1238 1239 err = emit_bb_start(request[idx], batch); 1240 GEM_BUG_ON(err); 1241 request[idx]->batch = batch; 1242 1243 i915_request_get(request[idx]); 1244 i915_request_add(request[idx]); 1245 idx++; 1246 out_unlock: 1247 i915_vma_unlock(batch); 1248 if (err) 1249 goto out_request; 1250 } 1251 1252 idx = 0; 1253 for_each_uabi_engine(engine, i915) { 1254 if (i915_request_completed(request[idx])) { 1255 pr_err("%s(%s): request completed too early!\n", 1256 __func__, engine->name); 1257 err = -EINVAL; 1258 goto out_request; 1259 } 1260 idx++; 1261 } 1262 1263 idx = 0; 1264 for_each_uabi_engine(engine, i915) { 1265 err = recursive_batch_resolve(request[idx]->batch); 1266 if (err) { 1267 pr_err("%s: failed to resolve batch, err=%d\n", 1268 __func__, err); 1269 goto out_request; 1270 } 1271 idx++; 1272 } 1273 1274 idx = 0; 1275 for_each_uabi_engine(engine, i915) { 1276 struct i915_request *rq = request[idx]; 1277 long timeout; 1278 1279 timeout = i915_request_wait(rq, 0, 1280 MAX_SCHEDULE_TIMEOUT); 1281 if (timeout < 0) { 1282 err = timeout; 1283 pr_err("%s: error waiting for request on %s, err=%d\n", 1284 __func__, engine->name, err); 1285 goto out_request; 1286 } 1287 1288 GEM_BUG_ON(!i915_request_completed(rq)); 1289 i915_vma_unpin(rq->batch); 1290 i915_vma_put(rq->batch); 1291 i915_request_put(rq); 1292 request[idx] = NULL; 1293 idx++; 1294 } 1295 1296 err = igt_live_test_end(&t); 1297 1298 out_request: 1299 idx = 0; 1300 for_each_uabi_engine(engine, i915) { 1301 struct i915_request *rq = request[idx]; 1302 1303 if (!rq) 1304 continue; 1305 1306 if (rq->batch) { 1307 i915_vma_unpin(rq->batch); 1308 i915_vma_put(rq->batch); 1309 } 1310 i915_request_put(rq); 1311 idx++; 1312 } 1313 out_free: 1314 kfree(request); 1315 return err; 1316 } 1317 1318 static int live_sequential_engines(void *arg) 1319 { 1320 struct drm_i915_private *i915 = arg; 1321 const unsigned int nengines = num_uabi_engines(i915); 1322 struct i915_request **request; 1323 struct i915_request *prev = NULL; 1324 struct intel_engine_cs *engine; 1325 struct igt_live_test t; 1326 unsigned int idx; 1327 int err; 1328 1329 /* 1330 * Check we can submit requests to all engines sequentially, such 1331 * that each successive request waits for the earlier ones. This 1332 * tests that we don't execute requests out of order, even though 1333 * they are running on independent engines. 1334 */ 1335 1336 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 1337 if (!request) 1338 return -ENOMEM; 1339 1340 err = igt_live_test_begin(&t, i915, __func__, ""); 1341 if (err) 1342 goto out_free; 1343 1344 idx = 0; 1345 for_each_uabi_engine(engine, i915) { 1346 struct i915_vma *batch; 1347 1348 batch = recursive_batch(engine->gt); 1349 if (IS_ERR(batch)) { 1350 err = PTR_ERR(batch); 1351 pr_err("%s: Unable to create batch for %s, err=%d\n", 1352 __func__, engine->name, err); 1353 goto out_free; 1354 } 1355 1356 i915_vma_lock(batch); 1357 request[idx] = intel_engine_create_kernel_request(engine); 1358 if (IS_ERR(request[idx])) { 1359 err = PTR_ERR(request[idx]); 1360 pr_err("%s: Request allocation failed for %s with err=%d\n", 1361 __func__, engine->name, err); 1362 goto out_unlock; 1363 } 1364 GEM_BUG_ON(request[idx]->context->vm != batch->vm); 1365 1366 if (prev) { 1367 err = i915_request_await_dma_fence(request[idx], 1368 &prev->fence); 1369 if (err) { 1370 i915_request_add(request[idx]); 1371 pr_err("%s: Request await failed for %s with err=%d\n", 1372 __func__, engine->name, err); 1373 goto out_unlock; 1374 } 1375 } 1376 1377 err = i915_vma_move_to_active(batch, request[idx], 0); 1378 GEM_BUG_ON(err); 1379 1380 err = emit_bb_start(request[idx], batch); 1381 GEM_BUG_ON(err); 1382 request[idx]->batch = batch; 1383 1384 i915_request_get(request[idx]); 1385 i915_request_add(request[idx]); 1386 1387 prev = request[idx]; 1388 idx++; 1389 1390 out_unlock: 1391 i915_vma_unlock(batch); 1392 if (err) 1393 goto out_request; 1394 } 1395 1396 idx = 0; 1397 for_each_uabi_engine(engine, i915) { 1398 long timeout; 1399 1400 if (i915_request_completed(request[idx])) { 1401 pr_err("%s(%s): request completed too early!\n", 1402 __func__, engine->name); 1403 err = -EINVAL; 1404 goto out_request; 1405 } 1406 1407 err = recursive_batch_resolve(request[idx]->batch); 1408 if (err) { 1409 pr_err("%s: failed to resolve batch, err=%d\n", 1410 __func__, err); 1411 goto out_request; 1412 } 1413 1414 timeout = i915_request_wait(request[idx], 0, 1415 MAX_SCHEDULE_TIMEOUT); 1416 if (timeout < 0) { 1417 err = timeout; 1418 pr_err("%s: error waiting for request on %s, err=%d\n", 1419 __func__, engine->name, err); 1420 goto out_request; 1421 } 1422 1423 GEM_BUG_ON(!i915_request_completed(request[idx])); 1424 idx++; 1425 } 1426 1427 err = igt_live_test_end(&t); 1428 1429 out_request: 1430 idx = 0; 1431 for_each_uabi_engine(engine, i915) { 1432 u32 *cmd; 1433 1434 if (!request[idx]) 1435 break; 1436 1437 cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj, 1438 I915_MAP_WC); 1439 if (!IS_ERR(cmd)) { 1440 *cmd = MI_BATCH_BUFFER_END; 1441 1442 __i915_gem_object_flush_map(request[idx]->batch->obj, 1443 0, sizeof(*cmd)); 1444 i915_gem_object_unpin_map(request[idx]->batch->obj); 1445 1446 intel_gt_chipset_flush(engine->gt); 1447 } 1448 1449 i915_vma_put(request[idx]->batch); 1450 i915_request_put(request[idx]); 1451 idx++; 1452 } 1453 out_free: 1454 kfree(request); 1455 return err; 1456 } 1457 1458 struct parallel_thread { 1459 struct kthread_worker *worker; 1460 struct kthread_work work; 1461 struct intel_engine_cs *engine; 1462 int result; 1463 }; 1464 1465 static void __live_parallel_engine1(struct kthread_work *work) 1466 { 1467 struct parallel_thread *thread = 1468 container_of(work, typeof(*thread), work); 1469 struct intel_engine_cs *engine = thread->engine; 1470 IGT_TIMEOUT(end_time); 1471 unsigned long count; 1472 int err = 0; 1473 1474 count = 0; 1475 intel_engine_pm_get(engine); 1476 do { 1477 struct i915_request *rq; 1478 1479 rq = i915_request_create(engine->kernel_context); 1480 if (IS_ERR(rq)) { 1481 err = PTR_ERR(rq); 1482 break; 1483 } 1484 1485 i915_request_get(rq); 1486 i915_request_add(rq); 1487 1488 err = 0; 1489 if (i915_request_wait(rq, 0, HZ) < 0) 1490 err = -ETIME; 1491 i915_request_put(rq); 1492 if (err) 1493 break; 1494 1495 count++; 1496 } while (!__igt_timeout(end_time, NULL)); 1497 intel_engine_pm_put(engine); 1498 1499 pr_info("%s: %lu request + sync\n", engine->name, count); 1500 thread->result = err; 1501 } 1502 1503 static void __live_parallel_engineN(struct kthread_work *work) 1504 { 1505 struct parallel_thread *thread = 1506 container_of(work, typeof(*thread), work); 1507 struct intel_engine_cs *engine = thread->engine; 1508 IGT_TIMEOUT(end_time); 1509 unsigned long count; 1510 int err = 0; 1511 1512 count = 0; 1513 intel_engine_pm_get(engine); 1514 do { 1515 struct i915_request *rq; 1516 1517 rq = i915_request_create(engine->kernel_context); 1518 if (IS_ERR(rq)) { 1519 err = PTR_ERR(rq); 1520 break; 1521 } 1522 1523 i915_request_add(rq); 1524 count++; 1525 } while (!__igt_timeout(end_time, NULL)); 1526 intel_engine_pm_put(engine); 1527 1528 pr_info("%s: %lu requests\n", engine->name, count); 1529 thread->result = err; 1530 } 1531 1532 static bool wake_all(struct drm_i915_private *i915) 1533 { 1534 if (atomic_dec_and_test(&i915->selftest.counter)) { 1535 wake_up_var(&i915->selftest.counter); 1536 return true; 1537 } 1538 1539 return false; 1540 } 1541 1542 static int wait_for_all(struct drm_i915_private *i915) 1543 { 1544 if (wake_all(i915)) 1545 return 0; 1546 1547 if (wait_var_event_timeout(&i915->selftest.counter, 1548 !atomic_read(&i915->selftest.counter), 1549 i915_selftest.timeout_jiffies)) 1550 return 0; 1551 1552 return -ETIME; 1553 } 1554 1555 static void __live_parallel_spin(struct kthread_work *work) 1556 { 1557 struct parallel_thread *thread = 1558 container_of(work, typeof(*thread), work); 1559 struct intel_engine_cs *engine = thread->engine; 1560 struct igt_spinner spin; 1561 struct i915_request *rq; 1562 int err = 0; 1563 1564 /* 1565 * Create a spinner running for eternity on each engine. If a second 1566 * spinner is incorrectly placed on the same engine, it will not be 1567 * able to start in time. 1568 */ 1569 1570 if (igt_spinner_init(&spin, engine->gt)) { 1571 wake_all(engine->i915); 1572 thread->result = -ENOMEM; 1573 return; 1574 } 1575 1576 intel_engine_pm_get(engine); 1577 rq = igt_spinner_create_request(&spin, 1578 engine->kernel_context, 1579 MI_NOOP); /* no preemption */ 1580 intel_engine_pm_put(engine); 1581 if (IS_ERR(rq)) { 1582 err = PTR_ERR(rq); 1583 if (err == -ENODEV) 1584 err = 0; 1585 wake_all(engine->i915); 1586 goto out_spin; 1587 } 1588 1589 i915_request_get(rq); 1590 i915_request_add(rq); 1591 if (igt_wait_for_spinner(&spin, rq)) { 1592 /* Occupy this engine for the whole test */ 1593 err = wait_for_all(engine->i915); 1594 } else { 1595 pr_err("Failed to start spinner on %s\n", engine->name); 1596 err = -EINVAL; 1597 } 1598 igt_spinner_end(&spin); 1599 1600 if (err == 0 && i915_request_wait(rq, 0, HZ) < 0) 1601 err = -EIO; 1602 i915_request_put(rq); 1603 1604 out_spin: 1605 igt_spinner_fini(&spin); 1606 thread->result = err; 1607 } 1608 1609 static int live_parallel_engines(void *arg) 1610 { 1611 struct drm_i915_private *i915 = arg; 1612 static void (* const func[])(struct kthread_work *) = { 1613 __live_parallel_engine1, 1614 __live_parallel_engineN, 1615 __live_parallel_spin, 1616 NULL, 1617 }; 1618 const unsigned int nengines = num_uabi_engines(i915); 1619 struct parallel_thread *threads; 1620 struct intel_engine_cs *engine; 1621 void (* const *fn)(struct kthread_work *); 1622 int err = 0; 1623 1624 /* 1625 * Check we can submit requests to all engines concurrently. This 1626 * tests that we load up the system maximally. 1627 */ 1628 1629 threads = kcalloc(nengines, sizeof(*threads), GFP_KERNEL); 1630 if (!threads) 1631 return -ENOMEM; 1632 1633 for (fn = func; !err && *fn; fn++) { 1634 char name[KSYM_NAME_LEN]; 1635 struct igt_live_test t; 1636 unsigned int idx; 1637 1638 snprintf(name, sizeof(name), "%ps", *fn); 1639 err = igt_live_test_begin(&t, i915, __func__, name); 1640 if (err) 1641 break; 1642 1643 atomic_set(&i915->selftest.counter, nengines); 1644 1645 idx = 0; 1646 for_each_uabi_engine(engine, i915) { 1647 struct kthread_worker *worker; 1648 1649 worker = kthread_run_worker(0, "igt/parallel:%s", 1650 engine->name); 1651 if (IS_ERR(worker)) { 1652 err = PTR_ERR(worker); 1653 break; 1654 } 1655 1656 threads[idx].worker = worker; 1657 threads[idx].result = 0; 1658 threads[idx].engine = engine; 1659 1660 kthread_init_work(&threads[idx].work, *fn); 1661 kthread_queue_work(worker, &threads[idx].work); 1662 idx++; 1663 } 1664 1665 idx = 0; 1666 for_each_uabi_engine(engine, i915) { 1667 int status; 1668 1669 if (!threads[idx].worker) 1670 break; 1671 1672 kthread_flush_work(&threads[idx].work); 1673 status = READ_ONCE(threads[idx].result); 1674 if (status && !err) 1675 err = status; 1676 1677 kthread_destroy_worker(threads[idx++].worker); 1678 } 1679 1680 if (igt_live_test_end(&t)) 1681 err = -EIO; 1682 } 1683 1684 kfree(threads); 1685 return err; 1686 } 1687 1688 static int 1689 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine) 1690 { 1691 struct i915_request *rq; 1692 int ret; 1693 1694 /* 1695 * Before execlists, all contexts share the same ringbuffer. With 1696 * execlists, each context/engine has a separate ringbuffer and 1697 * for the purposes of this test, inexhaustible. 1698 * 1699 * For the global ringbuffer though, we have to be very careful 1700 * that we do not wrap while preventing the execution of requests 1701 * with a unsignaled fence. 1702 */ 1703 if (HAS_EXECLISTS(ctx->i915)) 1704 return INT_MAX; 1705 1706 rq = igt_request_alloc(ctx, engine); 1707 if (IS_ERR(rq)) { 1708 ret = PTR_ERR(rq); 1709 } else { 1710 int sz; 1711 1712 ret = rq->ring->size - rq->reserved_space; 1713 i915_request_add(rq); 1714 1715 sz = rq->ring->emit - rq->head; 1716 if (sz < 0) 1717 sz += rq->ring->size; 1718 ret /= sz; 1719 ret /= 2; /* leave half spare, in case of emergency! */ 1720 } 1721 1722 return ret; 1723 } 1724 1725 static int live_breadcrumbs_smoketest(void *arg) 1726 { 1727 struct drm_i915_private *i915 = arg; 1728 const unsigned int nengines = num_uabi_engines(i915); 1729 const unsigned int ncpus = /* saturate with nengines * ncpus */ 1730 max_t(int, 2, DIV_ROUND_UP(num_online_cpus(), nengines)); 1731 unsigned long num_waits, num_fences; 1732 struct intel_engine_cs *engine; 1733 struct smoke_thread *threads; 1734 struct igt_live_test live; 1735 intel_wakeref_t wakeref; 1736 struct smoketest *smoke; 1737 unsigned int n, idx; 1738 struct file *file; 1739 int ret = 0; 1740 1741 /* 1742 * Smoketest our breadcrumb/signal handling for requests across multiple 1743 * threads. A very simple test to only catch the most egregious of bugs. 1744 * See __igt_breadcrumbs_smoketest(); 1745 * 1746 * On real hardware this time. 1747 */ 1748 1749 wakeref = intel_runtime_pm_get(&i915->runtime_pm); 1750 1751 file = mock_file(i915); 1752 if (IS_ERR(file)) { 1753 ret = PTR_ERR(file); 1754 goto out_rpm; 1755 } 1756 1757 smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL); 1758 if (!smoke) { 1759 ret = -ENOMEM; 1760 goto out_file; 1761 } 1762 1763 threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL); 1764 if (!threads) { 1765 ret = -ENOMEM; 1766 goto out_smoke; 1767 } 1768 1769 smoke[0].request_alloc = __live_request_alloc; 1770 smoke[0].ncontexts = 64; 1771 smoke[0].contexts = kcalloc(smoke[0].ncontexts, 1772 sizeof(*smoke[0].contexts), 1773 GFP_KERNEL); 1774 if (!smoke[0].contexts) { 1775 ret = -ENOMEM; 1776 goto out_threads; 1777 } 1778 1779 for (n = 0; n < smoke[0].ncontexts; n++) { 1780 smoke[0].contexts[n] = live_context(i915, file); 1781 if (IS_ERR(smoke[0].contexts[n])) { 1782 ret = PTR_ERR(smoke[0].contexts[n]); 1783 goto out_contexts; 1784 } 1785 } 1786 1787 ret = igt_live_test_begin(&live, i915, __func__, ""); 1788 if (ret) 1789 goto out_contexts; 1790 1791 idx = 0; 1792 for_each_uabi_engine(engine, i915) { 1793 smoke[idx] = smoke[0]; 1794 smoke[idx].engine = engine; 1795 smoke[idx].max_batch = 1796 max_batches(smoke[0].contexts[0], engine); 1797 if (smoke[idx].max_batch < 0) { 1798 ret = smoke[idx].max_batch; 1799 goto out_flush; 1800 } 1801 /* One ring interleaved between requests from all cpus */ 1802 smoke[idx].max_batch /= ncpus + 1; 1803 pr_debug("Limiting batches to %d requests on %s\n", 1804 smoke[idx].max_batch, engine->name); 1805 1806 for (n = 0; n < ncpus; n++) { 1807 unsigned int i = idx * ncpus + n; 1808 struct kthread_worker *worker; 1809 1810 worker = kthread_run_worker(0, "igt/%d.%d", idx, n); 1811 if (IS_ERR(worker)) { 1812 ret = PTR_ERR(worker); 1813 goto out_flush; 1814 } 1815 1816 threads[i].worker = worker; 1817 threads[i].t = &smoke[idx]; 1818 1819 kthread_init_work(&threads[i].work, 1820 __igt_breadcrumbs_smoketest); 1821 kthread_queue_work(worker, &threads[i].work); 1822 } 1823 1824 idx++; 1825 } 1826 1827 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 1828 1829 out_flush: 1830 idx = 0; 1831 num_waits = 0; 1832 num_fences = 0; 1833 for_each_uabi_engine(engine, i915) { 1834 for (n = 0; n < ncpus; n++) { 1835 unsigned int i = idx * ncpus + n; 1836 int err; 1837 1838 if (!threads[i].worker) 1839 continue; 1840 1841 WRITE_ONCE(threads[i].stop, true); 1842 kthread_flush_work(&threads[i].work); 1843 err = READ_ONCE(threads[i].result); 1844 if (err < 0 && !ret) 1845 ret = err; 1846 1847 kthread_destroy_worker(threads[i].worker); 1848 } 1849 1850 num_waits += atomic_long_read(&smoke[idx].num_waits); 1851 num_fences += atomic_long_read(&smoke[idx].num_fences); 1852 idx++; 1853 } 1854 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n", 1855 num_waits, num_fences, idx, ncpus); 1856 1857 ret = igt_live_test_end(&live) ?: ret; 1858 out_contexts: 1859 kfree(smoke[0].contexts); 1860 out_threads: 1861 kfree(threads); 1862 out_smoke: 1863 kfree(smoke); 1864 out_file: 1865 fput(file); 1866 out_rpm: 1867 intel_runtime_pm_put(&i915->runtime_pm, wakeref); 1868 1869 return ret; 1870 } 1871 1872 int i915_request_live_selftests(struct drm_i915_private *i915) 1873 { 1874 static const struct i915_subtest tests[] = { 1875 SUBTEST(live_nop_request), 1876 SUBTEST(live_all_engines), 1877 SUBTEST(live_sequential_engines), 1878 SUBTEST(live_parallel_engines), 1879 SUBTEST(live_empty_request), 1880 SUBTEST(live_cancel_request), 1881 SUBTEST(live_breadcrumbs_smoketest), 1882 }; 1883 1884 if (intel_gt_is_wedged(to_gt(i915))) 1885 return 0; 1886 1887 return i915_live_subtests(tests, i915); 1888 } 1889 1890 static int switch_to_kernel_sync(struct intel_context *ce, int err) 1891 { 1892 struct i915_request *rq; 1893 struct dma_fence *fence; 1894 1895 rq = intel_engine_create_kernel_request(ce->engine); 1896 if (IS_ERR(rq)) 1897 return PTR_ERR(rq); 1898 1899 fence = i915_active_fence_get(&ce->timeline->last_request); 1900 if (fence) { 1901 i915_request_await_dma_fence(rq, fence); 1902 dma_fence_put(fence); 1903 } 1904 1905 rq = i915_request_get(rq); 1906 i915_request_add(rq); 1907 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err) 1908 err = -ETIME; 1909 i915_request_put(rq); 1910 1911 while (!err && !intel_engine_is_idle(ce->engine)) 1912 intel_engine_flush_submission(ce->engine); 1913 1914 return err; 1915 } 1916 1917 struct perf_stats { 1918 struct intel_engine_cs *engine; 1919 unsigned long count; 1920 ktime_t time; 1921 ktime_t busy; 1922 u64 runtime; 1923 }; 1924 1925 struct perf_series { 1926 struct drm_i915_private *i915; 1927 unsigned int nengines; 1928 struct intel_context *ce[] __counted_by(nengines); 1929 }; 1930 1931 static int cmp_u32(const void *A, const void *B) 1932 { 1933 const u32 *a = A, *b = B; 1934 1935 return *a - *b; 1936 } 1937 1938 static u32 trifilter(u32 *a) 1939 { 1940 u64 sum; 1941 1942 #define TF_COUNT 5 1943 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL); 1944 1945 sum = mul_u32_u32(a[2], 2); 1946 sum += a[1]; 1947 sum += a[3]; 1948 1949 GEM_BUG_ON(sum > U32_MAX); 1950 return sum; 1951 #define TF_BIAS 2 1952 } 1953 1954 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles) 1955 { 1956 u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles); 1957 1958 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS); 1959 } 1960 1961 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset) 1962 { 1963 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; 1964 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base))); 1965 *cs++ = offset; 1966 *cs++ = 0; 1967 1968 return cs; 1969 } 1970 1971 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value) 1972 { 1973 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1974 *cs++ = offset; 1975 *cs++ = 0; 1976 *cs++ = value; 1977 1978 return cs; 1979 } 1980 1981 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset) 1982 { 1983 *cs++ = MI_SEMAPHORE_WAIT | 1984 MI_SEMAPHORE_GLOBAL_GTT | 1985 MI_SEMAPHORE_POLL | 1986 mode; 1987 *cs++ = value; 1988 *cs++ = offset; 1989 *cs++ = 0; 1990 1991 return cs; 1992 } 1993 1994 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value) 1995 { 1996 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset); 1997 } 1998 1999 static void semaphore_set(u32 *sema, u32 value) 2000 { 2001 WRITE_ONCE(*sema, value); 2002 wmb(); /* flush the update to the cache, and beyond */ 2003 } 2004 2005 static u32 *hwsp_scratch(const struct intel_context *ce) 2006 { 2007 return memset32(ce->engine->status_page.addr + 1000, 0, 21); 2008 } 2009 2010 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw) 2011 { 2012 return (i915_ggtt_offset(ce->engine->status_page.vma) + 2013 offset_in_page(dw)); 2014 } 2015 2016 static int measure_semaphore_response(struct intel_context *ce) 2017 { 2018 u32 *sema = hwsp_scratch(ce); 2019 const u32 offset = hwsp_offset(ce, sema); 2020 u32 elapsed[TF_COUNT], cycles; 2021 struct i915_request *rq; 2022 u32 *cs; 2023 int err; 2024 int i; 2025 2026 /* 2027 * Measure how many cycles it takes for the HW to detect the change 2028 * in a semaphore value. 2029 * 2030 * A: read CS_TIMESTAMP from CPU 2031 * poke semaphore 2032 * B: read CS_TIMESTAMP on GPU 2033 * 2034 * Semaphore latency: B - A 2035 */ 2036 2037 semaphore_set(sema, -1); 2038 2039 rq = i915_request_create(ce); 2040 if (IS_ERR(rq)) 2041 return PTR_ERR(rq); 2042 2043 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed)); 2044 if (IS_ERR(cs)) { 2045 i915_request_add(rq); 2046 err = PTR_ERR(cs); 2047 goto err; 2048 } 2049 2050 cs = emit_store_dw(cs, offset, 0); 2051 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2052 cs = emit_semaphore_poll_until(cs, offset, i); 2053 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2054 cs = emit_store_dw(cs, offset, 0); 2055 } 2056 2057 intel_ring_advance(rq, cs); 2058 i915_request_add(rq); 2059 2060 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 2061 err = -EIO; 2062 goto err; 2063 } 2064 2065 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2066 preempt_disable(); 2067 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2068 semaphore_set(sema, i); 2069 preempt_enable(); 2070 2071 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 2072 err = -EIO; 2073 goto err; 2074 } 2075 2076 elapsed[i - 1] = sema[i] - cycles; 2077 } 2078 2079 cycles = trifilter(elapsed); 2080 pr_info("%s: semaphore response %d cycles, %lluns\n", 2081 ce->engine->name, cycles >> TF_BIAS, 2082 cycles_to_ns(ce->engine, cycles)); 2083 2084 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2085 2086 err: 2087 intel_gt_set_wedged(ce->engine->gt); 2088 return err; 2089 } 2090 2091 static int measure_idle_dispatch(struct intel_context *ce) 2092 { 2093 u32 *sema = hwsp_scratch(ce); 2094 const u32 offset = hwsp_offset(ce, sema); 2095 u32 elapsed[TF_COUNT], cycles; 2096 u32 *cs; 2097 int err; 2098 int i; 2099 2100 /* 2101 * Measure how long it takes for us to submit a request while the 2102 * engine is idle, but is resting in our context. 2103 * 2104 * A: read CS_TIMESTAMP from CPU 2105 * submit request 2106 * B: read CS_TIMESTAMP on GPU 2107 * 2108 * Submission latency: B - A 2109 */ 2110 2111 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 2112 struct i915_request *rq; 2113 2114 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2115 if (err) 2116 return err; 2117 2118 rq = i915_request_create(ce); 2119 if (IS_ERR(rq)) { 2120 err = PTR_ERR(rq); 2121 goto err; 2122 } 2123 2124 cs = intel_ring_begin(rq, 4); 2125 if (IS_ERR(cs)) { 2126 i915_request_add(rq); 2127 err = PTR_ERR(cs); 2128 goto err; 2129 } 2130 2131 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2132 2133 intel_ring_advance(rq, cs); 2134 2135 preempt_disable(); 2136 local_bh_disable(); 2137 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2138 i915_request_add(rq); 2139 local_bh_enable(); 2140 preempt_enable(); 2141 } 2142 2143 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2144 if (err) 2145 goto err; 2146 2147 for (i = 0; i < ARRAY_SIZE(elapsed); i++) 2148 elapsed[i] = sema[i] - elapsed[i]; 2149 2150 cycles = trifilter(elapsed); 2151 pr_info("%s: idle dispatch latency %d cycles, %lluns\n", 2152 ce->engine->name, cycles >> TF_BIAS, 2153 cycles_to_ns(ce->engine, cycles)); 2154 2155 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2156 2157 err: 2158 intel_gt_set_wedged(ce->engine->gt); 2159 return err; 2160 } 2161 2162 static int measure_busy_dispatch(struct intel_context *ce) 2163 { 2164 u32 *sema = hwsp_scratch(ce); 2165 const u32 offset = hwsp_offset(ce, sema); 2166 u32 elapsed[TF_COUNT + 1], cycles; 2167 u32 *cs; 2168 int err; 2169 int i; 2170 2171 /* 2172 * Measure how long it takes for us to submit a request while the 2173 * engine is busy, polling on a semaphore in our context. With 2174 * direct submission, this will include the cost of a lite restore. 2175 * 2176 * A: read CS_TIMESTAMP from CPU 2177 * submit request 2178 * B: read CS_TIMESTAMP on GPU 2179 * 2180 * Submission latency: B - A 2181 */ 2182 2183 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2184 struct i915_request *rq; 2185 2186 rq = i915_request_create(ce); 2187 if (IS_ERR(rq)) { 2188 err = PTR_ERR(rq); 2189 goto err; 2190 } 2191 2192 cs = intel_ring_begin(rq, 12); 2193 if (IS_ERR(cs)) { 2194 i915_request_add(rq); 2195 err = PTR_ERR(cs); 2196 goto err; 2197 } 2198 2199 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 2200 cs = emit_semaphore_poll_until(cs, offset, i); 2201 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2202 2203 intel_ring_advance(rq, cs); 2204 2205 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) { 2206 err = -EIO; 2207 goto err; 2208 } 2209 2210 preempt_disable(); 2211 local_bh_disable(); 2212 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2213 i915_request_add(rq); 2214 local_bh_enable(); 2215 semaphore_set(sema, i - 1); 2216 preempt_enable(); 2217 } 2218 2219 wait_for(READ_ONCE(sema[i - 1]), 500); 2220 semaphore_set(sema, i - 1); 2221 2222 for (i = 1; i <= TF_COUNT; i++) { 2223 GEM_BUG_ON(sema[i] == -1); 2224 elapsed[i - 1] = sema[i] - elapsed[i]; 2225 } 2226 2227 cycles = trifilter(elapsed); 2228 pr_info("%s: busy dispatch latency %d cycles, %lluns\n", 2229 ce->engine->name, cycles >> TF_BIAS, 2230 cycles_to_ns(ce->engine, cycles)); 2231 2232 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2233 2234 err: 2235 intel_gt_set_wedged(ce->engine->gt); 2236 return err; 2237 } 2238 2239 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value) 2240 { 2241 const u32 offset = 2242 i915_ggtt_offset(engine->status_page.vma) + 2243 offset_in_page(sema); 2244 struct i915_request *rq; 2245 u32 *cs; 2246 2247 rq = i915_request_create(engine->kernel_context); 2248 if (IS_ERR(rq)) 2249 return PTR_ERR(rq); 2250 2251 cs = intel_ring_begin(rq, 4); 2252 if (IS_ERR(cs)) { 2253 i915_request_add(rq); 2254 return PTR_ERR(cs); 2255 } 2256 2257 cs = emit_semaphore_poll(cs, mode, value, offset); 2258 2259 intel_ring_advance(rq, cs); 2260 i915_request_add(rq); 2261 2262 return 0; 2263 } 2264 2265 static int measure_inter_request(struct intel_context *ce) 2266 { 2267 u32 *sema = hwsp_scratch(ce); 2268 const u32 offset = hwsp_offset(ce, sema); 2269 u32 elapsed[TF_COUNT + 1], cycles; 2270 struct i915_sw_fence *submit; 2271 int i, err; 2272 2273 /* 2274 * Measure how long it takes to advance from one request into the 2275 * next. Between each request we flush the GPU caches to memory, 2276 * update the breadcrumbs, and then invalidate those caches. 2277 * We queue up all the requests to be submitted in one batch so 2278 * it should be one set of contiguous measurements. 2279 * 2280 * A: read CS_TIMESTAMP on GPU 2281 * advance request 2282 * B: read CS_TIMESTAMP on GPU 2283 * 2284 * Request latency: B - A 2285 */ 2286 2287 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 2288 if (err) 2289 return err; 2290 2291 submit = heap_fence_create(GFP_KERNEL); 2292 if (!submit) { 2293 semaphore_set(sema, 1); 2294 return -ENOMEM; 2295 } 2296 2297 intel_engine_flush_submission(ce->engine); 2298 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2299 struct i915_request *rq; 2300 u32 *cs; 2301 2302 rq = i915_request_create(ce); 2303 if (IS_ERR(rq)) { 2304 err = PTR_ERR(rq); 2305 goto err_submit; 2306 } 2307 2308 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 2309 submit, 2310 GFP_KERNEL); 2311 if (err < 0) { 2312 i915_request_add(rq); 2313 goto err_submit; 2314 } 2315 2316 cs = intel_ring_begin(rq, 4); 2317 if (IS_ERR(cs)) { 2318 i915_request_add(rq); 2319 err = PTR_ERR(cs); 2320 goto err_submit; 2321 } 2322 2323 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2324 2325 intel_ring_advance(rq, cs); 2326 i915_request_add(rq); 2327 } 2328 i915_sw_fence_commit(submit); 2329 intel_engine_flush_submission(ce->engine); 2330 heap_fence_put(submit); 2331 2332 semaphore_set(sema, 1); 2333 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2334 if (err) 2335 goto err; 2336 2337 for (i = 1; i <= TF_COUNT; i++) 2338 elapsed[i - 1] = sema[i + 1] - sema[i]; 2339 2340 cycles = trifilter(elapsed); 2341 pr_info("%s: inter-request latency %d cycles, %lluns\n", 2342 ce->engine->name, cycles >> TF_BIAS, 2343 cycles_to_ns(ce->engine, cycles)); 2344 2345 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2346 2347 err_submit: 2348 i915_sw_fence_commit(submit); 2349 heap_fence_put(submit); 2350 semaphore_set(sema, 1); 2351 err: 2352 intel_gt_set_wedged(ce->engine->gt); 2353 return err; 2354 } 2355 2356 static int measure_context_switch(struct intel_context *ce) 2357 { 2358 u32 *sema = hwsp_scratch(ce); 2359 const u32 offset = hwsp_offset(ce, sema); 2360 struct i915_request *fence = NULL; 2361 u32 elapsed[TF_COUNT + 1], cycles; 2362 int i, j, err; 2363 u32 *cs; 2364 2365 /* 2366 * Measure how long it takes to advance from one request in one 2367 * context to a request in another context. This allows us to 2368 * measure how long the context save/restore take, along with all 2369 * the inter-context setup we require. 2370 * 2371 * A: read CS_TIMESTAMP on GPU 2372 * switch context 2373 * B: read CS_TIMESTAMP on GPU 2374 * 2375 * Context switch latency: B - A 2376 */ 2377 2378 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 2379 if (err) 2380 return err; 2381 2382 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2383 struct intel_context *arr[] = { 2384 ce, ce->engine->kernel_context 2385 }; 2386 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32); 2387 2388 for (j = 0; j < ARRAY_SIZE(arr); j++) { 2389 struct i915_request *rq; 2390 2391 rq = i915_request_create(arr[j]); 2392 if (IS_ERR(rq)) { 2393 err = PTR_ERR(rq); 2394 goto err_fence; 2395 } 2396 2397 if (fence) { 2398 err = i915_request_await_dma_fence(rq, 2399 &fence->fence); 2400 if (err) { 2401 i915_request_add(rq); 2402 goto err_fence; 2403 } 2404 } 2405 2406 cs = intel_ring_begin(rq, 4); 2407 if (IS_ERR(cs)) { 2408 i915_request_add(rq); 2409 err = PTR_ERR(cs); 2410 goto err_fence; 2411 } 2412 2413 cs = emit_timestamp_store(cs, ce, addr); 2414 addr += sizeof(u32); 2415 2416 intel_ring_advance(rq, cs); 2417 2418 i915_request_put(fence); 2419 fence = i915_request_get(rq); 2420 2421 i915_request_add(rq); 2422 } 2423 } 2424 i915_request_put(fence); 2425 intel_engine_flush_submission(ce->engine); 2426 2427 semaphore_set(sema, 1); 2428 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2429 if (err) 2430 goto err; 2431 2432 for (i = 1; i <= TF_COUNT; i++) 2433 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1]; 2434 2435 cycles = trifilter(elapsed); 2436 pr_info("%s: context switch latency %d cycles, %lluns\n", 2437 ce->engine->name, cycles >> TF_BIAS, 2438 cycles_to_ns(ce->engine, cycles)); 2439 2440 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2441 2442 err_fence: 2443 i915_request_put(fence); 2444 semaphore_set(sema, 1); 2445 err: 2446 intel_gt_set_wedged(ce->engine->gt); 2447 return err; 2448 } 2449 2450 static int measure_preemption(struct intel_context *ce) 2451 { 2452 u32 *sema = hwsp_scratch(ce); 2453 const u32 offset = hwsp_offset(ce, sema); 2454 u32 elapsed[TF_COUNT], cycles; 2455 u32 *cs; 2456 int err; 2457 int i; 2458 2459 /* 2460 * We measure two latencies while triggering preemption. The first 2461 * latency is how long it takes for us to submit a preempting request. 2462 * The second latency is how it takes for us to return from the 2463 * preemption back to the original context. 2464 * 2465 * A: read CS_TIMESTAMP from CPU 2466 * submit preemption 2467 * B: read CS_TIMESTAMP on GPU (in preempting context) 2468 * context switch 2469 * C: read CS_TIMESTAMP on GPU (in original context) 2470 * 2471 * Preemption dispatch latency: B - A 2472 * Preemption switch latency: C - B 2473 */ 2474 2475 if (!intel_engine_has_preemption(ce->engine)) 2476 return 0; 2477 2478 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2479 u32 addr = offset + 2 * i * sizeof(u32); 2480 struct i915_request *rq; 2481 2482 rq = i915_request_create(ce); 2483 if (IS_ERR(rq)) { 2484 err = PTR_ERR(rq); 2485 goto err; 2486 } 2487 2488 cs = intel_ring_begin(rq, 12); 2489 if (IS_ERR(cs)) { 2490 i915_request_add(rq); 2491 err = PTR_ERR(cs); 2492 goto err; 2493 } 2494 2495 cs = emit_store_dw(cs, addr, -1); 2496 cs = emit_semaphore_poll_until(cs, offset, i); 2497 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32)); 2498 2499 intel_ring_advance(rq, cs); 2500 i915_request_add(rq); 2501 2502 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) { 2503 err = -EIO; 2504 goto err; 2505 } 2506 2507 rq = i915_request_create(ce->engine->kernel_context); 2508 if (IS_ERR(rq)) { 2509 err = PTR_ERR(rq); 2510 goto err; 2511 } 2512 2513 cs = intel_ring_begin(rq, 8); 2514 if (IS_ERR(cs)) { 2515 i915_request_add(rq); 2516 err = PTR_ERR(cs); 2517 goto err; 2518 } 2519 2520 cs = emit_timestamp_store(cs, ce, addr); 2521 cs = emit_store_dw(cs, offset, i); 2522 2523 intel_ring_advance(rq, cs); 2524 rq->sched.attr.priority = I915_PRIORITY_BARRIER; 2525 2526 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2527 i915_request_add(rq); 2528 } 2529 2530 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) { 2531 err = -EIO; 2532 goto err; 2533 } 2534 2535 for (i = 1; i <= TF_COUNT; i++) 2536 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1]; 2537 2538 cycles = trifilter(elapsed); 2539 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n", 2540 ce->engine->name, cycles >> TF_BIAS, 2541 cycles_to_ns(ce->engine, cycles)); 2542 2543 for (i = 1; i <= TF_COUNT; i++) 2544 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0]; 2545 2546 cycles = trifilter(elapsed); 2547 pr_info("%s: preemption switch latency %d cycles, %lluns\n", 2548 ce->engine->name, cycles >> TF_BIAS, 2549 cycles_to_ns(ce->engine, cycles)); 2550 2551 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2552 2553 err: 2554 intel_gt_set_wedged(ce->engine->gt); 2555 return err; 2556 } 2557 2558 struct signal_cb { 2559 struct dma_fence_cb base; 2560 bool seen; 2561 }; 2562 2563 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb) 2564 { 2565 struct signal_cb *s = container_of(cb, typeof(*s), base); 2566 2567 smp_store_mb(s->seen, true); /* be safe, be strong */ 2568 } 2569 2570 static int measure_completion(struct intel_context *ce) 2571 { 2572 u32 *sema = hwsp_scratch(ce); 2573 const u32 offset = hwsp_offset(ce, sema); 2574 u32 elapsed[TF_COUNT], cycles; 2575 u32 *cs; 2576 int err; 2577 int i; 2578 2579 /* 2580 * Measure how long it takes for the signal (interrupt) to be 2581 * sent from the GPU to be processed by the CPU. 2582 * 2583 * A: read CS_TIMESTAMP on GPU 2584 * signal 2585 * B: read CS_TIMESTAMP from CPU 2586 * 2587 * Completion latency: B - A 2588 */ 2589 2590 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2591 struct signal_cb cb = { .seen = false }; 2592 struct i915_request *rq; 2593 2594 rq = i915_request_create(ce); 2595 if (IS_ERR(rq)) { 2596 err = PTR_ERR(rq); 2597 goto err; 2598 } 2599 2600 cs = intel_ring_begin(rq, 12); 2601 if (IS_ERR(cs)) { 2602 i915_request_add(rq); 2603 err = PTR_ERR(cs); 2604 goto err; 2605 } 2606 2607 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 2608 cs = emit_semaphore_poll_until(cs, offset, i); 2609 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2610 2611 intel_ring_advance(rq, cs); 2612 2613 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb); 2614 i915_request_add(rq); 2615 2616 intel_engine_flush_submission(ce->engine); 2617 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) { 2618 err = -EIO; 2619 goto err; 2620 } 2621 2622 preempt_disable(); 2623 semaphore_set(sema, i); 2624 while (!READ_ONCE(cb.seen)) 2625 cpu_relax(); 2626 2627 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2628 preempt_enable(); 2629 } 2630 2631 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2632 if (err) 2633 goto err; 2634 2635 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 2636 GEM_BUG_ON(sema[i + 1] == -1); 2637 elapsed[i] = elapsed[i] - sema[i + 1]; 2638 } 2639 2640 cycles = trifilter(elapsed); 2641 pr_info("%s: completion latency %d cycles, %lluns\n", 2642 ce->engine->name, cycles >> TF_BIAS, 2643 cycles_to_ns(ce->engine, cycles)); 2644 2645 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2646 2647 err: 2648 intel_gt_set_wedged(ce->engine->gt); 2649 return err; 2650 } 2651 2652 static void rps_pin(struct intel_gt *gt) 2653 { 2654 /* Pin the frequency to max */ 2655 atomic_inc(>->rps.num_waiters); 2656 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 2657 2658 mutex_lock(>->rps.lock); 2659 intel_rps_set(>->rps, gt->rps.max_freq); 2660 mutex_unlock(>->rps.lock); 2661 } 2662 2663 static void rps_unpin(struct intel_gt *gt) 2664 { 2665 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 2666 atomic_dec(>->rps.num_waiters); 2667 } 2668 2669 static int perf_request_latency(void *arg) 2670 { 2671 struct drm_i915_private *i915 = arg; 2672 struct intel_engine_cs *engine; 2673 struct pm_qos_request qos; 2674 int err = 0; 2675 2676 if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */ 2677 return 0; 2678 2679 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2680 2681 for_each_uabi_engine(engine, i915) { 2682 struct intel_context *ce; 2683 2684 ce = intel_context_create(engine); 2685 if (IS_ERR(ce)) { 2686 err = PTR_ERR(ce); 2687 goto out; 2688 } 2689 2690 err = intel_context_pin(ce); 2691 if (err) { 2692 intel_context_put(ce); 2693 goto out; 2694 } 2695 2696 st_engine_heartbeat_disable(engine); 2697 rps_pin(engine->gt); 2698 2699 if (err == 0) 2700 err = measure_semaphore_response(ce); 2701 if (err == 0) 2702 err = measure_idle_dispatch(ce); 2703 if (err == 0) 2704 err = measure_busy_dispatch(ce); 2705 if (err == 0) 2706 err = measure_inter_request(ce); 2707 if (err == 0) 2708 err = measure_context_switch(ce); 2709 if (err == 0) 2710 err = measure_preemption(ce); 2711 if (err == 0) 2712 err = measure_completion(ce); 2713 2714 rps_unpin(engine->gt); 2715 st_engine_heartbeat_enable(engine); 2716 2717 intel_context_unpin(ce); 2718 intel_context_put(ce); 2719 if (err) 2720 goto out; 2721 } 2722 2723 out: 2724 if (igt_flush_test(i915)) 2725 err = -EIO; 2726 2727 cpu_latency_qos_remove_request(&qos); 2728 return err; 2729 } 2730 2731 static int s_sync0(void *arg) 2732 { 2733 struct perf_series *ps = arg; 2734 IGT_TIMEOUT(end_time); 2735 unsigned int idx = 0; 2736 int err = 0; 2737 2738 GEM_BUG_ON(!ps->nengines); 2739 do { 2740 struct i915_request *rq; 2741 2742 rq = i915_request_create(ps->ce[idx]); 2743 if (IS_ERR(rq)) { 2744 err = PTR_ERR(rq); 2745 break; 2746 } 2747 2748 i915_request_get(rq); 2749 i915_request_add(rq); 2750 2751 if (i915_request_wait(rq, 0, HZ / 5) < 0) 2752 err = -ETIME; 2753 i915_request_put(rq); 2754 if (err) 2755 break; 2756 2757 if (++idx == ps->nengines) 2758 idx = 0; 2759 } while (!__igt_timeout(end_time, NULL)); 2760 2761 return err; 2762 } 2763 2764 static int s_sync1(void *arg) 2765 { 2766 struct perf_series *ps = arg; 2767 struct i915_request *prev = NULL; 2768 IGT_TIMEOUT(end_time); 2769 unsigned int idx = 0; 2770 int err = 0; 2771 2772 GEM_BUG_ON(!ps->nengines); 2773 do { 2774 struct i915_request *rq; 2775 2776 rq = i915_request_create(ps->ce[idx]); 2777 if (IS_ERR(rq)) { 2778 err = PTR_ERR(rq); 2779 break; 2780 } 2781 2782 i915_request_get(rq); 2783 i915_request_add(rq); 2784 2785 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0) 2786 err = -ETIME; 2787 i915_request_put(prev); 2788 prev = rq; 2789 if (err) 2790 break; 2791 2792 if (++idx == ps->nengines) 2793 idx = 0; 2794 } while (!__igt_timeout(end_time, NULL)); 2795 i915_request_put(prev); 2796 2797 return err; 2798 } 2799 2800 static int s_many(void *arg) 2801 { 2802 struct perf_series *ps = arg; 2803 IGT_TIMEOUT(end_time); 2804 unsigned int idx = 0; 2805 2806 GEM_BUG_ON(!ps->nengines); 2807 do { 2808 struct i915_request *rq; 2809 2810 rq = i915_request_create(ps->ce[idx]); 2811 if (IS_ERR(rq)) 2812 return PTR_ERR(rq); 2813 2814 i915_request_add(rq); 2815 2816 if (++idx == ps->nengines) 2817 idx = 0; 2818 } while (!__igt_timeout(end_time, NULL)); 2819 2820 return 0; 2821 } 2822 2823 static int perf_series_engines(void *arg) 2824 { 2825 struct drm_i915_private *i915 = arg; 2826 static int (* const func[])(void *arg) = { 2827 s_sync0, 2828 s_sync1, 2829 s_many, 2830 NULL, 2831 }; 2832 const unsigned int nengines = num_uabi_engines(i915); 2833 struct intel_engine_cs *engine; 2834 int (* const *fn)(void *arg); 2835 struct pm_qos_request qos; 2836 struct perf_stats *stats; 2837 struct perf_series *ps; 2838 unsigned int idx; 2839 int err = 0; 2840 2841 stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL); 2842 if (!stats) 2843 return -ENOMEM; 2844 2845 ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL); 2846 if (!ps) { 2847 kfree(stats); 2848 return -ENOMEM; 2849 } 2850 2851 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2852 2853 ps->i915 = i915; 2854 ps->nengines = nengines; 2855 2856 idx = 0; 2857 for_each_uabi_engine(engine, i915) { 2858 struct intel_context *ce; 2859 2860 ce = intel_context_create(engine); 2861 if (IS_ERR(ce)) { 2862 err = PTR_ERR(ce); 2863 goto out; 2864 } 2865 2866 err = intel_context_pin(ce); 2867 if (err) { 2868 intel_context_put(ce); 2869 goto out; 2870 } 2871 2872 ps->ce[idx++] = ce; 2873 } 2874 GEM_BUG_ON(idx != ps->nengines); 2875 2876 for (fn = func; *fn && !err; fn++) { 2877 char name[KSYM_NAME_LEN]; 2878 struct igt_live_test t; 2879 2880 snprintf(name, sizeof(name), "%ps", *fn); 2881 err = igt_live_test_begin(&t, i915, __func__, name); 2882 if (err) 2883 break; 2884 2885 for (idx = 0; idx < nengines; idx++) { 2886 struct perf_stats *p = 2887 memset(&stats[idx], 0, sizeof(stats[idx])); 2888 struct intel_context *ce = ps->ce[idx]; 2889 2890 p->engine = ps->ce[idx]->engine; 2891 intel_engine_pm_get(p->engine); 2892 2893 if (intel_engine_supports_stats(p->engine)) 2894 p->busy = intel_engine_get_busy_time(p->engine, 2895 &p->time) + 1; 2896 else 2897 p->time = ktime_get(); 2898 p->runtime = -intel_context_get_total_runtime_ns(ce); 2899 } 2900 2901 err = (*fn)(ps); 2902 if (igt_live_test_end(&t)) 2903 err = -EIO; 2904 2905 for (idx = 0; idx < nengines; idx++) { 2906 struct perf_stats *p = &stats[idx]; 2907 struct intel_context *ce = ps->ce[idx]; 2908 int integer, decimal; 2909 u64 busy, dt, now; 2910 2911 if (p->busy) 2912 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine, 2913 &now), 2914 p->busy - 1); 2915 else 2916 now = ktime_get(); 2917 p->time = ktime_sub(now, p->time); 2918 2919 err = switch_to_kernel_sync(ce, err); 2920 p->runtime += intel_context_get_total_runtime_ns(ce); 2921 intel_engine_pm_put(p->engine); 2922 2923 busy = 100 * ktime_to_ns(p->busy); 2924 dt = ktime_to_ns(p->time); 2925 if (dt) { 2926 integer = div64_u64(busy, dt); 2927 busy -= integer * dt; 2928 decimal = div64_u64(100 * busy, dt); 2929 } else { 2930 integer = 0; 2931 decimal = 0; 2932 } 2933 2934 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 2935 name, p->engine->name, ce->timeline->seqno, 2936 integer, decimal, 2937 div_u64(p->runtime, 1000 * 1000), 2938 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 2939 } 2940 } 2941 2942 out: 2943 for (idx = 0; idx < nengines; idx++) { 2944 if (IS_ERR_OR_NULL(ps->ce[idx])) 2945 break; 2946 2947 intel_context_unpin(ps->ce[idx]); 2948 intel_context_put(ps->ce[idx]); 2949 } 2950 kfree(ps); 2951 2952 cpu_latency_qos_remove_request(&qos); 2953 kfree(stats); 2954 return err; 2955 } 2956 2957 struct p_thread { 2958 struct perf_stats p; 2959 struct kthread_worker *worker; 2960 struct kthread_work work; 2961 struct intel_engine_cs *engine; 2962 int result; 2963 }; 2964 2965 static void p_sync0(struct kthread_work *work) 2966 { 2967 struct p_thread *thread = container_of(work, typeof(*thread), work); 2968 struct perf_stats *p = &thread->p; 2969 struct intel_engine_cs *engine = p->engine; 2970 struct intel_context *ce; 2971 IGT_TIMEOUT(end_time); 2972 unsigned long count; 2973 bool busy; 2974 int err = 0; 2975 2976 ce = intel_context_create(engine); 2977 if (IS_ERR(ce)) { 2978 thread->result = PTR_ERR(ce); 2979 return; 2980 } 2981 2982 err = intel_context_pin(ce); 2983 if (err) { 2984 intel_context_put(ce); 2985 thread->result = err; 2986 return; 2987 } 2988 2989 if (intel_engine_supports_stats(engine)) { 2990 p->busy = intel_engine_get_busy_time(engine, &p->time); 2991 busy = true; 2992 } else { 2993 p->time = ktime_get(); 2994 busy = false; 2995 } 2996 2997 count = 0; 2998 do { 2999 struct i915_request *rq; 3000 3001 rq = i915_request_create(ce); 3002 if (IS_ERR(rq)) { 3003 err = PTR_ERR(rq); 3004 break; 3005 } 3006 3007 i915_request_get(rq); 3008 i915_request_add(rq); 3009 3010 err = 0; 3011 if (i915_request_wait(rq, 0, HZ) < 0) 3012 err = -ETIME; 3013 i915_request_put(rq); 3014 if (err) 3015 break; 3016 3017 count++; 3018 } while (!__igt_timeout(end_time, NULL)); 3019 3020 if (busy) { 3021 ktime_t now; 3022 3023 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3024 p->busy); 3025 p->time = ktime_sub(now, p->time); 3026 } else { 3027 p->time = ktime_sub(ktime_get(), p->time); 3028 } 3029 3030 err = switch_to_kernel_sync(ce, err); 3031 p->runtime = intel_context_get_total_runtime_ns(ce); 3032 p->count = count; 3033 3034 intel_context_unpin(ce); 3035 intel_context_put(ce); 3036 thread->result = err; 3037 } 3038 3039 static void p_sync1(struct kthread_work *work) 3040 { 3041 struct p_thread *thread = container_of(work, typeof(*thread), work); 3042 struct perf_stats *p = &thread->p; 3043 struct intel_engine_cs *engine = p->engine; 3044 struct i915_request *prev = NULL; 3045 struct intel_context *ce; 3046 IGT_TIMEOUT(end_time); 3047 unsigned long count; 3048 bool busy; 3049 int err = 0; 3050 3051 ce = intel_context_create(engine); 3052 if (IS_ERR(ce)) { 3053 thread->result = PTR_ERR(ce); 3054 return; 3055 } 3056 3057 err = intel_context_pin(ce); 3058 if (err) { 3059 intel_context_put(ce); 3060 thread->result = err; 3061 return; 3062 } 3063 3064 if (intel_engine_supports_stats(engine)) { 3065 p->busy = intel_engine_get_busy_time(engine, &p->time); 3066 busy = true; 3067 } else { 3068 p->time = ktime_get(); 3069 busy = false; 3070 } 3071 3072 count = 0; 3073 do { 3074 struct i915_request *rq; 3075 3076 rq = i915_request_create(ce); 3077 if (IS_ERR(rq)) { 3078 err = PTR_ERR(rq); 3079 break; 3080 } 3081 3082 i915_request_get(rq); 3083 i915_request_add(rq); 3084 3085 err = 0; 3086 if (prev && i915_request_wait(prev, 0, HZ) < 0) 3087 err = -ETIME; 3088 i915_request_put(prev); 3089 prev = rq; 3090 if (err) 3091 break; 3092 3093 count++; 3094 } while (!__igt_timeout(end_time, NULL)); 3095 i915_request_put(prev); 3096 3097 if (busy) { 3098 ktime_t now; 3099 3100 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3101 p->busy); 3102 p->time = ktime_sub(now, p->time); 3103 } else { 3104 p->time = ktime_sub(ktime_get(), p->time); 3105 } 3106 3107 err = switch_to_kernel_sync(ce, err); 3108 p->runtime = intel_context_get_total_runtime_ns(ce); 3109 p->count = count; 3110 3111 intel_context_unpin(ce); 3112 intel_context_put(ce); 3113 thread->result = err; 3114 } 3115 3116 static void p_many(struct kthread_work *work) 3117 { 3118 struct p_thread *thread = container_of(work, typeof(*thread), work); 3119 struct perf_stats *p = &thread->p; 3120 struct intel_engine_cs *engine = p->engine; 3121 struct intel_context *ce; 3122 IGT_TIMEOUT(end_time); 3123 unsigned long count; 3124 int err = 0; 3125 bool busy; 3126 3127 ce = intel_context_create(engine); 3128 if (IS_ERR(ce)) { 3129 thread->result = PTR_ERR(ce); 3130 return; 3131 } 3132 3133 err = intel_context_pin(ce); 3134 if (err) { 3135 intel_context_put(ce); 3136 thread->result = err; 3137 return; 3138 } 3139 3140 if (intel_engine_supports_stats(engine)) { 3141 p->busy = intel_engine_get_busy_time(engine, &p->time); 3142 busy = true; 3143 } else { 3144 p->time = ktime_get(); 3145 busy = false; 3146 } 3147 3148 count = 0; 3149 do { 3150 struct i915_request *rq; 3151 3152 rq = i915_request_create(ce); 3153 if (IS_ERR(rq)) { 3154 err = PTR_ERR(rq); 3155 break; 3156 } 3157 3158 i915_request_add(rq); 3159 count++; 3160 } while (!__igt_timeout(end_time, NULL)); 3161 3162 if (busy) { 3163 ktime_t now; 3164 3165 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3166 p->busy); 3167 p->time = ktime_sub(now, p->time); 3168 } else { 3169 p->time = ktime_sub(ktime_get(), p->time); 3170 } 3171 3172 err = switch_to_kernel_sync(ce, err); 3173 p->runtime = intel_context_get_total_runtime_ns(ce); 3174 p->count = count; 3175 3176 intel_context_unpin(ce); 3177 intel_context_put(ce); 3178 thread->result = err; 3179 } 3180 3181 static int perf_parallel_engines(void *arg) 3182 { 3183 struct drm_i915_private *i915 = arg; 3184 static void (* const func[])(struct kthread_work *) = { 3185 p_sync0, 3186 p_sync1, 3187 p_many, 3188 NULL, 3189 }; 3190 const unsigned int nengines = num_uabi_engines(i915); 3191 void (* const *fn)(struct kthread_work *); 3192 struct intel_engine_cs *engine; 3193 struct pm_qos_request qos; 3194 struct p_thread *engines; 3195 int err = 0; 3196 3197 engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL); 3198 if (!engines) 3199 return -ENOMEM; 3200 3201 cpu_latency_qos_add_request(&qos, 0); 3202 3203 for (fn = func; *fn; fn++) { 3204 char name[KSYM_NAME_LEN]; 3205 struct igt_live_test t; 3206 unsigned int idx; 3207 3208 snprintf(name, sizeof(name), "%ps", *fn); 3209 err = igt_live_test_begin(&t, i915, __func__, name); 3210 if (err) 3211 break; 3212 3213 atomic_set(&i915->selftest.counter, nengines); 3214 3215 idx = 0; 3216 for_each_uabi_engine(engine, i915) { 3217 struct kthread_worker *worker; 3218 3219 intel_engine_pm_get(engine); 3220 3221 memset(&engines[idx].p, 0, sizeof(engines[idx].p)); 3222 3223 worker = kthread_run_worker(0, "igt:%s", 3224 engine->name); 3225 if (IS_ERR(worker)) { 3226 err = PTR_ERR(worker); 3227 intel_engine_pm_put(engine); 3228 break; 3229 } 3230 engines[idx].worker = worker; 3231 engines[idx].result = 0; 3232 engines[idx].p.engine = engine; 3233 engines[idx].engine = engine; 3234 3235 kthread_init_work(&engines[idx].work, *fn); 3236 kthread_queue_work(worker, &engines[idx].work); 3237 idx++; 3238 } 3239 3240 idx = 0; 3241 for_each_uabi_engine(engine, i915) { 3242 int status; 3243 3244 if (!engines[idx].worker) 3245 break; 3246 3247 kthread_flush_work(&engines[idx].work); 3248 status = READ_ONCE(engines[idx].result); 3249 if (status && !err) 3250 err = status; 3251 3252 intel_engine_pm_put(engine); 3253 3254 kthread_destroy_worker(engines[idx].worker); 3255 idx++; 3256 } 3257 3258 if (igt_live_test_end(&t)) 3259 err = -EIO; 3260 if (err) 3261 break; 3262 3263 idx = 0; 3264 for_each_uabi_engine(engine, i915) { 3265 struct perf_stats *p = &engines[idx].p; 3266 u64 busy = 100 * ktime_to_ns(p->busy); 3267 u64 dt = ktime_to_ns(p->time); 3268 int integer, decimal; 3269 3270 if (dt) { 3271 integer = div64_u64(busy, dt); 3272 busy -= integer * dt; 3273 decimal = div64_u64(100 * busy, dt); 3274 } else { 3275 integer = 0; 3276 decimal = 0; 3277 } 3278 3279 GEM_BUG_ON(engine != p->engine); 3280 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 3281 name, engine->name, p->count, integer, decimal, 3282 div_u64(p->runtime, 1000 * 1000), 3283 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 3284 idx++; 3285 } 3286 } 3287 3288 cpu_latency_qos_remove_request(&qos); 3289 kfree(engines); 3290 return err; 3291 } 3292 3293 int i915_request_perf_selftests(struct drm_i915_private *i915) 3294 { 3295 static const struct i915_subtest tests[] = { 3296 SUBTEST(perf_request_latency), 3297 SUBTEST(perf_series_engines), 3298 SUBTEST(perf_parallel_engines), 3299 }; 3300 3301 if (intel_gt_is_wedged(to_gt(i915))) 3302 return 0; 3303 3304 return i915_subtests(tests, i915); 3305 } 3306