1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/prime_numbers.h> 26 #include <linux/pm_qos.h> 27 #include <linux/sort.h> 28 29 #include "gem/i915_gem_internal.h" 30 #include "gem/i915_gem_pm.h" 31 #include "gem/selftests/mock_context.h" 32 33 #include "gt/intel_engine_heartbeat.h" 34 #include "gt/intel_engine_pm.h" 35 #include "gt/intel_engine_user.h" 36 #include "gt/intel_gt.h" 37 #include "gt/intel_gt_clock_utils.h" 38 #include "gt/intel_gt_requests.h" 39 #include "gt/selftest_engine_heartbeat.h" 40 41 #include "i915_random.h" 42 #include "i915_selftest.h" 43 #include "igt_flush_test.h" 44 #include "igt_live_test.h" 45 #include "igt_spinner.h" 46 #include "lib_sw_fence.h" 47 48 #include "mock_drm.h" 49 #include "mock_gem_device.h" 50 51 static unsigned int num_uabi_engines(struct drm_i915_private *i915) 52 { 53 struct intel_engine_cs *engine; 54 unsigned int count; 55 56 count = 0; 57 for_each_uabi_engine(engine, i915) 58 count++; 59 60 return count; 61 } 62 63 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915) 64 { 65 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0); 66 } 67 68 static int igt_add_request(void *arg) 69 { 70 struct drm_i915_private *i915 = arg; 71 struct i915_request *request; 72 73 /* Basic preliminary test to create a request and let it loose! */ 74 75 request = mock_request(rcs0(i915)->kernel_context, HZ / 10); 76 if (!request) 77 return -ENOMEM; 78 79 i915_request_add(request); 80 81 return 0; 82 } 83 84 static int igt_wait_request(void *arg) 85 { 86 const long T = HZ / 4; 87 struct drm_i915_private *i915 = arg; 88 struct i915_request *request; 89 int err = -EINVAL; 90 91 /* Submit a request, then wait upon it */ 92 93 request = mock_request(rcs0(i915)->kernel_context, T); 94 if (!request) 95 return -ENOMEM; 96 97 i915_request_get(request); 98 99 if (i915_request_wait(request, 0, 0) != -ETIME) { 100 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n"); 101 goto out_request; 102 } 103 104 if (i915_request_wait(request, 0, T) != -ETIME) { 105 pr_err("request wait succeeded (expected timeout before submit!)\n"); 106 goto out_request; 107 } 108 109 if (i915_request_completed(request)) { 110 pr_err("request completed before submit!!\n"); 111 goto out_request; 112 } 113 114 i915_request_add(request); 115 116 if (i915_request_wait(request, 0, 0) != -ETIME) { 117 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n"); 118 goto out_request; 119 } 120 121 if (i915_request_completed(request)) { 122 pr_err("request completed immediately!\n"); 123 goto out_request; 124 } 125 126 if (i915_request_wait(request, 0, T / 2) != -ETIME) { 127 pr_err("request wait succeeded (expected timeout!)\n"); 128 goto out_request; 129 } 130 131 if (i915_request_wait(request, 0, T) == -ETIME) { 132 pr_err("request wait timed out!\n"); 133 goto out_request; 134 } 135 136 if (!i915_request_completed(request)) { 137 pr_err("request not complete after waiting!\n"); 138 goto out_request; 139 } 140 141 if (i915_request_wait(request, 0, T) == -ETIME) { 142 pr_err("request wait timed out when already complete!\n"); 143 goto out_request; 144 } 145 146 err = 0; 147 out_request: 148 i915_request_put(request); 149 mock_device_flush(i915); 150 return err; 151 } 152 153 static int igt_fence_wait(void *arg) 154 { 155 const long T = HZ / 4; 156 struct drm_i915_private *i915 = arg; 157 struct i915_request *request; 158 int err = -EINVAL; 159 160 /* Submit a request, treat it as a fence and wait upon it */ 161 162 request = mock_request(rcs0(i915)->kernel_context, T); 163 if (!request) 164 return -ENOMEM; 165 166 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) { 167 pr_err("fence wait success before submit (expected timeout)!\n"); 168 goto out; 169 } 170 171 i915_request_add(request); 172 173 if (dma_fence_is_signaled(&request->fence)) { 174 pr_err("fence signaled immediately!\n"); 175 goto out; 176 } 177 178 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) { 179 pr_err("fence wait success after submit (expected timeout)!\n"); 180 goto out; 181 } 182 183 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 184 pr_err("fence wait timed out (expected success)!\n"); 185 goto out; 186 } 187 188 if (!dma_fence_is_signaled(&request->fence)) { 189 pr_err("fence unsignaled after waiting!\n"); 190 goto out; 191 } 192 193 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 194 pr_err("fence wait timed out when complete (expected success)!\n"); 195 goto out; 196 } 197 198 err = 0; 199 out: 200 mock_device_flush(i915); 201 return err; 202 } 203 204 static int igt_request_rewind(void *arg) 205 { 206 struct drm_i915_private *i915 = arg; 207 struct i915_request *request, *vip; 208 struct i915_gem_context *ctx[2]; 209 struct intel_context *ce; 210 int err = -EINVAL; 211 212 ctx[0] = mock_context(i915, "A"); 213 if (!ctx[0]) { 214 err = -ENOMEM; 215 goto err_ctx_0; 216 } 217 218 ce = i915_gem_context_get_engine(ctx[0], RCS0); 219 GEM_BUG_ON(IS_ERR(ce)); 220 request = mock_request(ce, 2 * HZ); 221 intel_context_put(ce); 222 if (!request) { 223 err = -ENOMEM; 224 goto err_context_0; 225 } 226 227 i915_request_get(request); 228 i915_request_add(request); 229 230 ctx[1] = mock_context(i915, "B"); 231 if (!ctx[1]) { 232 err = -ENOMEM; 233 goto err_ctx_1; 234 } 235 236 ce = i915_gem_context_get_engine(ctx[1], RCS0); 237 GEM_BUG_ON(IS_ERR(ce)); 238 vip = mock_request(ce, 0); 239 intel_context_put(ce); 240 if (!vip) { 241 err = -ENOMEM; 242 goto err_context_1; 243 } 244 245 /* Simulate preemption by manual reordering */ 246 if (!mock_cancel_request(request)) { 247 pr_err("failed to cancel request (already executed)!\n"); 248 i915_request_add(vip); 249 goto err_context_1; 250 } 251 i915_request_get(vip); 252 i915_request_add(vip); 253 rcu_read_lock(); 254 request->engine->submit_request(request); 255 rcu_read_unlock(); 256 257 258 if (i915_request_wait(vip, 0, HZ) == -ETIME) { 259 pr_err("timed out waiting for high priority request\n"); 260 goto err; 261 } 262 263 if (i915_request_completed(request)) { 264 pr_err("low priority request already completed\n"); 265 goto err; 266 } 267 268 err = 0; 269 err: 270 i915_request_put(vip); 271 err_context_1: 272 mock_context_close(ctx[1]); 273 err_ctx_1: 274 i915_request_put(request); 275 err_context_0: 276 mock_context_close(ctx[0]); 277 err_ctx_0: 278 mock_device_flush(i915); 279 return err; 280 } 281 282 struct smoketest { 283 struct intel_engine_cs *engine; 284 struct i915_gem_context **contexts; 285 atomic_long_t num_waits, num_fences; 286 int ncontexts, max_batch; 287 struct i915_request *(*request_alloc)(struct intel_context *ce); 288 }; 289 290 static struct i915_request * 291 __mock_request_alloc(struct intel_context *ce) 292 { 293 return mock_request(ce, 0); 294 } 295 296 static struct i915_request * 297 __live_request_alloc(struct intel_context *ce) 298 { 299 return intel_context_create_request(ce); 300 } 301 302 struct smoke_thread { 303 struct kthread_worker *worker; 304 struct kthread_work work; 305 struct smoketest *t; 306 bool stop; 307 int result; 308 }; 309 310 static void __igt_breadcrumbs_smoketest(struct kthread_work *work) 311 { 312 struct smoke_thread *thread = container_of(work, typeof(*thread), work); 313 struct smoketest *t = thread->t; 314 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1; 315 const unsigned int total = 4 * t->ncontexts + 1; 316 unsigned int num_waits = 0, num_fences = 0; 317 struct i915_request **requests; 318 I915_RND_STATE(prng); 319 unsigned int *order; 320 int err = 0; 321 322 /* 323 * A very simple test to catch the most egregious of list handling bugs. 324 * 325 * At its heart, we simply create oodles of requests running across 326 * multiple kthreads and enable signaling on them, for the sole purpose 327 * of stressing our breadcrumb handling. The only inspection we do is 328 * that the fences were marked as signaled. 329 */ 330 331 requests = kcalloc(total, sizeof(*requests), GFP_KERNEL); 332 if (!requests) { 333 thread->result = -ENOMEM; 334 return; 335 } 336 337 order = i915_random_order(total, &prng); 338 if (!order) { 339 err = -ENOMEM; 340 goto out_requests; 341 } 342 343 while (!READ_ONCE(thread->stop)) { 344 struct i915_sw_fence *submit, *wait; 345 unsigned int n, count; 346 347 submit = heap_fence_create(GFP_KERNEL); 348 if (!submit) { 349 err = -ENOMEM; 350 break; 351 } 352 353 wait = heap_fence_create(GFP_KERNEL); 354 if (!wait) { 355 i915_sw_fence_commit(submit); 356 heap_fence_put(submit); 357 err = -ENOMEM; 358 break; 359 } 360 361 i915_random_reorder(order, total, &prng); 362 count = 1 + i915_prandom_u32_max_state(max_batch, &prng); 363 364 for (n = 0; n < count; n++) { 365 struct i915_gem_context *ctx = 366 t->contexts[order[n] % t->ncontexts]; 367 struct i915_request *rq; 368 struct intel_context *ce; 369 370 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx); 371 GEM_BUG_ON(IS_ERR(ce)); 372 rq = t->request_alloc(ce); 373 intel_context_put(ce); 374 if (IS_ERR(rq)) { 375 err = PTR_ERR(rq); 376 count = n; 377 break; 378 } 379 380 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 381 submit, 382 GFP_KERNEL); 383 384 requests[n] = i915_request_get(rq); 385 i915_request_add(rq); 386 387 if (err >= 0) 388 err = i915_sw_fence_await_dma_fence(wait, 389 &rq->fence, 390 0, 391 GFP_KERNEL); 392 393 if (err < 0) { 394 i915_request_put(rq); 395 count = n; 396 break; 397 } 398 } 399 400 i915_sw_fence_commit(submit); 401 i915_sw_fence_commit(wait); 402 403 if (!wait_event_timeout(wait->wait, 404 i915_sw_fence_done(wait), 405 5 * HZ)) { 406 struct i915_request *rq = requests[count - 1]; 407 408 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n", 409 atomic_read(&wait->pending), count, 410 rq->fence.context, rq->fence.seqno, 411 t->engine->name); 412 GEM_TRACE_DUMP(); 413 414 intel_gt_set_wedged(t->engine->gt); 415 GEM_BUG_ON(!i915_request_completed(rq)); 416 i915_sw_fence_wait(wait); 417 err = -EIO; 418 } 419 420 for (n = 0; n < count; n++) { 421 struct i915_request *rq = requests[n]; 422 423 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 424 &rq->fence.flags)) { 425 pr_err("%llu:%llu was not signaled!\n", 426 rq->fence.context, rq->fence.seqno); 427 err = -EINVAL; 428 } 429 430 i915_request_put(rq); 431 } 432 433 heap_fence_put(wait); 434 heap_fence_put(submit); 435 436 if (err < 0) 437 break; 438 439 num_fences += count; 440 num_waits++; 441 442 cond_resched(); 443 } 444 445 atomic_long_add(num_fences, &t->num_fences); 446 atomic_long_add(num_waits, &t->num_waits); 447 448 kfree(order); 449 out_requests: 450 kfree(requests); 451 thread->result = err; 452 } 453 454 static int mock_breadcrumbs_smoketest(void *arg) 455 { 456 struct drm_i915_private *i915 = arg; 457 struct smoketest t = { 458 .engine = rcs0(i915), 459 .ncontexts = 1024, 460 .max_batch = 1024, 461 .request_alloc = __mock_request_alloc 462 }; 463 unsigned int ncpus = num_online_cpus(); 464 struct smoke_thread *threads; 465 unsigned int n; 466 int ret = 0; 467 468 /* 469 * Smoketest our breadcrumb/signal handling for requests across multiple 470 * threads. A very simple test to only catch the most egregious of bugs. 471 * See __igt_breadcrumbs_smoketest(); 472 */ 473 474 threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL); 475 if (!threads) 476 return -ENOMEM; 477 478 t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL); 479 if (!t.contexts) { 480 ret = -ENOMEM; 481 goto out_threads; 482 } 483 484 for (n = 0; n < t.ncontexts; n++) { 485 t.contexts[n] = mock_context(t.engine->i915, "mock"); 486 if (!t.contexts[n]) { 487 ret = -ENOMEM; 488 goto out_contexts; 489 } 490 } 491 492 for (n = 0; n < ncpus; n++) { 493 struct kthread_worker *worker; 494 495 worker = kthread_create_worker(0, "igt/%d", n); 496 if (IS_ERR(worker)) { 497 ret = PTR_ERR(worker); 498 ncpus = n; 499 break; 500 } 501 502 threads[n].worker = worker; 503 threads[n].t = &t; 504 threads[n].stop = false; 505 threads[n].result = 0; 506 507 kthread_init_work(&threads[n].work, 508 __igt_breadcrumbs_smoketest); 509 kthread_queue_work(worker, &threads[n].work); 510 } 511 512 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 513 514 for (n = 0; n < ncpus; n++) { 515 int err; 516 517 WRITE_ONCE(threads[n].stop, true); 518 kthread_flush_work(&threads[n].work); 519 err = READ_ONCE(threads[n].result); 520 if (err < 0 && !ret) 521 ret = err; 522 523 kthread_destroy_worker(threads[n].worker); 524 } 525 pr_info("Completed %lu waits for %lu fence across %d cpus\n", 526 atomic_long_read(&t.num_waits), 527 atomic_long_read(&t.num_fences), 528 ncpus); 529 530 out_contexts: 531 for (n = 0; n < t.ncontexts; n++) { 532 if (!t.contexts[n]) 533 break; 534 mock_context_close(t.contexts[n]); 535 } 536 kfree(t.contexts); 537 out_threads: 538 kfree(threads); 539 return ret; 540 } 541 542 int i915_request_mock_selftests(void) 543 { 544 static const struct i915_subtest tests[] = { 545 SUBTEST(igt_add_request), 546 SUBTEST(igt_wait_request), 547 SUBTEST(igt_fence_wait), 548 SUBTEST(igt_request_rewind), 549 SUBTEST(mock_breadcrumbs_smoketest), 550 }; 551 struct drm_i915_private *i915; 552 intel_wakeref_t wakeref; 553 int err = 0; 554 555 i915 = mock_gem_device(); 556 if (!i915) 557 return -ENOMEM; 558 559 with_intel_runtime_pm(&i915->runtime_pm, wakeref) 560 err = i915_subtests(tests, i915); 561 562 mock_destroy_device(i915); 563 564 return err; 565 } 566 567 static int live_nop_request(void *arg) 568 { 569 struct drm_i915_private *i915 = arg; 570 struct intel_engine_cs *engine; 571 struct igt_live_test t; 572 int err = -ENODEV; 573 574 /* 575 * Submit various sized batches of empty requests, to each engine 576 * (individually), and wait for the batch to complete. We can check 577 * the overhead of submitting requests to the hardware. 578 */ 579 580 for_each_uabi_engine(engine, i915) { 581 unsigned long n, prime; 582 IGT_TIMEOUT(end_time); 583 ktime_t times[2] = {}; 584 585 err = igt_live_test_begin(&t, i915, __func__, engine->name); 586 if (err) 587 return err; 588 589 intel_engine_pm_get(engine); 590 for_each_prime_number_from(prime, 1, 8192) { 591 struct i915_request *request = NULL; 592 593 times[1] = ktime_get_raw(); 594 595 for (n = 0; n < prime; n++) { 596 i915_request_put(request); 597 request = i915_request_create(engine->kernel_context); 598 if (IS_ERR(request)) 599 return PTR_ERR(request); 600 601 /* 602 * This space is left intentionally blank. 603 * 604 * We do not actually want to perform any 605 * action with this request, we just want 606 * to measure the latency in allocation 607 * and submission of our breadcrumbs - 608 * ensuring that the bare request is sufficient 609 * for the system to work (i.e. proper HEAD 610 * tracking of the rings, interrupt handling, 611 * etc). It also gives us the lowest bounds 612 * for latency. 613 */ 614 615 i915_request_get(request); 616 i915_request_add(request); 617 } 618 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 619 i915_request_put(request); 620 621 times[1] = ktime_sub(ktime_get_raw(), times[1]); 622 if (prime == 1) 623 times[0] = times[1]; 624 625 if (__igt_timeout(end_time, NULL)) 626 break; 627 } 628 intel_engine_pm_put(engine); 629 630 err = igt_live_test_end(&t); 631 if (err) 632 return err; 633 634 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n", 635 engine->name, 636 ktime_to_ns(times[0]), 637 prime, div64_u64(ktime_to_ns(times[1]), prime)); 638 } 639 640 return err; 641 } 642 643 static int __cancel_inactive(struct intel_engine_cs *engine) 644 { 645 struct intel_context *ce; 646 struct igt_spinner spin; 647 struct i915_request *rq; 648 int err = 0; 649 650 if (igt_spinner_init(&spin, engine->gt)) 651 return -ENOMEM; 652 653 ce = intel_context_create(engine); 654 if (IS_ERR(ce)) { 655 err = PTR_ERR(ce); 656 goto out_spin; 657 } 658 659 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 660 if (IS_ERR(rq)) { 661 err = PTR_ERR(rq); 662 goto out_ce; 663 } 664 665 pr_debug("%s: Cancelling inactive request\n", engine->name); 666 i915_request_cancel(rq, -EINTR); 667 i915_request_get(rq); 668 i915_request_add(rq); 669 670 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 671 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 672 673 pr_err("%s: Failed to cancel inactive request\n", engine->name); 674 intel_engine_dump(engine, &p, "%s\n", engine->name); 675 err = -ETIME; 676 goto out_rq; 677 } 678 679 if (rq->fence.error != -EINTR) { 680 pr_err("%s: fence not cancelled (%u)\n", 681 engine->name, rq->fence.error); 682 err = -EINVAL; 683 } 684 685 out_rq: 686 i915_request_put(rq); 687 out_ce: 688 intel_context_put(ce); 689 out_spin: 690 igt_spinner_fini(&spin); 691 if (err) 692 pr_err("%s: %s error %d\n", __func__, engine->name, err); 693 return err; 694 } 695 696 static int __cancel_active(struct intel_engine_cs *engine) 697 { 698 struct intel_context *ce; 699 struct igt_spinner spin; 700 struct i915_request *rq; 701 int err = 0; 702 703 if (igt_spinner_init(&spin, engine->gt)) 704 return -ENOMEM; 705 706 ce = intel_context_create(engine); 707 if (IS_ERR(ce)) { 708 err = PTR_ERR(ce); 709 goto out_spin; 710 } 711 712 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 713 if (IS_ERR(rq)) { 714 err = PTR_ERR(rq); 715 goto out_ce; 716 } 717 718 pr_debug("%s: Cancelling active request\n", engine->name); 719 i915_request_get(rq); 720 i915_request_add(rq); 721 if (!igt_wait_for_spinner(&spin, rq)) { 722 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 723 724 pr_err("Failed to start spinner on %s\n", engine->name); 725 intel_engine_dump(engine, &p, "%s\n", engine->name); 726 err = -ETIME; 727 goto out_rq; 728 } 729 i915_request_cancel(rq, -EINTR); 730 731 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 732 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 733 734 pr_err("%s: Failed to cancel active request\n", engine->name); 735 intel_engine_dump(engine, &p, "%s\n", engine->name); 736 err = -ETIME; 737 goto out_rq; 738 } 739 740 if (rq->fence.error != -EINTR) { 741 pr_err("%s: fence not cancelled (%u)\n", 742 engine->name, rq->fence.error); 743 err = -EINVAL; 744 } 745 746 out_rq: 747 i915_request_put(rq); 748 out_ce: 749 intel_context_put(ce); 750 out_spin: 751 igt_spinner_fini(&spin); 752 if (err) 753 pr_err("%s: %s error %d\n", __func__, engine->name, err); 754 return err; 755 } 756 757 static int __cancel_completed(struct intel_engine_cs *engine) 758 { 759 struct intel_context *ce; 760 struct igt_spinner spin; 761 struct i915_request *rq; 762 int err = 0; 763 764 if (igt_spinner_init(&spin, engine->gt)) 765 return -ENOMEM; 766 767 ce = intel_context_create(engine); 768 if (IS_ERR(ce)) { 769 err = PTR_ERR(ce); 770 goto out_spin; 771 } 772 773 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 774 if (IS_ERR(rq)) { 775 err = PTR_ERR(rq); 776 goto out_ce; 777 } 778 igt_spinner_end(&spin); 779 i915_request_get(rq); 780 i915_request_add(rq); 781 782 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 783 err = -ETIME; 784 goto out_rq; 785 } 786 787 pr_debug("%s: Cancelling completed request\n", engine->name); 788 i915_request_cancel(rq, -EINTR); 789 if (rq->fence.error) { 790 pr_err("%s: fence not cancelled (%u)\n", 791 engine->name, rq->fence.error); 792 err = -EINVAL; 793 } 794 795 out_rq: 796 i915_request_put(rq); 797 out_ce: 798 intel_context_put(ce); 799 out_spin: 800 igt_spinner_fini(&spin); 801 if (err) 802 pr_err("%s: %s error %d\n", __func__, engine->name, err); 803 return err; 804 } 805 806 /* 807 * Test to prove a non-preemptable request can be cancelled and a subsequent 808 * request on the same context can successfully complete after cancellation. 809 * 810 * Testing methodology is to create a non-preemptible request and submit it, 811 * wait for spinner to start, create a NOP request and submit it, cancel the 812 * spinner, wait for spinner to complete and verify it failed with an error, 813 * finally wait for NOP request to complete verify it succeeded without an 814 * error. Preemption timeout also reduced / restored so test runs in a timely 815 * maner. 816 */ 817 static int __cancel_reset(struct drm_i915_private *i915, 818 struct intel_engine_cs *engine) 819 { 820 struct intel_context *ce; 821 struct igt_spinner spin; 822 struct i915_request *rq, *nop; 823 unsigned long preempt_timeout_ms; 824 int err = 0; 825 826 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT || 827 !intel_has_reset_engine(engine->gt)) 828 return 0; 829 830 preempt_timeout_ms = engine->props.preempt_timeout_ms; 831 engine->props.preempt_timeout_ms = 100; 832 833 if (igt_spinner_init(&spin, engine->gt)) 834 goto out_restore; 835 836 ce = intel_context_create(engine); 837 if (IS_ERR(ce)) { 838 err = PTR_ERR(ce); 839 goto out_spin; 840 } 841 842 rq = igt_spinner_create_request(&spin, ce, MI_NOOP); 843 if (IS_ERR(rq)) { 844 err = PTR_ERR(rq); 845 goto out_ce; 846 } 847 848 pr_debug("%s: Cancelling active non-preemptable request\n", 849 engine->name); 850 i915_request_get(rq); 851 i915_request_add(rq); 852 if (!igt_wait_for_spinner(&spin, rq)) { 853 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 854 855 pr_err("Failed to start spinner on %s\n", engine->name); 856 intel_engine_dump(engine, &p, "%s\n", engine->name); 857 err = -ETIME; 858 goto out_rq; 859 } 860 861 nop = intel_context_create_request(ce); 862 if (IS_ERR(nop)) 863 goto out_rq; 864 i915_request_get(nop); 865 i915_request_add(nop); 866 867 i915_request_cancel(rq, -EINTR); 868 869 if (i915_request_wait(rq, 0, HZ) < 0) { 870 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 871 872 pr_err("%s: Failed to cancel hung request\n", engine->name); 873 intel_engine_dump(engine, &p, "%s\n", engine->name); 874 err = -ETIME; 875 goto out_nop; 876 } 877 878 if (rq->fence.error != -EINTR) { 879 pr_err("%s: fence not cancelled (%u)\n", 880 engine->name, rq->fence.error); 881 err = -EINVAL; 882 goto out_nop; 883 } 884 885 if (i915_request_wait(nop, 0, HZ) < 0) { 886 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 887 888 pr_err("%s: Failed to complete nop request\n", engine->name); 889 intel_engine_dump(engine, &p, "%s\n", engine->name); 890 err = -ETIME; 891 goto out_nop; 892 } 893 894 if (nop->fence.error != 0) { 895 pr_err("%s: Nop request errored (%u)\n", 896 engine->name, nop->fence.error); 897 err = -EINVAL; 898 } 899 900 out_nop: 901 i915_request_put(nop); 902 out_rq: 903 i915_request_put(rq); 904 out_ce: 905 intel_context_put(ce); 906 out_spin: 907 igt_spinner_fini(&spin); 908 out_restore: 909 engine->props.preempt_timeout_ms = preempt_timeout_ms; 910 if (err) 911 pr_err("%s: %s error %d\n", __func__, engine->name, err); 912 return err; 913 } 914 915 static int live_cancel_request(void *arg) 916 { 917 struct drm_i915_private *i915 = arg; 918 struct intel_engine_cs *engine; 919 920 /* 921 * Check cancellation of requests. We expect to be able to immediately 922 * cancel active requests, even if they are currently on the GPU. 923 */ 924 925 for_each_uabi_engine(engine, i915) { 926 struct igt_live_test t; 927 int err, err2; 928 929 if (!intel_engine_has_preemption(engine)) 930 continue; 931 932 err = igt_live_test_begin(&t, i915, __func__, engine->name); 933 if (err) 934 return err; 935 936 err = __cancel_inactive(engine); 937 if (err == 0) 938 err = __cancel_active(engine); 939 if (err == 0) 940 err = __cancel_completed(engine); 941 942 err2 = igt_live_test_end(&t); 943 if (err) 944 return err; 945 if (err2) 946 return err2; 947 948 /* Expects reset so call outside of igt_live_test_* */ 949 err = __cancel_reset(i915, engine); 950 if (err) 951 return err; 952 953 if (igt_flush_test(i915)) 954 return -EIO; 955 } 956 957 return 0; 958 } 959 960 static struct i915_vma *empty_batch(struct intel_gt *gt) 961 { 962 struct drm_i915_gem_object *obj; 963 struct i915_vma *vma; 964 u32 *cmd; 965 int err; 966 967 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 968 if (IS_ERR(obj)) 969 return ERR_CAST(obj); 970 971 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC); 972 if (IS_ERR(cmd)) { 973 err = PTR_ERR(cmd); 974 goto err; 975 } 976 977 *cmd = MI_BATCH_BUFFER_END; 978 979 __i915_gem_object_flush_map(obj, 0, 64); 980 i915_gem_object_unpin_map(obj); 981 982 intel_gt_chipset_flush(gt); 983 984 vma = i915_vma_instance(obj, gt->vm, NULL); 985 if (IS_ERR(vma)) { 986 err = PTR_ERR(vma); 987 goto err; 988 } 989 990 err = i915_vma_pin(vma, 0, 0, PIN_USER); 991 if (err) 992 goto err; 993 994 /* Force the wait now to avoid including it in the benchmark */ 995 err = i915_vma_sync(vma); 996 if (err) 997 goto err_pin; 998 999 return vma; 1000 1001 err_pin: 1002 i915_vma_unpin(vma); 1003 err: 1004 i915_gem_object_put(obj); 1005 return ERR_PTR(err); 1006 } 1007 1008 static int emit_bb_start(struct i915_request *rq, struct i915_vma *batch) 1009 { 1010 return rq->engine->emit_bb_start(rq, 1011 i915_vma_offset(batch), 1012 i915_vma_size(batch), 1013 0); 1014 } 1015 1016 static struct i915_request * 1017 empty_request(struct intel_engine_cs *engine, 1018 struct i915_vma *batch) 1019 { 1020 struct i915_request *request; 1021 int err; 1022 1023 request = i915_request_create(engine->kernel_context); 1024 if (IS_ERR(request)) 1025 return request; 1026 1027 err = emit_bb_start(request, batch); 1028 if (err) 1029 goto out_request; 1030 1031 i915_request_get(request); 1032 out_request: 1033 i915_request_add(request); 1034 return err ? ERR_PTR(err) : request; 1035 } 1036 1037 static int live_empty_request(void *arg) 1038 { 1039 struct drm_i915_private *i915 = arg; 1040 struct intel_engine_cs *engine; 1041 struct igt_live_test t; 1042 int err; 1043 1044 /* 1045 * Submit various sized batches of empty requests, to each engine 1046 * (individually), and wait for the batch to complete. We can check 1047 * the overhead of submitting requests to the hardware. 1048 */ 1049 1050 for_each_uabi_engine(engine, i915) { 1051 IGT_TIMEOUT(end_time); 1052 struct i915_request *request; 1053 struct i915_vma *batch; 1054 unsigned long n, prime; 1055 ktime_t times[2] = {}; 1056 1057 batch = empty_batch(engine->gt); 1058 if (IS_ERR(batch)) 1059 return PTR_ERR(batch); 1060 1061 err = igt_live_test_begin(&t, i915, __func__, engine->name); 1062 if (err) 1063 goto out_batch; 1064 1065 intel_engine_pm_get(engine); 1066 1067 /* Warmup / preload */ 1068 request = empty_request(engine, batch); 1069 if (IS_ERR(request)) { 1070 err = PTR_ERR(request); 1071 intel_engine_pm_put(engine); 1072 goto out_batch; 1073 } 1074 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 1075 1076 for_each_prime_number_from(prime, 1, 8192) { 1077 times[1] = ktime_get_raw(); 1078 1079 for (n = 0; n < prime; n++) { 1080 i915_request_put(request); 1081 request = empty_request(engine, batch); 1082 if (IS_ERR(request)) { 1083 err = PTR_ERR(request); 1084 intel_engine_pm_put(engine); 1085 goto out_batch; 1086 } 1087 } 1088 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 1089 1090 times[1] = ktime_sub(ktime_get_raw(), times[1]); 1091 if (prime == 1) 1092 times[0] = times[1]; 1093 1094 if (__igt_timeout(end_time, NULL)) 1095 break; 1096 } 1097 i915_request_put(request); 1098 intel_engine_pm_put(engine); 1099 1100 err = igt_live_test_end(&t); 1101 if (err) 1102 goto out_batch; 1103 1104 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n", 1105 engine->name, 1106 ktime_to_ns(times[0]), 1107 prime, div64_u64(ktime_to_ns(times[1]), prime)); 1108 out_batch: 1109 i915_vma_unpin(batch); 1110 i915_vma_put(batch); 1111 if (err) 1112 break; 1113 } 1114 1115 return err; 1116 } 1117 1118 static struct i915_vma *recursive_batch(struct intel_gt *gt) 1119 { 1120 struct drm_i915_gem_object *obj; 1121 const int ver = GRAPHICS_VER(gt->i915); 1122 struct i915_vma *vma; 1123 u32 *cmd; 1124 int err; 1125 1126 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 1127 if (IS_ERR(obj)) 1128 return ERR_CAST(obj); 1129 1130 vma = i915_vma_instance(obj, gt->vm, NULL); 1131 if (IS_ERR(vma)) { 1132 err = PTR_ERR(vma); 1133 goto err; 1134 } 1135 1136 err = i915_vma_pin(vma, 0, 0, PIN_USER); 1137 if (err) 1138 goto err; 1139 1140 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC); 1141 if (IS_ERR(cmd)) { 1142 err = PTR_ERR(cmd); 1143 goto err; 1144 } 1145 1146 if (ver >= 8) { 1147 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 1148 *cmd++ = lower_32_bits(i915_vma_offset(vma)); 1149 *cmd++ = upper_32_bits(i915_vma_offset(vma)); 1150 } else if (ver >= 6) { 1151 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8; 1152 *cmd++ = lower_32_bits(i915_vma_offset(vma)); 1153 } else { 1154 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; 1155 *cmd++ = lower_32_bits(i915_vma_offset(vma)); 1156 } 1157 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */ 1158 1159 __i915_gem_object_flush_map(obj, 0, 64); 1160 i915_gem_object_unpin_map(obj); 1161 1162 intel_gt_chipset_flush(gt); 1163 1164 return vma; 1165 1166 err: 1167 i915_gem_object_put(obj); 1168 return ERR_PTR(err); 1169 } 1170 1171 static int recursive_batch_resolve(struct i915_vma *batch) 1172 { 1173 u32 *cmd; 1174 1175 cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC); 1176 if (IS_ERR(cmd)) 1177 return PTR_ERR(cmd); 1178 1179 *cmd = MI_BATCH_BUFFER_END; 1180 1181 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd)); 1182 i915_gem_object_unpin_map(batch->obj); 1183 1184 intel_gt_chipset_flush(batch->vm->gt); 1185 1186 return 0; 1187 } 1188 1189 static int live_all_engines(void *arg) 1190 { 1191 struct drm_i915_private *i915 = arg; 1192 const unsigned int nengines = num_uabi_engines(i915); 1193 struct intel_engine_cs *engine; 1194 struct i915_request **request; 1195 struct igt_live_test t; 1196 unsigned int idx; 1197 int err; 1198 1199 /* 1200 * Check we can submit requests to all engines simultaneously. We 1201 * send a recursive batch to each engine - checking that we don't 1202 * block doing so, and that they don't complete too soon. 1203 */ 1204 1205 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 1206 if (!request) 1207 return -ENOMEM; 1208 1209 err = igt_live_test_begin(&t, i915, __func__, ""); 1210 if (err) 1211 goto out_free; 1212 1213 idx = 0; 1214 for_each_uabi_engine(engine, i915) { 1215 struct i915_vma *batch; 1216 1217 batch = recursive_batch(engine->gt); 1218 if (IS_ERR(batch)) { 1219 err = PTR_ERR(batch); 1220 pr_err("%s: Unable to create batch, err=%d\n", 1221 __func__, err); 1222 goto out_free; 1223 } 1224 1225 i915_vma_lock(batch); 1226 request[idx] = intel_engine_create_kernel_request(engine); 1227 if (IS_ERR(request[idx])) { 1228 err = PTR_ERR(request[idx]); 1229 pr_err("%s: Request allocation failed with err=%d\n", 1230 __func__, err); 1231 goto out_unlock; 1232 } 1233 GEM_BUG_ON(request[idx]->context->vm != batch->vm); 1234 1235 err = i915_vma_move_to_active(batch, request[idx], 0); 1236 GEM_BUG_ON(err); 1237 1238 err = emit_bb_start(request[idx], batch); 1239 GEM_BUG_ON(err); 1240 request[idx]->batch = batch; 1241 1242 i915_request_get(request[idx]); 1243 i915_request_add(request[idx]); 1244 idx++; 1245 out_unlock: 1246 i915_vma_unlock(batch); 1247 if (err) 1248 goto out_request; 1249 } 1250 1251 idx = 0; 1252 for_each_uabi_engine(engine, i915) { 1253 if (i915_request_completed(request[idx])) { 1254 pr_err("%s(%s): request completed too early!\n", 1255 __func__, engine->name); 1256 err = -EINVAL; 1257 goto out_request; 1258 } 1259 idx++; 1260 } 1261 1262 idx = 0; 1263 for_each_uabi_engine(engine, i915) { 1264 err = recursive_batch_resolve(request[idx]->batch); 1265 if (err) { 1266 pr_err("%s: failed to resolve batch, err=%d\n", 1267 __func__, err); 1268 goto out_request; 1269 } 1270 idx++; 1271 } 1272 1273 idx = 0; 1274 for_each_uabi_engine(engine, i915) { 1275 struct i915_request *rq = request[idx]; 1276 long timeout; 1277 1278 timeout = i915_request_wait(rq, 0, 1279 MAX_SCHEDULE_TIMEOUT); 1280 if (timeout < 0) { 1281 err = timeout; 1282 pr_err("%s: error waiting for request on %s, err=%d\n", 1283 __func__, engine->name, err); 1284 goto out_request; 1285 } 1286 1287 GEM_BUG_ON(!i915_request_completed(rq)); 1288 i915_vma_unpin(rq->batch); 1289 i915_vma_put(rq->batch); 1290 i915_request_put(rq); 1291 request[idx] = NULL; 1292 idx++; 1293 } 1294 1295 err = igt_live_test_end(&t); 1296 1297 out_request: 1298 idx = 0; 1299 for_each_uabi_engine(engine, i915) { 1300 struct i915_request *rq = request[idx]; 1301 1302 if (!rq) 1303 continue; 1304 1305 if (rq->batch) { 1306 i915_vma_unpin(rq->batch); 1307 i915_vma_put(rq->batch); 1308 } 1309 i915_request_put(rq); 1310 idx++; 1311 } 1312 out_free: 1313 kfree(request); 1314 return err; 1315 } 1316 1317 static int live_sequential_engines(void *arg) 1318 { 1319 struct drm_i915_private *i915 = arg; 1320 const unsigned int nengines = num_uabi_engines(i915); 1321 struct i915_request **request; 1322 struct i915_request *prev = NULL; 1323 struct intel_engine_cs *engine; 1324 struct igt_live_test t; 1325 unsigned int idx; 1326 int err; 1327 1328 /* 1329 * Check we can submit requests to all engines sequentially, such 1330 * that each successive request waits for the earlier ones. This 1331 * tests that we don't execute requests out of order, even though 1332 * they are running on independent engines. 1333 */ 1334 1335 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 1336 if (!request) 1337 return -ENOMEM; 1338 1339 err = igt_live_test_begin(&t, i915, __func__, ""); 1340 if (err) 1341 goto out_free; 1342 1343 idx = 0; 1344 for_each_uabi_engine(engine, i915) { 1345 struct i915_vma *batch; 1346 1347 batch = recursive_batch(engine->gt); 1348 if (IS_ERR(batch)) { 1349 err = PTR_ERR(batch); 1350 pr_err("%s: Unable to create batch for %s, err=%d\n", 1351 __func__, engine->name, err); 1352 goto out_free; 1353 } 1354 1355 i915_vma_lock(batch); 1356 request[idx] = intel_engine_create_kernel_request(engine); 1357 if (IS_ERR(request[idx])) { 1358 err = PTR_ERR(request[idx]); 1359 pr_err("%s: Request allocation failed for %s with err=%d\n", 1360 __func__, engine->name, err); 1361 goto out_unlock; 1362 } 1363 GEM_BUG_ON(request[idx]->context->vm != batch->vm); 1364 1365 if (prev) { 1366 err = i915_request_await_dma_fence(request[idx], 1367 &prev->fence); 1368 if (err) { 1369 i915_request_add(request[idx]); 1370 pr_err("%s: Request await failed for %s with err=%d\n", 1371 __func__, engine->name, err); 1372 goto out_unlock; 1373 } 1374 } 1375 1376 err = i915_vma_move_to_active(batch, request[idx], 0); 1377 GEM_BUG_ON(err); 1378 1379 err = emit_bb_start(request[idx], batch); 1380 GEM_BUG_ON(err); 1381 request[idx]->batch = batch; 1382 1383 i915_request_get(request[idx]); 1384 i915_request_add(request[idx]); 1385 1386 prev = request[idx]; 1387 idx++; 1388 1389 out_unlock: 1390 i915_vma_unlock(batch); 1391 if (err) 1392 goto out_request; 1393 } 1394 1395 idx = 0; 1396 for_each_uabi_engine(engine, i915) { 1397 long timeout; 1398 1399 if (i915_request_completed(request[idx])) { 1400 pr_err("%s(%s): request completed too early!\n", 1401 __func__, engine->name); 1402 err = -EINVAL; 1403 goto out_request; 1404 } 1405 1406 err = recursive_batch_resolve(request[idx]->batch); 1407 if (err) { 1408 pr_err("%s: failed to resolve batch, err=%d\n", 1409 __func__, err); 1410 goto out_request; 1411 } 1412 1413 timeout = i915_request_wait(request[idx], 0, 1414 MAX_SCHEDULE_TIMEOUT); 1415 if (timeout < 0) { 1416 err = timeout; 1417 pr_err("%s: error waiting for request on %s, err=%d\n", 1418 __func__, engine->name, err); 1419 goto out_request; 1420 } 1421 1422 GEM_BUG_ON(!i915_request_completed(request[idx])); 1423 idx++; 1424 } 1425 1426 err = igt_live_test_end(&t); 1427 1428 out_request: 1429 idx = 0; 1430 for_each_uabi_engine(engine, i915) { 1431 u32 *cmd; 1432 1433 if (!request[idx]) 1434 break; 1435 1436 cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj, 1437 I915_MAP_WC); 1438 if (!IS_ERR(cmd)) { 1439 *cmd = MI_BATCH_BUFFER_END; 1440 1441 __i915_gem_object_flush_map(request[idx]->batch->obj, 1442 0, sizeof(*cmd)); 1443 i915_gem_object_unpin_map(request[idx]->batch->obj); 1444 1445 intel_gt_chipset_flush(engine->gt); 1446 } 1447 1448 i915_vma_put(request[idx]->batch); 1449 i915_request_put(request[idx]); 1450 idx++; 1451 } 1452 out_free: 1453 kfree(request); 1454 return err; 1455 } 1456 1457 struct parallel_thread { 1458 struct kthread_worker *worker; 1459 struct kthread_work work; 1460 struct intel_engine_cs *engine; 1461 int result; 1462 }; 1463 1464 static void __live_parallel_engine1(struct kthread_work *work) 1465 { 1466 struct parallel_thread *thread = 1467 container_of(work, typeof(*thread), work); 1468 struct intel_engine_cs *engine = thread->engine; 1469 IGT_TIMEOUT(end_time); 1470 unsigned long count; 1471 int err = 0; 1472 1473 count = 0; 1474 intel_engine_pm_get(engine); 1475 do { 1476 struct i915_request *rq; 1477 1478 rq = i915_request_create(engine->kernel_context); 1479 if (IS_ERR(rq)) { 1480 err = PTR_ERR(rq); 1481 break; 1482 } 1483 1484 i915_request_get(rq); 1485 i915_request_add(rq); 1486 1487 err = 0; 1488 if (i915_request_wait(rq, 0, HZ) < 0) 1489 err = -ETIME; 1490 i915_request_put(rq); 1491 if (err) 1492 break; 1493 1494 count++; 1495 } while (!__igt_timeout(end_time, NULL)); 1496 intel_engine_pm_put(engine); 1497 1498 pr_info("%s: %lu request + sync\n", engine->name, count); 1499 thread->result = err; 1500 } 1501 1502 static void __live_parallel_engineN(struct kthread_work *work) 1503 { 1504 struct parallel_thread *thread = 1505 container_of(work, typeof(*thread), work); 1506 struct intel_engine_cs *engine = thread->engine; 1507 IGT_TIMEOUT(end_time); 1508 unsigned long count; 1509 int err = 0; 1510 1511 count = 0; 1512 intel_engine_pm_get(engine); 1513 do { 1514 struct i915_request *rq; 1515 1516 rq = i915_request_create(engine->kernel_context); 1517 if (IS_ERR(rq)) { 1518 err = PTR_ERR(rq); 1519 break; 1520 } 1521 1522 i915_request_add(rq); 1523 count++; 1524 } while (!__igt_timeout(end_time, NULL)); 1525 intel_engine_pm_put(engine); 1526 1527 pr_info("%s: %lu requests\n", engine->name, count); 1528 thread->result = err; 1529 } 1530 1531 static bool wake_all(struct drm_i915_private *i915) 1532 { 1533 if (atomic_dec_and_test(&i915->selftest.counter)) { 1534 wake_up_var(&i915->selftest.counter); 1535 return true; 1536 } 1537 1538 return false; 1539 } 1540 1541 static int wait_for_all(struct drm_i915_private *i915) 1542 { 1543 if (wake_all(i915)) 1544 return 0; 1545 1546 if (wait_var_event_timeout(&i915->selftest.counter, 1547 !atomic_read(&i915->selftest.counter), 1548 i915_selftest.timeout_jiffies)) 1549 return 0; 1550 1551 return -ETIME; 1552 } 1553 1554 static void __live_parallel_spin(struct kthread_work *work) 1555 { 1556 struct parallel_thread *thread = 1557 container_of(work, typeof(*thread), work); 1558 struct intel_engine_cs *engine = thread->engine; 1559 struct igt_spinner spin; 1560 struct i915_request *rq; 1561 int err = 0; 1562 1563 /* 1564 * Create a spinner running for eternity on each engine. If a second 1565 * spinner is incorrectly placed on the same engine, it will not be 1566 * able to start in time. 1567 */ 1568 1569 if (igt_spinner_init(&spin, engine->gt)) { 1570 wake_all(engine->i915); 1571 thread->result = -ENOMEM; 1572 return; 1573 } 1574 1575 intel_engine_pm_get(engine); 1576 rq = igt_spinner_create_request(&spin, 1577 engine->kernel_context, 1578 MI_NOOP); /* no preemption */ 1579 intel_engine_pm_put(engine); 1580 if (IS_ERR(rq)) { 1581 err = PTR_ERR(rq); 1582 if (err == -ENODEV) 1583 err = 0; 1584 wake_all(engine->i915); 1585 goto out_spin; 1586 } 1587 1588 i915_request_get(rq); 1589 i915_request_add(rq); 1590 if (igt_wait_for_spinner(&spin, rq)) { 1591 /* Occupy this engine for the whole test */ 1592 err = wait_for_all(engine->i915); 1593 } else { 1594 pr_err("Failed to start spinner on %s\n", engine->name); 1595 err = -EINVAL; 1596 } 1597 igt_spinner_end(&spin); 1598 1599 if (err == 0 && i915_request_wait(rq, 0, HZ) < 0) 1600 err = -EIO; 1601 i915_request_put(rq); 1602 1603 out_spin: 1604 igt_spinner_fini(&spin); 1605 thread->result = err; 1606 } 1607 1608 static int live_parallel_engines(void *arg) 1609 { 1610 struct drm_i915_private *i915 = arg; 1611 static void (* const func[])(struct kthread_work *) = { 1612 __live_parallel_engine1, 1613 __live_parallel_engineN, 1614 __live_parallel_spin, 1615 NULL, 1616 }; 1617 const unsigned int nengines = num_uabi_engines(i915); 1618 struct parallel_thread *threads; 1619 struct intel_engine_cs *engine; 1620 void (* const *fn)(struct kthread_work *); 1621 int err = 0; 1622 1623 /* 1624 * Check we can submit requests to all engines concurrently. This 1625 * tests that we load up the system maximally. 1626 */ 1627 1628 threads = kcalloc(nengines, sizeof(*threads), GFP_KERNEL); 1629 if (!threads) 1630 return -ENOMEM; 1631 1632 for (fn = func; !err && *fn; fn++) { 1633 char name[KSYM_NAME_LEN]; 1634 struct igt_live_test t; 1635 unsigned int idx; 1636 1637 snprintf(name, sizeof(name), "%ps", *fn); 1638 err = igt_live_test_begin(&t, i915, __func__, name); 1639 if (err) 1640 break; 1641 1642 atomic_set(&i915->selftest.counter, nengines); 1643 1644 idx = 0; 1645 for_each_uabi_engine(engine, i915) { 1646 struct kthread_worker *worker; 1647 1648 worker = kthread_create_worker(0, "igt/parallel:%s", 1649 engine->name); 1650 if (IS_ERR(worker)) { 1651 err = PTR_ERR(worker); 1652 break; 1653 } 1654 1655 threads[idx].worker = worker; 1656 threads[idx].result = 0; 1657 threads[idx].engine = engine; 1658 1659 kthread_init_work(&threads[idx].work, *fn); 1660 kthread_queue_work(worker, &threads[idx].work); 1661 idx++; 1662 } 1663 1664 idx = 0; 1665 for_each_uabi_engine(engine, i915) { 1666 int status; 1667 1668 if (!threads[idx].worker) 1669 break; 1670 1671 kthread_flush_work(&threads[idx].work); 1672 status = READ_ONCE(threads[idx].result); 1673 if (status && !err) 1674 err = status; 1675 1676 kthread_destroy_worker(threads[idx++].worker); 1677 } 1678 1679 if (igt_live_test_end(&t)) 1680 err = -EIO; 1681 } 1682 1683 kfree(threads); 1684 return err; 1685 } 1686 1687 static int 1688 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine) 1689 { 1690 struct i915_request *rq; 1691 int ret; 1692 1693 /* 1694 * Before execlists, all contexts share the same ringbuffer. With 1695 * execlists, each context/engine has a separate ringbuffer and 1696 * for the purposes of this test, inexhaustible. 1697 * 1698 * For the global ringbuffer though, we have to be very careful 1699 * that we do not wrap while preventing the execution of requests 1700 * with a unsignaled fence. 1701 */ 1702 if (HAS_EXECLISTS(ctx->i915)) 1703 return INT_MAX; 1704 1705 rq = igt_request_alloc(ctx, engine); 1706 if (IS_ERR(rq)) { 1707 ret = PTR_ERR(rq); 1708 } else { 1709 int sz; 1710 1711 ret = rq->ring->size - rq->reserved_space; 1712 i915_request_add(rq); 1713 1714 sz = rq->ring->emit - rq->head; 1715 if (sz < 0) 1716 sz += rq->ring->size; 1717 ret /= sz; 1718 ret /= 2; /* leave half spare, in case of emergency! */ 1719 } 1720 1721 return ret; 1722 } 1723 1724 static int live_breadcrumbs_smoketest(void *arg) 1725 { 1726 struct drm_i915_private *i915 = arg; 1727 const unsigned int nengines = num_uabi_engines(i915); 1728 const unsigned int ncpus = /* saturate with nengines * ncpus */ 1729 max_t(int, 2, DIV_ROUND_UP(num_online_cpus(), nengines)); 1730 unsigned long num_waits, num_fences; 1731 struct intel_engine_cs *engine; 1732 struct smoke_thread *threads; 1733 struct igt_live_test live; 1734 intel_wakeref_t wakeref; 1735 struct smoketest *smoke; 1736 unsigned int n, idx; 1737 struct file *file; 1738 int ret = 0; 1739 1740 /* 1741 * Smoketest our breadcrumb/signal handling for requests across multiple 1742 * threads. A very simple test to only catch the most egregious of bugs. 1743 * See __igt_breadcrumbs_smoketest(); 1744 * 1745 * On real hardware this time. 1746 */ 1747 1748 wakeref = intel_runtime_pm_get(&i915->runtime_pm); 1749 1750 file = mock_file(i915); 1751 if (IS_ERR(file)) { 1752 ret = PTR_ERR(file); 1753 goto out_rpm; 1754 } 1755 1756 smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL); 1757 if (!smoke) { 1758 ret = -ENOMEM; 1759 goto out_file; 1760 } 1761 1762 threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL); 1763 if (!threads) { 1764 ret = -ENOMEM; 1765 goto out_smoke; 1766 } 1767 1768 smoke[0].request_alloc = __live_request_alloc; 1769 smoke[0].ncontexts = 64; 1770 smoke[0].contexts = kcalloc(smoke[0].ncontexts, 1771 sizeof(*smoke[0].contexts), 1772 GFP_KERNEL); 1773 if (!smoke[0].contexts) { 1774 ret = -ENOMEM; 1775 goto out_threads; 1776 } 1777 1778 for (n = 0; n < smoke[0].ncontexts; n++) { 1779 smoke[0].contexts[n] = live_context(i915, file); 1780 if (IS_ERR(smoke[0].contexts[n])) { 1781 ret = PTR_ERR(smoke[0].contexts[n]); 1782 goto out_contexts; 1783 } 1784 } 1785 1786 ret = igt_live_test_begin(&live, i915, __func__, ""); 1787 if (ret) 1788 goto out_contexts; 1789 1790 idx = 0; 1791 for_each_uabi_engine(engine, i915) { 1792 smoke[idx] = smoke[0]; 1793 smoke[idx].engine = engine; 1794 smoke[idx].max_batch = 1795 max_batches(smoke[0].contexts[0], engine); 1796 if (smoke[idx].max_batch < 0) { 1797 ret = smoke[idx].max_batch; 1798 goto out_flush; 1799 } 1800 /* One ring interleaved between requests from all cpus */ 1801 smoke[idx].max_batch /= ncpus + 1; 1802 pr_debug("Limiting batches to %d requests on %s\n", 1803 smoke[idx].max_batch, engine->name); 1804 1805 for (n = 0; n < ncpus; n++) { 1806 unsigned int i = idx * ncpus + n; 1807 struct kthread_worker *worker; 1808 1809 worker = kthread_create_worker(0, "igt/%d.%d", idx, n); 1810 if (IS_ERR(worker)) { 1811 ret = PTR_ERR(worker); 1812 goto out_flush; 1813 } 1814 1815 threads[i].worker = worker; 1816 threads[i].t = &smoke[idx]; 1817 1818 kthread_init_work(&threads[i].work, 1819 __igt_breadcrumbs_smoketest); 1820 kthread_queue_work(worker, &threads[i].work); 1821 } 1822 1823 idx++; 1824 } 1825 1826 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 1827 1828 out_flush: 1829 idx = 0; 1830 num_waits = 0; 1831 num_fences = 0; 1832 for_each_uabi_engine(engine, i915) { 1833 for (n = 0; n < ncpus; n++) { 1834 unsigned int i = idx * ncpus + n; 1835 int err; 1836 1837 if (!threads[i].worker) 1838 continue; 1839 1840 WRITE_ONCE(threads[i].stop, true); 1841 kthread_flush_work(&threads[i].work); 1842 err = READ_ONCE(threads[i].result); 1843 if (err < 0 && !ret) 1844 ret = err; 1845 1846 kthread_destroy_worker(threads[i].worker); 1847 } 1848 1849 num_waits += atomic_long_read(&smoke[idx].num_waits); 1850 num_fences += atomic_long_read(&smoke[idx].num_fences); 1851 idx++; 1852 } 1853 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n", 1854 num_waits, num_fences, idx, ncpus); 1855 1856 ret = igt_live_test_end(&live) ?: ret; 1857 out_contexts: 1858 kfree(smoke[0].contexts); 1859 out_threads: 1860 kfree(threads); 1861 out_smoke: 1862 kfree(smoke); 1863 out_file: 1864 fput(file); 1865 out_rpm: 1866 intel_runtime_pm_put(&i915->runtime_pm, wakeref); 1867 1868 return ret; 1869 } 1870 1871 int i915_request_live_selftests(struct drm_i915_private *i915) 1872 { 1873 static const struct i915_subtest tests[] = { 1874 SUBTEST(live_nop_request), 1875 SUBTEST(live_all_engines), 1876 SUBTEST(live_sequential_engines), 1877 SUBTEST(live_parallel_engines), 1878 SUBTEST(live_empty_request), 1879 SUBTEST(live_cancel_request), 1880 SUBTEST(live_breadcrumbs_smoketest), 1881 }; 1882 1883 if (intel_gt_is_wedged(to_gt(i915))) 1884 return 0; 1885 1886 return i915_live_subtests(tests, i915); 1887 } 1888 1889 static int switch_to_kernel_sync(struct intel_context *ce, int err) 1890 { 1891 struct i915_request *rq; 1892 struct dma_fence *fence; 1893 1894 rq = intel_engine_create_kernel_request(ce->engine); 1895 if (IS_ERR(rq)) 1896 return PTR_ERR(rq); 1897 1898 fence = i915_active_fence_get(&ce->timeline->last_request); 1899 if (fence) { 1900 i915_request_await_dma_fence(rq, fence); 1901 dma_fence_put(fence); 1902 } 1903 1904 rq = i915_request_get(rq); 1905 i915_request_add(rq); 1906 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err) 1907 err = -ETIME; 1908 i915_request_put(rq); 1909 1910 while (!err && !intel_engine_is_idle(ce->engine)) 1911 intel_engine_flush_submission(ce->engine); 1912 1913 return err; 1914 } 1915 1916 struct perf_stats { 1917 struct intel_engine_cs *engine; 1918 unsigned long count; 1919 ktime_t time; 1920 ktime_t busy; 1921 u64 runtime; 1922 }; 1923 1924 struct perf_series { 1925 struct drm_i915_private *i915; 1926 unsigned int nengines; 1927 struct intel_context *ce[]; 1928 }; 1929 1930 static int cmp_u32(const void *A, const void *B) 1931 { 1932 const u32 *a = A, *b = B; 1933 1934 return *a - *b; 1935 } 1936 1937 static u32 trifilter(u32 *a) 1938 { 1939 u64 sum; 1940 1941 #define TF_COUNT 5 1942 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL); 1943 1944 sum = mul_u32_u32(a[2], 2); 1945 sum += a[1]; 1946 sum += a[3]; 1947 1948 GEM_BUG_ON(sum > U32_MAX); 1949 return sum; 1950 #define TF_BIAS 2 1951 } 1952 1953 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles) 1954 { 1955 u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles); 1956 1957 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS); 1958 } 1959 1960 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset) 1961 { 1962 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; 1963 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base))); 1964 *cs++ = offset; 1965 *cs++ = 0; 1966 1967 return cs; 1968 } 1969 1970 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value) 1971 { 1972 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1973 *cs++ = offset; 1974 *cs++ = 0; 1975 *cs++ = value; 1976 1977 return cs; 1978 } 1979 1980 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset) 1981 { 1982 *cs++ = MI_SEMAPHORE_WAIT | 1983 MI_SEMAPHORE_GLOBAL_GTT | 1984 MI_SEMAPHORE_POLL | 1985 mode; 1986 *cs++ = value; 1987 *cs++ = offset; 1988 *cs++ = 0; 1989 1990 return cs; 1991 } 1992 1993 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value) 1994 { 1995 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset); 1996 } 1997 1998 static void semaphore_set(u32 *sema, u32 value) 1999 { 2000 WRITE_ONCE(*sema, value); 2001 wmb(); /* flush the update to the cache, and beyond */ 2002 } 2003 2004 static u32 *hwsp_scratch(const struct intel_context *ce) 2005 { 2006 return memset32(ce->engine->status_page.addr + 1000, 0, 21); 2007 } 2008 2009 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw) 2010 { 2011 return (i915_ggtt_offset(ce->engine->status_page.vma) + 2012 offset_in_page(dw)); 2013 } 2014 2015 static int measure_semaphore_response(struct intel_context *ce) 2016 { 2017 u32 *sema = hwsp_scratch(ce); 2018 const u32 offset = hwsp_offset(ce, sema); 2019 u32 elapsed[TF_COUNT], cycles; 2020 struct i915_request *rq; 2021 u32 *cs; 2022 int err; 2023 int i; 2024 2025 /* 2026 * Measure how many cycles it takes for the HW to detect the change 2027 * in a semaphore value. 2028 * 2029 * A: read CS_TIMESTAMP from CPU 2030 * poke semaphore 2031 * B: read CS_TIMESTAMP on GPU 2032 * 2033 * Semaphore latency: B - A 2034 */ 2035 2036 semaphore_set(sema, -1); 2037 2038 rq = i915_request_create(ce); 2039 if (IS_ERR(rq)) 2040 return PTR_ERR(rq); 2041 2042 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed)); 2043 if (IS_ERR(cs)) { 2044 i915_request_add(rq); 2045 err = PTR_ERR(cs); 2046 goto err; 2047 } 2048 2049 cs = emit_store_dw(cs, offset, 0); 2050 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2051 cs = emit_semaphore_poll_until(cs, offset, i); 2052 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2053 cs = emit_store_dw(cs, offset, 0); 2054 } 2055 2056 intel_ring_advance(rq, cs); 2057 i915_request_add(rq); 2058 2059 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 2060 err = -EIO; 2061 goto err; 2062 } 2063 2064 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2065 preempt_disable(); 2066 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2067 semaphore_set(sema, i); 2068 preempt_enable(); 2069 2070 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 2071 err = -EIO; 2072 goto err; 2073 } 2074 2075 elapsed[i - 1] = sema[i] - cycles; 2076 } 2077 2078 cycles = trifilter(elapsed); 2079 pr_info("%s: semaphore response %d cycles, %lluns\n", 2080 ce->engine->name, cycles >> TF_BIAS, 2081 cycles_to_ns(ce->engine, cycles)); 2082 2083 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2084 2085 err: 2086 intel_gt_set_wedged(ce->engine->gt); 2087 return err; 2088 } 2089 2090 static int measure_idle_dispatch(struct intel_context *ce) 2091 { 2092 u32 *sema = hwsp_scratch(ce); 2093 const u32 offset = hwsp_offset(ce, sema); 2094 u32 elapsed[TF_COUNT], cycles; 2095 u32 *cs; 2096 int err; 2097 int i; 2098 2099 /* 2100 * Measure how long it takes for us to submit a request while the 2101 * engine is idle, but is resting in our context. 2102 * 2103 * A: read CS_TIMESTAMP from CPU 2104 * submit request 2105 * B: read CS_TIMESTAMP on GPU 2106 * 2107 * Submission latency: B - A 2108 */ 2109 2110 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 2111 struct i915_request *rq; 2112 2113 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2114 if (err) 2115 return err; 2116 2117 rq = i915_request_create(ce); 2118 if (IS_ERR(rq)) { 2119 err = PTR_ERR(rq); 2120 goto err; 2121 } 2122 2123 cs = intel_ring_begin(rq, 4); 2124 if (IS_ERR(cs)) { 2125 i915_request_add(rq); 2126 err = PTR_ERR(cs); 2127 goto err; 2128 } 2129 2130 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2131 2132 intel_ring_advance(rq, cs); 2133 2134 preempt_disable(); 2135 local_bh_disable(); 2136 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2137 i915_request_add(rq); 2138 local_bh_enable(); 2139 preempt_enable(); 2140 } 2141 2142 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2143 if (err) 2144 goto err; 2145 2146 for (i = 0; i < ARRAY_SIZE(elapsed); i++) 2147 elapsed[i] = sema[i] - elapsed[i]; 2148 2149 cycles = trifilter(elapsed); 2150 pr_info("%s: idle dispatch latency %d cycles, %lluns\n", 2151 ce->engine->name, cycles >> TF_BIAS, 2152 cycles_to_ns(ce->engine, cycles)); 2153 2154 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2155 2156 err: 2157 intel_gt_set_wedged(ce->engine->gt); 2158 return err; 2159 } 2160 2161 static int measure_busy_dispatch(struct intel_context *ce) 2162 { 2163 u32 *sema = hwsp_scratch(ce); 2164 const u32 offset = hwsp_offset(ce, sema); 2165 u32 elapsed[TF_COUNT + 1], cycles; 2166 u32 *cs; 2167 int err; 2168 int i; 2169 2170 /* 2171 * Measure how long it takes for us to submit a request while the 2172 * engine is busy, polling on a semaphore in our context. With 2173 * direct submission, this will include the cost of a lite restore. 2174 * 2175 * A: read CS_TIMESTAMP from CPU 2176 * submit request 2177 * B: read CS_TIMESTAMP on GPU 2178 * 2179 * Submission latency: B - A 2180 */ 2181 2182 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2183 struct i915_request *rq; 2184 2185 rq = i915_request_create(ce); 2186 if (IS_ERR(rq)) { 2187 err = PTR_ERR(rq); 2188 goto err; 2189 } 2190 2191 cs = intel_ring_begin(rq, 12); 2192 if (IS_ERR(cs)) { 2193 i915_request_add(rq); 2194 err = PTR_ERR(cs); 2195 goto err; 2196 } 2197 2198 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 2199 cs = emit_semaphore_poll_until(cs, offset, i); 2200 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2201 2202 intel_ring_advance(rq, cs); 2203 2204 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) { 2205 err = -EIO; 2206 goto err; 2207 } 2208 2209 preempt_disable(); 2210 local_bh_disable(); 2211 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2212 i915_request_add(rq); 2213 local_bh_enable(); 2214 semaphore_set(sema, i - 1); 2215 preempt_enable(); 2216 } 2217 2218 wait_for(READ_ONCE(sema[i - 1]), 500); 2219 semaphore_set(sema, i - 1); 2220 2221 for (i = 1; i <= TF_COUNT; i++) { 2222 GEM_BUG_ON(sema[i] == -1); 2223 elapsed[i - 1] = sema[i] - elapsed[i]; 2224 } 2225 2226 cycles = trifilter(elapsed); 2227 pr_info("%s: busy dispatch latency %d cycles, %lluns\n", 2228 ce->engine->name, cycles >> TF_BIAS, 2229 cycles_to_ns(ce->engine, cycles)); 2230 2231 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2232 2233 err: 2234 intel_gt_set_wedged(ce->engine->gt); 2235 return err; 2236 } 2237 2238 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value) 2239 { 2240 const u32 offset = 2241 i915_ggtt_offset(engine->status_page.vma) + 2242 offset_in_page(sema); 2243 struct i915_request *rq; 2244 u32 *cs; 2245 2246 rq = i915_request_create(engine->kernel_context); 2247 if (IS_ERR(rq)) 2248 return PTR_ERR(rq); 2249 2250 cs = intel_ring_begin(rq, 4); 2251 if (IS_ERR(cs)) { 2252 i915_request_add(rq); 2253 return PTR_ERR(cs); 2254 } 2255 2256 cs = emit_semaphore_poll(cs, mode, value, offset); 2257 2258 intel_ring_advance(rq, cs); 2259 i915_request_add(rq); 2260 2261 return 0; 2262 } 2263 2264 static int measure_inter_request(struct intel_context *ce) 2265 { 2266 u32 *sema = hwsp_scratch(ce); 2267 const u32 offset = hwsp_offset(ce, sema); 2268 u32 elapsed[TF_COUNT + 1], cycles; 2269 struct i915_sw_fence *submit; 2270 int i, err; 2271 2272 /* 2273 * Measure how long it takes to advance from one request into the 2274 * next. Between each request we flush the GPU caches to memory, 2275 * update the breadcrumbs, and then invalidate those caches. 2276 * We queue up all the requests to be submitted in one batch so 2277 * it should be one set of contiguous measurements. 2278 * 2279 * A: read CS_TIMESTAMP on GPU 2280 * advance request 2281 * B: read CS_TIMESTAMP on GPU 2282 * 2283 * Request latency: B - A 2284 */ 2285 2286 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 2287 if (err) 2288 return err; 2289 2290 submit = heap_fence_create(GFP_KERNEL); 2291 if (!submit) { 2292 semaphore_set(sema, 1); 2293 return -ENOMEM; 2294 } 2295 2296 intel_engine_flush_submission(ce->engine); 2297 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2298 struct i915_request *rq; 2299 u32 *cs; 2300 2301 rq = i915_request_create(ce); 2302 if (IS_ERR(rq)) { 2303 err = PTR_ERR(rq); 2304 goto err_submit; 2305 } 2306 2307 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 2308 submit, 2309 GFP_KERNEL); 2310 if (err < 0) { 2311 i915_request_add(rq); 2312 goto err_submit; 2313 } 2314 2315 cs = intel_ring_begin(rq, 4); 2316 if (IS_ERR(cs)) { 2317 i915_request_add(rq); 2318 err = PTR_ERR(cs); 2319 goto err_submit; 2320 } 2321 2322 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2323 2324 intel_ring_advance(rq, cs); 2325 i915_request_add(rq); 2326 } 2327 i915_sw_fence_commit(submit); 2328 intel_engine_flush_submission(ce->engine); 2329 heap_fence_put(submit); 2330 2331 semaphore_set(sema, 1); 2332 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2333 if (err) 2334 goto err; 2335 2336 for (i = 1; i <= TF_COUNT; i++) 2337 elapsed[i - 1] = sema[i + 1] - sema[i]; 2338 2339 cycles = trifilter(elapsed); 2340 pr_info("%s: inter-request latency %d cycles, %lluns\n", 2341 ce->engine->name, cycles >> TF_BIAS, 2342 cycles_to_ns(ce->engine, cycles)); 2343 2344 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2345 2346 err_submit: 2347 i915_sw_fence_commit(submit); 2348 heap_fence_put(submit); 2349 semaphore_set(sema, 1); 2350 err: 2351 intel_gt_set_wedged(ce->engine->gt); 2352 return err; 2353 } 2354 2355 static int measure_context_switch(struct intel_context *ce) 2356 { 2357 u32 *sema = hwsp_scratch(ce); 2358 const u32 offset = hwsp_offset(ce, sema); 2359 struct i915_request *fence = NULL; 2360 u32 elapsed[TF_COUNT + 1], cycles; 2361 int i, j, err; 2362 u32 *cs; 2363 2364 /* 2365 * Measure how long it takes to advance from one request in one 2366 * context to a request in another context. This allows us to 2367 * measure how long the context save/restore take, along with all 2368 * the inter-context setup we require. 2369 * 2370 * A: read CS_TIMESTAMP on GPU 2371 * switch context 2372 * B: read CS_TIMESTAMP on GPU 2373 * 2374 * Context switch latency: B - A 2375 */ 2376 2377 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 2378 if (err) 2379 return err; 2380 2381 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2382 struct intel_context *arr[] = { 2383 ce, ce->engine->kernel_context 2384 }; 2385 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32); 2386 2387 for (j = 0; j < ARRAY_SIZE(arr); j++) { 2388 struct i915_request *rq; 2389 2390 rq = i915_request_create(arr[j]); 2391 if (IS_ERR(rq)) { 2392 err = PTR_ERR(rq); 2393 goto err_fence; 2394 } 2395 2396 if (fence) { 2397 err = i915_request_await_dma_fence(rq, 2398 &fence->fence); 2399 if (err) { 2400 i915_request_add(rq); 2401 goto err_fence; 2402 } 2403 } 2404 2405 cs = intel_ring_begin(rq, 4); 2406 if (IS_ERR(cs)) { 2407 i915_request_add(rq); 2408 err = PTR_ERR(cs); 2409 goto err_fence; 2410 } 2411 2412 cs = emit_timestamp_store(cs, ce, addr); 2413 addr += sizeof(u32); 2414 2415 intel_ring_advance(rq, cs); 2416 2417 i915_request_put(fence); 2418 fence = i915_request_get(rq); 2419 2420 i915_request_add(rq); 2421 } 2422 } 2423 i915_request_put(fence); 2424 intel_engine_flush_submission(ce->engine); 2425 2426 semaphore_set(sema, 1); 2427 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2428 if (err) 2429 goto err; 2430 2431 for (i = 1; i <= TF_COUNT; i++) 2432 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1]; 2433 2434 cycles = trifilter(elapsed); 2435 pr_info("%s: context switch latency %d cycles, %lluns\n", 2436 ce->engine->name, cycles >> TF_BIAS, 2437 cycles_to_ns(ce->engine, cycles)); 2438 2439 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2440 2441 err_fence: 2442 i915_request_put(fence); 2443 semaphore_set(sema, 1); 2444 err: 2445 intel_gt_set_wedged(ce->engine->gt); 2446 return err; 2447 } 2448 2449 static int measure_preemption(struct intel_context *ce) 2450 { 2451 u32 *sema = hwsp_scratch(ce); 2452 const u32 offset = hwsp_offset(ce, sema); 2453 u32 elapsed[TF_COUNT], cycles; 2454 u32 *cs; 2455 int err; 2456 int i; 2457 2458 /* 2459 * We measure two latencies while triggering preemption. The first 2460 * latency is how long it takes for us to submit a preempting request. 2461 * The second latency is how it takes for us to return from the 2462 * preemption back to the original context. 2463 * 2464 * A: read CS_TIMESTAMP from CPU 2465 * submit preemption 2466 * B: read CS_TIMESTAMP on GPU (in preempting context) 2467 * context switch 2468 * C: read CS_TIMESTAMP on GPU (in original context) 2469 * 2470 * Preemption dispatch latency: B - A 2471 * Preemption switch latency: C - B 2472 */ 2473 2474 if (!intel_engine_has_preemption(ce->engine)) 2475 return 0; 2476 2477 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2478 u32 addr = offset + 2 * i * sizeof(u32); 2479 struct i915_request *rq; 2480 2481 rq = i915_request_create(ce); 2482 if (IS_ERR(rq)) { 2483 err = PTR_ERR(rq); 2484 goto err; 2485 } 2486 2487 cs = intel_ring_begin(rq, 12); 2488 if (IS_ERR(cs)) { 2489 i915_request_add(rq); 2490 err = PTR_ERR(cs); 2491 goto err; 2492 } 2493 2494 cs = emit_store_dw(cs, addr, -1); 2495 cs = emit_semaphore_poll_until(cs, offset, i); 2496 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32)); 2497 2498 intel_ring_advance(rq, cs); 2499 i915_request_add(rq); 2500 2501 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) { 2502 err = -EIO; 2503 goto err; 2504 } 2505 2506 rq = i915_request_create(ce->engine->kernel_context); 2507 if (IS_ERR(rq)) { 2508 err = PTR_ERR(rq); 2509 goto err; 2510 } 2511 2512 cs = intel_ring_begin(rq, 8); 2513 if (IS_ERR(cs)) { 2514 i915_request_add(rq); 2515 err = PTR_ERR(cs); 2516 goto err; 2517 } 2518 2519 cs = emit_timestamp_store(cs, ce, addr); 2520 cs = emit_store_dw(cs, offset, i); 2521 2522 intel_ring_advance(rq, cs); 2523 rq->sched.attr.priority = I915_PRIORITY_BARRIER; 2524 2525 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2526 i915_request_add(rq); 2527 } 2528 2529 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) { 2530 err = -EIO; 2531 goto err; 2532 } 2533 2534 for (i = 1; i <= TF_COUNT; i++) 2535 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1]; 2536 2537 cycles = trifilter(elapsed); 2538 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n", 2539 ce->engine->name, cycles >> TF_BIAS, 2540 cycles_to_ns(ce->engine, cycles)); 2541 2542 for (i = 1; i <= TF_COUNT; i++) 2543 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0]; 2544 2545 cycles = trifilter(elapsed); 2546 pr_info("%s: preemption switch latency %d cycles, %lluns\n", 2547 ce->engine->name, cycles >> TF_BIAS, 2548 cycles_to_ns(ce->engine, cycles)); 2549 2550 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2551 2552 err: 2553 intel_gt_set_wedged(ce->engine->gt); 2554 return err; 2555 } 2556 2557 struct signal_cb { 2558 struct dma_fence_cb base; 2559 bool seen; 2560 }; 2561 2562 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb) 2563 { 2564 struct signal_cb *s = container_of(cb, typeof(*s), base); 2565 2566 smp_store_mb(s->seen, true); /* be safe, be strong */ 2567 } 2568 2569 static int measure_completion(struct intel_context *ce) 2570 { 2571 u32 *sema = hwsp_scratch(ce); 2572 const u32 offset = hwsp_offset(ce, sema); 2573 u32 elapsed[TF_COUNT], cycles; 2574 u32 *cs; 2575 int err; 2576 int i; 2577 2578 /* 2579 * Measure how long it takes for the signal (interrupt) to be 2580 * sent from the GPU to be processed by the CPU. 2581 * 2582 * A: read CS_TIMESTAMP on GPU 2583 * signal 2584 * B: read CS_TIMESTAMP from CPU 2585 * 2586 * Completion latency: B - A 2587 */ 2588 2589 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2590 struct signal_cb cb = { .seen = false }; 2591 struct i915_request *rq; 2592 2593 rq = i915_request_create(ce); 2594 if (IS_ERR(rq)) { 2595 err = PTR_ERR(rq); 2596 goto err; 2597 } 2598 2599 cs = intel_ring_begin(rq, 12); 2600 if (IS_ERR(cs)) { 2601 i915_request_add(rq); 2602 err = PTR_ERR(cs); 2603 goto err; 2604 } 2605 2606 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 2607 cs = emit_semaphore_poll_until(cs, offset, i); 2608 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2609 2610 intel_ring_advance(rq, cs); 2611 2612 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb); 2613 i915_request_add(rq); 2614 2615 intel_engine_flush_submission(ce->engine); 2616 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) { 2617 err = -EIO; 2618 goto err; 2619 } 2620 2621 preempt_disable(); 2622 semaphore_set(sema, i); 2623 while (!READ_ONCE(cb.seen)) 2624 cpu_relax(); 2625 2626 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2627 preempt_enable(); 2628 } 2629 2630 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2631 if (err) 2632 goto err; 2633 2634 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 2635 GEM_BUG_ON(sema[i + 1] == -1); 2636 elapsed[i] = elapsed[i] - sema[i + 1]; 2637 } 2638 2639 cycles = trifilter(elapsed); 2640 pr_info("%s: completion latency %d cycles, %lluns\n", 2641 ce->engine->name, cycles >> TF_BIAS, 2642 cycles_to_ns(ce->engine, cycles)); 2643 2644 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2645 2646 err: 2647 intel_gt_set_wedged(ce->engine->gt); 2648 return err; 2649 } 2650 2651 static void rps_pin(struct intel_gt *gt) 2652 { 2653 /* Pin the frequency to max */ 2654 atomic_inc(>->rps.num_waiters); 2655 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 2656 2657 mutex_lock(>->rps.lock); 2658 intel_rps_set(>->rps, gt->rps.max_freq); 2659 mutex_unlock(>->rps.lock); 2660 } 2661 2662 static void rps_unpin(struct intel_gt *gt) 2663 { 2664 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 2665 atomic_dec(>->rps.num_waiters); 2666 } 2667 2668 static int perf_request_latency(void *arg) 2669 { 2670 struct drm_i915_private *i915 = arg; 2671 struct intel_engine_cs *engine; 2672 struct pm_qos_request qos; 2673 int err = 0; 2674 2675 if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */ 2676 return 0; 2677 2678 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2679 2680 for_each_uabi_engine(engine, i915) { 2681 struct intel_context *ce; 2682 2683 ce = intel_context_create(engine); 2684 if (IS_ERR(ce)) { 2685 err = PTR_ERR(ce); 2686 goto out; 2687 } 2688 2689 err = intel_context_pin(ce); 2690 if (err) { 2691 intel_context_put(ce); 2692 goto out; 2693 } 2694 2695 st_engine_heartbeat_disable(engine); 2696 rps_pin(engine->gt); 2697 2698 if (err == 0) 2699 err = measure_semaphore_response(ce); 2700 if (err == 0) 2701 err = measure_idle_dispatch(ce); 2702 if (err == 0) 2703 err = measure_busy_dispatch(ce); 2704 if (err == 0) 2705 err = measure_inter_request(ce); 2706 if (err == 0) 2707 err = measure_context_switch(ce); 2708 if (err == 0) 2709 err = measure_preemption(ce); 2710 if (err == 0) 2711 err = measure_completion(ce); 2712 2713 rps_unpin(engine->gt); 2714 st_engine_heartbeat_enable(engine); 2715 2716 intel_context_unpin(ce); 2717 intel_context_put(ce); 2718 if (err) 2719 goto out; 2720 } 2721 2722 out: 2723 if (igt_flush_test(i915)) 2724 err = -EIO; 2725 2726 cpu_latency_qos_remove_request(&qos); 2727 return err; 2728 } 2729 2730 static int s_sync0(void *arg) 2731 { 2732 struct perf_series *ps = arg; 2733 IGT_TIMEOUT(end_time); 2734 unsigned int idx = 0; 2735 int err = 0; 2736 2737 GEM_BUG_ON(!ps->nengines); 2738 do { 2739 struct i915_request *rq; 2740 2741 rq = i915_request_create(ps->ce[idx]); 2742 if (IS_ERR(rq)) { 2743 err = PTR_ERR(rq); 2744 break; 2745 } 2746 2747 i915_request_get(rq); 2748 i915_request_add(rq); 2749 2750 if (i915_request_wait(rq, 0, HZ / 5) < 0) 2751 err = -ETIME; 2752 i915_request_put(rq); 2753 if (err) 2754 break; 2755 2756 if (++idx == ps->nengines) 2757 idx = 0; 2758 } while (!__igt_timeout(end_time, NULL)); 2759 2760 return err; 2761 } 2762 2763 static int s_sync1(void *arg) 2764 { 2765 struct perf_series *ps = arg; 2766 struct i915_request *prev = NULL; 2767 IGT_TIMEOUT(end_time); 2768 unsigned int idx = 0; 2769 int err = 0; 2770 2771 GEM_BUG_ON(!ps->nengines); 2772 do { 2773 struct i915_request *rq; 2774 2775 rq = i915_request_create(ps->ce[idx]); 2776 if (IS_ERR(rq)) { 2777 err = PTR_ERR(rq); 2778 break; 2779 } 2780 2781 i915_request_get(rq); 2782 i915_request_add(rq); 2783 2784 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0) 2785 err = -ETIME; 2786 i915_request_put(prev); 2787 prev = rq; 2788 if (err) 2789 break; 2790 2791 if (++idx == ps->nengines) 2792 idx = 0; 2793 } while (!__igt_timeout(end_time, NULL)); 2794 i915_request_put(prev); 2795 2796 return err; 2797 } 2798 2799 static int s_many(void *arg) 2800 { 2801 struct perf_series *ps = arg; 2802 IGT_TIMEOUT(end_time); 2803 unsigned int idx = 0; 2804 2805 GEM_BUG_ON(!ps->nengines); 2806 do { 2807 struct i915_request *rq; 2808 2809 rq = i915_request_create(ps->ce[idx]); 2810 if (IS_ERR(rq)) 2811 return PTR_ERR(rq); 2812 2813 i915_request_add(rq); 2814 2815 if (++idx == ps->nengines) 2816 idx = 0; 2817 } while (!__igt_timeout(end_time, NULL)); 2818 2819 return 0; 2820 } 2821 2822 static int perf_series_engines(void *arg) 2823 { 2824 struct drm_i915_private *i915 = arg; 2825 static int (* const func[])(void *arg) = { 2826 s_sync0, 2827 s_sync1, 2828 s_many, 2829 NULL, 2830 }; 2831 const unsigned int nengines = num_uabi_engines(i915); 2832 struct intel_engine_cs *engine; 2833 int (* const *fn)(void *arg); 2834 struct pm_qos_request qos; 2835 struct perf_stats *stats; 2836 struct perf_series *ps; 2837 unsigned int idx; 2838 int err = 0; 2839 2840 stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL); 2841 if (!stats) 2842 return -ENOMEM; 2843 2844 ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL); 2845 if (!ps) { 2846 kfree(stats); 2847 return -ENOMEM; 2848 } 2849 2850 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2851 2852 ps->i915 = i915; 2853 ps->nengines = nengines; 2854 2855 idx = 0; 2856 for_each_uabi_engine(engine, i915) { 2857 struct intel_context *ce; 2858 2859 ce = intel_context_create(engine); 2860 if (IS_ERR(ce)) { 2861 err = PTR_ERR(ce); 2862 goto out; 2863 } 2864 2865 err = intel_context_pin(ce); 2866 if (err) { 2867 intel_context_put(ce); 2868 goto out; 2869 } 2870 2871 ps->ce[idx++] = ce; 2872 } 2873 GEM_BUG_ON(idx != ps->nengines); 2874 2875 for (fn = func; *fn && !err; fn++) { 2876 char name[KSYM_NAME_LEN]; 2877 struct igt_live_test t; 2878 2879 snprintf(name, sizeof(name), "%ps", *fn); 2880 err = igt_live_test_begin(&t, i915, __func__, name); 2881 if (err) 2882 break; 2883 2884 for (idx = 0; idx < nengines; idx++) { 2885 struct perf_stats *p = 2886 memset(&stats[idx], 0, sizeof(stats[idx])); 2887 struct intel_context *ce = ps->ce[idx]; 2888 2889 p->engine = ps->ce[idx]->engine; 2890 intel_engine_pm_get(p->engine); 2891 2892 if (intel_engine_supports_stats(p->engine)) 2893 p->busy = intel_engine_get_busy_time(p->engine, 2894 &p->time) + 1; 2895 else 2896 p->time = ktime_get(); 2897 p->runtime = -intel_context_get_total_runtime_ns(ce); 2898 } 2899 2900 err = (*fn)(ps); 2901 if (igt_live_test_end(&t)) 2902 err = -EIO; 2903 2904 for (idx = 0; idx < nengines; idx++) { 2905 struct perf_stats *p = &stats[idx]; 2906 struct intel_context *ce = ps->ce[idx]; 2907 int integer, decimal; 2908 u64 busy, dt, now; 2909 2910 if (p->busy) 2911 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine, 2912 &now), 2913 p->busy - 1); 2914 else 2915 now = ktime_get(); 2916 p->time = ktime_sub(now, p->time); 2917 2918 err = switch_to_kernel_sync(ce, err); 2919 p->runtime += intel_context_get_total_runtime_ns(ce); 2920 intel_engine_pm_put(p->engine); 2921 2922 busy = 100 * ktime_to_ns(p->busy); 2923 dt = ktime_to_ns(p->time); 2924 if (dt) { 2925 integer = div64_u64(busy, dt); 2926 busy -= integer * dt; 2927 decimal = div64_u64(100 * busy, dt); 2928 } else { 2929 integer = 0; 2930 decimal = 0; 2931 } 2932 2933 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 2934 name, p->engine->name, ce->timeline->seqno, 2935 integer, decimal, 2936 div_u64(p->runtime, 1000 * 1000), 2937 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 2938 } 2939 } 2940 2941 out: 2942 for (idx = 0; idx < nengines; idx++) { 2943 if (IS_ERR_OR_NULL(ps->ce[idx])) 2944 break; 2945 2946 intel_context_unpin(ps->ce[idx]); 2947 intel_context_put(ps->ce[idx]); 2948 } 2949 kfree(ps); 2950 2951 cpu_latency_qos_remove_request(&qos); 2952 kfree(stats); 2953 return err; 2954 } 2955 2956 struct p_thread { 2957 struct perf_stats p; 2958 struct kthread_worker *worker; 2959 struct kthread_work work; 2960 struct intel_engine_cs *engine; 2961 int result; 2962 }; 2963 2964 static void p_sync0(struct kthread_work *work) 2965 { 2966 struct p_thread *thread = container_of(work, typeof(*thread), work); 2967 struct perf_stats *p = &thread->p; 2968 struct intel_engine_cs *engine = p->engine; 2969 struct intel_context *ce; 2970 IGT_TIMEOUT(end_time); 2971 unsigned long count; 2972 bool busy; 2973 int err = 0; 2974 2975 ce = intel_context_create(engine); 2976 if (IS_ERR(ce)) { 2977 thread->result = PTR_ERR(ce); 2978 return; 2979 } 2980 2981 err = intel_context_pin(ce); 2982 if (err) { 2983 intel_context_put(ce); 2984 thread->result = err; 2985 return; 2986 } 2987 2988 if (intel_engine_supports_stats(engine)) { 2989 p->busy = intel_engine_get_busy_time(engine, &p->time); 2990 busy = true; 2991 } else { 2992 p->time = ktime_get(); 2993 busy = false; 2994 } 2995 2996 count = 0; 2997 do { 2998 struct i915_request *rq; 2999 3000 rq = i915_request_create(ce); 3001 if (IS_ERR(rq)) { 3002 err = PTR_ERR(rq); 3003 break; 3004 } 3005 3006 i915_request_get(rq); 3007 i915_request_add(rq); 3008 3009 err = 0; 3010 if (i915_request_wait(rq, 0, HZ) < 0) 3011 err = -ETIME; 3012 i915_request_put(rq); 3013 if (err) 3014 break; 3015 3016 count++; 3017 } while (!__igt_timeout(end_time, NULL)); 3018 3019 if (busy) { 3020 ktime_t now; 3021 3022 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3023 p->busy); 3024 p->time = ktime_sub(now, p->time); 3025 } else { 3026 p->time = ktime_sub(ktime_get(), p->time); 3027 } 3028 3029 err = switch_to_kernel_sync(ce, err); 3030 p->runtime = intel_context_get_total_runtime_ns(ce); 3031 p->count = count; 3032 3033 intel_context_unpin(ce); 3034 intel_context_put(ce); 3035 thread->result = err; 3036 } 3037 3038 static void p_sync1(struct kthread_work *work) 3039 { 3040 struct p_thread *thread = container_of(work, typeof(*thread), work); 3041 struct perf_stats *p = &thread->p; 3042 struct intel_engine_cs *engine = p->engine; 3043 struct i915_request *prev = NULL; 3044 struct intel_context *ce; 3045 IGT_TIMEOUT(end_time); 3046 unsigned long count; 3047 bool busy; 3048 int err = 0; 3049 3050 ce = intel_context_create(engine); 3051 if (IS_ERR(ce)) { 3052 thread->result = PTR_ERR(ce); 3053 return; 3054 } 3055 3056 err = intel_context_pin(ce); 3057 if (err) { 3058 intel_context_put(ce); 3059 thread->result = err; 3060 return; 3061 } 3062 3063 if (intel_engine_supports_stats(engine)) { 3064 p->busy = intel_engine_get_busy_time(engine, &p->time); 3065 busy = true; 3066 } else { 3067 p->time = ktime_get(); 3068 busy = false; 3069 } 3070 3071 count = 0; 3072 do { 3073 struct i915_request *rq; 3074 3075 rq = i915_request_create(ce); 3076 if (IS_ERR(rq)) { 3077 err = PTR_ERR(rq); 3078 break; 3079 } 3080 3081 i915_request_get(rq); 3082 i915_request_add(rq); 3083 3084 err = 0; 3085 if (prev && i915_request_wait(prev, 0, HZ) < 0) 3086 err = -ETIME; 3087 i915_request_put(prev); 3088 prev = rq; 3089 if (err) 3090 break; 3091 3092 count++; 3093 } while (!__igt_timeout(end_time, NULL)); 3094 i915_request_put(prev); 3095 3096 if (busy) { 3097 ktime_t now; 3098 3099 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3100 p->busy); 3101 p->time = ktime_sub(now, p->time); 3102 } else { 3103 p->time = ktime_sub(ktime_get(), p->time); 3104 } 3105 3106 err = switch_to_kernel_sync(ce, err); 3107 p->runtime = intel_context_get_total_runtime_ns(ce); 3108 p->count = count; 3109 3110 intel_context_unpin(ce); 3111 intel_context_put(ce); 3112 thread->result = err; 3113 } 3114 3115 static void p_many(struct kthread_work *work) 3116 { 3117 struct p_thread *thread = container_of(work, typeof(*thread), work); 3118 struct perf_stats *p = &thread->p; 3119 struct intel_engine_cs *engine = p->engine; 3120 struct intel_context *ce; 3121 IGT_TIMEOUT(end_time); 3122 unsigned long count; 3123 int err = 0; 3124 bool busy; 3125 3126 ce = intel_context_create(engine); 3127 if (IS_ERR(ce)) { 3128 thread->result = PTR_ERR(ce); 3129 return; 3130 } 3131 3132 err = intel_context_pin(ce); 3133 if (err) { 3134 intel_context_put(ce); 3135 thread->result = err; 3136 return; 3137 } 3138 3139 if (intel_engine_supports_stats(engine)) { 3140 p->busy = intel_engine_get_busy_time(engine, &p->time); 3141 busy = true; 3142 } else { 3143 p->time = ktime_get(); 3144 busy = false; 3145 } 3146 3147 count = 0; 3148 do { 3149 struct i915_request *rq; 3150 3151 rq = i915_request_create(ce); 3152 if (IS_ERR(rq)) { 3153 err = PTR_ERR(rq); 3154 break; 3155 } 3156 3157 i915_request_add(rq); 3158 count++; 3159 } while (!__igt_timeout(end_time, NULL)); 3160 3161 if (busy) { 3162 ktime_t now; 3163 3164 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3165 p->busy); 3166 p->time = ktime_sub(now, p->time); 3167 } else { 3168 p->time = ktime_sub(ktime_get(), p->time); 3169 } 3170 3171 err = switch_to_kernel_sync(ce, err); 3172 p->runtime = intel_context_get_total_runtime_ns(ce); 3173 p->count = count; 3174 3175 intel_context_unpin(ce); 3176 intel_context_put(ce); 3177 thread->result = err; 3178 } 3179 3180 static int perf_parallel_engines(void *arg) 3181 { 3182 struct drm_i915_private *i915 = arg; 3183 static void (* const func[])(struct kthread_work *) = { 3184 p_sync0, 3185 p_sync1, 3186 p_many, 3187 NULL, 3188 }; 3189 const unsigned int nengines = num_uabi_engines(i915); 3190 void (* const *fn)(struct kthread_work *); 3191 struct intel_engine_cs *engine; 3192 struct pm_qos_request qos; 3193 struct p_thread *engines; 3194 int err = 0; 3195 3196 engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL); 3197 if (!engines) 3198 return -ENOMEM; 3199 3200 cpu_latency_qos_add_request(&qos, 0); 3201 3202 for (fn = func; *fn; fn++) { 3203 char name[KSYM_NAME_LEN]; 3204 struct igt_live_test t; 3205 unsigned int idx; 3206 3207 snprintf(name, sizeof(name), "%ps", *fn); 3208 err = igt_live_test_begin(&t, i915, __func__, name); 3209 if (err) 3210 break; 3211 3212 atomic_set(&i915->selftest.counter, nengines); 3213 3214 idx = 0; 3215 for_each_uabi_engine(engine, i915) { 3216 struct kthread_worker *worker; 3217 3218 intel_engine_pm_get(engine); 3219 3220 memset(&engines[idx].p, 0, sizeof(engines[idx].p)); 3221 3222 worker = kthread_create_worker(0, "igt:%s", 3223 engine->name); 3224 if (IS_ERR(worker)) { 3225 err = PTR_ERR(worker); 3226 intel_engine_pm_put(engine); 3227 break; 3228 } 3229 engines[idx].worker = worker; 3230 engines[idx].result = 0; 3231 engines[idx].p.engine = engine; 3232 engines[idx].engine = engine; 3233 3234 kthread_init_work(&engines[idx].work, *fn); 3235 kthread_queue_work(worker, &engines[idx].work); 3236 idx++; 3237 } 3238 3239 idx = 0; 3240 for_each_uabi_engine(engine, i915) { 3241 int status; 3242 3243 if (!engines[idx].worker) 3244 break; 3245 3246 kthread_flush_work(&engines[idx].work); 3247 status = READ_ONCE(engines[idx].result); 3248 if (status && !err) 3249 err = status; 3250 3251 intel_engine_pm_put(engine); 3252 3253 kthread_destroy_worker(engines[idx].worker); 3254 idx++; 3255 } 3256 3257 if (igt_live_test_end(&t)) 3258 err = -EIO; 3259 if (err) 3260 break; 3261 3262 idx = 0; 3263 for_each_uabi_engine(engine, i915) { 3264 struct perf_stats *p = &engines[idx].p; 3265 u64 busy = 100 * ktime_to_ns(p->busy); 3266 u64 dt = ktime_to_ns(p->time); 3267 int integer, decimal; 3268 3269 if (dt) { 3270 integer = div64_u64(busy, dt); 3271 busy -= integer * dt; 3272 decimal = div64_u64(100 * busy, dt); 3273 } else { 3274 integer = 0; 3275 decimal = 0; 3276 } 3277 3278 GEM_BUG_ON(engine != p->engine); 3279 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 3280 name, engine->name, p->count, integer, decimal, 3281 div_u64(p->runtime, 1000 * 1000), 3282 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 3283 idx++; 3284 } 3285 } 3286 3287 cpu_latency_qos_remove_request(&qos); 3288 kfree(engines); 3289 return err; 3290 } 3291 3292 int i915_request_perf_selftests(struct drm_i915_private *i915) 3293 { 3294 static const struct i915_subtest tests[] = { 3295 SUBTEST(perf_request_latency), 3296 SUBTEST(perf_series_engines), 3297 SUBTEST(perf_parallel_engines), 3298 }; 3299 3300 if (intel_gt_is_wedged(to_gt(i915))) 3301 return 0; 3302 3303 return i915_subtests(tests, i915); 3304 } 3305