1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/pm_qos.h> 26 #include <linux/prime_numbers.h> 27 #include <linux/sort.h> 28 29 #include <drm/drm_print.h> 30 31 #include "gem/i915_gem_internal.h" 32 #include "gem/i915_gem_pm.h" 33 #include "gem/selftests/mock_context.h" 34 #include "gt/intel_engine_heartbeat.h" 35 #include "gt/intel_engine_pm.h" 36 #include "gt/intel_engine_user.h" 37 #include "gt/intel_gt.h" 38 #include "gt/intel_gt_clock_utils.h" 39 #include "gt/intel_gt_requests.h" 40 #include "gt/selftest_engine_heartbeat.h" 41 42 #include "i915_random.h" 43 #include "i915_selftest.h" 44 #include "i915_wait_util.h" 45 #include "igt_flush_test.h" 46 #include "igt_live_test.h" 47 #include "igt_spinner.h" 48 #include "lib_sw_fence.h" 49 #include "mock_drm.h" 50 #include "mock_gem_device.h" 51 52 static unsigned int num_uabi_engines(struct drm_i915_private *i915) 53 { 54 struct intel_engine_cs *engine; 55 unsigned int count; 56 57 count = 0; 58 for_each_uabi_engine(engine, i915) 59 count++; 60 61 return count; 62 } 63 64 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915) 65 { 66 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0); 67 } 68 69 static int igt_add_request(void *arg) 70 { 71 struct drm_i915_private *i915 = arg; 72 struct i915_request *request; 73 74 /* Basic preliminary test to create a request and let it loose! */ 75 76 request = mock_request(rcs0(i915)->kernel_context, HZ / 10); 77 if (IS_ERR(request)) 78 return PTR_ERR(request); 79 80 i915_request_add(request); 81 82 return 0; 83 } 84 85 static int igt_wait_request(void *arg) 86 { 87 const long T = HZ / 4; 88 struct drm_i915_private *i915 = arg; 89 struct i915_request *request; 90 int err = -EINVAL; 91 92 /* Submit a request, then wait upon it */ 93 94 request = mock_request(rcs0(i915)->kernel_context, T); 95 if (IS_ERR(request)) 96 return PTR_ERR(request); 97 98 i915_request_get(request); 99 100 if (i915_request_wait(request, 0, 0) != -ETIME) { 101 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n"); 102 goto out_request; 103 } 104 105 if (i915_request_wait(request, 0, T) != -ETIME) { 106 pr_err("request wait succeeded (expected timeout before submit!)\n"); 107 goto out_request; 108 } 109 110 if (i915_request_completed(request)) { 111 pr_err("request completed before submit!!\n"); 112 goto out_request; 113 } 114 115 i915_request_add(request); 116 117 if (i915_request_wait(request, 0, 0) != -ETIME) { 118 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n"); 119 goto out_request; 120 } 121 122 if (i915_request_completed(request)) { 123 pr_err("request completed immediately!\n"); 124 goto out_request; 125 } 126 127 if (i915_request_wait(request, 0, T / 2) != -ETIME) { 128 pr_err("request wait succeeded (expected timeout!)\n"); 129 goto out_request; 130 } 131 132 if (i915_request_wait(request, 0, T) == -ETIME) { 133 pr_err("request wait timed out!\n"); 134 goto out_request; 135 } 136 137 if (!i915_request_completed(request)) { 138 pr_err("request not complete after waiting!\n"); 139 goto out_request; 140 } 141 142 if (i915_request_wait(request, 0, T) == -ETIME) { 143 pr_err("request wait timed out when already complete!\n"); 144 goto out_request; 145 } 146 147 err = 0; 148 out_request: 149 i915_request_put(request); 150 mock_device_flush(i915); 151 return err; 152 } 153 154 static int igt_fence_wait(void *arg) 155 { 156 const long T = HZ / 4; 157 struct drm_i915_private *i915 = arg; 158 struct i915_request *request; 159 int err = -EINVAL; 160 161 /* Submit a request, treat it as a fence and wait upon it */ 162 163 request = mock_request(rcs0(i915)->kernel_context, T); 164 if (IS_ERR(request)) 165 return PTR_ERR(request); 166 167 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) { 168 pr_err("fence wait success before submit (expected timeout)!\n"); 169 goto out; 170 } 171 172 i915_request_add(request); 173 174 if (dma_fence_is_signaled(&request->fence)) { 175 pr_err("fence signaled immediately!\n"); 176 goto out; 177 } 178 179 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) { 180 pr_err("fence wait success after submit (expected timeout)!\n"); 181 goto out; 182 } 183 184 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 185 pr_err("fence wait timed out (expected success)!\n"); 186 goto out; 187 } 188 189 if (!dma_fence_is_signaled(&request->fence)) { 190 pr_err("fence unsignaled after waiting!\n"); 191 goto out; 192 } 193 194 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 195 pr_err("fence wait timed out when complete (expected success)!\n"); 196 goto out; 197 } 198 199 err = 0; 200 out: 201 mock_device_flush(i915); 202 return err; 203 } 204 205 static int igt_request_rewind(void *arg) 206 { 207 struct drm_i915_private *i915 = arg; 208 struct i915_request *request, *vip; 209 struct i915_gem_context *ctx[2]; 210 struct intel_context *ce; 211 int err = -EINVAL; 212 213 ctx[0] = mock_context(i915, "A"); 214 if (!ctx[0]) { 215 err = -ENOMEM; 216 goto err_ctx_0; 217 } 218 219 ce = i915_gem_context_get_engine(ctx[0], RCS0); 220 GEM_BUG_ON(IS_ERR(ce)); 221 request = mock_request(ce, 2 * HZ); 222 intel_context_put(ce); 223 if (IS_ERR(request)) { 224 err = PTR_ERR(request); 225 goto err_context_0; 226 } 227 228 i915_request_get(request); 229 i915_request_add(request); 230 231 ctx[1] = mock_context(i915, "B"); 232 if (!ctx[1]) { 233 err = -ENOMEM; 234 goto err_ctx_1; 235 } 236 237 ce = i915_gem_context_get_engine(ctx[1], RCS0); 238 GEM_BUG_ON(IS_ERR(ce)); 239 vip = mock_request(ce, 0); 240 intel_context_put(ce); 241 if (IS_ERR(vip)) { 242 err = PTR_ERR(vip); 243 goto err_context_1; 244 } 245 246 /* Simulate preemption by manual reordering */ 247 if (!mock_cancel_request(request)) { 248 pr_err("failed to cancel request (already executed)!\n"); 249 i915_request_add(vip); 250 goto err_context_1; 251 } 252 i915_request_get(vip); 253 i915_request_add(vip); 254 rcu_read_lock(); 255 request->engine->submit_request(request); 256 rcu_read_unlock(); 257 258 259 if (i915_request_wait(vip, 0, HZ) == -ETIME) { 260 pr_err("timed out waiting for high priority request\n"); 261 goto err; 262 } 263 264 if (i915_request_completed(request)) { 265 pr_err("low priority request already completed\n"); 266 goto err; 267 } 268 269 err = 0; 270 err: 271 i915_request_put(vip); 272 err_context_1: 273 mock_context_close(ctx[1]); 274 err_ctx_1: 275 i915_request_put(request); 276 err_context_0: 277 mock_context_close(ctx[0]); 278 err_ctx_0: 279 mock_device_flush(i915); 280 return err; 281 } 282 283 struct smoketest { 284 struct intel_engine_cs *engine; 285 struct i915_gem_context **contexts; 286 atomic_long_t num_waits, num_fences; 287 int ncontexts, max_batch; 288 struct i915_request *(*request_alloc)(struct intel_context *ce); 289 }; 290 291 static struct i915_request * 292 __mock_request_alloc(struct intel_context *ce) 293 { 294 return mock_request(ce, 0); 295 } 296 297 static struct i915_request * 298 __live_request_alloc(struct intel_context *ce) 299 { 300 return intel_context_create_request(ce); 301 } 302 303 struct smoke_thread { 304 struct kthread_worker *worker; 305 struct kthread_work work; 306 struct smoketest *t; 307 bool stop; 308 int result; 309 }; 310 311 static void __igt_breadcrumbs_smoketest(struct kthread_work *work) 312 { 313 struct smoke_thread *thread = container_of(work, typeof(*thread), work); 314 struct smoketest *t = thread->t; 315 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1; 316 const unsigned int total = 4 * t->ncontexts + 1; 317 unsigned int num_waits = 0, num_fences = 0; 318 struct i915_request **requests; 319 I915_RND_STATE(prng); 320 unsigned int *order; 321 int err = 0; 322 323 /* 324 * A very simple test to catch the most egregious of list handling bugs. 325 * 326 * At its heart, we simply create oodles of requests running across 327 * multiple kthreads and enable signaling on them, for the sole purpose 328 * of stressing our breadcrumb handling. The only inspection we do is 329 * that the fences were marked as signaled. 330 */ 331 332 requests = kzalloc_objs(*requests, total); 333 if (!requests) { 334 thread->result = -ENOMEM; 335 return; 336 } 337 338 order = i915_random_order(total, &prng); 339 if (!order) { 340 err = -ENOMEM; 341 goto out_requests; 342 } 343 344 while (!READ_ONCE(thread->stop)) { 345 struct i915_sw_fence *submit, *wait; 346 unsigned int n, count; 347 348 submit = heap_fence_create(GFP_KERNEL); 349 if (!submit) { 350 err = -ENOMEM; 351 break; 352 } 353 354 wait = heap_fence_create(GFP_KERNEL); 355 if (!wait) { 356 i915_sw_fence_commit(submit); 357 heap_fence_put(submit); 358 err = -ENOMEM; 359 break; 360 } 361 362 i915_random_reorder(order, total, &prng); 363 count = 1 + i915_prandom_u32_max_state(max_batch, &prng); 364 365 for (n = 0; n < count; n++) { 366 struct i915_gem_context *ctx = 367 t->contexts[order[n] % t->ncontexts]; 368 struct i915_request *rq; 369 struct intel_context *ce; 370 371 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx); 372 GEM_BUG_ON(IS_ERR(ce)); 373 rq = t->request_alloc(ce); 374 intel_context_put(ce); 375 if (IS_ERR(rq)) { 376 err = PTR_ERR(rq); 377 count = n; 378 break; 379 } 380 381 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 382 submit, 383 GFP_KERNEL); 384 385 requests[n] = i915_request_get(rq); 386 i915_request_add(rq); 387 388 if (err >= 0) 389 err = i915_sw_fence_await_dma_fence(wait, 390 &rq->fence, 391 0, 392 GFP_KERNEL); 393 394 if (err < 0) { 395 i915_request_put(rq); 396 count = n; 397 break; 398 } 399 } 400 401 i915_sw_fence_commit(submit); 402 i915_sw_fence_commit(wait); 403 404 if (!wait_event_timeout(wait->wait, 405 i915_sw_fence_done(wait), 406 5 * HZ)) { 407 struct i915_request *rq = requests[count - 1]; 408 409 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n", 410 atomic_read(&wait->pending), count, 411 rq->fence.context, rq->fence.seqno, 412 t->engine->name); 413 GEM_TRACE_DUMP(); 414 415 intel_gt_set_wedged(t->engine->gt); 416 GEM_BUG_ON(!i915_request_completed(rq)); 417 i915_sw_fence_wait(wait); 418 err = -EIO; 419 } 420 421 for (n = 0; n < count; n++) { 422 struct i915_request *rq = requests[n]; 423 424 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 425 &rq->fence.flags)) { 426 pr_err("%llu:%llu was not signaled!\n", 427 rq->fence.context, rq->fence.seqno); 428 err = -EINVAL; 429 } 430 431 i915_request_put(rq); 432 } 433 434 heap_fence_put(wait); 435 heap_fence_put(submit); 436 437 if (err < 0) 438 break; 439 440 num_fences += count; 441 num_waits++; 442 443 cond_resched(); 444 } 445 446 atomic_long_add(num_fences, &t->num_fences); 447 atomic_long_add(num_waits, &t->num_waits); 448 449 kfree(order); 450 out_requests: 451 kfree(requests); 452 thread->result = err; 453 } 454 455 static int mock_breadcrumbs_smoketest(void *arg) 456 { 457 struct drm_i915_private *i915 = arg; 458 struct smoketest t = { 459 .engine = rcs0(i915), 460 .ncontexts = 1024, 461 .max_batch = 1024, 462 .request_alloc = __mock_request_alloc 463 }; 464 unsigned int ncpus = num_online_cpus(); 465 struct smoke_thread *threads; 466 unsigned int n; 467 int ret = 0; 468 469 /* 470 * Smoketest our breadcrumb/signal handling for requests across multiple 471 * threads. A very simple test to only catch the most egregious of bugs. 472 * See __igt_breadcrumbs_smoketest(); 473 */ 474 475 threads = kzalloc_objs(*threads, ncpus); 476 if (!threads) 477 return -ENOMEM; 478 479 t.contexts = kzalloc_objs(*t.contexts, t.ncontexts); 480 if (!t.contexts) { 481 ret = -ENOMEM; 482 goto out_threads; 483 } 484 485 for (n = 0; n < t.ncontexts; n++) { 486 t.contexts[n] = mock_context(t.engine->i915, "mock"); 487 if (!t.contexts[n]) { 488 ret = -ENOMEM; 489 goto out_contexts; 490 } 491 } 492 493 for (n = 0; n < ncpus; n++) { 494 struct kthread_worker *worker; 495 496 worker = kthread_run_worker(0, "igt/%d", n); 497 if (IS_ERR(worker)) { 498 ret = PTR_ERR(worker); 499 ncpus = n; 500 break; 501 } 502 503 threads[n].worker = worker; 504 threads[n].t = &t; 505 threads[n].stop = false; 506 threads[n].result = 0; 507 508 kthread_init_work(&threads[n].work, 509 __igt_breadcrumbs_smoketest); 510 kthread_queue_work(worker, &threads[n].work); 511 } 512 513 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 514 515 for (n = 0; n < ncpus; n++) { 516 int err; 517 518 WRITE_ONCE(threads[n].stop, true); 519 kthread_flush_work(&threads[n].work); 520 err = READ_ONCE(threads[n].result); 521 if (err < 0 && !ret) 522 ret = err; 523 524 kthread_destroy_worker(threads[n].worker); 525 } 526 pr_info("Completed %lu waits for %lu fence across %d cpus\n", 527 atomic_long_read(&t.num_waits), 528 atomic_long_read(&t.num_fences), 529 ncpus); 530 531 out_contexts: 532 for (n = 0; n < t.ncontexts; n++) { 533 if (!t.contexts[n]) 534 break; 535 mock_context_close(t.contexts[n]); 536 } 537 kfree(t.contexts); 538 out_threads: 539 kfree(threads); 540 return ret; 541 } 542 543 int i915_request_mock_selftests(void) 544 { 545 static const struct i915_subtest tests[] = { 546 SUBTEST(igt_add_request), 547 SUBTEST(igt_wait_request), 548 SUBTEST(igt_fence_wait), 549 SUBTEST(igt_request_rewind), 550 SUBTEST(mock_breadcrumbs_smoketest), 551 }; 552 struct drm_i915_private *i915; 553 intel_wakeref_t wakeref; 554 int err = 0; 555 556 i915 = mock_gem_device(); 557 if (!i915) 558 return -ENOMEM; 559 560 with_intel_runtime_pm(&i915->runtime_pm, wakeref) 561 err = i915_subtests(tests, i915); 562 563 mock_destroy_device(i915); 564 565 return err; 566 } 567 568 static int live_nop_request(void *arg) 569 { 570 struct drm_i915_private *i915 = arg; 571 struct intel_engine_cs *engine; 572 struct igt_live_test t; 573 int err = -ENODEV; 574 575 /* 576 * Submit various sized batches of empty requests, to each engine 577 * (individually), and wait for the batch to complete. We can check 578 * the overhead of submitting requests to the hardware. 579 */ 580 581 for_each_uabi_engine(engine, i915) { 582 unsigned long n, prime; 583 IGT_TIMEOUT(end_time); 584 ktime_t times[2] = {}; 585 586 err = igt_live_test_begin(&t, i915, __func__, engine->name); 587 if (err) 588 return err; 589 590 intel_engine_pm_get(engine); 591 for_each_prime_number_from(prime, 1, 8192) { 592 struct i915_request *request = NULL; 593 594 times[1] = ktime_get_raw(); 595 596 for (n = 0; n < prime; n++) { 597 i915_request_put(request); 598 request = i915_request_create(engine->kernel_context); 599 if (IS_ERR(request)) 600 return PTR_ERR(request); 601 602 /* 603 * This space is left intentionally blank. 604 * 605 * We do not actually want to perform any 606 * action with this request, we just want 607 * to measure the latency in allocation 608 * and submission of our breadcrumbs - 609 * ensuring that the bare request is sufficient 610 * for the system to work (i.e. proper HEAD 611 * tracking of the rings, interrupt handling, 612 * etc). It also gives us the lowest bounds 613 * for latency. 614 */ 615 616 i915_request_get(request); 617 i915_request_add(request); 618 } 619 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 620 i915_request_put(request); 621 622 times[1] = ktime_sub(ktime_get_raw(), times[1]); 623 if (prime == 1) 624 times[0] = times[1]; 625 626 if (__igt_timeout(end_time, NULL)) 627 break; 628 } 629 intel_engine_pm_put(engine); 630 631 err = igt_live_test_end(&t); 632 if (err) 633 return err; 634 635 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n", 636 engine->name, 637 ktime_to_ns(times[0]), 638 prime, div64_u64(ktime_to_ns(times[1]), prime)); 639 } 640 641 return err; 642 } 643 644 static int __cancel_inactive(struct intel_engine_cs *engine) 645 { 646 struct intel_context *ce; 647 struct igt_spinner spin; 648 struct i915_request *rq; 649 int err = 0; 650 651 if (igt_spinner_init(&spin, engine->gt)) 652 return -ENOMEM; 653 654 ce = intel_context_create(engine); 655 if (IS_ERR(ce)) { 656 err = PTR_ERR(ce); 657 goto out_spin; 658 } 659 660 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 661 if (IS_ERR(rq)) { 662 err = PTR_ERR(rq); 663 goto out_ce; 664 } 665 666 pr_debug("%s: Cancelling inactive request\n", engine->name); 667 i915_request_cancel(rq, -EINTR); 668 i915_request_get(rq); 669 i915_request_add(rq); 670 671 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 672 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 673 674 pr_err("%s: Failed to cancel inactive request\n", engine->name); 675 intel_engine_dump(engine, &p, "%s\n", engine->name); 676 err = -ETIME; 677 goto out_rq; 678 } 679 680 if (rq->fence.error != -EINTR) { 681 pr_err("%s: fence not cancelled (%u)\n", 682 engine->name, rq->fence.error); 683 err = -EINVAL; 684 } 685 686 out_rq: 687 i915_request_put(rq); 688 out_ce: 689 intel_context_put(ce); 690 out_spin: 691 igt_spinner_fini(&spin); 692 if (err) 693 pr_err("%s: %s error %d\n", __func__, engine->name, err); 694 return err; 695 } 696 697 static int __cancel_active(struct intel_engine_cs *engine) 698 { 699 struct intel_context *ce; 700 struct igt_spinner spin; 701 struct i915_request *rq; 702 int err = 0; 703 704 if (igt_spinner_init(&spin, engine->gt)) 705 return -ENOMEM; 706 707 ce = intel_context_create(engine); 708 if (IS_ERR(ce)) { 709 err = PTR_ERR(ce); 710 goto out_spin; 711 } 712 713 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 714 if (IS_ERR(rq)) { 715 err = PTR_ERR(rq); 716 goto out_ce; 717 } 718 719 pr_debug("%s: Cancelling active request\n", engine->name); 720 i915_request_get(rq); 721 i915_request_add(rq); 722 if (!igt_wait_for_spinner(&spin, rq)) { 723 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 724 725 pr_err("Failed to start spinner on %s\n", engine->name); 726 intel_engine_dump(engine, &p, "%s\n", engine->name); 727 err = -ETIME; 728 goto out_rq; 729 } 730 i915_request_cancel(rq, -EINTR); 731 732 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 733 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 734 735 pr_err("%s: Failed to cancel active request\n", engine->name); 736 intel_engine_dump(engine, &p, "%s\n", engine->name); 737 err = -ETIME; 738 goto out_rq; 739 } 740 741 if (rq->fence.error != -EINTR) { 742 pr_err("%s: fence not cancelled (%u)\n", 743 engine->name, rq->fence.error); 744 err = -EINVAL; 745 } 746 747 out_rq: 748 i915_request_put(rq); 749 out_ce: 750 intel_context_put(ce); 751 out_spin: 752 igt_spinner_fini(&spin); 753 if (err) 754 pr_err("%s: %s error %d\n", __func__, engine->name, err); 755 return err; 756 } 757 758 static int __cancel_completed(struct intel_engine_cs *engine) 759 { 760 struct intel_context *ce; 761 struct igt_spinner spin; 762 struct i915_request *rq; 763 int err = 0; 764 765 if (igt_spinner_init(&spin, engine->gt)) 766 return -ENOMEM; 767 768 ce = intel_context_create(engine); 769 if (IS_ERR(ce)) { 770 err = PTR_ERR(ce); 771 goto out_spin; 772 } 773 774 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); 775 if (IS_ERR(rq)) { 776 err = PTR_ERR(rq); 777 goto out_ce; 778 } 779 igt_spinner_end(&spin); 780 i915_request_get(rq); 781 i915_request_add(rq); 782 783 if (i915_request_wait(rq, 0, HZ / 5) < 0) { 784 err = -ETIME; 785 goto out_rq; 786 } 787 788 pr_debug("%s: Cancelling completed request\n", engine->name); 789 i915_request_cancel(rq, -EINTR); 790 if (rq->fence.error) { 791 pr_err("%s: fence not cancelled (%u)\n", 792 engine->name, rq->fence.error); 793 err = -EINVAL; 794 } 795 796 out_rq: 797 i915_request_put(rq); 798 out_ce: 799 intel_context_put(ce); 800 out_spin: 801 igt_spinner_fini(&spin); 802 if (err) 803 pr_err("%s: %s error %d\n", __func__, engine->name, err); 804 return err; 805 } 806 807 /* 808 * Test to prove a non-preemptable request can be cancelled and a subsequent 809 * request on the same context can successfully complete after cancellation. 810 * 811 * Testing methodology is to create a non-preemptible request and submit it, 812 * wait for spinner to start, create a NOP request and submit it, cancel the 813 * spinner, wait for spinner to complete and verify it failed with an error, 814 * finally wait for NOP request to complete verify it succeeded without an 815 * error. Preemption timeout also reduced / restored so test runs in a timely 816 * maner. 817 */ 818 static int __cancel_reset(struct drm_i915_private *i915, 819 struct intel_engine_cs *engine) 820 { 821 struct intel_context *ce; 822 struct igt_spinner spin; 823 struct i915_request *rq, *nop; 824 unsigned long preempt_timeout_ms; 825 int err = 0; 826 827 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT || 828 !intel_has_reset_engine(engine->gt)) 829 return 0; 830 831 preempt_timeout_ms = engine->props.preempt_timeout_ms; 832 engine->props.preempt_timeout_ms = 100; 833 834 if (igt_spinner_init(&spin, engine->gt)) 835 goto out_restore; 836 837 ce = intel_context_create(engine); 838 if (IS_ERR(ce)) { 839 err = PTR_ERR(ce); 840 goto out_spin; 841 } 842 843 rq = igt_spinner_create_request(&spin, ce, MI_NOOP); 844 if (IS_ERR(rq)) { 845 err = PTR_ERR(rq); 846 goto out_ce; 847 } 848 849 pr_debug("%s: Cancelling active non-preemptable request\n", 850 engine->name); 851 i915_request_get(rq); 852 i915_request_add(rq); 853 if (!igt_wait_for_spinner(&spin, rq)) { 854 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 855 856 pr_err("Failed to start spinner on %s\n", engine->name); 857 intel_engine_dump(engine, &p, "%s\n", engine->name); 858 err = -ETIME; 859 goto out_rq; 860 } 861 862 nop = intel_context_create_request(ce); 863 if (IS_ERR(nop)) 864 goto out_rq; 865 i915_request_get(nop); 866 i915_request_add(nop); 867 868 i915_request_cancel(rq, -EINTR); 869 870 if (i915_request_wait(rq, 0, HZ) < 0) { 871 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 872 873 pr_err("%s: Failed to cancel hung request\n", engine->name); 874 intel_engine_dump(engine, &p, "%s\n", engine->name); 875 err = -ETIME; 876 goto out_nop; 877 } 878 879 if (rq->fence.error != -EINTR) { 880 pr_err("%s: fence not cancelled (%u)\n", 881 engine->name, rq->fence.error); 882 err = -EINVAL; 883 goto out_nop; 884 } 885 886 if (i915_request_wait(nop, 0, HZ) < 0) { 887 struct drm_printer p = drm_info_printer(engine->i915->drm.dev); 888 889 pr_err("%s: Failed to complete nop request\n", engine->name); 890 intel_engine_dump(engine, &p, "%s\n", engine->name); 891 err = -ETIME; 892 goto out_nop; 893 } 894 895 if (nop->fence.error != 0) { 896 pr_err("%s: Nop request errored (%u)\n", 897 engine->name, nop->fence.error); 898 err = -EINVAL; 899 } 900 901 out_nop: 902 i915_request_put(nop); 903 out_rq: 904 i915_request_put(rq); 905 out_ce: 906 intel_context_put(ce); 907 out_spin: 908 igt_spinner_fini(&spin); 909 out_restore: 910 engine->props.preempt_timeout_ms = preempt_timeout_ms; 911 if (err) 912 pr_err("%s: %s error %d\n", __func__, engine->name, err); 913 return err; 914 } 915 916 static int live_cancel_request(void *arg) 917 { 918 struct drm_i915_private *i915 = arg; 919 struct intel_engine_cs *engine; 920 921 /* 922 * Check cancellation of requests. We expect to be able to immediately 923 * cancel active requests, even if they are currently on the GPU. 924 */ 925 926 for_each_uabi_engine(engine, i915) { 927 struct igt_live_test t; 928 int err, err2; 929 930 if (!intel_engine_has_preemption(engine)) 931 continue; 932 933 err = igt_live_test_begin(&t, i915, __func__, engine->name); 934 if (err) 935 return err; 936 937 err = __cancel_inactive(engine); 938 if (err == 0) 939 err = __cancel_active(engine); 940 if (err == 0) 941 err = __cancel_completed(engine); 942 943 err2 = igt_live_test_end(&t); 944 if (err) 945 return err; 946 if (err2) 947 return err2; 948 949 /* Expects reset so call outside of igt_live_test_* */ 950 err = __cancel_reset(i915, engine); 951 if (err) 952 return err; 953 954 if (igt_flush_test(i915)) 955 return -EIO; 956 } 957 958 return 0; 959 } 960 961 static struct i915_vma *empty_batch(struct intel_gt *gt) 962 { 963 struct drm_i915_gem_object *obj; 964 struct i915_vma *vma; 965 u32 *cmd; 966 int err; 967 968 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 969 if (IS_ERR(obj)) 970 return ERR_CAST(obj); 971 972 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC); 973 if (IS_ERR(cmd)) { 974 err = PTR_ERR(cmd); 975 goto err; 976 } 977 978 *cmd = MI_BATCH_BUFFER_END; 979 980 __i915_gem_object_flush_map(obj, 0, 64); 981 i915_gem_object_unpin_map(obj); 982 983 intel_gt_chipset_flush(gt); 984 985 vma = i915_vma_instance(obj, gt->vm, NULL); 986 if (IS_ERR(vma)) { 987 err = PTR_ERR(vma); 988 goto err; 989 } 990 991 err = i915_vma_pin(vma, 0, 0, PIN_USER); 992 if (err) 993 goto err; 994 995 /* Force the wait now to avoid including it in the benchmark */ 996 err = i915_vma_sync(vma); 997 if (err) 998 goto err_pin; 999 1000 return vma; 1001 1002 err_pin: 1003 i915_vma_unpin(vma); 1004 err: 1005 i915_gem_object_put(obj); 1006 return ERR_PTR(err); 1007 } 1008 1009 static int emit_bb_start(struct i915_request *rq, struct i915_vma *batch) 1010 { 1011 return rq->engine->emit_bb_start(rq, 1012 i915_vma_offset(batch), 1013 i915_vma_size(batch), 1014 0); 1015 } 1016 1017 static struct i915_request * 1018 empty_request(struct intel_engine_cs *engine, 1019 struct i915_vma *batch) 1020 { 1021 struct i915_request *request; 1022 int err; 1023 1024 request = i915_request_create(engine->kernel_context); 1025 if (IS_ERR(request)) 1026 return request; 1027 1028 err = emit_bb_start(request, batch); 1029 if (err) 1030 goto out_request; 1031 1032 i915_request_get(request); 1033 out_request: 1034 i915_request_add(request); 1035 return err ? ERR_PTR(err) : request; 1036 } 1037 1038 static int live_empty_request(void *arg) 1039 { 1040 struct drm_i915_private *i915 = arg; 1041 struct intel_engine_cs *engine; 1042 struct igt_live_test t; 1043 int err; 1044 1045 /* 1046 * Submit various sized batches of empty requests, to each engine 1047 * (individually), and wait for the batch to complete. We can check 1048 * the overhead of submitting requests to the hardware. 1049 */ 1050 1051 for_each_uabi_engine(engine, i915) { 1052 IGT_TIMEOUT(end_time); 1053 struct i915_request *request; 1054 struct i915_vma *batch; 1055 unsigned long n, prime; 1056 ktime_t times[2] = {}; 1057 1058 batch = empty_batch(engine->gt); 1059 if (IS_ERR(batch)) 1060 return PTR_ERR(batch); 1061 1062 err = igt_live_test_begin(&t, i915, __func__, engine->name); 1063 if (err) 1064 goto out_batch; 1065 1066 intel_engine_pm_get(engine); 1067 1068 /* Warmup / preload */ 1069 request = empty_request(engine, batch); 1070 if (IS_ERR(request)) { 1071 err = PTR_ERR(request); 1072 intel_engine_pm_put(engine); 1073 goto out_batch; 1074 } 1075 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 1076 1077 for_each_prime_number_from(prime, 1, 8192) { 1078 times[1] = ktime_get_raw(); 1079 1080 for (n = 0; n < prime; n++) { 1081 i915_request_put(request); 1082 request = empty_request(engine, batch); 1083 if (IS_ERR(request)) { 1084 err = PTR_ERR(request); 1085 intel_engine_pm_put(engine); 1086 goto out_batch; 1087 } 1088 } 1089 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 1090 1091 times[1] = ktime_sub(ktime_get_raw(), times[1]); 1092 if (prime == 1) 1093 times[0] = times[1]; 1094 1095 if (__igt_timeout(end_time, NULL)) 1096 break; 1097 } 1098 i915_request_put(request); 1099 intel_engine_pm_put(engine); 1100 1101 err = igt_live_test_end(&t); 1102 if (err) 1103 goto out_batch; 1104 1105 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n", 1106 engine->name, 1107 ktime_to_ns(times[0]), 1108 prime, div64_u64(ktime_to_ns(times[1]), prime)); 1109 out_batch: 1110 i915_vma_unpin(batch); 1111 i915_vma_put(batch); 1112 if (err) 1113 break; 1114 } 1115 1116 return err; 1117 } 1118 1119 static struct i915_vma *recursive_batch(struct intel_gt *gt) 1120 { 1121 struct drm_i915_gem_object *obj; 1122 const int ver = GRAPHICS_VER(gt->i915); 1123 struct i915_vma *vma; 1124 u32 *cmd; 1125 int err; 1126 1127 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE); 1128 if (IS_ERR(obj)) 1129 return ERR_CAST(obj); 1130 1131 vma = i915_vma_instance(obj, gt->vm, NULL); 1132 if (IS_ERR(vma)) { 1133 err = PTR_ERR(vma); 1134 goto err; 1135 } 1136 1137 err = i915_vma_pin(vma, 0, 0, PIN_USER); 1138 if (err) 1139 goto err; 1140 1141 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC); 1142 if (IS_ERR(cmd)) { 1143 err = PTR_ERR(cmd); 1144 goto err; 1145 } 1146 1147 if (ver >= 8) { 1148 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 1149 *cmd++ = lower_32_bits(i915_vma_offset(vma)); 1150 *cmd++ = upper_32_bits(i915_vma_offset(vma)); 1151 } else if (ver >= 6) { 1152 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8; 1153 *cmd++ = lower_32_bits(i915_vma_offset(vma)); 1154 } else { 1155 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; 1156 *cmd++ = lower_32_bits(i915_vma_offset(vma)); 1157 } 1158 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */ 1159 1160 __i915_gem_object_flush_map(obj, 0, 64); 1161 i915_gem_object_unpin_map(obj); 1162 1163 intel_gt_chipset_flush(gt); 1164 1165 return vma; 1166 1167 err: 1168 i915_gem_object_put(obj); 1169 return ERR_PTR(err); 1170 } 1171 1172 static int recursive_batch_resolve(struct i915_vma *batch) 1173 { 1174 u32 *cmd; 1175 1176 cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC); 1177 if (IS_ERR(cmd)) 1178 return PTR_ERR(cmd); 1179 1180 *cmd = MI_BATCH_BUFFER_END; 1181 1182 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd)); 1183 i915_gem_object_unpin_map(batch->obj); 1184 1185 intel_gt_chipset_flush(batch->vm->gt); 1186 1187 return 0; 1188 } 1189 1190 static int live_all_engines(void *arg) 1191 { 1192 struct drm_i915_private *i915 = arg; 1193 const unsigned int nengines = num_uabi_engines(i915); 1194 struct intel_engine_cs *engine; 1195 struct i915_request **request; 1196 struct igt_live_test t; 1197 unsigned int idx; 1198 int err; 1199 1200 /* 1201 * Check we can submit requests to all engines simultaneously. We 1202 * send a recursive batch to each engine - checking that we don't 1203 * block doing so, and that they don't complete too soon. 1204 */ 1205 1206 request = kzalloc_objs(*request, nengines); 1207 if (!request) 1208 return -ENOMEM; 1209 1210 err = igt_live_test_begin(&t, i915, __func__, ""); 1211 if (err) 1212 goto out_free; 1213 1214 idx = 0; 1215 for_each_uabi_engine(engine, i915) { 1216 struct i915_vma *batch; 1217 1218 batch = recursive_batch(engine->gt); 1219 if (IS_ERR(batch)) { 1220 err = PTR_ERR(batch); 1221 pr_err("%s: Unable to create batch, err=%d\n", 1222 __func__, err); 1223 goto out_free; 1224 } 1225 1226 i915_vma_lock(batch); 1227 request[idx] = intel_engine_create_kernel_request(engine); 1228 if (IS_ERR(request[idx])) { 1229 err = PTR_ERR(request[idx]); 1230 pr_err("%s: Request allocation failed with err=%d\n", 1231 __func__, err); 1232 goto out_unlock; 1233 } 1234 GEM_BUG_ON(request[idx]->context->vm != batch->vm); 1235 1236 err = i915_vma_move_to_active(batch, request[idx], 0); 1237 GEM_BUG_ON(err); 1238 1239 err = emit_bb_start(request[idx], batch); 1240 GEM_BUG_ON(err); 1241 request[idx]->batch = batch; 1242 1243 i915_request_get(request[idx]); 1244 i915_request_add(request[idx]); 1245 idx++; 1246 out_unlock: 1247 i915_vma_unlock(batch); 1248 if (err) 1249 goto out_request; 1250 } 1251 1252 idx = 0; 1253 for_each_uabi_engine(engine, i915) { 1254 if (i915_request_completed(request[idx])) { 1255 pr_err("%s(%s): request completed too early!\n", 1256 __func__, engine->name); 1257 err = -EINVAL; 1258 goto out_request; 1259 } 1260 idx++; 1261 } 1262 1263 idx = 0; 1264 for_each_uabi_engine(engine, i915) { 1265 err = recursive_batch_resolve(request[idx]->batch); 1266 if (err) { 1267 pr_err("%s: failed to resolve batch, err=%d\n", 1268 __func__, err); 1269 goto out_request; 1270 } 1271 idx++; 1272 } 1273 1274 idx = 0; 1275 for_each_uabi_engine(engine, i915) { 1276 struct i915_request *rq = request[idx]; 1277 long timeout; 1278 1279 timeout = i915_request_wait(rq, 0, 1280 MAX_SCHEDULE_TIMEOUT); 1281 if (timeout < 0) { 1282 err = timeout; 1283 pr_err("%s: error waiting for request on %s, err=%d\n", 1284 __func__, engine->name, err); 1285 goto out_request; 1286 } 1287 1288 GEM_BUG_ON(!i915_request_completed(rq)); 1289 i915_vma_unpin(rq->batch); 1290 i915_vma_put(rq->batch); 1291 i915_request_put(rq); 1292 request[idx] = NULL; 1293 idx++; 1294 } 1295 1296 err = igt_live_test_end(&t); 1297 1298 out_request: 1299 idx = 0; 1300 for_each_uabi_engine(engine, i915) { 1301 struct i915_request *rq = request[idx]; 1302 1303 if (!rq) 1304 continue; 1305 1306 if (rq->batch) { 1307 i915_vma_unpin(rq->batch); 1308 i915_vma_put(rq->batch); 1309 } 1310 i915_request_put(rq); 1311 idx++; 1312 } 1313 out_free: 1314 kfree(request); 1315 return err; 1316 } 1317 1318 static int live_sequential_engines(void *arg) 1319 { 1320 struct drm_i915_private *i915 = arg; 1321 const unsigned int nengines = num_uabi_engines(i915); 1322 struct i915_request **request; 1323 struct i915_request *prev = NULL; 1324 struct intel_engine_cs *engine; 1325 struct igt_live_test t; 1326 unsigned int idx; 1327 int err; 1328 1329 /* 1330 * Check we can submit requests to all engines sequentially, such 1331 * that each successive request waits for the earlier ones. This 1332 * tests that we don't execute requests out of order, even though 1333 * they are running on independent engines. 1334 */ 1335 1336 request = kzalloc_objs(*request, nengines); 1337 if (!request) 1338 return -ENOMEM; 1339 1340 err = igt_live_test_begin(&t, i915, __func__, ""); 1341 if (err) 1342 goto out_free; 1343 1344 idx = 0; 1345 for_each_uabi_engine(engine, i915) { 1346 struct i915_vma *batch; 1347 1348 batch = recursive_batch(engine->gt); 1349 if (IS_ERR(batch)) { 1350 err = PTR_ERR(batch); 1351 pr_err("%s: Unable to create batch for %s, err=%d\n", 1352 __func__, engine->name, err); 1353 goto out_free; 1354 } 1355 1356 i915_vma_lock(batch); 1357 request[idx] = intel_engine_create_kernel_request(engine); 1358 if (IS_ERR(request[idx])) { 1359 err = PTR_ERR(request[idx]); 1360 pr_err("%s: Request allocation failed for %s with err=%d\n", 1361 __func__, engine->name, err); 1362 goto out_unlock; 1363 } 1364 GEM_BUG_ON(request[idx]->context->vm != batch->vm); 1365 1366 if (prev) { 1367 err = i915_request_await_dma_fence(request[idx], 1368 &prev->fence); 1369 if (err) { 1370 i915_request_add(request[idx]); 1371 pr_err("%s: Request await failed for %s with err=%d\n", 1372 __func__, engine->name, err); 1373 goto out_unlock; 1374 } 1375 } 1376 1377 err = i915_vma_move_to_active(batch, request[idx], 0); 1378 GEM_BUG_ON(err); 1379 1380 err = emit_bb_start(request[idx], batch); 1381 GEM_BUG_ON(err); 1382 request[idx]->batch = batch; 1383 1384 i915_request_get(request[idx]); 1385 i915_request_add(request[idx]); 1386 1387 prev = request[idx]; 1388 idx++; 1389 1390 out_unlock: 1391 i915_vma_unlock(batch); 1392 if (err) 1393 goto out_request; 1394 } 1395 1396 idx = 0; 1397 for_each_uabi_engine(engine, i915) { 1398 long timeout; 1399 1400 if (i915_request_completed(request[idx])) { 1401 pr_err("%s(%s): request completed too early!\n", 1402 __func__, engine->name); 1403 err = -EINVAL; 1404 goto out_request; 1405 } 1406 1407 err = recursive_batch_resolve(request[idx]->batch); 1408 if (err) { 1409 pr_err("%s: failed to resolve batch, err=%d\n", 1410 __func__, err); 1411 goto out_request; 1412 } 1413 1414 timeout = i915_request_wait(request[idx], 0, 1415 MAX_SCHEDULE_TIMEOUT); 1416 if (timeout < 0) { 1417 err = timeout; 1418 pr_err("%s: error waiting for request on %s, err=%d\n", 1419 __func__, engine->name, err); 1420 goto out_request; 1421 } 1422 1423 GEM_BUG_ON(!i915_request_completed(request[idx])); 1424 idx++; 1425 } 1426 1427 err = igt_live_test_end(&t); 1428 1429 out_request: 1430 idx = 0; 1431 for_each_uabi_engine(engine, i915) { 1432 u32 *cmd; 1433 1434 if (!request[idx]) 1435 break; 1436 1437 cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj, 1438 I915_MAP_WC); 1439 if (!IS_ERR(cmd)) { 1440 *cmd = MI_BATCH_BUFFER_END; 1441 1442 __i915_gem_object_flush_map(request[idx]->batch->obj, 1443 0, sizeof(*cmd)); 1444 i915_gem_object_unpin_map(request[idx]->batch->obj); 1445 1446 intel_gt_chipset_flush(engine->gt); 1447 } 1448 1449 i915_vma_put(request[idx]->batch); 1450 i915_request_put(request[idx]); 1451 idx++; 1452 } 1453 out_free: 1454 kfree(request); 1455 return err; 1456 } 1457 1458 struct parallel_thread { 1459 struct kthread_worker *worker; 1460 struct kthread_work work; 1461 struct intel_engine_cs *engine; 1462 int result; 1463 }; 1464 1465 static void __live_parallel_engine1(struct kthread_work *work) 1466 { 1467 struct parallel_thread *thread = 1468 container_of(work, typeof(*thread), work); 1469 struct intel_engine_cs *engine = thread->engine; 1470 IGT_TIMEOUT(end_time); 1471 unsigned long count; 1472 int err = 0; 1473 1474 count = 0; 1475 intel_engine_pm_get(engine); 1476 do { 1477 struct i915_request *rq; 1478 1479 rq = i915_request_create(engine->kernel_context); 1480 if (IS_ERR(rq)) { 1481 err = PTR_ERR(rq); 1482 break; 1483 } 1484 1485 i915_request_get(rq); 1486 i915_request_add(rq); 1487 1488 err = 0; 1489 if (i915_request_wait(rq, 0, HZ) < 0) 1490 err = -ETIME; 1491 i915_request_put(rq); 1492 if (err) 1493 break; 1494 1495 count++; 1496 } while (!__igt_timeout(end_time, NULL)); 1497 intel_engine_pm_put(engine); 1498 1499 pr_info("%s: %lu request + sync\n", engine->name, count); 1500 thread->result = err; 1501 } 1502 1503 static void __live_parallel_engineN(struct kthread_work *work) 1504 { 1505 struct parallel_thread *thread = 1506 container_of(work, typeof(*thread), work); 1507 struct intel_engine_cs *engine = thread->engine; 1508 IGT_TIMEOUT(end_time); 1509 unsigned long count; 1510 int err = 0; 1511 1512 count = 0; 1513 intel_engine_pm_get(engine); 1514 do { 1515 struct i915_request *rq; 1516 1517 rq = i915_request_create(engine->kernel_context); 1518 if (IS_ERR(rq)) { 1519 err = PTR_ERR(rq); 1520 break; 1521 } 1522 1523 i915_request_add(rq); 1524 count++; 1525 } while (!__igt_timeout(end_time, NULL)); 1526 intel_engine_pm_put(engine); 1527 1528 pr_info("%s: %lu requests\n", engine->name, count); 1529 thread->result = err; 1530 } 1531 1532 static bool wake_all(struct drm_i915_private *i915) 1533 { 1534 if (atomic_dec_and_test(&i915->selftest.counter)) { 1535 wake_up_var(&i915->selftest.counter); 1536 return true; 1537 } 1538 1539 return false; 1540 } 1541 1542 static int wait_for_all(struct drm_i915_private *i915) 1543 { 1544 if (wake_all(i915)) 1545 return 0; 1546 1547 if (wait_var_event_timeout(&i915->selftest.counter, 1548 !atomic_read(&i915->selftest.counter), 1549 i915_selftest.timeout_jiffies)) 1550 return 0; 1551 1552 return -ETIME; 1553 } 1554 1555 static void __live_parallel_spin(struct kthread_work *work) 1556 { 1557 struct parallel_thread *thread = 1558 container_of(work, typeof(*thread), work); 1559 struct intel_engine_cs *engine = thread->engine; 1560 struct igt_spinner spin; 1561 struct i915_request *rq; 1562 int err = 0; 1563 1564 /* 1565 * Create a spinner running for eternity on each engine. If a second 1566 * spinner is incorrectly placed on the same engine, it will not be 1567 * able to start in time. 1568 */ 1569 1570 if (igt_spinner_init(&spin, engine->gt)) { 1571 wake_all(engine->i915); 1572 thread->result = -ENOMEM; 1573 return; 1574 } 1575 1576 intel_engine_pm_get(engine); 1577 rq = igt_spinner_create_request(&spin, 1578 engine->kernel_context, 1579 MI_NOOP); /* no preemption */ 1580 intel_engine_pm_put(engine); 1581 if (IS_ERR(rq)) { 1582 err = PTR_ERR(rq); 1583 if (err == -ENODEV) 1584 err = 0; 1585 wake_all(engine->i915); 1586 goto out_spin; 1587 } 1588 1589 i915_request_get(rq); 1590 i915_request_add(rq); 1591 if (igt_wait_for_spinner(&spin, rq)) { 1592 /* Occupy this engine for the whole test */ 1593 err = wait_for_all(engine->i915); 1594 } else { 1595 pr_err("Failed to start spinner on %s\n", engine->name); 1596 err = -EINVAL; 1597 } 1598 igt_spinner_end(&spin); 1599 1600 if (err == 0 && i915_request_wait(rq, 0, HZ) < 0) 1601 err = -EIO; 1602 i915_request_put(rq); 1603 1604 out_spin: 1605 igt_spinner_fini(&spin); 1606 thread->result = err; 1607 } 1608 1609 static int live_parallel_engines(void *arg) 1610 { 1611 struct drm_i915_private *i915 = arg; 1612 static void (* const func[])(struct kthread_work *) = { 1613 __live_parallel_engine1, 1614 __live_parallel_engineN, 1615 __live_parallel_spin, 1616 NULL, 1617 }; 1618 const unsigned int nengines = num_uabi_engines(i915); 1619 struct parallel_thread *threads; 1620 struct intel_engine_cs *engine; 1621 void (* const *fn)(struct kthread_work *); 1622 int err = 0; 1623 1624 /* 1625 * Check we can submit requests to all engines concurrently. This 1626 * tests that we load up the system maximally. 1627 */ 1628 1629 threads = kzalloc_objs(*threads, nengines); 1630 if (!threads) 1631 return -ENOMEM; 1632 1633 for (fn = func; !err && *fn; fn++) { 1634 char name[KSYM_NAME_LEN]; 1635 struct igt_live_test t; 1636 unsigned int idx; 1637 1638 snprintf(name, sizeof(name), "%ps", *fn); 1639 err = igt_live_test_begin(&t, i915, __func__, name); 1640 if (err) 1641 break; 1642 1643 atomic_set(&i915->selftest.counter, nengines); 1644 1645 idx = 0; 1646 for_each_uabi_engine(engine, i915) { 1647 struct kthread_worker *worker; 1648 1649 worker = kthread_run_worker(0, "igt/parallel:%s", 1650 engine->name); 1651 if (IS_ERR(worker)) { 1652 err = PTR_ERR(worker); 1653 break; 1654 } 1655 1656 threads[idx].worker = worker; 1657 threads[idx].result = 0; 1658 threads[idx].engine = engine; 1659 1660 kthread_init_work(&threads[idx].work, *fn); 1661 kthread_queue_work(worker, &threads[idx].work); 1662 idx++; 1663 } 1664 1665 idx = 0; 1666 for_each_uabi_engine(engine, i915) { 1667 int status; 1668 1669 if (!threads[idx].worker) 1670 break; 1671 1672 kthread_flush_work(&threads[idx].work); 1673 status = READ_ONCE(threads[idx].result); 1674 if (status && !err) 1675 err = status; 1676 1677 kthread_destroy_worker(threads[idx++].worker); 1678 } 1679 1680 if (igt_live_test_end(&t)) 1681 err = -EIO; 1682 } 1683 1684 kfree(threads); 1685 return err; 1686 } 1687 1688 static int 1689 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine) 1690 { 1691 struct i915_request *rq; 1692 int ret; 1693 1694 /* 1695 * Before execlists, all contexts share the same ringbuffer. With 1696 * execlists, each context/engine has a separate ringbuffer and 1697 * for the purposes of this test, inexhaustible. 1698 * 1699 * For the global ringbuffer though, we have to be very careful 1700 * that we do not wrap while preventing the execution of requests 1701 * with a unsignaled fence. 1702 */ 1703 if (HAS_EXECLISTS(ctx->i915)) 1704 return INT_MAX; 1705 1706 rq = igt_request_alloc(ctx, engine); 1707 if (IS_ERR(rq)) { 1708 ret = PTR_ERR(rq); 1709 } else { 1710 int sz; 1711 1712 ret = rq->ring->size - rq->reserved_space; 1713 i915_request_add(rq); 1714 1715 sz = rq->ring->emit - rq->head; 1716 if (sz < 0) 1717 sz += rq->ring->size; 1718 ret /= sz; 1719 ret /= 2; /* leave half spare, in case of emergency! */ 1720 } 1721 1722 return ret; 1723 } 1724 1725 static int live_breadcrumbs_smoketest(void *arg) 1726 { 1727 struct drm_i915_private *i915 = arg; 1728 const unsigned int nengines = num_uabi_engines(i915); 1729 const unsigned int ncpus = /* saturate with nengines * ncpus */ 1730 max_t(int, 2, DIV_ROUND_UP(num_online_cpus(), nengines)); 1731 unsigned long num_waits, num_fences; 1732 struct intel_engine_cs *engine; 1733 struct smoke_thread *threads; 1734 struct igt_live_test live; 1735 intel_wakeref_t wakeref; 1736 struct smoketest *smoke; 1737 unsigned int n, idx; 1738 struct file *file; 1739 int ret = 0; 1740 1741 /* 1742 * Smoketest our breadcrumb/signal handling for requests across multiple 1743 * threads. A very simple test to only catch the most egregious of bugs. 1744 * See __igt_breadcrumbs_smoketest(); 1745 * 1746 * On real hardware this time. 1747 */ 1748 1749 wakeref = intel_runtime_pm_get(&i915->runtime_pm); 1750 1751 file = mock_file(i915); 1752 if (IS_ERR(file)) { 1753 ret = PTR_ERR(file); 1754 goto out_rpm; 1755 } 1756 1757 smoke = kzalloc_objs(*smoke, nengines); 1758 if (!smoke) { 1759 ret = -ENOMEM; 1760 goto out_file; 1761 } 1762 1763 threads = kzalloc_objs(*threads, ncpus * nengines); 1764 if (!threads) { 1765 ret = -ENOMEM; 1766 goto out_smoke; 1767 } 1768 1769 smoke[0].request_alloc = __live_request_alloc; 1770 smoke[0].ncontexts = 64; 1771 smoke[0].contexts = kzalloc_objs(*smoke[0].contexts, smoke[0].ncontexts); 1772 if (!smoke[0].contexts) { 1773 ret = -ENOMEM; 1774 goto out_threads; 1775 } 1776 1777 for (n = 0; n < smoke[0].ncontexts; n++) { 1778 smoke[0].contexts[n] = live_context(i915, file); 1779 if (IS_ERR(smoke[0].contexts[n])) { 1780 ret = PTR_ERR(smoke[0].contexts[n]); 1781 goto out_contexts; 1782 } 1783 } 1784 1785 ret = igt_live_test_begin(&live, i915, __func__, ""); 1786 if (ret) 1787 goto out_contexts; 1788 1789 idx = 0; 1790 for_each_uabi_engine(engine, i915) { 1791 smoke[idx] = smoke[0]; 1792 smoke[idx].engine = engine; 1793 smoke[idx].max_batch = 1794 max_batches(smoke[0].contexts[0], engine); 1795 if (smoke[idx].max_batch < 0) { 1796 ret = smoke[idx].max_batch; 1797 goto out_flush; 1798 } 1799 /* One ring interleaved between requests from all cpus */ 1800 smoke[idx].max_batch /= ncpus + 1; 1801 pr_debug("Limiting batches to %d requests on %s\n", 1802 smoke[idx].max_batch, engine->name); 1803 1804 for (n = 0; n < ncpus; n++) { 1805 unsigned int i = idx * ncpus + n; 1806 struct kthread_worker *worker; 1807 1808 worker = kthread_run_worker(0, "igt/%d.%d", idx, n); 1809 if (IS_ERR(worker)) { 1810 ret = PTR_ERR(worker); 1811 goto out_flush; 1812 } 1813 1814 threads[i].worker = worker; 1815 threads[i].t = &smoke[idx]; 1816 1817 kthread_init_work(&threads[i].work, 1818 __igt_breadcrumbs_smoketest); 1819 kthread_queue_work(worker, &threads[i].work); 1820 } 1821 1822 idx++; 1823 } 1824 1825 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 1826 1827 out_flush: 1828 idx = 0; 1829 num_waits = 0; 1830 num_fences = 0; 1831 for_each_uabi_engine(engine, i915) { 1832 for (n = 0; n < ncpus; n++) { 1833 unsigned int i = idx * ncpus + n; 1834 int err; 1835 1836 if (!threads[i].worker) 1837 continue; 1838 1839 WRITE_ONCE(threads[i].stop, true); 1840 kthread_flush_work(&threads[i].work); 1841 err = READ_ONCE(threads[i].result); 1842 if (err < 0 && !ret) 1843 ret = err; 1844 1845 kthread_destroy_worker(threads[i].worker); 1846 } 1847 1848 num_waits += atomic_long_read(&smoke[idx].num_waits); 1849 num_fences += atomic_long_read(&smoke[idx].num_fences); 1850 idx++; 1851 } 1852 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n", 1853 num_waits, num_fences, idx, ncpus); 1854 1855 ret = igt_live_test_end(&live) ?: ret; 1856 out_contexts: 1857 kfree(smoke[0].contexts); 1858 out_threads: 1859 kfree(threads); 1860 out_smoke: 1861 kfree(smoke); 1862 out_file: 1863 fput(file); 1864 out_rpm: 1865 intel_runtime_pm_put(&i915->runtime_pm, wakeref); 1866 1867 return ret; 1868 } 1869 1870 int i915_request_live_selftests(struct drm_i915_private *i915) 1871 { 1872 static const struct i915_subtest tests[] = { 1873 SUBTEST(live_nop_request), 1874 SUBTEST(live_all_engines), 1875 SUBTEST(live_sequential_engines), 1876 SUBTEST(live_parallel_engines), 1877 SUBTEST(live_empty_request), 1878 SUBTEST(live_cancel_request), 1879 SUBTEST(live_breadcrumbs_smoketest), 1880 }; 1881 1882 if (intel_gt_is_wedged(to_gt(i915))) 1883 return 0; 1884 1885 return i915_live_subtests(tests, i915); 1886 } 1887 1888 static int switch_to_kernel_sync(struct intel_context *ce, int err) 1889 { 1890 struct i915_request *rq; 1891 struct dma_fence *fence; 1892 1893 rq = intel_engine_create_kernel_request(ce->engine); 1894 if (IS_ERR(rq)) 1895 return PTR_ERR(rq); 1896 1897 fence = i915_active_fence_get(&ce->timeline->last_request); 1898 if (fence) { 1899 i915_request_await_dma_fence(rq, fence); 1900 dma_fence_put(fence); 1901 } 1902 1903 rq = i915_request_get(rq); 1904 i915_request_add(rq); 1905 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err) 1906 err = -ETIME; 1907 i915_request_put(rq); 1908 1909 while (!err && !intel_engine_is_idle(ce->engine)) 1910 intel_engine_flush_submission(ce->engine); 1911 1912 return err; 1913 } 1914 1915 struct perf_stats { 1916 struct intel_engine_cs *engine; 1917 unsigned long count; 1918 ktime_t time; 1919 ktime_t busy; 1920 u64 runtime; 1921 }; 1922 1923 struct perf_series { 1924 struct drm_i915_private *i915; 1925 unsigned int nengines; 1926 struct intel_context *ce[] __counted_by(nengines); 1927 }; 1928 1929 static int cmp_u32(const void *A, const void *B) 1930 { 1931 const u32 *a = A, *b = B; 1932 1933 return *a - *b; 1934 } 1935 1936 static u32 trifilter(u32 *a) 1937 { 1938 u64 sum; 1939 1940 #define TF_COUNT 5 1941 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL); 1942 1943 sum = mul_u32_u32(a[2], 2); 1944 sum += a[1]; 1945 sum += a[3]; 1946 1947 GEM_BUG_ON(sum > U32_MAX); 1948 return sum; 1949 #define TF_BIAS 2 1950 } 1951 1952 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles) 1953 { 1954 u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles); 1955 1956 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS); 1957 } 1958 1959 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset) 1960 { 1961 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; 1962 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base))); 1963 *cs++ = offset; 1964 *cs++ = 0; 1965 1966 return cs; 1967 } 1968 1969 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value) 1970 { 1971 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1972 *cs++ = offset; 1973 *cs++ = 0; 1974 *cs++ = value; 1975 1976 return cs; 1977 } 1978 1979 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset) 1980 { 1981 *cs++ = MI_SEMAPHORE_WAIT | 1982 MI_SEMAPHORE_GLOBAL_GTT | 1983 MI_SEMAPHORE_POLL | 1984 mode; 1985 *cs++ = value; 1986 *cs++ = offset; 1987 *cs++ = 0; 1988 1989 return cs; 1990 } 1991 1992 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value) 1993 { 1994 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset); 1995 } 1996 1997 static void semaphore_set(u32 *sema, u32 value) 1998 { 1999 WRITE_ONCE(*sema, value); 2000 wmb(); /* flush the update to the cache, and beyond */ 2001 } 2002 2003 static u32 *hwsp_scratch(const struct intel_context *ce) 2004 { 2005 return memset32(ce->engine->status_page.addr + 1000, 0, 21); 2006 } 2007 2008 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw) 2009 { 2010 return (i915_ggtt_offset(ce->engine->status_page.vma) + 2011 offset_in_page(dw)); 2012 } 2013 2014 static int measure_semaphore_response(struct intel_context *ce) 2015 { 2016 u32 *sema = hwsp_scratch(ce); 2017 const u32 offset = hwsp_offset(ce, sema); 2018 u32 elapsed[TF_COUNT], cycles; 2019 struct i915_request *rq; 2020 u32 *cs; 2021 int err; 2022 int i; 2023 2024 /* 2025 * Measure how many cycles it takes for the HW to detect the change 2026 * in a semaphore value. 2027 * 2028 * A: read CS_TIMESTAMP from CPU 2029 * poke semaphore 2030 * B: read CS_TIMESTAMP on GPU 2031 * 2032 * Semaphore latency: B - A 2033 */ 2034 2035 semaphore_set(sema, -1); 2036 2037 rq = i915_request_create(ce); 2038 if (IS_ERR(rq)) 2039 return PTR_ERR(rq); 2040 2041 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed)); 2042 if (IS_ERR(cs)) { 2043 i915_request_add(rq); 2044 err = PTR_ERR(cs); 2045 goto err; 2046 } 2047 2048 cs = emit_store_dw(cs, offset, 0); 2049 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2050 cs = emit_semaphore_poll_until(cs, offset, i); 2051 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2052 cs = emit_store_dw(cs, offset, 0); 2053 } 2054 2055 intel_ring_advance(rq, cs); 2056 i915_request_add(rq); 2057 2058 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 2059 err = -EIO; 2060 goto err; 2061 } 2062 2063 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2064 preempt_disable(); 2065 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2066 semaphore_set(sema, i); 2067 preempt_enable(); 2068 2069 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 2070 err = -EIO; 2071 goto err; 2072 } 2073 2074 elapsed[i - 1] = sema[i] - cycles; 2075 } 2076 2077 cycles = trifilter(elapsed); 2078 pr_info("%s: semaphore response %d cycles, %lluns\n", 2079 ce->engine->name, cycles >> TF_BIAS, 2080 cycles_to_ns(ce->engine, cycles)); 2081 2082 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2083 2084 err: 2085 intel_gt_set_wedged(ce->engine->gt); 2086 return err; 2087 } 2088 2089 static int measure_idle_dispatch(struct intel_context *ce) 2090 { 2091 u32 *sema = hwsp_scratch(ce); 2092 const u32 offset = hwsp_offset(ce, sema); 2093 u32 elapsed[TF_COUNT], cycles; 2094 u32 *cs; 2095 int err; 2096 int i; 2097 2098 /* 2099 * Measure how long it takes for us to submit a request while the 2100 * engine is idle, but is resting in our context. 2101 * 2102 * A: read CS_TIMESTAMP from CPU 2103 * submit request 2104 * B: read CS_TIMESTAMP on GPU 2105 * 2106 * Submission latency: B - A 2107 */ 2108 2109 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 2110 struct i915_request *rq; 2111 2112 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2113 if (err) 2114 return err; 2115 2116 rq = i915_request_create(ce); 2117 if (IS_ERR(rq)) { 2118 err = PTR_ERR(rq); 2119 goto err; 2120 } 2121 2122 cs = intel_ring_begin(rq, 4); 2123 if (IS_ERR(cs)) { 2124 i915_request_add(rq); 2125 err = PTR_ERR(cs); 2126 goto err; 2127 } 2128 2129 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2130 2131 intel_ring_advance(rq, cs); 2132 2133 preempt_disable(); 2134 local_bh_disable(); 2135 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2136 i915_request_add(rq); 2137 local_bh_enable(); 2138 preempt_enable(); 2139 } 2140 2141 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2142 if (err) 2143 goto err; 2144 2145 for (i = 0; i < ARRAY_SIZE(elapsed); i++) 2146 elapsed[i] = sema[i] - elapsed[i]; 2147 2148 cycles = trifilter(elapsed); 2149 pr_info("%s: idle dispatch latency %d cycles, %lluns\n", 2150 ce->engine->name, cycles >> TF_BIAS, 2151 cycles_to_ns(ce->engine, cycles)); 2152 2153 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2154 2155 err: 2156 intel_gt_set_wedged(ce->engine->gt); 2157 return err; 2158 } 2159 2160 static int measure_busy_dispatch(struct intel_context *ce) 2161 { 2162 u32 *sema = hwsp_scratch(ce); 2163 const u32 offset = hwsp_offset(ce, sema); 2164 u32 elapsed[TF_COUNT + 1], cycles; 2165 u32 *cs; 2166 int err; 2167 int i; 2168 2169 /* 2170 * Measure how long it takes for us to submit a request while the 2171 * engine is busy, polling on a semaphore in our context. With 2172 * direct submission, this will include the cost of a lite restore. 2173 * 2174 * A: read CS_TIMESTAMP from CPU 2175 * submit request 2176 * B: read CS_TIMESTAMP on GPU 2177 * 2178 * Submission latency: B - A 2179 */ 2180 2181 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2182 struct i915_request *rq; 2183 2184 rq = i915_request_create(ce); 2185 if (IS_ERR(rq)) { 2186 err = PTR_ERR(rq); 2187 goto err; 2188 } 2189 2190 cs = intel_ring_begin(rq, 12); 2191 if (IS_ERR(cs)) { 2192 i915_request_add(rq); 2193 err = PTR_ERR(cs); 2194 goto err; 2195 } 2196 2197 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 2198 cs = emit_semaphore_poll_until(cs, offset, i); 2199 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2200 2201 intel_ring_advance(rq, cs); 2202 2203 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) { 2204 err = -EIO; 2205 goto err; 2206 } 2207 2208 preempt_disable(); 2209 local_bh_disable(); 2210 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2211 i915_request_add(rq); 2212 local_bh_enable(); 2213 semaphore_set(sema, i - 1); 2214 preempt_enable(); 2215 } 2216 2217 wait_for(READ_ONCE(sema[i - 1]), 500); 2218 semaphore_set(sema, i - 1); 2219 2220 for (i = 1; i <= TF_COUNT; i++) { 2221 GEM_BUG_ON(sema[i] == -1); 2222 elapsed[i - 1] = sema[i] - elapsed[i]; 2223 } 2224 2225 cycles = trifilter(elapsed); 2226 pr_info("%s: busy dispatch latency %d cycles, %lluns\n", 2227 ce->engine->name, cycles >> TF_BIAS, 2228 cycles_to_ns(ce->engine, cycles)); 2229 2230 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2231 2232 err: 2233 intel_gt_set_wedged(ce->engine->gt); 2234 return err; 2235 } 2236 2237 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value) 2238 { 2239 const u32 offset = 2240 i915_ggtt_offset(engine->status_page.vma) + 2241 offset_in_page(sema); 2242 struct i915_request *rq; 2243 u32 *cs; 2244 2245 rq = i915_request_create(engine->kernel_context); 2246 if (IS_ERR(rq)) 2247 return PTR_ERR(rq); 2248 2249 cs = intel_ring_begin(rq, 4); 2250 if (IS_ERR(cs)) { 2251 i915_request_add(rq); 2252 return PTR_ERR(cs); 2253 } 2254 2255 cs = emit_semaphore_poll(cs, mode, value, offset); 2256 2257 intel_ring_advance(rq, cs); 2258 i915_request_add(rq); 2259 2260 return 0; 2261 } 2262 2263 static int measure_inter_request(struct intel_context *ce) 2264 { 2265 u32 *sema = hwsp_scratch(ce); 2266 const u32 offset = hwsp_offset(ce, sema); 2267 u32 elapsed[TF_COUNT + 1], cycles; 2268 struct i915_sw_fence *submit; 2269 int i, err; 2270 2271 /* 2272 * Measure how long it takes to advance from one request into the 2273 * next. Between each request we flush the GPU caches to memory, 2274 * update the breadcrumbs, and then invalidate those caches. 2275 * We queue up all the requests to be submitted in one batch so 2276 * it should be one set of contiguous measurements. 2277 * 2278 * A: read CS_TIMESTAMP on GPU 2279 * advance request 2280 * B: read CS_TIMESTAMP on GPU 2281 * 2282 * Request latency: B - A 2283 */ 2284 2285 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 2286 if (err) 2287 return err; 2288 2289 submit = heap_fence_create(GFP_KERNEL); 2290 if (!submit) { 2291 semaphore_set(sema, 1); 2292 return -ENOMEM; 2293 } 2294 2295 intel_engine_flush_submission(ce->engine); 2296 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2297 struct i915_request *rq; 2298 u32 *cs; 2299 2300 rq = i915_request_create(ce); 2301 if (IS_ERR(rq)) { 2302 err = PTR_ERR(rq); 2303 goto err_submit; 2304 } 2305 2306 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 2307 submit, 2308 GFP_KERNEL); 2309 if (err < 0) { 2310 i915_request_add(rq); 2311 goto err_submit; 2312 } 2313 2314 cs = intel_ring_begin(rq, 4); 2315 if (IS_ERR(cs)) { 2316 i915_request_add(rq); 2317 err = PTR_ERR(cs); 2318 goto err_submit; 2319 } 2320 2321 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2322 2323 intel_ring_advance(rq, cs); 2324 i915_request_add(rq); 2325 } 2326 i915_sw_fence_commit(submit); 2327 intel_engine_flush_submission(ce->engine); 2328 heap_fence_put(submit); 2329 2330 semaphore_set(sema, 1); 2331 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2332 if (err) 2333 goto err; 2334 2335 for (i = 1; i <= TF_COUNT; i++) 2336 elapsed[i - 1] = sema[i + 1] - sema[i]; 2337 2338 cycles = trifilter(elapsed); 2339 pr_info("%s: inter-request latency %d cycles, %lluns\n", 2340 ce->engine->name, cycles >> TF_BIAS, 2341 cycles_to_ns(ce->engine, cycles)); 2342 2343 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2344 2345 err_submit: 2346 i915_sw_fence_commit(submit); 2347 heap_fence_put(submit); 2348 semaphore_set(sema, 1); 2349 err: 2350 intel_gt_set_wedged(ce->engine->gt); 2351 return err; 2352 } 2353 2354 static int measure_context_switch(struct intel_context *ce) 2355 { 2356 u32 *sema = hwsp_scratch(ce); 2357 const u32 offset = hwsp_offset(ce, sema); 2358 struct i915_request *fence = NULL; 2359 u32 elapsed[TF_COUNT + 1], cycles; 2360 int i, j, err; 2361 u32 *cs; 2362 2363 /* 2364 * Measure how long it takes to advance from one request in one 2365 * context to a request in another context. This allows us to 2366 * measure how long the context save/restore take, along with all 2367 * the inter-context setup we require. 2368 * 2369 * A: read CS_TIMESTAMP on GPU 2370 * switch context 2371 * B: read CS_TIMESTAMP on GPU 2372 * 2373 * Context switch latency: B - A 2374 */ 2375 2376 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 2377 if (err) 2378 return err; 2379 2380 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2381 struct intel_context *arr[] = { 2382 ce, ce->engine->kernel_context 2383 }; 2384 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32); 2385 2386 for (j = 0; j < ARRAY_SIZE(arr); j++) { 2387 struct i915_request *rq; 2388 2389 rq = i915_request_create(arr[j]); 2390 if (IS_ERR(rq)) { 2391 err = PTR_ERR(rq); 2392 goto err_fence; 2393 } 2394 2395 if (fence) { 2396 err = i915_request_await_dma_fence(rq, 2397 &fence->fence); 2398 if (err) { 2399 i915_request_add(rq); 2400 goto err_fence; 2401 } 2402 } 2403 2404 cs = intel_ring_begin(rq, 4); 2405 if (IS_ERR(cs)) { 2406 i915_request_add(rq); 2407 err = PTR_ERR(cs); 2408 goto err_fence; 2409 } 2410 2411 cs = emit_timestamp_store(cs, ce, addr); 2412 addr += sizeof(u32); 2413 2414 intel_ring_advance(rq, cs); 2415 2416 i915_request_put(fence); 2417 fence = i915_request_get(rq); 2418 2419 i915_request_add(rq); 2420 } 2421 } 2422 i915_request_put(fence); 2423 intel_engine_flush_submission(ce->engine); 2424 2425 semaphore_set(sema, 1); 2426 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2427 if (err) 2428 goto err; 2429 2430 for (i = 1; i <= TF_COUNT; i++) 2431 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1]; 2432 2433 cycles = trifilter(elapsed); 2434 pr_info("%s: context switch latency %d cycles, %lluns\n", 2435 ce->engine->name, cycles >> TF_BIAS, 2436 cycles_to_ns(ce->engine, cycles)); 2437 2438 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2439 2440 err_fence: 2441 i915_request_put(fence); 2442 semaphore_set(sema, 1); 2443 err: 2444 intel_gt_set_wedged(ce->engine->gt); 2445 return err; 2446 } 2447 2448 static int measure_preemption(struct intel_context *ce) 2449 { 2450 u32 *sema = hwsp_scratch(ce); 2451 const u32 offset = hwsp_offset(ce, sema); 2452 u32 elapsed[TF_COUNT], cycles; 2453 u32 *cs; 2454 int err; 2455 int i; 2456 2457 /* 2458 * We measure two latencies while triggering preemption. The first 2459 * latency is how long it takes for us to submit a preempting request. 2460 * The second latency is how it takes for us to return from the 2461 * preemption back to the original context. 2462 * 2463 * A: read CS_TIMESTAMP from CPU 2464 * submit preemption 2465 * B: read CS_TIMESTAMP on GPU (in preempting context) 2466 * context switch 2467 * C: read CS_TIMESTAMP on GPU (in original context) 2468 * 2469 * Preemption dispatch latency: B - A 2470 * Preemption switch latency: C - B 2471 */ 2472 2473 if (!intel_engine_has_preemption(ce->engine)) 2474 return 0; 2475 2476 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2477 u32 addr = offset + 2 * i * sizeof(u32); 2478 struct i915_request *rq; 2479 2480 rq = i915_request_create(ce); 2481 if (IS_ERR(rq)) { 2482 err = PTR_ERR(rq); 2483 goto err; 2484 } 2485 2486 cs = intel_ring_begin(rq, 12); 2487 if (IS_ERR(cs)) { 2488 i915_request_add(rq); 2489 err = PTR_ERR(cs); 2490 goto err; 2491 } 2492 2493 cs = emit_store_dw(cs, addr, -1); 2494 cs = emit_semaphore_poll_until(cs, offset, i); 2495 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32)); 2496 2497 intel_ring_advance(rq, cs); 2498 i915_request_add(rq); 2499 2500 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) { 2501 err = -EIO; 2502 goto err; 2503 } 2504 2505 rq = i915_request_create(ce->engine->kernel_context); 2506 if (IS_ERR(rq)) { 2507 err = PTR_ERR(rq); 2508 goto err; 2509 } 2510 2511 cs = intel_ring_begin(rq, 8); 2512 if (IS_ERR(cs)) { 2513 i915_request_add(rq); 2514 err = PTR_ERR(cs); 2515 goto err; 2516 } 2517 2518 cs = emit_timestamp_store(cs, ce, addr); 2519 cs = emit_store_dw(cs, offset, i); 2520 2521 intel_ring_advance(rq, cs); 2522 rq->sched.attr.priority = I915_PRIORITY_BARRIER; 2523 2524 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2525 i915_request_add(rq); 2526 } 2527 2528 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) { 2529 err = -EIO; 2530 goto err; 2531 } 2532 2533 for (i = 1; i <= TF_COUNT; i++) 2534 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1]; 2535 2536 cycles = trifilter(elapsed); 2537 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n", 2538 ce->engine->name, cycles >> TF_BIAS, 2539 cycles_to_ns(ce->engine, cycles)); 2540 2541 for (i = 1; i <= TF_COUNT; i++) 2542 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0]; 2543 2544 cycles = trifilter(elapsed); 2545 pr_info("%s: preemption switch latency %d cycles, %lluns\n", 2546 ce->engine->name, cycles >> TF_BIAS, 2547 cycles_to_ns(ce->engine, cycles)); 2548 2549 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2550 2551 err: 2552 intel_gt_set_wedged(ce->engine->gt); 2553 return err; 2554 } 2555 2556 struct signal_cb { 2557 struct dma_fence_cb base; 2558 bool seen; 2559 }; 2560 2561 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb) 2562 { 2563 struct signal_cb *s = container_of(cb, typeof(*s), base); 2564 2565 smp_store_mb(s->seen, true); /* be safe, be strong */ 2566 } 2567 2568 static int measure_completion(struct intel_context *ce) 2569 { 2570 u32 *sema = hwsp_scratch(ce); 2571 const u32 offset = hwsp_offset(ce, sema); 2572 u32 elapsed[TF_COUNT], cycles; 2573 u32 *cs; 2574 int err; 2575 int i; 2576 2577 /* 2578 * Measure how long it takes for the signal (interrupt) to be 2579 * sent from the GPU to be processed by the CPU. 2580 * 2581 * A: read CS_TIMESTAMP on GPU 2582 * signal 2583 * B: read CS_TIMESTAMP from CPU 2584 * 2585 * Completion latency: B - A 2586 */ 2587 2588 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2589 struct signal_cb cb = { .seen = false }; 2590 struct i915_request *rq; 2591 2592 rq = i915_request_create(ce); 2593 if (IS_ERR(rq)) { 2594 err = PTR_ERR(rq); 2595 goto err; 2596 } 2597 2598 cs = intel_ring_begin(rq, 12); 2599 if (IS_ERR(cs)) { 2600 i915_request_add(rq); 2601 err = PTR_ERR(cs); 2602 goto err; 2603 } 2604 2605 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 2606 cs = emit_semaphore_poll_until(cs, offset, i); 2607 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2608 2609 intel_ring_advance(rq, cs); 2610 2611 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb); 2612 i915_request_add(rq); 2613 2614 intel_engine_flush_submission(ce->engine); 2615 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) { 2616 err = -EIO; 2617 goto err; 2618 } 2619 2620 preempt_disable(); 2621 semaphore_set(sema, i); 2622 while (!READ_ONCE(cb.seen)) 2623 cpu_relax(); 2624 2625 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2626 preempt_enable(); 2627 } 2628 2629 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2630 if (err) 2631 goto err; 2632 2633 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 2634 GEM_BUG_ON(sema[i + 1] == -1); 2635 elapsed[i] = elapsed[i] - sema[i + 1]; 2636 } 2637 2638 cycles = trifilter(elapsed); 2639 pr_info("%s: completion latency %d cycles, %lluns\n", 2640 ce->engine->name, cycles >> TF_BIAS, 2641 cycles_to_ns(ce->engine, cycles)); 2642 2643 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2644 2645 err: 2646 intel_gt_set_wedged(ce->engine->gt); 2647 return err; 2648 } 2649 2650 static void rps_pin(struct intel_gt *gt) 2651 { 2652 /* Pin the frequency to max */ 2653 atomic_inc(>->rps.num_waiters); 2654 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 2655 2656 mutex_lock(>->rps.lock); 2657 intel_rps_set(>->rps, gt->rps.max_freq); 2658 mutex_unlock(>->rps.lock); 2659 } 2660 2661 static void rps_unpin(struct intel_gt *gt) 2662 { 2663 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 2664 atomic_dec(>->rps.num_waiters); 2665 } 2666 2667 static int perf_request_latency(void *arg) 2668 { 2669 struct drm_i915_private *i915 = arg; 2670 struct intel_engine_cs *engine; 2671 struct pm_qos_request qos; 2672 int err = 0; 2673 2674 if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */ 2675 return 0; 2676 2677 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2678 2679 for_each_uabi_engine(engine, i915) { 2680 struct intel_context *ce; 2681 2682 ce = intel_context_create(engine); 2683 if (IS_ERR(ce)) { 2684 err = PTR_ERR(ce); 2685 goto out; 2686 } 2687 2688 err = intel_context_pin(ce); 2689 if (err) { 2690 intel_context_put(ce); 2691 goto out; 2692 } 2693 2694 st_engine_heartbeat_disable(engine); 2695 rps_pin(engine->gt); 2696 2697 if (err == 0) 2698 err = measure_semaphore_response(ce); 2699 if (err == 0) 2700 err = measure_idle_dispatch(ce); 2701 if (err == 0) 2702 err = measure_busy_dispatch(ce); 2703 if (err == 0) 2704 err = measure_inter_request(ce); 2705 if (err == 0) 2706 err = measure_context_switch(ce); 2707 if (err == 0) 2708 err = measure_preemption(ce); 2709 if (err == 0) 2710 err = measure_completion(ce); 2711 2712 rps_unpin(engine->gt); 2713 st_engine_heartbeat_enable(engine); 2714 2715 intel_context_unpin(ce); 2716 intel_context_put(ce); 2717 if (err) 2718 goto out; 2719 } 2720 2721 out: 2722 if (igt_flush_test(i915)) 2723 err = -EIO; 2724 2725 cpu_latency_qos_remove_request(&qos); 2726 return err; 2727 } 2728 2729 static int s_sync0(void *arg) 2730 { 2731 struct perf_series *ps = arg; 2732 IGT_TIMEOUT(end_time); 2733 unsigned int idx = 0; 2734 int err = 0; 2735 2736 GEM_BUG_ON(!ps->nengines); 2737 do { 2738 struct i915_request *rq; 2739 2740 rq = i915_request_create(ps->ce[idx]); 2741 if (IS_ERR(rq)) { 2742 err = PTR_ERR(rq); 2743 break; 2744 } 2745 2746 i915_request_get(rq); 2747 i915_request_add(rq); 2748 2749 if (i915_request_wait(rq, 0, HZ / 5) < 0) 2750 err = -ETIME; 2751 i915_request_put(rq); 2752 if (err) 2753 break; 2754 2755 if (++idx == ps->nengines) 2756 idx = 0; 2757 } while (!__igt_timeout(end_time, NULL)); 2758 2759 return err; 2760 } 2761 2762 static int s_sync1(void *arg) 2763 { 2764 struct perf_series *ps = arg; 2765 struct i915_request *prev = NULL; 2766 IGT_TIMEOUT(end_time); 2767 unsigned int idx = 0; 2768 int err = 0; 2769 2770 GEM_BUG_ON(!ps->nengines); 2771 do { 2772 struct i915_request *rq; 2773 2774 rq = i915_request_create(ps->ce[idx]); 2775 if (IS_ERR(rq)) { 2776 err = PTR_ERR(rq); 2777 break; 2778 } 2779 2780 i915_request_get(rq); 2781 i915_request_add(rq); 2782 2783 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0) 2784 err = -ETIME; 2785 i915_request_put(prev); 2786 prev = rq; 2787 if (err) 2788 break; 2789 2790 if (++idx == ps->nengines) 2791 idx = 0; 2792 } while (!__igt_timeout(end_time, NULL)); 2793 i915_request_put(prev); 2794 2795 return err; 2796 } 2797 2798 static int s_many(void *arg) 2799 { 2800 struct perf_series *ps = arg; 2801 IGT_TIMEOUT(end_time); 2802 unsigned int idx = 0; 2803 2804 GEM_BUG_ON(!ps->nengines); 2805 do { 2806 struct i915_request *rq; 2807 2808 rq = i915_request_create(ps->ce[idx]); 2809 if (IS_ERR(rq)) 2810 return PTR_ERR(rq); 2811 2812 i915_request_add(rq); 2813 2814 if (++idx == ps->nengines) 2815 idx = 0; 2816 } while (!__igt_timeout(end_time, NULL)); 2817 2818 return 0; 2819 } 2820 2821 static int perf_series_engines(void *arg) 2822 { 2823 struct drm_i915_private *i915 = arg; 2824 static int (* const func[])(void *arg) = { 2825 s_sync0, 2826 s_sync1, 2827 s_many, 2828 NULL, 2829 }; 2830 const unsigned int nengines = num_uabi_engines(i915); 2831 struct intel_engine_cs *engine; 2832 int (* const *fn)(void *arg); 2833 struct pm_qos_request qos; 2834 struct perf_stats *stats; 2835 struct perf_series *ps; 2836 unsigned int idx; 2837 int err = 0; 2838 2839 stats = kzalloc_objs(*stats, nengines); 2840 if (!stats) 2841 return -ENOMEM; 2842 2843 ps = kzalloc_flex(*ps, ce, nengines); 2844 if (!ps) { 2845 kfree(stats); 2846 return -ENOMEM; 2847 } 2848 2849 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2850 2851 ps->i915 = i915; 2852 ps->nengines = nengines; 2853 2854 idx = 0; 2855 for_each_uabi_engine(engine, i915) { 2856 struct intel_context *ce; 2857 2858 ce = intel_context_create(engine); 2859 if (IS_ERR(ce)) { 2860 err = PTR_ERR(ce); 2861 goto out; 2862 } 2863 2864 err = intel_context_pin(ce); 2865 if (err) { 2866 intel_context_put(ce); 2867 goto out; 2868 } 2869 2870 ps->ce[idx++] = ce; 2871 } 2872 GEM_BUG_ON(idx != ps->nengines); 2873 2874 for (fn = func; *fn && !err; fn++) { 2875 char name[KSYM_NAME_LEN]; 2876 struct igt_live_test t; 2877 2878 snprintf(name, sizeof(name), "%ps", *fn); 2879 err = igt_live_test_begin(&t, i915, __func__, name); 2880 if (err) 2881 break; 2882 2883 for (idx = 0; idx < nengines; idx++) { 2884 struct perf_stats *p = 2885 memset(&stats[idx], 0, sizeof(stats[idx])); 2886 struct intel_context *ce = ps->ce[idx]; 2887 2888 p->engine = ps->ce[idx]->engine; 2889 intel_engine_pm_get(p->engine); 2890 2891 if (intel_engine_supports_stats(p->engine)) 2892 p->busy = intel_engine_get_busy_time(p->engine, 2893 &p->time) + 1; 2894 else 2895 p->time = ktime_get(); 2896 p->runtime = -intel_context_get_total_runtime_ns(ce); 2897 } 2898 2899 err = (*fn)(ps); 2900 if (igt_live_test_end(&t)) 2901 err = -EIO; 2902 2903 for (idx = 0; idx < nengines; idx++) { 2904 struct perf_stats *p = &stats[idx]; 2905 struct intel_context *ce = ps->ce[idx]; 2906 int integer, decimal; 2907 u64 busy, dt, now; 2908 2909 if (p->busy) 2910 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine, 2911 &now), 2912 p->busy - 1); 2913 else 2914 now = ktime_get(); 2915 p->time = ktime_sub(now, p->time); 2916 2917 err = switch_to_kernel_sync(ce, err); 2918 p->runtime += intel_context_get_total_runtime_ns(ce); 2919 intel_engine_pm_put(p->engine); 2920 2921 busy = 100 * ktime_to_ns(p->busy); 2922 dt = ktime_to_ns(p->time); 2923 if (dt) { 2924 integer = div64_u64(busy, dt); 2925 busy -= integer * dt; 2926 decimal = div64_u64(100 * busy, dt); 2927 } else { 2928 integer = 0; 2929 decimal = 0; 2930 } 2931 2932 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 2933 name, p->engine->name, ce->timeline->seqno, 2934 integer, decimal, 2935 div_u64(p->runtime, 1000 * 1000), 2936 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 2937 } 2938 } 2939 2940 out: 2941 for (idx = 0; idx < nengines; idx++) { 2942 if (IS_ERR_OR_NULL(ps->ce[idx])) 2943 break; 2944 2945 intel_context_unpin(ps->ce[idx]); 2946 intel_context_put(ps->ce[idx]); 2947 } 2948 kfree(ps); 2949 2950 cpu_latency_qos_remove_request(&qos); 2951 kfree(stats); 2952 return err; 2953 } 2954 2955 struct p_thread { 2956 struct perf_stats p; 2957 struct kthread_worker *worker; 2958 struct kthread_work work; 2959 struct intel_engine_cs *engine; 2960 int result; 2961 }; 2962 2963 static void p_sync0(struct kthread_work *work) 2964 { 2965 struct p_thread *thread = container_of(work, typeof(*thread), work); 2966 struct perf_stats *p = &thread->p; 2967 struct intel_engine_cs *engine = p->engine; 2968 struct intel_context *ce; 2969 IGT_TIMEOUT(end_time); 2970 unsigned long count; 2971 bool busy; 2972 int err = 0; 2973 2974 ce = intel_context_create(engine); 2975 if (IS_ERR(ce)) { 2976 thread->result = PTR_ERR(ce); 2977 return; 2978 } 2979 2980 err = intel_context_pin(ce); 2981 if (err) { 2982 intel_context_put(ce); 2983 thread->result = err; 2984 return; 2985 } 2986 2987 if (intel_engine_supports_stats(engine)) { 2988 p->busy = intel_engine_get_busy_time(engine, &p->time); 2989 busy = true; 2990 } else { 2991 p->time = ktime_get(); 2992 busy = false; 2993 } 2994 2995 count = 0; 2996 do { 2997 struct i915_request *rq; 2998 2999 rq = i915_request_create(ce); 3000 if (IS_ERR(rq)) { 3001 err = PTR_ERR(rq); 3002 break; 3003 } 3004 3005 i915_request_get(rq); 3006 i915_request_add(rq); 3007 3008 err = 0; 3009 if (i915_request_wait(rq, 0, HZ) < 0) 3010 err = -ETIME; 3011 i915_request_put(rq); 3012 if (err) 3013 break; 3014 3015 count++; 3016 } while (!__igt_timeout(end_time, NULL)); 3017 3018 if (busy) { 3019 ktime_t now; 3020 3021 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3022 p->busy); 3023 p->time = ktime_sub(now, p->time); 3024 } else { 3025 p->time = ktime_sub(ktime_get(), p->time); 3026 } 3027 3028 err = switch_to_kernel_sync(ce, err); 3029 p->runtime = intel_context_get_total_runtime_ns(ce); 3030 p->count = count; 3031 3032 intel_context_unpin(ce); 3033 intel_context_put(ce); 3034 thread->result = err; 3035 } 3036 3037 static void p_sync1(struct kthread_work *work) 3038 { 3039 struct p_thread *thread = container_of(work, typeof(*thread), work); 3040 struct perf_stats *p = &thread->p; 3041 struct intel_engine_cs *engine = p->engine; 3042 struct i915_request *prev = NULL; 3043 struct intel_context *ce; 3044 IGT_TIMEOUT(end_time); 3045 unsigned long count; 3046 bool busy; 3047 int err = 0; 3048 3049 ce = intel_context_create(engine); 3050 if (IS_ERR(ce)) { 3051 thread->result = PTR_ERR(ce); 3052 return; 3053 } 3054 3055 err = intel_context_pin(ce); 3056 if (err) { 3057 intel_context_put(ce); 3058 thread->result = err; 3059 return; 3060 } 3061 3062 if (intel_engine_supports_stats(engine)) { 3063 p->busy = intel_engine_get_busy_time(engine, &p->time); 3064 busy = true; 3065 } else { 3066 p->time = ktime_get(); 3067 busy = false; 3068 } 3069 3070 count = 0; 3071 do { 3072 struct i915_request *rq; 3073 3074 rq = i915_request_create(ce); 3075 if (IS_ERR(rq)) { 3076 err = PTR_ERR(rq); 3077 break; 3078 } 3079 3080 i915_request_get(rq); 3081 i915_request_add(rq); 3082 3083 err = 0; 3084 if (prev && i915_request_wait(prev, 0, HZ) < 0) 3085 err = -ETIME; 3086 i915_request_put(prev); 3087 prev = rq; 3088 if (err) 3089 break; 3090 3091 count++; 3092 } while (!__igt_timeout(end_time, NULL)); 3093 i915_request_put(prev); 3094 3095 if (busy) { 3096 ktime_t now; 3097 3098 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3099 p->busy); 3100 p->time = ktime_sub(now, p->time); 3101 } else { 3102 p->time = ktime_sub(ktime_get(), p->time); 3103 } 3104 3105 err = switch_to_kernel_sync(ce, err); 3106 p->runtime = intel_context_get_total_runtime_ns(ce); 3107 p->count = count; 3108 3109 intel_context_unpin(ce); 3110 intel_context_put(ce); 3111 thread->result = err; 3112 } 3113 3114 static void p_many(struct kthread_work *work) 3115 { 3116 struct p_thread *thread = container_of(work, typeof(*thread), work); 3117 struct perf_stats *p = &thread->p; 3118 struct intel_engine_cs *engine = p->engine; 3119 struct intel_context *ce; 3120 IGT_TIMEOUT(end_time); 3121 unsigned long count; 3122 int err = 0; 3123 bool busy; 3124 3125 ce = intel_context_create(engine); 3126 if (IS_ERR(ce)) { 3127 thread->result = PTR_ERR(ce); 3128 return; 3129 } 3130 3131 err = intel_context_pin(ce); 3132 if (err) { 3133 intel_context_put(ce); 3134 thread->result = err; 3135 return; 3136 } 3137 3138 if (intel_engine_supports_stats(engine)) { 3139 p->busy = intel_engine_get_busy_time(engine, &p->time); 3140 busy = true; 3141 } else { 3142 p->time = ktime_get(); 3143 busy = false; 3144 } 3145 3146 count = 0; 3147 do { 3148 struct i915_request *rq; 3149 3150 rq = i915_request_create(ce); 3151 if (IS_ERR(rq)) { 3152 err = PTR_ERR(rq); 3153 break; 3154 } 3155 3156 i915_request_add(rq); 3157 count++; 3158 } while (!__igt_timeout(end_time, NULL)); 3159 3160 if (busy) { 3161 ktime_t now; 3162 3163 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now), 3164 p->busy); 3165 p->time = ktime_sub(now, p->time); 3166 } else { 3167 p->time = ktime_sub(ktime_get(), p->time); 3168 } 3169 3170 err = switch_to_kernel_sync(ce, err); 3171 p->runtime = intel_context_get_total_runtime_ns(ce); 3172 p->count = count; 3173 3174 intel_context_unpin(ce); 3175 intel_context_put(ce); 3176 thread->result = err; 3177 } 3178 3179 static int perf_parallel_engines(void *arg) 3180 { 3181 struct drm_i915_private *i915 = arg; 3182 static void (* const func[])(struct kthread_work *) = { 3183 p_sync0, 3184 p_sync1, 3185 p_many, 3186 NULL, 3187 }; 3188 const unsigned int nengines = num_uabi_engines(i915); 3189 void (* const *fn)(struct kthread_work *); 3190 struct intel_engine_cs *engine; 3191 struct pm_qos_request qos; 3192 struct p_thread *engines; 3193 int err = 0; 3194 3195 engines = kzalloc_objs(*engines, nengines); 3196 if (!engines) 3197 return -ENOMEM; 3198 3199 cpu_latency_qos_add_request(&qos, 0); 3200 3201 for (fn = func; *fn; fn++) { 3202 char name[KSYM_NAME_LEN]; 3203 struct igt_live_test t; 3204 unsigned int idx; 3205 3206 snprintf(name, sizeof(name), "%ps", *fn); 3207 err = igt_live_test_begin(&t, i915, __func__, name); 3208 if (err) 3209 break; 3210 3211 atomic_set(&i915->selftest.counter, nengines); 3212 3213 idx = 0; 3214 for_each_uabi_engine(engine, i915) { 3215 struct kthread_worker *worker; 3216 3217 intel_engine_pm_get(engine); 3218 3219 memset(&engines[idx].p, 0, sizeof(engines[idx].p)); 3220 3221 worker = kthread_run_worker(0, "igt:%s", 3222 engine->name); 3223 if (IS_ERR(worker)) { 3224 err = PTR_ERR(worker); 3225 intel_engine_pm_put(engine); 3226 break; 3227 } 3228 engines[idx].worker = worker; 3229 engines[idx].result = 0; 3230 engines[idx].p.engine = engine; 3231 engines[idx].engine = engine; 3232 3233 kthread_init_work(&engines[idx].work, *fn); 3234 kthread_queue_work(worker, &engines[idx].work); 3235 idx++; 3236 } 3237 3238 idx = 0; 3239 for_each_uabi_engine(engine, i915) { 3240 int status; 3241 3242 if (!engines[idx].worker) 3243 break; 3244 3245 kthread_flush_work(&engines[idx].work); 3246 status = READ_ONCE(engines[idx].result); 3247 if (status && !err) 3248 err = status; 3249 3250 intel_engine_pm_put(engine); 3251 3252 kthread_destroy_worker(engines[idx].worker); 3253 idx++; 3254 } 3255 3256 if (igt_live_test_end(&t)) 3257 err = -EIO; 3258 if (err) 3259 break; 3260 3261 idx = 0; 3262 for_each_uabi_engine(engine, i915) { 3263 struct perf_stats *p = &engines[idx].p; 3264 u64 busy = 100 * ktime_to_ns(p->busy); 3265 u64 dt = ktime_to_ns(p->time); 3266 int integer, decimal; 3267 3268 if (dt) { 3269 integer = div64_u64(busy, dt); 3270 busy -= integer * dt; 3271 decimal = div64_u64(100 * busy, dt); 3272 } else { 3273 integer = 0; 3274 decimal = 0; 3275 } 3276 3277 GEM_BUG_ON(engine != p->engine); 3278 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 3279 name, engine->name, p->count, integer, decimal, 3280 div_u64(p->runtime, 1000 * 1000), 3281 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 3282 idx++; 3283 } 3284 } 3285 3286 cpu_latency_qos_remove_request(&qos); 3287 kfree(engines); 3288 return err; 3289 } 3290 3291 int i915_request_perf_selftests(struct drm_i915_private *i915) 3292 { 3293 static const struct i915_subtest tests[] = { 3294 SUBTEST(perf_request_latency), 3295 SUBTEST(perf_series_engines), 3296 SUBTEST(perf_parallel_engines), 3297 }; 3298 3299 if (intel_gt_is_wedged(to_gt(i915))) 3300 return 0; 3301 3302 return i915_subtests(tests, i915); 3303 } 3304