1 /* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/prime_numbers.h> 26 #include <linux/pm_qos.h> 27 #include <linux/sort.h> 28 29 #include "gem/i915_gem_pm.h" 30 #include "gem/selftests/mock_context.h" 31 32 #include "gt/intel_engine_heartbeat.h" 33 #include "gt/intel_engine_pm.h" 34 #include "gt/intel_engine_user.h" 35 #include "gt/intel_gt.h" 36 #include "gt/intel_gt_requests.h" 37 38 #include "i915_random.h" 39 #include "i915_selftest.h" 40 #include "igt_flush_test.h" 41 #include "igt_live_test.h" 42 #include "igt_spinner.h" 43 #include "lib_sw_fence.h" 44 45 #include "mock_drm.h" 46 #include "mock_gem_device.h" 47 48 static unsigned int num_uabi_engines(struct drm_i915_private *i915) 49 { 50 struct intel_engine_cs *engine; 51 unsigned int count; 52 53 count = 0; 54 for_each_uabi_engine(engine, i915) 55 count++; 56 57 return count; 58 } 59 60 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915) 61 { 62 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0); 63 } 64 65 static int igt_add_request(void *arg) 66 { 67 struct drm_i915_private *i915 = arg; 68 struct i915_request *request; 69 70 /* Basic preliminary test to create a request and let it loose! */ 71 72 request = mock_request(rcs0(i915)->kernel_context, HZ / 10); 73 if (!request) 74 return -ENOMEM; 75 76 i915_request_add(request); 77 78 return 0; 79 } 80 81 static int igt_wait_request(void *arg) 82 { 83 const long T = HZ / 4; 84 struct drm_i915_private *i915 = arg; 85 struct i915_request *request; 86 int err = -EINVAL; 87 88 /* Submit a request, then wait upon it */ 89 90 request = mock_request(rcs0(i915)->kernel_context, T); 91 if (!request) 92 return -ENOMEM; 93 94 i915_request_get(request); 95 96 if (i915_request_wait(request, 0, 0) != -ETIME) { 97 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n"); 98 goto out_request; 99 } 100 101 if (i915_request_wait(request, 0, T) != -ETIME) { 102 pr_err("request wait succeeded (expected timeout before submit!)\n"); 103 goto out_request; 104 } 105 106 if (i915_request_completed(request)) { 107 pr_err("request completed before submit!!\n"); 108 goto out_request; 109 } 110 111 i915_request_add(request); 112 113 if (i915_request_wait(request, 0, 0) != -ETIME) { 114 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n"); 115 goto out_request; 116 } 117 118 if (i915_request_completed(request)) { 119 pr_err("request completed immediately!\n"); 120 goto out_request; 121 } 122 123 if (i915_request_wait(request, 0, T / 2) != -ETIME) { 124 pr_err("request wait succeeded (expected timeout!)\n"); 125 goto out_request; 126 } 127 128 if (i915_request_wait(request, 0, T) == -ETIME) { 129 pr_err("request wait timed out!\n"); 130 goto out_request; 131 } 132 133 if (!i915_request_completed(request)) { 134 pr_err("request not complete after waiting!\n"); 135 goto out_request; 136 } 137 138 if (i915_request_wait(request, 0, T) == -ETIME) { 139 pr_err("request wait timed out when already complete!\n"); 140 goto out_request; 141 } 142 143 err = 0; 144 out_request: 145 i915_request_put(request); 146 mock_device_flush(i915); 147 return err; 148 } 149 150 static int igt_fence_wait(void *arg) 151 { 152 const long T = HZ / 4; 153 struct drm_i915_private *i915 = arg; 154 struct i915_request *request; 155 int err = -EINVAL; 156 157 /* Submit a request, treat it as a fence and wait upon it */ 158 159 request = mock_request(rcs0(i915)->kernel_context, T); 160 if (!request) 161 return -ENOMEM; 162 163 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) { 164 pr_err("fence wait success before submit (expected timeout)!\n"); 165 goto out; 166 } 167 168 i915_request_add(request); 169 170 if (dma_fence_is_signaled(&request->fence)) { 171 pr_err("fence signaled immediately!\n"); 172 goto out; 173 } 174 175 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) { 176 pr_err("fence wait success after submit (expected timeout)!\n"); 177 goto out; 178 } 179 180 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 181 pr_err("fence wait timed out (expected success)!\n"); 182 goto out; 183 } 184 185 if (!dma_fence_is_signaled(&request->fence)) { 186 pr_err("fence unsignaled after waiting!\n"); 187 goto out; 188 } 189 190 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) { 191 pr_err("fence wait timed out when complete (expected success)!\n"); 192 goto out; 193 } 194 195 err = 0; 196 out: 197 mock_device_flush(i915); 198 return err; 199 } 200 201 static int igt_request_rewind(void *arg) 202 { 203 struct drm_i915_private *i915 = arg; 204 struct i915_request *request, *vip; 205 struct i915_gem_context *ctx[2]; 206 struct intel_context *ce; 207 int err = -EINVAL; 208 209 ctx[0] = mock_context(i915, "A"); 210 211 ce = i915_gem_context_get_engine(ctx[0], RCS0); 212 GEM_BUG_ON(IS_ERR(ce)); 213 request = mock_request(ce, 2 * HZ); 214 intel_context_put(ce); 215 if (!request) { 216 err = -ENOMEM; 217 goto err_context_0; 218 } 219 220 i915_request_get(request); 221 i915_request_add(request); 222 223 ctx[1] = mock_context(i915, "B"); 224 225 ce = i915_gem_context_get_engine(ctx[1], RCS0); 226 GEM_BUG_ON(IS_ERR(ce)); 227 vip = mock_request(ce, 0); 228 intel_context_put(ce); 229 if (!vip) { 230 err = -ENOMEM; 231 goto err_context_1; 232 } 233 234 /* Simulate preemption by manual reordering */ 235 if (!mock_cancel_request(request)) { 236 pr_err("failed to cancel request (already executed)!\n"); 237 i915_request_add(vip); 238 goto err_context_1; 239 } 240 i915_request_get(vip); 241 i915_request_add(vip); 242 rcu_read_lock(); 243 request->engine->submit_request(request); 244 rcu_read_unlock(); 245 246 247 if (i915_request_wait(vip, 0, HZ) == -ETIME) { 248 pr_err("timed out waiting for high priority request\n"); 249 goto err; 250 } 251 252 if (i915_request_completed(request)) { 253 pr_err("low priority request already completed\n"); 254 goto err; 255 } 256 257 err = 0; 258 err: 259 i915_request_put(vip); 260 err_context_1: 261 mock_context_close(ctx[1]); 262 i915_request_put(request); 263 err_context_0: 264 mock_context_close(ctx[0]); 265 mock_device_flush(i915); 266 return err; 267 } 268 269 struct smoketest { 270 struct intel_engine_cs *engine; 271 struct i915_gem_context **contexts; 272 atomic_long_t num_waits, num_fences; 273 int ncontexts, max_batch; 274 struct i915_request *(*request_alloc)(struct intel_context *ce); 275 }; 276 277 static struct i915_request * 278 __mock_request_alloc(struct intel_context *ce) 279 { 280 return mock_request(ce, 0); 281 } 282 283 static struct i915_request * 284 __live_request_alloc(struct intel_context *ce) 285 { 286 return intel_context_create_request(ce); 287 } 288 289 static int __igt_breadcrumbs_smoketest(void *arg) 290 { 291 struct smoketest *t = arg; 292 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1; 293 const unsigned int total = 4 * t->ncontexts + 1; 294 unsigned int num_waits = 0, num_fences = 0; 295 struct i915_request **requests; 296 I915_RND_STATE(prng); 297 unsigned int *order; 298 int err = 0; 299 300 /* 301 * A very simple test to catch the most egregious of list handling bugs. 302 * 303 * At its heart, we simply create oodles of requests running across 304 * multiple kthreads and enable signaling on them, for the sole purpose 305 * of stressing our breadcrumb handling. The only inspection we do is 306 * that the fences were marked as signaled. 307 */ 308 309 requests = kcalloc(total, sizeof(*requests), GFP_KERNEL); 310 if (!requests) 311 return -ENOMEM; 312 313 order = i915_random_order(total, &prng); 314 if (!order) { 315 err = -ENOMEM; 316 goto out_requests; 317 } 318 319 while (!kthread_should_stop()) { 320 struct i915_sw_fence *submit, *wait; 321 unsigned int n, count; 322 323 submit = heap_fence_create(GFP_KERNEL); 324 if (!submit) { 325 err = -ENOMEM; 326 break; 327 } 328 329 wait = heap_fence_create(GFP_KERNEL); 330 if (!wait) { 331 i915_sw_fence_commit(submit); 332 heap_fence_put(submit); 333 err = ENOMEM; 334 break; 335 } 336 337 i915_random_reorder(order, total, &prng); 338 count = 1 + i915_prandom_u32_max_state(max_batch, &prng); 339 340 for (n = 0; n < count; n++) { 341 struct i915_gem_context *ctx = 342 t->contexts[order[n] % t->ncontexts]; 343 struct i915_request *rq; 344 struct intel_context *ce; 345 346 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx); 347 GEM_BUG_ON(IS_ERR(ce)); 348 rq = t->request_alloc(ce); 349 intel_context_put(ce); 350 if (IS_ERR(rq)) { 351 err = PTR_ERR(rq); 352 count = n; 353 break; 354 } 355 356 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 357 submit, 358 GFP_KERNEL); 359 360 requests[n] = i915_request_get(rq); 361 i915_request_add(rq); 362 363 if (err >= 0) 364 err = i915_sw_fence_await_dma_fence(wait, 365 &rq->fence, 366 0, 367 GFP_KERNEL); 368 369 if (err < 0) { 370 i915_request_put(rq); 371 count = n; 372 break; 373 } 374 } 375 376 i915_sw_fence_commit(submit); 377 i915_sw_fence_commit(wait); 378 379 if (!wait_event_timeout(wait->wait, 380 i915_sw_fence_done(wait), 381 5 * HZ)) { 382 struct i915_request *rq = requests[count - 1]; 383 384 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n", 385 atomic_read(&wait->pending), count, 386 rq->fence.context, rq->fence.seqno, 387 t->engine->name); 388 GEM_TRACE_DUMP(); 389 390 intel_gt_set_wedged(t->engine->gt); 391 GEM_BUG_ON(!i915_request_completed(rq)); 392 i915_sw_fence_wait(wait); 393 err = -EIO; 394 } 395 396 for (n = 0; n < count; n++) { 397 struct i915_request *rq = requests[n]; 398 399 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 400 &rq->fence.flags)) { 401 pr_err("%llu:%llu was not signaled!\n", 402 rq->fence.context, rq->fence.seqno); 403 err = -EINVAL; 404 } 405 406 i915_request_put(rq); 407 } 408 409 heap_fence_put(wait); 410 heap_fence_put(submit); 411 412 if (err < 0) 413 break; 414 415 num_fences += count; 416 num_waits++; 417 418 cond_resched(); 419 } 420 421 atomic_long_add(num_fences, &t->num_fences); 422 atomic_long_add(num_waits, &t->num_waits); 423 424 kfree(order); 425 out_requests: 426 kfree(requests); 427 return err; 428 } 429 430 static int mock_breadcrumbs_smoketest(void *arg) 431 { 432 struct drm_i915_private *i915 = arg; 433 struct smoketest t = { 434 .engine = rcs0(i915), 435 .ncontexts = 1024, 436 .max_batch = 1024, 437 .request_alloc = __mock_request_alloc 438 }; 439 unsigned int ncpus = num_online_cpus(); 440 struct task_struct **threads; 441 unsigned int n; 442 int ret = 0; 443 444 /* 445 * Smoketest our breadcrumb/signal handling for requests across multiple 446 * threads. A very simple test to only catch the most egregious of bugs. 447 * See __igt_breadcrumbs_smoketest(); 448 */ 449 450 threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL); 451 if (!threads) 452 return -ENOMEM; 453 454 t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL); 455 if (!t.contexts) { 456 ret = -ENOMEM; 457 goto out_threads; 458 } 459 460 for (n = 0; n < t.ncontexts; n++) { 461 t.contexts[n] = mock_context(t.engine->i915, "mock"); 462 if (!t.contexts[n]) { 463 ret = -ENOMEM; 464 goto out_contexts; 465 } 466 } 467 468 for (n = 0; n < ncpus; n++) { 469 threads[n] = kthread_run(__igt_breadcrumbs_smoketest, 470 &t, "igt/%d", n); 471 if (IS_ERR(threads[n])) { 472 ret = PTR_ERR(threads[n]); 473 ncpus = n; 474 break; 475 } 476 477 get_task_struct(threads[n]); 478 } 479 480 yield(); /* start all threads before we begin */ 481 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 482 483 for (n = 0; n < ncpus; n++) { 484 int err; 485 486 err = kthread_stop(threads[n]); 487 if (err < 0 && !ret) 488 ret = err; 489 490 put_task_struct(threads[n]); 491 } 492 pr_info("Completed %lu waits for %lu fence across %d cpus\n", 493 atomic_long_read(&t.num_waits), 494 atomic_long_read(&t.num_fences), 495 ncpus); 496 497 out_contexts: 498 for (n = 0; n < t.ncontexts; n++) { 499 if (!t.contexts[n]) 500 break; 501 mock_context_close(t.contexts[n]); 502 } 503 kfree(t.contexts); 504 out_threads: 505 kfree(threads); 506 return ret; 507 } 508 509 int i915_request_mock_selftests(void) 510 { 511 static const struct i915_subtest tests[] = { 512 SUBTEST(igt_add_request), 513 SUBTEST(igt_wait_request), 514 SUBTEST(igt_fence_wait), 515 SUBTEST(igt_request_rewind), 516 SUBTEST(mock_breadcrumbs_smoketest), 517 }; 518 struct drm_i915_private *i915; 519 intel_wakeref_t wakeref; 520 int err = 0; 521 522 i915 = mock_gem_device(); 523 if (!i915) 524 return -ENOMEM; 525 526 with_intel_runtime_pm(&i915->runtime_pm, wakeref) 527 err = i915_subtests(tests, i915); 528 529 drm_dev_put(&i915->drm); 530 531 return err; 532 } 533 534 static int live_nop_request(void *arg) 535 { 536 struct drm_i915_private *i915 = arg; 537 struct intel_engine_cs *engine; 538 struct igt_live_test t; 539 int err = -ENODEV; 540 541 /* 542 * Submit various sized batches of empty requests, to each engine 543 * (individually), and wait for the batch to complete. We can check 544 * the overhead of submitting requests to the hardware. 545 */ 546 547 for_each_uabi_engine(engine, i915) { 548 unsigned long n, prime; 549 IGT_TIMEOUT(end_time); 550 ktime_t times[2] = {}; 551 552 err = igt_live_test_begin(&t, i915, __func__, engine->name); 553 if (err) 554 return err; 555 556 intel_engine_pm_get(engine); 557 for_each_prime_number_from(prime, 1, 8192) { 558 struct i915_request *request = NULL; 559 560 times[1] = ktime_get_raw(); 561 562 for (n = 0; n < prime; n++) { 563 i915_request_put(request); 564 request = i915_request_create(engine->kernel_context); 565 if (IS_ERR(request)) 566 return PTR_ERR(request); 567 568 /* 569 * This space is left intentionally blank. 570 * 571 * We do not actually want to perform any 572 * action with this request, we just want 573 * to measure the latency in allocation 574 * and submission of our breadcrumbs - 575 * ensuring that the bare request is sufficient 576 * for the system to work (i.e. proper HEAD 577 * tracking of the rings, interrupt handling, 578 * etc). It also gives us the lowest bounds 579 * for latency. 580 */ 581 582 i915_request_get(request); 583 i915_request_add(request); 584 } 585 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 586 i915_request_put(request); 587 588 times[1] = ktime_sub(ktime_get_raw(), times[1]); 589 if (prime == 1) 590 times[0] = times[1]; 591 592 if (__igt_timeout(end_time, NULL)) 593 break; 594 } 595 intel_engine_pm_put(engine); 596 597 err = igt_live_test_end(&t); 598 if (err) 599 return err; 600 601 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n", 602 engine->name, 603 ktime_to_ns(times[0]), 604 prime, div64_u64(ktime_to_ns(times[1]), prime)); 605 } 606 607 return err; 608 } 609 610 static struct i915_vma *empty_batch(struct drm_i915_private *i915) 611 { 612 struct drm_i915_gem_object *obj; 613 struct i915_vma *vma; 614 u32 *cmd; 615 int err; 616 617 obj = i915_gem_object_create_internal(i915, PAGE_SIZE); 618 if (IS_ERR(obj)) 619 return ERR_CAST(obj); 620 621 cmd = i915_gem_object_pin_map(obj, I915_MAP_WB); 622 if (IS_ERR(cmd)) { 623 err = PTR_ERR(cmd); 624 goto err; 625 } 626 627 *cmd = MI_BATCH_BUFFER_END; 628 629 __i915_gem_object_flush_map(obj, 0, 64); 630 i915_gem_object_unpin_map(obj); 631 632 intel_gt_chipset_flush(&i915->gt); 633 634 vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL); 635 if (IS_ERR(vma)) { 636 err = PTR_ERR(vma); 637 goto err; 638 } 639 640 err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL); 641 if (err) 642 goto err; 643 644 /* Force the wait wait now to avoid including it in the benchmark */ 645 err = i915_vma_sync(vma); 646 if (err) 647 goto err_pin; 648 649 return vma; 650 651 err_pin: 652 i915_vma_unpin(vma); 653 err: 654 i915_gem_object_put(obj); 655 return ERR_PTR(err); 656 } 657 658 static struct i915_request * 659 empty_request(struct intel_engine_cs *engine, 660 struct i915_vma *batch) 661 { 662 struct i915_request *request; 663 int err; 664 665 request = i915_request_create(engine->kernel_context); 666 if (IS_ERR(request)) 667 return request; 668 669 err = engine->emit_bb_start(request, 670 batch->node.start, 671 batch->node.size, 672 I915_DISPATCH_SECURE); 673 if (err) 674 goto out_request; 675 676 i915_request_get(request); 677 out_request: 678 i915_request_add(request); 679 return err ? ERR_PTR(err) : request; 680 } 681 682 static int live_empty_request(void *arg) 683 { 684 struct drm_i915_private *i915 = arg; 685 struct intel_engine_cs *engine; 686 struct igt_live_test t; 687 struct i915_vma *batch; 688 int err = 0; 689 690 /* 691 * Submit various sized batches of empty requests, to each engine 692 * (individually), and wait for the batch to complete. We can check 693 * the overhead of submitting requests to the hardware. 694 */ 695 696 batch = empty_batch(i915); 697 if (IS_ERR(batch)) 698 return PTR_ERR(batch); 699 700 for_each_uabi_engine(engine, i915) { 701 IGT_TIMEOUT(end_time); 702 struct i915_request *request; 703 unsigned long n, prime; 704 ktime_t times[2] = {}; 705 706 err = igt_live_test_begin(&t, i915, __func__, engine->name); 707 if (err) 708 goto out_batch; 709 710 intel_engine_pm_get(engine); 711 712 /* Warmup / preload */ 713 request = empty_request(engine, batch); 714 if (IS_ERR(request)) { 715 err = PTR_ERR(request); 716 intel_engine_pm_put(engine); 717 goto out_batch; 718 } 719 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 720 721 for_each_prime_number_from(prime, 1, 8192) { 722 times[1] = ktime_get_raw(); 723 724 for (n = 0; n < prime; n++) { 725 i915_request_put(request); 726 request = empty_request(engine, batch); 727 if (IS_ERR(request)) { 728 err = PTR_ERR(request); 729 intel_engine_pm_put(engine); 730 goto out_batch; 731 } 732 } 733 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT); 734 735 times[1] = ktime_sub(ktime_get_raw(), times[1]); 736 if (prime == 1) 737 times[0] = times[1]; 738 739 if (__igt_timeout(end_time, NULL)) 740 break; 741 } 742 i915_request_put(request); 743 intel_engine_pm_put(engine); 744 745 err = igt_live_test_end(&t); 746 if (err) 747 goto out_batch; 748 749 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n", 750 engine->name, 751 ktime_to_ns(times[0]), 752 prime, div64_u64(ktime_to_ns(times[1]), prime)); 753 } 754 755 out_batch: 756 i915_vma_unpin(batch); 757 i915_vma_put(batch); 758 return err; 759 } 760 761 static struct i915_vma *recursive_batch(struct drm_i915_private *i915) 762 { 763 struct drm_i915_gem_object *obj; 764 const int gen = INTEL_GEN(i915); 765 struct i915_vma *vma; 766 u32 *cmd; 767 int err; 768 769 obj = i915_gem_object_create_internal(i915, PAGE_SIZE); 770 if (IS_ERR(obj)) 771 return ERR_CAST(obj); 772 773 vma = i915_vma_instance(obj, i915->gt.vm, NULL); 774 if (IS_ERR(vma)) { 775 err = PTR_ERR(vma); 776 goto err; 777 } 778 779 err = i915_vma_pin(vma, 0, 0, PIN_USER); 780 if (err) 781 goto err; 782 783 cmd = i915_gem_object_pin_map(obj, I915_MAP_WC); 784 if (IS_ERR(cmd)) { 785 err = PTR_ERR(cmd); 786 goto err; 787 } 788 789 if (gen >= 8) { 790 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; 791 *cmd++ = lower_32_bits(vma->node.start); 792 *cmd++ = upper_32_bits(vma->node.start); 793 } else if (gen >= 6) { 794 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8; 795 *cmd++ = lower_32_bits(vma->node.start); 796 } else { 797 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; 798 *cmd++ = lower_32_bits(vma->node.start); 799 } 800 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */ 801 802 __i915_gem_object_flush_map(obj, 0, 64); 803 i915_gem_object_unpin_map(obj); 804 805 intel_gt_chipset_flush(&i915->gt); 806 807 return vma; 808 809 err: 810 i915_gem_object_put(obj); 811 return ERR_PTR(err); 812 } 813 814 static int recursive_batch_resolve(struct i915_vma *batch) 815 { 816 u32 *cmd; 817 818 cmd = i915_gem_object_pin_map(batch->obj, I915_MAP_WC); 819 if (IS_ERR(cmd)) 820 return PTR_ERR(cmd); 821 822 *cmd = MI_BATCH_BUFFER_END; 823 824 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd)); 825 i915_gem_object_unpin_map(batch->obj); 826 827 intel_gt_chipset_flush(batch->vm->gt); 828 829 return 0; 830 } 831 832 static int live_all_engines(void *arg) 833 { 834 struct drm_i915_private *i915 = arg; 835 const unsigned int nengines = num_uabi_engines(i915); 836 struct intel_engine_cs *engine; 837 struct i915_request **request; 838 struct igt_live_test t; 839 struct i915_vma *batch; 840 unsigned int idx; 841 int err; 842 843 /* 844 * Check we can submit requests to all engines simultaneously. We 845 * send a recursive batch to each engine - checking that we don't 846 * block doing so, and that they don't complete too soon. 847 */ 848 849 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 850 if (!request) 851 return -ENOMEM; 852 853 err = igt_live_test_begin(&t, i915, __func__, ""); 854 if (err) 855 goto out_free; 856 857 batch = recursive_batch(i915); 858 if (IS_ERR(batch)) { 859 err = PTR_ERR(batch); 860 pr_err("%s: Unable to create batch, err=%d\n", __func__, err); 861 goto out_free; 862 } 863 864 idx = 0; 865 for_each_uabi_engine(engine, i915) { 866 request[idx] = intel_engine_create_kernel_request(engine); 867 if (IS_ERR(request[idx])) { 868 err = PTR_ERR(request[idx]); 869 pr_err("%s: Request allocation failed with err=%d\n", 870 __func__, err); 871 goto out_request; 872 } 873 874 i915_vma_lock(batch); 875 err = i915_request_await_object(request[idx], batch->obj, 0); 876 if (err == 0) 877 err = i915_vma_move_to_active(batch, request[idx], 0); 878 i915_vma_unlock(batch); 879 GEM_BUG_ON(err); 880 881 err = engine->emit_bb_start(request[idx], 882 batch->node.start, 883 batch->node.size, 884 0); 885 GEM_BUG_ON(err); 886 request[idx]->batch = batch; 887 888 i915_request_get(request[idx]); 889 i915_request_add(request[idx]); 890 idx++; 891 } 892 893 idx = 0; 894 for_each_uabi_engine(engine, i915) { 895 if (i915_request_completed(request[idx])) { 896 pr_err("%s(%s): request completed too early!\n", 897 __func__, engine->name); 898 err = -EINVAL; 899 goto out_request; 900 } 901 idx++; 902 } 903 904 err = recursive_batch_resolve(batch); 905 if (err) { 906 pr_err("%s: failed to resolve batch, err=%d\n", __func__, err); 907 goto out_request; 908 } 909 910 idx = 0; 911 for_each_uabi_engine(engine, i915) { 912 long timeout; 913 914 timeout = i915_request_wait(request[idx], 0, 915 MAX_SCHEDULE_TIMEOUT); 916 if (timeout < 0) { 917 err = timeout; 918 pr_err("%s: error waiting for request on %s, err=%d\n", 919 __func__, engine->name, err); 920 goto out_request; 921 } 922 923 GEM_BUG_ON(!i915_request_completed(request[idx])); 924 i915_request_put(request[idx]); 925 request[idx] = NULL; 926 idx++; 927 } 928 929 err = igt_live_test_end(&t); 930 931 out_request: 932 idx = 0; 933 for_each_uabi_engine(engine, i915) { 934 if (request[idx]) 935 i915_request_put(request[idx]); 936 idx++; 937 } 938 i915_vma_unpin(batch); 939 i915_vma_put(batch); 940 out_free: 941 kfree(request); 942 return err; 943 } 944 945 static int live_sequential_engines(void *arg) 946 { 947 struct drm_i915_private *i915 = arg; 948 const unsigned int nengines = num_uabi_engines(i915); 949 struct i915_request **request; 950 struct i915_request *prev = NULL; 951 struct intel_engine_cs *engine; 952 struct igt_live_test t; 953 unsigned int idx; 954 int err; 955 956 /* 957 * Check we can submit requests to all engines sequentially, such 958 * that each successive request waits for the earlier ones. This 959 * tests that we don't execute requests out of order, even though 960 * they are running on independent engines. 961 */ 962 963 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL); 964 if (!request) 965 return -ENOMEM; 966 967 err = igt_live_test_begin(&t, i915, __func__, ""); 968 if (err) 969 goto out_free; 970 971 idx = 0; 972 for_each_uabi_engine(engine, i915) { 973 struct i915_vma *batch; 974 975 batch = recursive_batch(i915); 976 if (IS_ERR(batch)) { 977 err = PTR_ERR(batch); 978 pr_err("%s: Unable to create batch for %s, err=%d\n", 979 __func__, engine->name, err); 980 goto out_free; 981 } 982 983 request[idx] = intel_engine_create_kernel_request(engine); 984 if (IS_ERR(request[idx])) { 985 err = PTR_ERR(request[idx]); 986 pr_err("%s: Request allocation failed for %s with err=%d\n", 987 __func__, engine->name, err); 988 goto out_request; 989 } 990 991 if (prev) { 992 err = i915_request_await_dma_fence(request[idx], 993 &prev->fence); 994 if (err) { 995 i915_request_add(request[idx]); 996 pr_err("%s: Request await failed for %s with err=%d\n", 997 __func__, engine->name, err); 998 goto out_request; 999 } 1000 } 1001 1002 i915_vma_lock(batch); 1003 err = i915_request_await_object(request[idx], 1004 batch->obj, false); 1005 if (err == 0) 1006 err = i915_vma_move_to_active(batch, request[idx], 0); 1007 i915_vma_unlock(batch); 1008 GEM_BUG_ON(err); 1009 1010 err = engine->emit_bb_start(request[idx], 1011 batch->node.start, 1012 batch->node.size, 1013 0); 1014 GEM_BUG_ON(err); 1015 request[idx]->batch = batch; 1016 1017 i915_request_get(request[idx]); 1018 i915_request_add(request[idx]); 1019 1020 prev = request[idx]; 1021 idx++; 1022 } 1023 1024 idx = 0; 1025 for_each_uabi_engine(engine, i915) { 1026 long timeout; 1027 1028 if (i915_request_completed(request[idx])) { 1029 pr_err("%s(%s): request completed too early!\n", 1030 __func__, engine->name); 1031 err = -EINVAL; 1032 goto out_request; 1033 } 1034 1035 err = recursive_batch_resolve(request[idx]->batch); 1036 if (err) { 1037 pr_err("%s: failed to resolve batch, err=%d\n", 1038 __func__, err); 1039 goto out_request; 1040 } 1041 1042 timeout = i915_request_wait(request[idx], 0, 1043 MAX_SCHEDULE_TIMEOUT); 1044 if (timeout < 0) { 1045 err = timeout; 1046 pr_err("%s: error waiting for request on %s, err=%d\n", 1047 __func__, engine->name, err); 1048 goto out_request; 1049 } 1050 1051 GEM_BUG_ON(!i915_request_completed(request[idx])); 1052 idx++; 1053 } 1054 1055 err = igt_live_test_end(&t); 1056 1057 out_request: 1058 idx = 0; 1059 for_each_uabi_engine(engine, i915) { 1060 u32 *cmd; 1061 1062 if (!request[idx]) 1063 break; 1064 1065 cmd = i915_gem_object_pin_map(request[idx]->batch->obj, 1066 I915_MAP_WC); 1067 if (!IS_ERR(cmd)) { 1068 *cmd = MI_BATCH_BUFFER_END; 1069 1070 __i915_gem_object_flush_map(request[idx]->batch->obj, 1071 0, sizeof(*cmd)); 1072 i915_gem_object_unpin_map(request[idx]->batch->obj); 1073 1074 intel_gt_chipset_flush(engine->gt); 1075 } 1076 1077 i915_vma_put(request[idx]->batch); 1078 i915_request_put(request[idx]); 1079 idx++; 1080 } 1081 out_free: 1082 kfree(request); 1083 return err; 1084 } 1085 1086 static int __live_parallel_engine1(void *arg) 1087 { 1088 struct intel_engine_cs *engine = arg; 1089 IGT_TIMEOUT(end_time); 1090 unsigned long count; 1091 int err = 0; 1092 1093 count = 0; 1094 intel_engine_pm_get(engine); 1095 do { 1096 struct i915_request *rq; 1097 1098 rq = i915_request_create(engine->kernel_context); 1099 if (IS_ERR(rq)) { 1100 err = PTR_ERR(rq); 1101 break; 1102 } 1103 1104 i915_request_get(rq); 1105 i915_request_add(rq); 1106 1107 err = 0; 1108 if (i915_request_wait(rq, 0, HZ / 5) < 0) 1109 err = -ETIME; 1110 i915_request_put(rq); 1111 if (err) 1112 break; 1113 1114 count++; 1115 } while (!__igt_timeout(end_time, NULL)); 1116 intel_engine_pm_put(engine); 1117 1118 pr_info("%s: %lu request + sync\n", engine->name, count); 1119 return err; 1120 } 1121 1122 static int __live_parallel_engineN(void *arg) 1123 { 1124 struct intel_engine_cs *engine = arg; 1125 IGT_TIMEOUT(end_time); 1126 unsigned long count; 1127 int err = 0; 1128 1129 count = 0; 1130 intel_engine_pm_get(engine); 1131 do { 1132 struct i915_request *rq; 1133 1134 rq = i915_request_create(engine->kernel_context); 1135 if (IS_ERR(rq)) { 1136 err = PTR_ERR(rq); 1137 break; 1138 } 1139 1140 i915_request_add(rq); 1141 count++; 1142 } while (!__igt_timeout(end_time, NULL)); 1143 intel_engine_pm_put(engine); 1144 1145 pr_info("%s: %lu requests\n", engine->name, count); 1146 return err; 1147 } 1148 1149 static bool wake_all(struct drm_i915_private *i915) 1150 { 1151 if (atomic_dec_and_test(&i915->selftest.counter)) { 1152 wake_up_var(&i915->selftest.counter); 1153 return true; 1154 } 1155 1156 return false; 1157 } 1158 1159 static int wait_for_all(struct drm_i915_private *i915) 1160 { 1161 if (wake_all(i915)) 1162 return 0; 1163 1164 if (wait_var_event_timeout(&i915->selftest.counter, 1165 !atomic_read(&i915->selftest.counter), 1166 i915_selftest.timeout_jiffies)) 1167 return 0; 1168 1169 return -ETIME; 1170 } 1171 1172 static int __live_parallel_spin(void *arg) 1173 { 1174 struct intel_engine_cs *engine = arg; 1175 struct igt_spinner spin; 1176 struct i915_request *rq; 1177 int err = 0; 1178 1179 /* 1180 * Create a spinner running for eternity on each engine. If a second 1181 * spinner is incorrectly placed on the same engine, it will not be 1182 * able to start in time. 1183 */ 1184 1185 if (igt_spinner_init(&spin, engine->gt)) { 1186 wake_all(engine->i915); 1187 return -ENOMEM; 1188 } 1189 1190 intel_engine_pm_get(engine); 1191 rq = igt_spinner_create_request(&spin, 1192 engine->kernel_context, 1193 MI_NOOP); /* no preemption */ 1194 intel_engine_pm_put(engine); 1195 if (IS_ERR(rq)) { 1196 err = PTR_ERR(rq); 1197 if (err == -ENODEV) 1198 err = 0; 1199 wake_all(engine->i915); 1200 goto out_spin; 1201 } 1202 1203 i915_request_get(rq); 1204 i915_request_add(rq); 1205 if (igt_wait_for_spinner(&spin, rq)) { 1206 /* Occupy this engine for the whole test */ 1207 err = wait_for_all(engine->i915); 1208 } else { 1209 pr_err("Failed to start spinner on %s\n", engine->name); 1210 err = -EINVAL; 1211 } 1212 igt_spinner_end(&spin); 1213 1214 if (err == 0 && i915_request_wait(rq, 0, HZ / 5) < 0) 1215 err = -EIO; 1216 i915_request_put(rq); 1217 1218 out_spin: 1219 igt_spinner_fini(&spin); 1220 return err; 1221 } 1222 1223 static int live_parallel_engines(void *arg) 1224 { 1225 struct drm_i915_private *i915 = arg; 1226 static int (* const func[])(void *arg) = { 1227 __live_parallel_engine1, 1228 __live_parallel_engineN, 1229 __live_parallel_spin, 1230 NULL, 1231 }; 1232 const unsigned int nengines = num_uabi_engines(i915); 1233 struct intel_engine_cs *engine; 1234 int (* const *fn)(void *arg); 1235 struct task_struct **tsk; 1236 int err = 0; 1237 1238 /* 1239 * Check we can submit requests to all engines concurrently. This 1240 * tests that we load up the system maximally. 1241 */ 1242 1243 tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL); 1244 if (!tsk) 1245 return -ENOMEM; 1246 1247 for (fn = func; !err && *fn; fn++) { 1248 char name[KSYM_NAME_LEN]; 1249 struct igt_live_test t; 1250 unsigned int idx; 1251 1252 snprintf(name, sizeof(name), "%ps", *fn); 1253 err = igt_live_test_begin(&t, i915, __func__, name); 1254 if (err) 1255 break; 1256 1257 atomic_set(&i915->selftest.counter, nengines); 1258 1259 idx = 0; 1260 for_each_uabi_engine(engine, i915) { 1261 tsk[idx] = kthread_run(*fn, engine, 1262 "igt/parallel:%s", 1263 engine->name); 1264 if (IS_ERR(tsk[idx])) { 1265 err = PTR_ERR(tsk[idx]); 1266 break; 1267 } 1268 get_task_struct(tsk[idx++]); 1269 } 1270 1271 yield(); /* start all threads before we kthread_stop() */ 1272 1273 idx = 0; 1274 for_each_uabi_engine(engine, i915) { 1275 int status; 1276 1277 if (IS_ERR(tsk[idx])) 1278 break; 1279 1280 status = kthread_stop(tsk[idx]); 1281 if (status && !err) 1282 err = status; 1283 1284 put_task_struct(tsk[idx++]); 1285 } 1286 1287 if (igt_live_test_end(&t)) 1288 err = -EIO; 1289 } 1290 1291 kfree(tsk); 1292 return err; 1293 } 1294 1295 static int 1296 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine) 1297 { 1298 struct i915_request *rq; 1299 int ret; 1300 1301 /* 1302 * Before execlists, all contexts share the same ringbuffer. With 1303 * execlists, each context/engine has a separate ringbuffer and 1304 * for the purposes of this test, inexhaustible. 1305 * 1306 * For the global ringbuffer though, we have to be very careful 1307 * that we do not wrap while preventing the execution of requests 1308 * with a unsignaled fence. 1309 */ 1310 if (HAS_EXECLISTS(ctx->i915)) 1311 return INT_MAX; 1312 1313 rq = igt_request_alloc(ctx, engine); 1314 if (IS_ERR(rq)) { 1315 ret = PTR_ERR(rq); 1316 } else { 1317 int sz; 1318 1319 ret = rq->ring->size - rq->reserved_space; 1320 i915_request_add(rq); 1321 1322 sz = rq->ring->emit - rq->head; 1323 if (sz < 0) 1324 sz += rq->ring->size; 1325 ret /= sz; 1326 ret /= 2; /* leave half spare, in case of emergency! */ 1327 } 1328 1329 return ret; 1330 } 1331 1332 static int live_breadcrumbs_smoketest(void *arg) 1333 { 1334 struct drm_i915_private *i915 = arg; 1335 const unsigned int nengines = num_uabi_engines(i915); 1336 const unsigned int ncpus = num_online_cpus(); 1337 unsigned long num_waits, num_fences; 1338 struct intel_engine_cs *engine; 1339 struct task_struct **threads; 1340 struct igt_live_test live; 1341 intel_wakeref_t wakeref; 1342 struct smoketest *smoke; 1343 unsigned int n, idx; 1344 struct file *file; 1345 int ret = 0; 1346 1347 /* 1348 * Smoketest our breadcrumb/signal handling for requests across multiple 1349 * threads. A very simple test to only catch the most egregious of bugs. 1350 * See __igt_breadcrumbs_smoketest(); 1351 * 1352 * On real hardware this time. 1353 */ 1354 1355 wakeref = intel_runtime_pm_get(&i915->runtime_pm); 1356 1357 file = mock_file(i915); 1358 if (IS_ERR(file)) { 1359 ret = PTR_ERR(file); 1360 goto out_rpm; 1361 } 1362 1363 smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL); 1364 if (!smoke) { 1365 ret = -ENOMEM; 1366 goto out_file; 1367 } 1368 1369 threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL); 1370 if (!threads) { 1371 ret = -ENOMEM; 1372 goto out_smoke; 1373 } 1374 1375 smoke[0].request_alloc = __live_request_alloc; 1376 smoke[0].ncontexts = 64; 1377 smoke[0].contexts = kcalloc(smoke[0].ncontexts, 1378 sizeof(*smoke[0].contexts), 1379 GFP_KERNEL); 1380 if (!smoke[0].contexts) { 1381 ret = -ENOMEM; 1382 goto out_threads; 1383 } 1384 1385 for (n = 0; n < smoke[0].ncontexts; n++) { 1386 smoke[0].contexts[n] = live_context(i915, file); 1387 if (!smoke[0].contexts[n]) { 1388 ret = -ENOMEM; 1389 goto out_contexts; 1390 } 1391 } 1392 1393 ret = igt_live_test_begin(&live, i915, __func__, ""); 1394 if (ret) 1395 goto out_contexts; 1396 1397 idx = 0; 1398 for_each_uabi_engine(engine, i915) { 1399 smoke[idx] = smoke[0]; 1400 smoke[idx].engine = engine; 1401 smoke[idx].max_batch = 1402 max_batches(smoke[0].contexts[0], engine); 1403 if (smoke[idx].max_batch < 0) { 1404 ret = smoke[idx].max_batch; 1405 goto out_flush; 1406 } 1407 /* One ring interleaved between requests from all cpus */ 1408 smoke[idx].max_batch /= num_online_cpus() + 1; 1409 pr_debug("Limiting batches to %d requests on %s\n", 1410 smoke[idx].max_batch, engine->name); 1411 1412 for (n = 0; n < ncpus; n++) { 1413 struct task_struct *tsk; 1414 1415 tsk = kthread_run(__igt_breadcrumbs_smoketest, 1416 &smoke[idx], "igt/%d.%d", idx, n); 1417 if (IS_ERR(tsk)) { 1418 ret = PTR_ERR(tsk); 1419 goto out_flush; 1420 } 1421 1422 get_task_struct(tsk); 1423 threads[idx * ncpus + n] = tsk; 1424 } 1425 1426 idx++; 1427 } 1428 1429 yield(); /* start all threads before we begin */ 1430 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies)); 1431 1432 out_flush: 1433 idx = 0; 1434 num_waits = 0; 1435 num_fences = 0; 1436 for_each_uabi_engine(engine, i915) { 1437 for (n = 0; n < ncpus; n++) { 1438 struct task_struct *tsk = threads[idx * ncpus + n]; 1439 int err; 1440 1441 if (!tsk) 1442 continue; 1443 1444 err = kthread_stop(tsk); 1445 if (err < 0 && !ret) 1446 ret = err; 1447 1448 put_task_struct(tsk); 1449 } 1450 1451 num_waits += atomic_long_read(&smoke[idx].num_waits); 1452 num_fences += atomic_long_read(&smoke[idx].num_fences); 1453 idx++; 1454 } 1455 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n", 1456 num_waits, num_fences, RUNTIME_INFO(i915)->num_engines, ncpus); 1457 1458 ret = igt_live_test_end(&live) ?: ret; 1459 out_contexts: 1460 kfree(smoke[0].contexts); 1461 out_threads: 1462 kfree(threads); 1463 out_smoke: 1464 kfree(smoke); 1465 out_file: 1466 fput(file); 1467 out_rpm: 1468 intel_runtime_pm_put(&i915->runtime_pm, wakeref); 1469 1470 return ret; 1471 } 1472 1473 int i915_request_live_selftests(struct drm_i915_private *i915) 1474 { 1475 static const struct i915_subtest tests[] = { 1476 SUBTEST(live_nop_request), 1477 SUBTEST(live_all_engines), 1478 SUBTEST(live_sequential_engines), 1479 SUBTEST(live_parallel_engines), 1480 SUBTEST(live_empty_request), 1481 SUBTEST(live_breadcrumbs_smoketest), 1482 }; 1483 1484 if (intel_gt_is_wedged(&i915->gt)) 1485 return 0; 1486 1487 return i915_subtests(tests, i915); 1488 } 1489 1490 static int switch_to_kernel_sync(struct intel_context *ce, int err) 1491 { 1492 struct i915_request *rq; 1493 struct dma_fence *fence; 1494 1495 rq = intel_engine_create_kernel_request(ce->engine); 1496 if (IS_ERR(rq)) 1497 return PTR_ERR(rq); 1498 1499 fence = i915_active_fence_get(&ce->timeline->last_request); 1500 if (fence) { 1501 i915_request_await_dma_fence(rq, fence); 1502 dma_fence_put(fence); 1503 } 1504 1505 rq = i915_request_get(rq); 1506 i915_request_add(rq); 1507 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err) 1508 err = -ETIME; 1509 i915_request_put(rq); 1510 1511 while (!err && !intel_engine_is_idle(ce->engine)) 1512 intel_engine_flush_submission(ce->engine); 1513 1514 return err; 1515 } 1516 1517 struct perf_stats { 1518 struct intel_engine_cs *engine; 1519 unsigned long count; 1520 ktime_t time; 1521 ktime_t busy; 1522 u64 runtime; 1523 }; 1524 1525 struct perf_series { 1526 struct drm_i915_private *i915; 1527 unsigned int nengines; 1528 struct intel_context *ce[]; 1529 }; 1530 1531 static int cmp_u32(const void *A, const void *B) 1532 { 1533 const u32 *a = A, *b = B; 1534 1535 return *a - *b; 1536 } 1537 1538 static u32 trifilter(u32 *a) 1539 { 1540 u64 sum; 1541 1542 #define TF_COUNT 5 1543 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL); 1544 1545 sum = mul_u32_u32(a[2], 2); 1546 sum += a[1]; 1547 sum += a[3]; 1548 1549 GEM_BUG_ON(sum > U32_MAX); 1550 return sum; 1551 #define TF_BIAS 2 1552 } 1553 1554 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles) 1555 { 1556 u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles); 1557 1558 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS); 1559 } 1560 1561 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset) 1562 { 1563 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; 1564 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base))); 1565 *cs++ = offset; 1566 *cs++ = 0; 1567 1568 return cs; 1569 } 1570 1571 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value) 1572 { 1573 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1574 *cs++ = offset; 1575 *cs++ = 0; 1576 *cs++ = value; 1577 1578 return cs; 1579 } 1580 1581 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset) 1582 { 1583 *cs++ = MI_SEMAPHORE_WAIT | 1584 MI_SEMAPHORE_GLOBAL_GTT | 1585 MI_SEMAPHORE_POLL | 1586 mode; 1587 *cs++ = value; 1588 *cs++ = offset; 1589 *cs++ = 0; 1590 1591 return cs; 1592 } 1593 1594 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value) 1595 { 1596 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset); 1597 } 1598 1599 static void semaphore_set(u32 *sema, u32 value) 1600 { 1601 WRITE_ONCE(*sema, value); 1602 wmb(); /* flush the update to the cache, and beyond */ 1603 } 1604 1605 static u32 *hwsp_scratch(const struct intel_context *ce) 1606 { 1607 return memset32(ce->engine->status_page.addr + 1000, 0, 21); 1608 } 1609 1610 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw) 1611 { 1612 return (i915_ggtt_offset(ce->engine->status_page.vma) + 1613 offset_in_page(dw)); 1614 } 1615 1616 static int measure_semaphore_response(struct intel_context *ce) 1617 { 1618 u32 *sema = hwsp_scratch(ce); 1619 const u32 offset = hwsp_offset(ce, sema); 1620 u32 elapsed[TF_COUNT], cycles; 1621 struct i915_request *rq; 1622 u32 *cs; 1623 int err; 1624 int i; 1625 1626 /* 1627 * Measure how many cycles it takes for the HW to detect the change 1628 * in a semaphore value. 1629 * 1630 * A: read CS_TIMESTAMP from CPU 1631 * poke semaphore 1632 * B: read CS_TIMESTAMP on GPU 1633 * 1634 * Semaphore latency: B - A 1635 */ 1636 1637 semaphore_set(sema, -1); 1638 1639 rq = i915_request_create(ce); 1640 if (IS_ERR(rq)) 1641 return PTR_ERR(rq); 1642 1643 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed)); 1644 if (IS_ERR(cs)) { 1645 i915_request_add(rq); 1646 err = PTR_ERR(cs); 1647 goto err; 1648 } 1649 1650 cs = emit_store_dw(cs, offset, 0); 1651 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1652 cs = emit_semaphore_poll_until(cs, offset, i); 1653 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 1654 cs = emit_store_dw(cs, offset, 0); 1655 } 1656 1657 intel_ring_advance(rq, cs); 1658 i915_request_add(rq); 1659 1660 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 1661 err = -EIO; 1662 goto err; 1663 } 1664 1665 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1666 preempt_disable(); 1667 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 1668 semaphore_set(sema, i); 1669 preempt_enable(); 1670 1671 if (wait_for(READ_ONCE(*sema) == 0, 50)) { 1672 err = -EIO; 1673 goto err; 1674 } 1675 1676 elapsed[i - 1] = sema[i] - cycles; 1677 } 1678 1679 cycles = trifilter(elapsed); 1680 pr_info("%s: semaphore response %d cycles, %lluns\n", 1681 ce->engine->name, cycles >> TF_BIAS, 1682 cycles_to_ns(ce->engine, cycles)); 1683 1684 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 1685 1686 err: 1687 intel_gt_set_wedged(ce->engine->gt); 1688 return err; 1689 } 1690 1691 static int measure_idle_dispatch(struct intel_context *ce) 1692 { 1693 u32 *sema = hwsp_scratch(ce); 1694 const u32 offset = hwsp_offset(ce, sema); 1695 u32 elapsed[TF_COUNT], cycles; 1696 u32 *cs; 1697 int err; 1698 int i; 1699 1700 /* 1701 * Measure how long it takes for us to submit a request while the 1702 * engine is idle, but is resting in our context. 1703 * 1704 * A: read CS_TIMESTAMP from CPU 1705 * submit request 1706 * B: read CS_TIMESTAMP on GPU 1707 * 1708 * Submission latency: B - A 1709 */ 1710 1711 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 1712 struct i915_request *rq; 1713 1714 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 1715 if (err) 1716 return err; 1717 1718 rq = i915_request_create(ce); 1719 if (IS_ERR(rq)) { 1720 err = PTR_ERR(rq); 1721 goto err; 1722 } 1723 1724 cs = intel_ring_begin(rq, 4); 1725 if (IS_ERR(cs)) { 1726 i915_request_add(rq); 1727 err = PTR_ERR(cs); 1728 goto err; 1729 } 1730 1731 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 1732 1733 intel_ring_advance(rq, cs); 1734 1735 preempt_disable(); 1736 local_bh_disable(); 1737 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 1738 i915_request_add(rq); 1739 local_bh_enable(); 1740 preempt_enable(); 1741 } 1742 1743 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 1744 if (err) 1745 goto err; 1746 1747 for (i = 0; i < ARRAY_SIZE(elapsed); i++) 1748 elapsed[i] = sema[i] - elapsed[i]; 1749 1750 cycles = trifilter(elapsed); 1751 pr_info("%s: idle dispatch latency %d cycles, %lluns\n", 1752 ce->engine->name, cycles >> TF_BIAS, 1753 cycles_to_ns(ce->engine, cycles)); 1754 1755 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 1756 1757 err: 1758 intel_gt_set_wedged(ce->engine->gt); 1759 return err; 1760 } 1761 1762 static int measure_busy_dispatch(struct intel_context *ce) 1763 { 1764 u32 *sema = hwsp_scratch(ce); 1765 const u32 offset = hwsp_offset(ce, sema); 1766 u32 elapsed[TF_COUNT + 1], cycles; 1767 u32 *cs; 1768 int err; 1769 int i; 1770 1771 /* 1772 * Measure how long it takes for us to submit a request while the 1773 * engine is busy, polling on a semaphore in our context. With 1774 * direct submission, this will include the cost of a lite restore. 1775 * 1776 * A: read CS_TIMESTAMP from CPU 1777 * submit request 1778 * B: read CS_TIMESTAMP on GPU 1779 * 1780 * Submission latency: B - A 1781 */ 1782 1783 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1784 struct i915_request *rq; 1785 1786 rq = i915_request_create(ce); 1787 if (IS_ERR(rq)) { 1788 err = PTR_ERR(rq); 1789 goto err; 1790 } 1791 1792 cs = intel_ring_begin(rq, 12); 1793 if (IS_ERR(cs)) { 1794 i915_request_add(rq); 1795 err = PTR_ERR(cs); 1796 goto err; 1797 } 1798 1799 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 1800 cs = emit_semaphore_poll_until(cs, offset, i); 1801 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 1802 1803 intel_ring_advance(rq, cs); 1804 1805 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) { 1806 err = -EIO; 1807 goto err; 1808 } 1809 1810 preempt_disable(); 1811 local_bh_disable(); 1812 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 1813 i915_request_add(rq); 1814 local_bh_enable(); 1815 semaphore_set(sema, i - 1); 1816 preempt_enable(); 1817 } 1818 1819 wait_for(READ_ONCE(sema[i - 1]), 500); 1820 semaphore_set(sema, i - 1); 1821 1822 for (i = 1; i <= TF_COUNT; i++) { 1823 GEM_BUG_ON(sema[i] == -1); 1824 elapsed[i - 1] = sema[i] - elapsed[i]; 1825 } 1826 1827 cycles = trifilter(elapsed); 1828 pr_info("%s: busy dispatch latency %d cycles, %lluns\n", 1829 ce->engine->name, cycles >> TF_BIAS, 1830 cycles_to_ns(ce->engine, cycles)); 1831 1832 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 1833 1834 err: 1835 intel_gt_set_wedged(ce->engine->gt); 1836 return err; 1837 } 1838 1839 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value) 1840 { 1841 const u32 offset = 1842 i915_ggtt_offset(engine->status_page.vma) + 1843 offset_in_page(sema); 1844 struct i915_request *rq; 1845 u32 *cs; 1846 1847 rq = i915_request_create(engine->kernel_context); 1848 if (IS_ERR(rq)) 1849 return PTR_ERR(rq); 1850 1851 cs = intel_ring_begin(rq, 4); 1852 if (IS_ERR(cs)) { 1853 i915_request_add(rq); 1854 return PTR_ERR(cs); 1855 } 1856 1857 cs = emit_semaphore_poll(cs, mode, value, offset); 1858 1859 intel_ring_advance(rq, cs); 1860 i915_request_add(rq); 1861 1862 return 0; 1863 } 1864 1865 static int measure_inter_request(struct intel_context *ce) 1866 { 1867 u32 *sema = hwsp_scratch(ce); 1868 const u32 offset = hwsp_offset(ce, sema); 1869 u32 elapsed[TF_COUNT + 1], cycles; 1870 struct i915_sw_fence *submit; 1871 int i, err; 1872 1873 /* 1874 * Measure how long it takes to advance from one request into the 1875 * next. Between each request we flush the GPU caches to memory, 1876 * update the breadcrumbs, and then invalidate those caches. 1877 * We queue up all the requests to be submitted in one batch so 1878 * it should be one set of contiguous measurements. 1879 * 1880 * A: read CS_TIMESTAMP on GPU 1881 * advance request 1882 * B: read CS_TIMESTAMP on GPU 1883 * 1884 * Request latency: B - A 1885 */ 1886 1887 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 1888 if (err) 1889 return err; 1890 1891 submit = heap_fence_create(GFP_KERNEL); 1892 if (!submit) { 1893 semaphore_set(sema, 1); 1894 return -ENOMEM; 1895 } 1896 1897 intel_engine_flush_submission(ce->engine); 1898 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1899 struct i915_request *rq; 1900 u32 *cs; 1901 1902 rq = i915_request_create(ce); 1903 if (IS_ERR(rq)) { 1904 err = PTR_ERR(rq); 1905 goto err_submit; 1906 } 1907 1908 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit, 1909 submit, 1910 GFP_KERNEL); 1911 if (err < 0) { 1912 i915_request_add(rq); 1913 goto err_submit; 1914 } 1915 1916 cs = intel_ring_begin(rq, 4); 1917 if (IS_ERR(cs)) { 1918 i915_request_add(rq); 1919 err = PTR_ERR(cs); 1920 goto err_submit; 1921 } 1922 1923 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 1924 1925 intel_ring_advance(rq, cs); 1926 i915_request_add(rq); 1927 } 1928 local_bh_disable(); 1929 i915_sw_fence_commit(submit); 1930 local_bh_enable(); 1931 intel_engine_flush_submission(ce->engine); 1932 heap_fence_put(submit); 1933 1934 semaphore_set(sema, 1); 1935 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 1936 if (err) 1937 goto err; 1938 1939 for (i = 1; i <= TF_COUNT; i++) 1940 elapsed[i - 1] = sema[i + 1] - sema[i]; 1941 1942 cycles = trifilter(elapsed); 1943 pr_info("%s: inter-request latency %d cycles, %lluns\n", 1944 ce->engine->name, cycles >> TF_BIAS, 1945 cycles_to_ns(ce->engine, cycles)); 1946 1947 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 1948 1949 err_submit: 1950 i915_sw_fence_commit(submit); 1951 heap_fence_put(submit); 1952 semaphore_set(sema, 1); 1953 err: 1954 intel_gt_set_wedged(ce->engine->gt); 1955 return err; 1956 } 1957 1958 static int measure_context_switch(struct intel_context *ce) 1959 { 1960 u32 *sema = hwsp_scratch(ce); 1961 const u32 offset = hwsp_offset(ce, sema); 1962 struct i915_request *fence = NULL; 1963 u32 elapsed[TF_COUNT + 1], cycles; 1964 int i, j, err; 1965 u32 *cs; 1966 1967 /* 1968 * Measure how long it takes to advance from one request in one 1969 * context to a request in another context. This allows us to 1970 * measure how long the context save/restore take, along with all 1971 * the inter-context setup we require. 1972 * 1973 * A: read CS_TIMESTAMP on GPU 1974 * switch context 1975 * B: read CS_TIMESTAMP on GPU 1976 * 1977 * Context switch latency: B - A 1978 */ 1979 1980 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0); 1981 if (err) 1982 return err; 1983 1984 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 1985 struct intel_context *arr[] = { 1986 ce, ce->engine->kernel_context 1987 }; 1988 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32); 1989 1990 for (j = 0; j < ARRAY_SIZE(arr); j++) { 1991 struct i915_request *rq; 1992 1993 rq = i915_request_create(arr[j]); 1994 if (IS_ERR(rq)) { 1995 err = PTR_ERR(rq); 1996 goto err_fence; 1997 } 1998 1999 if (fence) { 2000 err = i915_request_await_dma_fence(rq, 2001 &fence->fence); 2002 if (err) { 2003 i915_request_add(rq); 2004 goto err_fence; 2005 } 2006 } 2007 2008 cs = intel_ring_begin(rq, 4); 2009 if (IS_ERR(cs)) { 2010 i915_request_add(rq); 2011 err = PTR_ERR(cs); 2012 goto err_fence; 2013 } 2014 2015 cs = emit_timestamp_store(cs, ce, addr); 2016 addr += sizeof(u32); 2017 2018 intel_ring_advance(rq, cs); 2019 2020 i915_request_put(fence); 2021 fence = i915_request_get(rq); 2022 2023 i915_request_add(rq); 2024 } 2025 } 2026 i915_request_put(fence); 2027 intel_engine_flush_submission(ce->engine); 2028 2029 semaphore_set(sema, 1); 2030 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2031 if (err) 2032 goto err; 2033 2034 for (i = 1; i <= TF_COUNT; i++) 2035 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1]; 2036 2037 cycles = trifilter(elapsed); 2038 pr_info("%s: context switch latency %d cycles, %lluns\n", 2039 ce->engine->name, cycles >> TF_BIAS, 2040 cycles_to_ns(ce->engine, cycles)); 2041 2042 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2043 2044 err_fence: 2045 i915_request_put(fence); 2046 semaphore_set(sema, 1); 2047 err: 2048 intel_gt_set_wedged(ce->engine->gt); 2049 return err; 2050 } 2051 2052 static int measure_preemption(struct intel_context *ce) 2053 { 2054 u32 *sema = hwsp_scratch(ce); 2055 const u32 offset = hwsp_offset(ce, sema); 2056 u32 elapsed[TF_COUNT], cycles; 2057 u32 *cs; 2058 int err; 2059 int i; 2060 2061 /* 2062 * We measure two latencies while triggering preemption. The first 2063 * latency is how long it takes for us to submit a preempting request. 2064 * The second latency is how it takes for us to return from the 2065 * preemption back to the original context. 2066 * 2067 * A: read CS_TIMESTAMP from CPU 2068 * submit preemption 2069 * B: read CS_TIMESTAMP on GPU (in preempting context) 2070 * context switch 2071 * C: read CS_TIMESTAMP on GPU (in original context) 2072 * 2073 * Preemption dispatch latency: B - A 2074 * Preemption switch latency: C - B 2075 */ 2076 2077 if (!intel_engine_has_preemption(ce->engine)) 2078 return 0; 2079 2080 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2081 u32 addr = offset + 2 * i * sizeof(u32); 2082 struct i915_request *rq; 2083 2084 rq = i915_request_create(ce); 2085 if (IS_ERR(rq)) { 2086 err = PTR_ERR(rq); 2087 goto err; 2088 } 2089 2090 cs = intel_ring_begin(rq, 12); 2091 if (IS_ERR(cs)) { 2092 i915_request_add(rq); 2093 err = PTR_ERR(cs); 2094 goto err; 2095 } 2096 2097 cs = emit_store_dw(cs, addr, -1); 2098 cs = emit_semaphore_poll_until(cs, offset, i); 2099 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32)); 2100 2101 intel_ring_advance(rq, cs); 2102 i915_request_add(rq); 2103 2104 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) { 2105 err = -EIO; 2106 goto err; 2107 } 2108 2109 rq = i915_request_create(ce->engine->kernel_context); 2110 if (IS_ERR(rq)) { 2111 err = PTR_ERR(rq); 2112 goto err; 2113 } 2114 2115 cs = intel_ring_begin(rq, 8); 2116 if (IS_ERR(cs)) { 2117 i915_request_add(rq); 2118 err = PTR_ERR(cs); 2119 goto err; 2120 } 2121 2122 cs = emit_timestamp_store(cs, ce, addr); 2123 cs = emit_store_dw(cs, offset, i); 2124 2125 intel_ring_advance(rq, cs); 2126 rq->sched.attr.priority = I915_PRIORITY_BARRIER; 2127 2128 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2129 i915_request_add(rq); 2130 } 2131 2132 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) { 2133 err = -EIO; 2134 goto err; 2135 } 2136 2137 for (i = 1; i <= TF_COUNT; i++) 2138 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1]; 2139 2140 cycles = trifilter(elapsed); 2141 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n", 2142 ce->engine->name, cycles >> TF_BIAS, 2143 cycles_to_ns(ce->engine, cycles)); 2144 2145 for (i = 1; i <= TF_COUNT; i++) 2146 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0]; 2147 2148 cycles = trifilter(elapsed); 2149 pr_info("%s: preemption switch latency %d cycles, %lluns\n", 2150 ce->engine->name, cycles >> TF_BIAS, 2151 cycles_to_ns(ce->engine, cycles)); 2152 2153 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2154 2155 err: 2156 intel_gt_set_wedged(ce->engine->gt); 2157 return err; 2158 } 2159 2160 struct signal_cb { 2161 struct dma_fence_cb base; 2162 bool seen; 2163 }; 2164 2165 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb) 2166 { 2167 struct signal_cb *s = container_of(cb, typeof(*s), base); 2168 2169 smp_store_mb(s->seen, true); /* be safe, be strong */ 2170 } 2171 2172 static int measure_completion(struct intel_context *ce) 2173 { 2174 u32 *sema = hwsp_scratch(ce); 2175 const u32 offset = hwsp_offset(ce, sema); 2176 u32 elapsed[TF_COUNT], cycles; 2177 u32 *cs; 2178 int err; 2179 int i; 2180 2181 /* 2182 * Measure how long it takes for the signal (interrupt) to be 2183 * sent from the GPU to be processed by the CPU. 2184 * 2185 * A: read CS_TIMESTAMP on GPU 2186 * signal 2187 * B: read CS_TIMESTAMP from CPU 2188 * 2189 * Completion latency: B - A 2190 */ 2191 2192 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) { 2193 struct signal_cb cb = { .seen = false }; 2194 struct i915_request *rq; 2195 2196 rq = i915_request_create(ce); 2197 if (IS_ERR(rq)) { 2198 err = PTR_ERR(rq); 2199 goto err; 2200 } 2201 2202 cs = intel_ring_begin(rq, 12); 2203 if (IS_ERR(cs)) { 2204 i915_request_add(rq); 2205 err = PTR_ERR(cs); 2206 goto err; 2207 } 2208 2209 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1); 2210 cs = emit_semaphore_poll_until(cs, offset, i); 2211 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32)); 2212 2213 intel_ring_advance(rq, cs); 2214 2215 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb); 2216 2217 local_bh_disable(); 2218 i915_request_add(rq); 2219 local_bh_enable(); 2220 2221 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) { 2222 err = -EIO; 2223 goto err; 2224 } 2225 2226 preempt_disable(); 2227 semaphore_set(sema, i); 2228 while (!READ_ONCE(cb.seen)) 2229 cpu_relax(); 2230 2231 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP); 2232 preempt_enable(); 2233 } 2234 2235 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2); 2236 if (err) 2237 goto err; 2238 2239 for (i = 0; i < ARRAY_SIZE(elapsed); i++) { 2240 GEM_BUG_ON(sema[i + 1] == -1); 2241 elapsed[i] = elapsed[i] - sema[i + 1]; 2242 } 2243 2244 cycles = trifilter(elapsed); 2245 pr_info("%s: completion latency %d cycles, %lluns\n", 2246 ce->engine->name, cycles >> TF_BIAS, 2247 cycles_to_ns(ce->engine, cycles)); 2248 2249 return intel_gt_wait_for_idle(ce->engine->gt, HZ); 2250 2251 err: 2252 intel_gt_set_wedged(ce->engine->gt); 2253 return err; 2254 } 2255 2256 static void rps_pin(struct intel_gt *gt) 2257 { 2258 /* Pin the frequency to max */ 2259 atomic_inc(>->rps.num_waiters); 2260 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 2261 2262 mutex_lock(>->rps.lock); 2263 intel_rps_set(>->rps, gt->rps.max_freq); 2264 mutex_unlock(>->rps.lock); 2265 } 2266 2267 static void rps_unpin(struct intel_gt *gt) 2268 { 2269 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 2270 atomic_dec(>->rps.num_waiters); 2271 } 2272 2273 static void engine_heartbeat_disable(struct intel_engine_cs *engine) 2274 { 2275 engine->props.heartbeat_interval_ms = 0; 2276 2277 intel_engine_pm_get(engine); 2278 intel_engine_park_heartbeat(engine); 2279 } 2280 2281 static void engine_heartbeat_enable(struct intel_engine_cs *engine) 2282 { 2283 intel_engine_pm_put(engine); 2284 2285 engine->props.heartbeat_interval_ms = 2286 engine->defaults.heartbeat_interval_ms; 2287 } 2288 2289 static int perf_request_latency(void *arg) 2290 { 2291 struct drm_i915_private *i915 = arg; 2292 struct intel_engine_cs *engine; 2293 struct pm_qos_request qos; 2294 int err = 0; 2295 2296 if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */ 2297 return 0; 2298 2299 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2300 2301 for_each_uabi_engine(engine, i915) { 2302 struct intel_context *ce; 2303 2304 ce = intel_context_create(engine); 2305 if (IS_ERR(ce)) 2306 goto out; 2307 2308 err = intel_context_pin(ce); 2309 if (err) { 2310 intel_context_put(ce); 2311 goto out; 2312 } 2313 2314 engine_heartbeat_disable(engine); 2315 rps_pin(engine->gt); 2316 2317 if (err == 0) 2318 err = measure_semaphore_response(ce); 2319 if (err == 0) 2320 err = measure_idle_dispatch(ce); 2321 if (err == 0) 2322 err = measure_busy_dispatch(ce); 2323 if (err == 0) 2324 err = measure_inter_request(ce); 2325 if (err == 0) 2326 err = measure_context_switch(ce); 2327 if (err == 0) 2328 err = measure_preemption(ce); 2329 if (err == 0) 2330 err = measure_completion(ce); 2331 2332 rps_unpin(engine->gt); 2333 engine_heartbeat_enable(engine); 2334 2335 intel_context_unpin(ce); 2336 intel_context_put(ce); 2337 if (err) 2338 goto out; 2339 } 2340 2341 out: 2342 if (igt_flush_test(i915)) 2343 err = -EIO; 2344 2345 cpu_latency_qos_remove_request(&qos); 2346 return err; 2347 } 2348 2349 static int s_sync0(void *arg) 2350 { 2351 struct perf_series *ps = arg; 2352 IGT_TIMEOUT(end_time); 2353 unsigned int idx = 0; 2354 int err = 0; 2355 2356 GEM_BUG_ON(!ps->nengines); 2357 do { 2358 struct i915_request *rq; 2359 2360 rq = i915_request_create(ps->ce[idx]); 2361 if (IS_ERR(rq)) { 2362 err = PTR_ERR(rq); 2363 break; 2364 } 2365 2366 i915_request_get(rq); 2367 i915_request_add(rq); 2368 2369 if (i915_request_wait(rq, 0, HZ / 5) < 0) 2370 err = -ETIME; 2371 i915_request_put(rq); 2372 if (err) 2373 break; 2374 2375 if (++idx == ps->nengines) 2376 idx = 0; 2377 } while (!__igt_timeout(end_time, NULL)); 2378 2379 return err; 2380 } 2381 2382 static int s_sync1(void *arg) 2383 { 2384 struct perf_series *ps = arg; 2385 struct i915_request *prev = NULL; 2386 IGT_TIMEOUT(end_time); 2387 unsigned int idx = 0; 2388 int err = 0; 2389 2390 GEM_BUG_ON(!ps->nengines); 2391 do { 2392 struct i915_request *rq; 2393 2394 rq = i915_request_create(ps->ce[idx]); 2395 if (IS_ERR(rq)) { 2396 err = PTR_ERR(rq); 2397 break; 2398 } 2399 2400 i915_request_get(rq); 2401 i915_request_add(rq); 2402 2403 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0) 2404 err = -ETIME; 2405 i915_request_put(prev); 2406 prev = rq; 2407 if (err) 2408 break; 2409 2410 if (++idx == ps->nengines) 2411 idx = 0; 2412 } while (!__igt_timeout(end_time, NULL)); 2413 i915_request_put(prev); 2414 2415 return err; 2416 } 2417 2418 static int s_many(void *arg) 2419 { 2420 struct perf_series *ps = arg; 2421 IGT_TIMEOUT(end_time); 2422 unsigned int idx = 0; 2423 2424 GEM_BUG_ON(!ps->nengines); 2425 do { 2426 struct i915_request *rq; 2427 2428 rq = i915_request_create(ps->ce[idx]); 2429 if (IS_ERR(rq)) 2430 return PTR_ERR(rq); 2431 2432 i915_request_add(rq); 2433 2434 if (++idx == ps->nengines) 2435 idx = 0; 2436 } while (!__igt_timeout(end_time, NULL)); 2437 2438 return 0; 2439 } 2440 2441 static int perf_series_engines(void *arg) 2442 { 2443 struct drm_i915_private *i915 = arg; 2444 static int (* const func[])(void *arg) = { 2445 s_sync0, 2446 s_sync1, 2447 s_many, 2448 NULL, 2449 }; 2450 const unsigned int nengines = num_uabi_engines(i915); 2451 struct intel_engine_cs *engine; 2452 int (* const *fn)(void *arg); 2453 struct pm_qos_request qos; 2454 struct perf_stats *stats; 2455 struct perf_series *ps; 2456 unsigned int idx; 2457 int err = 0; 2458 2459 stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL); 2460 if (!stats) 2461 return -ENOMEM; 2462 2463 ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL); 2464 if (!ps) { 2465 kfree(stats); 2466 return -ENOMEM; 2467 } 2468 2469 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */ 2470 2471 ps->i915 = i915; 2472 ps->nengines = nengines; 2473 2474 idx = 0; 2475 for_each_uabi_engine(engine, i915) { 2476 struct intel_context *ce; 2477 2478 ce = intel_context_create(engine); 2479 if (IS_ERR(ce)) 2480 goto out; 2481 2482 err = intel_context_pin(ce); 2483 if (err) { 2484 intel_context_put(ce); 2485 goto out; 2486 } 2487 2488 ps->ce[idx++] = ce; 2489 } 2490 GEM_BUG_ON(idx != ps->nengines); 2491 2492 for (fn = func; *fn && !err; fn++) { 2493 char name[KSYM_NAME_LEN]; 2494 struct igt_live_test t; 2495 2496 snprintf(name, sizeof(name), "%ps", *fn); 2497 err = igt_live_test_begin(&t, i915, __func__, name); 2498 if (err) 2499 break; 2500 2501 for (idx = 0; idx < nengines; idx++) { 2502 struct perf_stats *p = 2503 memset(&stats[idx], 0, sizeof(stats[idx])); 2504 struct intel_context *ce = ps->ce[idx]; 2505 2506 p->engine = ps->ce[idx]->engine; 2507 intel_engine_pm_get(p->engine); 2508 2509 if (intel_engine_supports_stats(p->engine)) 2510 p->busy = intel_engine_get_busy_time(p->engine) + 1; 2511 p->runtime = -intel_context_get_total_runtime_ns(ce); 2512 p->time = ktime_get(); 2513 } 2514 2515 err = (*fn)(ps); 2516 if (igt_live_test_end(&t)) 2517 err = -EIO; 2518 2519 for (idx = 0; idx < nengines; idx++) { 2520 struct perf_stats *p = &stats[idx]; 2521 struct intel_context *ce = ps->ce[idx]; 2522 int integer, decimal; 2523 u64 busy, dt; 2524 2525 p->time = ktime_sub(ktime_get(), p->time); 2526 if (p->busy) { 2527 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine), 2528 p->busy - 1); 2529 } 2530 2531 err = switch_to_kernel_sync(ce, err); 2532 p->runtime += intel_context_get_total_runtime_ns(ce); 2533 intel_engine_pm_put(p->engine); 2534 2535 busy = 100 * ktime_to_ns(p->busy); 2536 dt = ktime_to_ns(p->time); 2537 if (dt) { 2538 integer = div64_u64(busy, dt); 2539 busy -= integer * dt; 2540 decimal = div64_u64(100 * busy, dt); 2541 } else { 2542 integer = 0; 2543 decimal = 0; 2544 } 2545 2546 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 2547 name, p->engine->name, ce->timeline->seqno, 2548 integer, decimal, 2549 div_u64(p->runtime, 1000 * 1000), 2550 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 2551 } 2552 } 2553 2554 out: 2555 for (idx = 0; idx < nengines; idx++) { 2556 if (IS_ERR_OR_NULL(ps->ce[idx])) 2557 break; 2558 2559 intel_context_unpin(ps->ce[idx]); 2560 intel_context_put(ps->ce[idx]); 2561 } 2562 kfree(ps); 2563 2564 cpu_latency_qos_remove_request(&qos); 2565 kfree(stats); 2566 return err; 2567 } 2568 2569 static int p_sync0(void *arg) 2570 { 2571 struct perf_stats *p = arg; 2572 struct intel_engine_cs *engine = p->engine; 2573 struct intel_context *ce; 2574 IGT_TIMEOUT(end_time); 2575 unsigned long count; 2576 bool busy; 2577 int err = 0; 2578 2579 ce = intel_context_create(engine); 2580 if (IS_ERR(ce)) 2581 return PTR_ERR(ce); 2582 2583 err = intel_context_pin(ce); 2584 if (err) { 2585 intel_context_put(ce); 2586 return err; 2587 } 2588 2589 busy = false; 2590 if (intel_engine_supports_stats(engine)) { 2591 p->busy = intel_engine_get_busy_time(engine); 2592 busy = true; 2593 } 2594 2595 p->time = ktime_get(); 2596 count = 0; 2597 do { 2598 struct i915_request *rq; 2599 2600 rq = i915_request_create(ce); 2601 if (IS_ERR(rq)) { 2602 err = PTR_ERR(rq); 2603 break; 2604 } 2605 2606 i915_request_get(rq); 2607 i915_request_add(rq); 2608 2609 err = 0; 2610 if (i915_request_wait(rq, 0, HZ / 5) < 0) 2611 err = -ETIME; 2612 i915_request_put(rq); 2613 if (err) 2614 break; 2615 2616 count++; 2617 } while (!__igt_timeout(end_time, NULL)); 2618 p->time = ktime_sub(ktime_get(), p->time); 2619 2620 if (busy) { 2621 p->busy = ktime_sub(intel_engine_get_busy_time(engine), 2622 p->busy); 2623 } 2624 2625 err = switch_to_kernel_sync(ce, err); 2626 p->runtime = intel_context_get_total_runtime_ns(ce); 2627 p->count = count; 2628 2629 intel_context_unpin(ce); 2630 intel_context_put(ce); 2631 return err; 2632 } 2633 2634 static int p_sync1(void *arg) 2635 { 2636 struct perf_stats *p = arg; 2637 struct intel_engine_cs *engine = p->engine; 2638 struct i915_request *prev = NULL; 2639 struct intel_context *ce; 2640 IGT_TIMEOUT(end_time); 2641 unsigned long count; 2642 bool busy; 2643 int err = 0; 2644 2645 ce = intel_context_create(engine); 2646 if (IS_ERR(ce)) 2647 return PTR_ERR(ce); 2648 2649 err = intel_context_pin(ce); 2650 if (err) { 2651 intel_context_put(ce); 2652 return err; 2653 } 2654 2655 busy = false; 2656 if (intel_engine_supports_stats(engine)) { 2657 p->busy = intel_engine_get_busy_time(engine); 2658 busy = true; 2659 } 2660 2661 p->time = ktime_get(); 2662 count = 0; 2663 do { 2664 struct i915_request *rq; 2665 2666 rq = i915_request_create(ce); 2667 if (IS_ERR(rq)) { 2668 err = PTR_ERR(rq); 2669 break; 2670 } 2671 2672 i915_request_get(rq); 2673 i915_request_add(rq); 2674 2675 err = 0; 2676 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0) 2677 err = -ETIME; 2678 i915_request_put(prev); 2679 prev = rq; 2680 if (err) 2681 break; 2682 2683 count++; 2684 } while (!__igt_timeout(end_time, NULL)); 2685 i915_request_put(prev); 2686 p->time = ktime_sub(ktime_get(), p->time); 2687 2688 if (busy) { 2689 p->busy = ktime_sub(intel_engine_get_busy_time(engine), 2690 p->busy); 2691 } 2692 2693 err = switch_to_kernel_sync(ce, err); 2694 p->runtime = intel_context_get_total_runtime_ns(ce); 2695 p->count = count; 2696 2697 intel_context_unpin(ce); 2698 intel_context_put(ce); 2699 return err; 2700 } 2701 2702 static int p_many(void *arg) 2703 { 2704 struct perf_stats *p = arg; 2705 struct intel_engine_cs *engine = p->engine; 2706 struct intel_context *ce; 2707 IGT_TIMEOUT(end_time); 2708 unsigned long count; 2709 int err = 0; 2710 bool busy; 2711 2712 ce = intel_context_create(engine); 2713 if (IS_ERR(ce)) 2714 return PTR_ERR(ce); 2715 2716 err = intel_context_pin(ce); 2717 if (err) { 2718 intel_context_put(ce); 2719 return err; 2720 } 2721 2722 busy = false; 2723 if (intel_engine_supports_stats(engine)) { 2724 p->busy = intel_engine_get_busy_time(engine); 2725 busy = true; 2726 } 2727 2728 count = 0; 2729 p->time = ktime_get(); 2730 do { 2731 struct i915_request *rq; 2732 2733 rq = i915_request_create(ce); 2734 if (IS_ERR(rq)) { 2735 err = PTR_ERR(rq); 2736 break; 2737 } 2738 2739 i915_request_add(rq); 2740 count++; 2741 } while (!__igt_timeout(end_time, NULL)); 2742 p->time = ktime_sub(ktime_get(), p->time); 2743 2744 if (busy) { 2745 p->busy = ktime_sub(intel_engine_get_busy_time(engine), 2746 p->busy); 2747 } 2748 2749 err = switch_to_kernel_sync(ce, err); 2750 p->runtime = intel_context_get_total_runtime_ns(ce); 2751 p->count = count; 2752 2753 intel_context_unpin(ce); 2754 intel_context_put(ce); 2755 return err; 2756 } 2757 2758 static int perf_parallel_engines(void *arg) 2759 { 2760 struct drm_i915_private *i915 = arg; 2761 static int (* const func[])(void *arg) = { 2762 p_sync0, 2763 p_sync1, 2764 p_many, 2765 NULL, 2766 }; 2767 const unsigned int nengines = num_uabi_engines(i915); 2768 struct intel_engine_cs *engine; 2769 int (* const *fn)(void *arg); 2770 struct pm_qos_request qos; 2771 struct { 2772 struct perf_stats p; 2773 struct task_struct *tsk; 2774 } *engines; 2775 int err = 0; 2776 2777 engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL); 2778 if (!engines) 2779 return -ENOMEM; 2780 2781 cpu_latency_qos_add_request(&qos, 0); 2782 2783 for (fn = func; *fn; fn++) { 2784 char name[KSYM_NAME_LEN]; 2785 struct igt_live_test t; 2786 unsigned int idx; 2787 2788 snprintf(name, sizeof(name), "%ps", *fn); 2789 err = igt_live_test_begin(&t, i915, __func__, name); 2790 if (err) 2791 break; 2792 2793 atomic_set(&i915->selftest.counter, nengines); 2794 2795 idx = 0; 2796 for_each_uabi_engine(engine, i915) { 2797 intel_engine_pm_get(engine); 2798 2799 memset(&engines[idx].p, 0, sizeof(engines[idx].p)); 2800 engines[idx].p.engine = engine; 2801 2802 engines[idx].tsk = kthread_run(*fn, &engines[idx].p, 2803 "igt:%s", engine->name); 2804 if (IS_ERR(engines[idx].tsk)) { 2805 err = PTR_ERR(engines[idx].tsk); 2806 intel_engine_pm_put(engine); 2807 break; 2808 } 2809 get_task_struct(engines[idx++].tsk); 2810 } 2811 2812 yield(); /* start all threads before we kthread_stop() */ 2813 2814 idx = 0; 2815 for_each_uabi_engine(engine, i915) { 2816 int status; 2817 2818 if (IS_ERR(engines[idx].tsk)) 2819 break; 2820 2821 status = kthread_stop(engines[idx].tsk); 2822 if (status && !err) 2823 err = status; 2824 2825 intel_engine_pm_put(engine); 2826 put_task_struct(engines[idx++].tsk); 2827 } 2828 2829 if (igt_live_test_end(&t)) 2830 err = -EIO; 2831 if (err) 2832 break; 2833 2834 idx = 0; 2835 for_each_uabi_engine(engine, i915) { 2836 struct perf_stats *p = &engines[idx].p; 2837 u64 busy = 100 * ktime_to_ns(p->busy); 2838 u64 dt = ktime_to_ns(p->time); 2839 int integer, decimal; 2840 2841 if (dt) { 2842 integer = div64_u64(busy, dt); 2843 busy -= integer * dt; 2844 decimal = div64_u64(100 * busy, dt); 2845 } else { 2846 integer = 0; 2847 decimal = 0; 2848 } 2849 2850 GEM_BUG_ON(engine != p->engine); 2851 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n", 2852 name, engine->name, p->count, integer, decimal, 2853 div_u64(p->runtime, 1000 * 1000), 2854 div_u64(ktime_to_ns(p->time), 1000 * 1000)); 2855 idx++; 2856 } 2857 } 2858 2859 cpu_latency_qos_remove_request(&qos); 2860 kfree(engines); 2861 return err; 2862 } 2863 2864 int i915_request_perf_selftests(struct drm_i915_private *i915) 2865 { 2866 static const struct i915_subtest tests[] = { 2867 SUBTEST(perf_request_latency), 2868 SUBTEST(perf_series_engines), 2869 SUBTEST(perf_parallel_engines), 2870 }; 2871 2872 if (intel_gt_is_wedged(&i915->gt)) 2873 return 0; 2874 2875 return i915_subtests(tests, i915); 2876 } 2877