1 /*
2 * Copyright © 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include <linux/pm_qos.h>
26 #include <linux/prime_numbers.h>
27 #include <linux/sort.h>
28
29 #include <drm/drm_print.h>
30
31 #include "gem/i915_gem_internal.h"
32 #include "gem/i915_gem_pm.h"
33 #include "gem/selftests/mock_context.h"
34 #include "gt/intel_engine_heartbeat.h"
35 #include "gt/intel_engine_pm.h"
36 #include "gt/intel_engine_user.h"
37 #include "gt/intel_gt.h"
38 #include "gt/intel_gt_clock_utils.h"
39 #include "gt/intel_gt_requests.h"
40 #include "gt/selftest_engine_heartbeat.h"
41
42 #include "i915_random.h"
43 #include "i915_selftest.h"
44 #include "i915_wait_util.h"
45 #include "igt_flush_test.h"
46 #include "igt_live_test.h"
47 #include "igt_spinner.h"
48 #include "lib_sw_fence.h"
49 #include "mock_drm.h"
50 #include "mock_gem_device.h"
51
num_uabi_engines(struct drm_i915_private * i915)52 static unsigned int num_uabi_engines(struct drm_i915_private *i915)
53 {
54 struct intel_engine_cs *engine;
55 unsigned int count;
56
57 count = 0;
58 for_each_uabi_engine(engine, i915)
59 count++;
60
61 return count;
62 }
63
rcs0(struct drm_i915_private * i915)64 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
65 {
66 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
67 }
68
igt_add_request(void * arg)69 static int igt_add_request(void *arg)
70 {
71 struct drm_i915_private *i915 = arg;
72 struct i915_request *request;
73
74 /* Basic preliminary test to create a request and let it loose! */
75
76 request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
77 if (IS_ERR(request))
78 return PTR_ERR(request);
79
80 i915_request_add(request);
81
82 return 0;
83 }
84
igt_wait_request(void * arg)85 static int igt_wait_request(void *arg)
86 {
87 const long T = HZ / 4;
88 struct drm_i915_private *i915 = arg;
89 struct i915_request *request;
90 int err = -EINVAL;
91
92 /* Submit a request, then wait upon it */
93
94 request = mock_request(rcs0(i915)->kernel_context, T);
95 if (IS_ERR(request))
96 return PTR_ERR(request);
97
98 i915_request_get(request);
99
100 if (i915_request_wait(request, 0, 0) != -ETIME) {
101 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
102 goto out_request;
103 }
104
105 if (i915_request_wait(request, 0, T) != -ETIME) {
106 pr_err("request wait succeeded (expected timeout before submit!)\n");
107 goto out_request;
108 }
109
110 if (i915_request_completed(request)) {
111 pr_err("request completed before submit!!\n");
112 goto out_request;
113 }
114
115 i915_request_add(request);
116
117 if (i915_request_wait(request, 0, 0) != -ETIME) {
118 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
119 goto out_request;
120 }
121
122 if (i915_request_completed(request)) {
123 pr_err("request completed immediately!\n");
124 goto out_request;
125 }
126
127 if (i915_request_wait(request, 0, T / 2) != -ETIME) {
128 pr_err("request wait succeeded (expected timeout!)\n");
129 goto out_request;
130 }
131
132 if (i915_request_wait(request, 0, T) == -ETIME) {
133 pr_err("request wait timed out!\n");
134 goto out_request;
135 }
136
137 if (!i915_request_completed(request)) {
138 pr_err("request not complete after waiting!\n");
139 goto out_request;
140 }
141
142 if (i915_request_wait(request, 0, T) == -ETIME) {
143 pr_err("request wait timed out when already complete!\n");
144 goto out_request;
145 }
146
147 err = 0;
148 out_request:
149 i915_request_put(request);
150 mock_device_flush(i915);
151 return err;
152 }
153
igt_fence_wait(void * arg)154 static int igt_fence_wait(void *arg)
155 {
156 const long T = HZ / 4;
157 struct drm_i915_private *i915 = arg;
158 struct i915_request *request;
159 int err = -EINVAL;
160
161 /* Submit a request, treat it as a fence and wait upon it */
162
163 request = mock_request(rcs0(i915)->kernel_context, T);
164 if (IS_ERR(request))
165 return PTR_ERR(request);
166
167 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
168 pr_err("fence wait success before submit (expected timeout)!\n");
169 goto out;
170 }
171
172 i915_request_add(request);
173
174 if (dma_fence_is_signaled(&request->fence)) {
175 pr_err("fence signaled immediately!\n");
176 goto out;
177 }
178
179 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
180 pr_err("fence wait success after submit (expected timeout)!\n");
181 goto out;
182 }
183
184 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
185 pr_err("fence wait timed out (expected success)!\n");
186 goto out;
187 }
188
189 if (!dma_fence_is_signaled(&request->fence)) {
190 pr_err("fence unsignaled after waiting!\n");
191 goto out;
192 }
193
194 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
195 pr_err("fence wait timed out when complete (expected success)!\n");
196 goto out;
197 }
198
199 err = 0;
200 out:
201 mock_device_flush(i915);
202 return err;
203 }
204
igt_request_rewind(void * arg)205 static int igt_request_rewind(void *arg)
206 {
207 struct drm_i915_private *i915 = arg;
208 struct i915_request *request, *vip;
209 struct i915_gem_context *ctx[2];
210 struct intel_context *ce;
211 int err = -EINVAL;
212
213 ctx[0] = mock_context(i915, "A");
214 if (!ctx[0]) {
215 err = -ENOMEM;
216 goto err_ctx_0;
217 }
218
219 ce = i915_gem_context_get_engine(ctx[0], RCS0);
220 GEM_BUG_ON(IS_ERR(ce));
221 request = mock_request(ce, 2 * HZ);
222 intel_context_put(ce);
223 if (IS_ERR(request)) {
224 err = PTR_ERR(request);
225 goto err_context_0;
226 }
227
228 i915_request_get(request);
229 i915_request_add(request);
230
231 ctx[1] = mock_context(i915, "B");
232 if (!ctx[1]) {
233 err = -ENOMEM;
234 goto err_ctx_1;
235 }
236
237 ce = i915_gem_context_get_engine(ctx[1], RCS0);
238 GEM_BUG_ON(IS_ERR(ce));
239 vip = mock_request(ce, 0);
240 intel_context_put(ce);
241 if (IS_ERR(vip)) {
242 err = PTR_ERR(vip);
243 goto err_context_1;
244 }
245
246 /* Simulate preemption by manual reordering */
247 if (!mock_cancel_request(request)) {
248 pr_err("failed to cancel request (already executed)!\n");
249 i915_request_add(vip);
250 goto err_context_1;
251 }
252 i915_request_get(vip);
253 i915_request_add(vip);
254 rcu_read_lock();
255 request->engine->submit_request(request);
256 rcu_read_unlock();
257
258
259 if (i915_request_wait(vip, 0, HZ) == -ETIME) {
260 pr_err("timed out waiting for high priority request\n");
261 goto err;
262 }
263
264 if (i915_request_completed(request)) {
265 pr_err("low priority request already completed\n");
266 goto err;
267 }
268
269 err = 0;
270 err:
271 i915_request_put(vip);
272 err_context_1:
273 mock_context_close(ctx[1]);
274 err_ctx_1:
275 i915_request_put(request);
276 err_context_0:
277 mock_context_close(ctx[0]);
278 err_ctx_0:
279 mock_device_flush(i915);
280 return err;
281 }
282
283 struct smoketest {
284 struct intel_engine_cs *engine;
285 struct i915_gem_context **contexts;
286 atomic_long_t num_waits, num_fences;
287 int ncontexts, max_batch;
288 struct i915_request *(*request_alloc)(struct intel_context *ce);
289 };
290
291 static struct i915_request *
__mock_request_alloc(struct intel_context * ce)292 __mock_request_alloc(struct intel_context *ce)
293 {
294 return mock_request(ce, 0);
295 }
296
297 static struct i915_request *
__live_request_alloc(struct intel_context * ce)298 __live_request_alloc(struct intel_context *ce)
299 {
300 return intel_context_create_request(ce);
301 }
302
303 struct smoke_thread {
304 struct kthread_worker *worker;
305 struct kthread_work work;
306 struct smoketest *t;
307 bool stop;
308 int result;
309 };
310
__igt_breadcrumbs_smoketest(struct kthread_work * work)311 static void __igt_breadcrumbs_smoketest(struct kthread_work *work)
312 {
313 struct smoke_thread *thread = container_of(work, typeof(*thread), work);
314 struct smoketest *t = thread->t;
315 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
316 const unsigned int total = 4 * t->ncontexts + 1;
317 unsigned int num_waits = 0, num_fences = 0;
318 struct i915_request **requests;
319 I915_RND_STATE(prng);
320 unsigned int *order;
321 int err = 0;
322
323 /*
324 * A very simple test to catch the most egregious of list handling bugs.
325 *
326 * At its heart, we simply create oodles of requests running across
327 * multiple kthreads and enable signaling on them, for the sole purpose
328 * of stressing our breadcrumb handling. The only inspection we do is
329 * that the fences were marked as signaled.
330 */
331
332 requests = kzalloc_objs(*requests, total);
333 if (!requests) {
334 thread->result = -ENOMEM;
335 return;
336 }
337
338 order = i915_random_order(total, &prng);
339 if (!order) {
340 err = -ENOMEM;
341 goto out_requests;
342 }
343
344 while (!READ_ONCE(thread->stop)) {
345 struct i915_sw_fence *submit, *wait;
346 unsigned int n, count;
347
348 submit = heap_fence_create(GFP_KERNEL);
349 if (!submit) {
350 err = -ENOMEM;
351 break;
352 }
353
354 wait = heap_fence_create(GFP_KERNEL);
355 if (!wait) {
356 i915_sw_fence_commit(submit);
357 heap_fence_put(submit);
358 err = -ENOMEM;
359 break;
360 }
361
362 i915_random_reorder(order, total, &prng);
363 count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
364
365 for (n = 0; n < count; n++) {
366 struct i915_gem_context *ctx =
367 t->contexts[order[n] % t->ncontexts];
368 struct i915_request *rq;
369 struct intel_context *ce;
370
371 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
372 GEM_BUG_ON(IS_ERR(ce));
373 rq = t->request_alloc(ce);
374 intel_context_put(ce);
375 if (IS_ERR(rq)) {
376 err = PTR_ERR(rq);
377 count = n;
378 break;
379 }
380
381 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
382 submit,
383 GFP_KERNEL);
384
385 requests[n] = i915_request_get(rq);
386 i915_request_add(rq);
387
388 if (err >= 0)
389 err = i915_sw_fence_await_dma_fence(wait,
390 &rq->fence,
391 0,
392 GFP_KERNEL);
393
394 if (err < 0) {
395 i915_request_put(rq);
396 count = n;
397 break;
398 }
399 }
400
401 i915_sw_fence_commit(submit);
402 i915_sw_fence_commit(wait);
403
404 if (!wait_event_timeout(wait->wait,
405 i915_sw_fence_done(wait),
406 5 * HZ)) {
407 struct i915_request *rq = requests[count - 1];
408
409 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
410 atomic_read(&wait->pending), count,
411 rq->fence.context, rq->fence.seqno,
412 t->engine->name);
413 GEM_TRACE_DUMP();
414
415 intel_gt_set_wedged(t->engine->gt);
416 GEM_BUG_ON(!i915_request_completed(rq));
417 i915_sw_fence_wait(wait);
418 err = -EIO;
419 }
420
421 for (n = 0; n < count; n++) {
422 struct i915_request *rq = requests[n];
423
424 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
425 &rq->fence.flags)) {
426 pr_err("%llu:%llu was not signaled!\n",
427 rq->fence.context, rq->fence.seqno);
428 err = -EINVAL;
429 }
430
431 i915_request_put(rq);
432 }
433
434 heap_fence_put(wait);
435 heap_fence_put(submit);
436
437 if (err < 0)
438 break;
439
440 num_fences += count;
441 num_waits++;
442
443 cond_resched();
444 }
445
446 atomic_long_add(num_fences, &t->num_fences);
447 atomic_long_add(num_waits, &t->num_waits);
448
449 kfree(order);
450 out_requests:
451 kfree(requests);
452 thread->result = err;
453 }
454
mock_breadcrumbs_smoketest(void * arg)455 static int mock_breadcrumbs_smoketest(void *arg)
456 {
457 struct drm_i915_private *i915 = arg;
458 struct smoketest t = {
459 .engine = rcs0(i915),
460 .ncontexts = 1024,
461 .max_batch = 1024,
462 .request_alloc = __mock_request_alloc
463 };
464 unsigned int ncpus = num_online_cpus();
465 struct smoke_thread *threads;
466 unsigned int n;
467 int ret = 0;
468
469 /*
470 * Smoketest our breadcrumb/signal handling for requests across multiple
471 * threads. A very simple test to only catch the most egregious of bugs.
472 * See __igt_breadcrumbs_smoketest();
473 */
474
475 threads = kzalloc_objs(*threads, ncpus);
476 if (!threads)
477 return -ENOMEM;
478
479 t.contexts = kzalloc_objs(*t.contexts, t.ncontexts);
480 if (!t.contexts) {
481 ret = -ENOMEM;
482 goto out_threads;
483 }
484
485 for (n = 0; n < t.ncontexts; n++) {
486 t.contexts[n] = mock_context(t.engine->i915, "mock");
487 if (!t.contexts[n]) {
488 ret = -ENOMEM;
489 goto out_contexts;
490 }
491 }
492
493 for (n = 0; n < ncpus; n++) {
494 struct kthread_worker *worker;
495
496 worker = kthread_run_worker(0, "igt/%d", n);
497 if (IS_ERR(worker)) {
498 ret = PTR_ERR(worker);
499 ncpus = n;
500 break;
501 }
502
503 threads[n].worker = worker;
504 threads[n].t = &t;
505 threads[n].stop = false;
506 threads[n].result = 0;
507
508 kthread_init_work(&threads[n].work,
509 __igt_breadcrumbs_smoketest);
510 kthread_queue_work(worker, &threads[n].work);
511 }
512
513 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
514
515 for (n = 0; n < ncpus; n++) {
516 int err;
517
518 WRITE_ONCE(threads[n].stop, true);
519 kthread_flush_work(&threads[n].work);
520 err = READ_ONCE(threads[n].result);
521 if (err < 0 && !ret)
522 ret = err;
523
524 kthread_destroy_worker(threads[n].worker);
525 }
526 pr_info("Completed %lu waits for %lu fence across %d cpus\n",
527 atomic_long_read(&t.num_waits),
528 atomic_long_read(&t.num_fences),
529 ncpus);
530
531 out_contexts:
532 for (n = 0; n < t.ncontexts; n++) {
533 if (!t.contexts[n])
534 break;
535 mock_context_close(t.contexts[n]);
536 }
537 kfree(t.contexts);
538 out_threads:
539 kfree(threads);
540 return ret;
541 }
542
i915_request_mock_selftests(void)543 int i915_request_mock_selftests(void)
544 {
545 static const struct i915_subtest tests[] = {
546 SUBTEST(igt_add_request),
547 SUBTEST(igt_wait_request),
548 SUBTEST(igt_fence_wait),
549 SUBTEST(igt_request_rewind),
550 SUBTEST(mock_breadcrumbs_smoketest),
551 };
552 struct drm_i915_private *i915;
553 intel_wakeref_t wakeref;
554 int err = 0;
555
556 i915 = mock_gem_device();
557 if (!i915)
558 return -ENOMEM;
559
560 with_intel_runtime_pm(&i915->runtime_pm, wakeref)
561 err = i915_subtests(tests, i915);
562
563 mock_destroy_device(i915);
564
565 return err;
566 }
567
live_nop_request(void * arg)568 static int live_nop_request(void *arg)
569 {
570 struct drm_i915_private *i915 = arg;
571 struct intel_engine_cs *engine;
572 struct igt_live_test t;
573 int err = -ENODEV;
574
575 /*
576 * Submit various sized batches of empty requests, to each engine
577 * (individually), and wait for the batch to complete. We can check
578 * the overhead of submitting requests to the hardware.
579 */
580
581 for_each_uabi_engine(engine, i915) {
582 unsigned long n, prime;
583 IGT_TIMEOUT(end_time);
584 ktime_t times[2] = {};
585
586 err = igt_live_test_begin(&t, i915, __func__, engine->name);
587 if (err)
588 return err;
589
590 intel_engine_pm_get(engine);
591 for_each_prime_number_from(prime, 1, 8192) {
592 struct i915_request *request = NULL;
593
594 times[1] = ktime_get_raw();
595
596 for (n = 0; n < prime; n++) {
597 i915_request_put(request);
598 request = i915_request_create(engine->kernel_context);
599 if (IS_ERR(request))
600 return PTR_ERR(request);
601
602 /*
603 * This space is left intentionally blank.
604 *
605 * We do not actually want to perform any
606 * action with this request, we just want
607 * to measure the latency in allocation
608 * and submission of our breadcrumbs -
609 * ensuring that the bare request is sufficient
610 * for the system to work (i.e. proper HEAD
611 * tracking of the rings, interrupt handling,
612 * etc). It also gives us the lowest bounds
613 * for latency.
614 */
615
616 i915_request_get(request);
617 i915_request_add(request);
618 }
619 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
620 i915_request_put(request);
621
622 times[1] = ktime_sub(ktime_get_raw(), times[1]);
623 if (prime == 1)
624 times[0] = times[1];
625
626 if (__igt_timeout(end_time, NULL))
627 break;
628 }
629 intel_engine_pm_put(engine);
630
631 err = igt_live_test_end(&t);
632 if (err)
633 return err;
634
635 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
636 engine->name,
637 ktime_to_ns(times[0]),
638 prime, div64_u64(ktime_to_ns(times[1]), prime));
639 }
640
641 return err;
642 }
643
__cancel_inactive(struct intel_engine_cs * engine)644 static int __cancel_inactive(struct intel_engine_cs *engine)
645 {
646 struct intel_context *ce;
647 struct igt_spinner spin;
648 struct i915_request *rq;
649 int err = 0;
650
651 if (igt_spinner_init(&spin, engine->gt))
652 return -ENOMEM;
653
654 ce = intel_context_create(engine);
655 if (IS_ERR(ce)) {
656 err = PTR_ERR(ce);
657 goto out_spin;
658 }
659
660 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
661 if (IS_ERR(rq)) {
662 err = PTR_ERR(rq);
663 goto out_ce;
664 }
665
666 pr_debug("%s: Cancelling inactive request\n", engine->name);
667 i915_request_cancel(rq, -EINTR);
668 i915_request_get(rq);
669 i915_request_add(rq);
670
671 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
672 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
673
674 pr_err("%s: Failed to cancel inactive request\n", engine->name);
675 intel_engine_dump(engine, &p, "%s\n", engine->name);
676 err = -ETIME;
677 goto out_rq;
678 }
679
680 if (rq->fence.error != -EINTR) {
681 pr_err("%s: fence not cancelled (%u)\n",
682 engine->name, rq->fence.error);
683 err = -EINVAL;
684 }
685
686 out_rq:
687 i915_request_put(rq);
688 out_ce:
689 intel_context_put(ce);
690 out_spin:
691 igt_spinner_fini(&spin);
692 if (err)
693 pr_err("%s: %s error %d\n", __func__, engine->name, err);
694 return err;
695 }
696
__cancel_active(struct intel_engine_cs * engine)697 static int __cancel_active(struct intel_engine_cs *engine)
698 {
699 struct intel_context *ce;
700 struct igt_spinner spin;
701 struct i915_request *rq;
702 int err = 0;
703
704 if (igt_spinner_init(&spin, engine->gt))
705 return -ENOMEM;
706
707 ce = intel_context_create(engine);
708 if (IS_ERR(ce)) {
709 err = PTR_ERR(ce);
710 goto out_spin;
711 }
712
713 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
714 if (IS_ERR(rq)) {
715 err = PTR_ERR(rq);
716 goto out_ce;
717 }
718
719 pr_debug("%s: Cancelling active request\n", engine->name);
720 i915_request_get(rq);
721 i915_request_add(rq);
722 if (!igt_wait_for_spinner(&spin, rq)) {
723 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
724
725 pr_err("Failed to start spinner on %s\n", engine->name);
726 intel_engine_dump(engine, &p, "%s\n", engine->name);
727 err = -ETIME;
728 goto out_rq;
729 }
730 i915_request_cancel(rq, -EINTR);
731
732 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
733 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
734
735 pr_err("%s: Failed to cancel active request\n", engine->name);
736 intel_engine_dump(engine, &p, "%s\n", engine->name);
737 err = -ETIME;
738 goto out_rq;
739 }
740
741 if (rq->fence.error != -EINTR) {
742 pr_err("%s: fence not cancelled (%u)\n",
743 engine->name, rq->fence.error);
744 err = -EINVAL;
745 }
746
747 out_rq:
748 i915_request_put(rq);
749 out_ce:
750 intel_context_put(ce);
751 out_spin:
752 igt_spinner_fini(&spin);
753 if (err)
754 pr_err("%s: %s error %d\n", __func__, engine->name, err);
755 return err;
756 }
757
__cancel_completed(struct intel_engine_cs * engine)758 static int __cancel_completed(struct intel_engine_cs *engine)
759 {
760 struct intel_context *ce;
761 struct igt_spinner spin;
762 struct i915_request *rq;
763 int err = 0;
764
765 if (igt_spinner_init(&spin, engine->gt))
766 return -ENOMEM;
767
768 ce = intel_context_create(engine);
769 if (IS_ERR(ce)) {
770 err = PTR_ERR(ce);
771 goto out_spin;
772 }
773
774 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
775 if (IS_ERR(rq)) {
776 err = PTR_ERR(rq);
777 goto out_ce;
778 }
779 igt_spinner_end(&spin);
780 i915_request_get(rq);
781 i915_request_add(rq);
782
783 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
784 err = -ETIME;
785 goto out_rq;
786 }
787
788 pr_debug("%s: Cancelling completed request\n", engine->name);
789 i915_request_cancel(rq, -EINTR);
790 if (rq->fence.error) {
791 pr_err("%s: fence not cancelled (%u)\n",
792 engine->name, rq->fence.error);
793 err = -EINVAL;
794 }
795
796 out_rq:
797 i915_request_put(rq);
798 out_ce:
799 intel_context_put(ce);
800 out_spin:
801 igt_spinner_fini(&spin);
802 if (err)
803 pr_err("%s: %s error %d\n", __func__, engine->name, err);
804 return err;
805 }
806
807 /*
808 * Test to prove a non-preemptable request can be cancelled and a subsequent
809 * request on the same context can successfully complete after cancellation.
810 *
811 * Testing methodology is to create a non-preemptible request and submit it,
812 * wait for spinner to start, create a NOP request and submit it, cancel the
813 * spinner, wait for spinner to complete and verify it failed with an error,
814 * finally wait for NOP request to complete verify it succeeded without an
815 * error. Preemption timeout also reduced / restored so test runs in a timely
816 * maner.
817 */
__cancel_reset(struct drm_i915_private * i915,struct intel_engine_cs * engine)818 static int __cancel_reset(struct drm_i915_private *i915,
819 struct intel_engine_cs *engine)
820 {
821 struct intel_context *ce;
822 struct igt_spinner spin;
823 struct i915_request *rq, *nop;
824 unsigned long preempt_timeout_ms;
825 int err = 0;
826
827 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT ||
828 !intel_has_reset_engine(engine->gt))
829 return 0;
830
831 preempt_timeout_ms = engine->props.preempt_timeout_ms;
832 engine->props.preempt_timeout_ms = 100;
833
834 if (igt_spinner_init(&spin, engine->gt))
835 goto out_restore;
836
837 ce = intel_context_create(engine);
838 if (IS_ERR(ce)) {
839 err = PTR_ERR(ce);
840 goto out_spin;
841 }
842
843 rq = igt_spinner_create_request(&spin, ce, MI_NOOP);
844 if (IS_ERR(rq)) {
845 err = PTR_ERR(rq);
846 goto out_ce;
847 }
848
849 pr_debug("%s: Cancelling active non-preemptable request\n",
850 engine->name);
851 i915_request_get(rq);
852 i915_request_add(rq);
853 if (!igt_wait_for_spinner(&spin, rq)) {
854 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
855
856 pr_err("Failed to start spinner on %s\n", engine->name);
857 intel_engine_dump(engine, &p, "%s\n", engine->name);
858 err = -ETIME;
859 goto out_rq;
860 }
861
862 nop = intel_context_create_request(ce);
863 if (IS_ERR(nop))
864 goto out_rq;
865 i915_request_get(nop);
866 i915_request_add(nop);
867
868 i915_request_cancel(rq, -EINTR);
869
870 if (i915_request_wait(rq, 0, HZ) < 0) {
871 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
872
873 pr_err("%s: Failed to cancel hung request\n", engine->name);
874 intel_engine_dump(engine, &p, "%s\n", engine->name);
875 err = -ETIME;
876 goto out_nop;
877 }
878
879 if (rq->fence.error != -EINTR) {
880 pr_err("%s: fence not cancelled (%u)\n",
881 engine->name, rq->fence.error);
882 err = -EINVAL;
883 goto out_nop;
884 }
885
886 if (i915_request_wait(nop, 0, HZ) < 0) {
887 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
888
889 pr_err("%s: Failed to complete nop request\n", engine->name);
890 intel_engine_dump(engine, &p, "%s\n", engine->name);
891 err = -ETIME;
892 goto out_nop;
893 }
894
895 if (nop->fence.error != 0) {
896 pr_err("%s: Nop request errored (%u)\n",
897 engine->name, nop->fence.error);
898 err = -EINVAL;
899 }
900
901 out_nop:
902 i915_request_put(nop);
903 out_rq:
904 i915_request_put(rq);
905 out_ce:
906 intel_context_put(ce);
907 out_spin:
908 igt_spinner_fini(&spin);
909 out_restore:
910 engine->props.preempt_timeout_ms = preempt_timeout_ms;
911 if (err)
912 pr_err("%s: %s error %d\n", __func__, engine->name, err);
913 return err;
914 }
915
live_cancel_request(void * arg)916 static int live_cancel_request(void *arg)
917 {
918 struct drm_i915_private *i915 = arg;
919 struct intel_engine_cs *engine;
920
921 /*
922 * Check cancellation of requests. We expect to be able to immediately
923 * cancel active requests, even if they are currently on the GPU.
924 */
925
926 for_each_uabi_engine(engine, i915) {
927 struct igt_live_test t;
928 int err, err2;
929
930 if (!intel_engine_has_preemption(engine))
931 continue;
932
933 err = igt_live_test_begin(&t, i915, __func__, engine->name);
934 if (err)
935 return err;
936
937 err = __cancel_inactive(engine);
938 if (err == 0)
939 err = __cancel_active(engine);
940 if (err == 0)
941 err = __cancel_completed(engine);
942
943 err2 = igt_live_test_end(&t);
944 if (err)
945 return err;
946 if (err2)
947 return err2;
948
949 /* Expects reset so call outside of igt_live_test_* */
950 err = __cancel_reset(i915, engine);
951 if (err)
952 return err;
953
954 if (igt_flush_test(i915))
955 return -EIO;
956 }
957
958 return 0;
959 }
960
empty_batch(struct intel_gt * gt)961 static struct i915_vma *empty_batch(struct intel_gt *gt)
962 {
963 struct drm_i915_gem_object *obj;
964 struct i915_vma *vma;
965 u32 *cmd;
966 int err;
967
968 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
969 if (IS_ERR(obj))
970 return ERR_CAST(obj);
971
972 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
973 if (IS_ERR(cmd)) {
974 err = PTR_ERR(cmd);
975 goto err;
976 }
977
978 *cmd = MI_BATCH_BUFFER_END;
979
980 __i915_gem_object_flush_map(obj, 0, 64);
981 i915_gem_object_unpin_map(obj);
982
983 intel_gt_chipset_flush(gt);
984
985 vma = i915_vma_instance(obj, gt->vm, NULL);
986 if (IS_ERR(vma)) {
987 err = PTR_ERR(vma);
988 goto err;
989 }
990
991 err = i915_vma_pin(vma, 0, 0, PIN_USER);
992 if (err)
993 goto err;
994
995 /* Force the wait now to avoid including it in the benchmark */
996 err = i915_vma_sync(vma);
997 if (err)
998 goto err_pin;
999
1000 return vma;
1001
1002 err_pin:
1003 i915_vma_unpin(vma);
1004 err:
1005 i915_gem_object_put(obj);
1006 return ERR_PTR(err);
1007 }
1008
emit_bb_start(struct i915_request * rq,struct i915_vma * batch)1009 static int emit_bb_start(struct i915_request *rq, struct i915_vma *batch)
1010 {
1011 return rq->engine->emit_bb_start(rq,
1012 i915_vma_offset(batch),
1013 i915_vma_size(batch),
1014 0);
1015 }
1016
1017 static struct i915_request *
empty_request(struct intel_engine_cs * engine,struct i915_vma * batch)1018 empty_request(struct intel_engine_cs *engine,
1019 struct i915_vma *batch)
1020 {
1021 struct i915_request *request;
1022 int err;
1023
1024 request = i915_request_create(engine->kernel_context);
1025 if (IS_ERR(request))
1026 return request;
1027
1028 err = emit_bb_start(request, batch);
1029 if (err)
1030 goto out_request;
1031
1032 i915_request_get(request);
1033 out_request:
1034 i915_request_add(request);
1035 return err ? ERR_PTR(err) : request;
1036 }
1037
live_empty_request(void * arg)1038 static int live_empty_request(void *arg)
1039 {
1040 struct drm_i915_private *i915 = arg;
1041 struct intel_engine_cs *engine;
1042 struct igt_live_test t;
1043 int err;
1044
1045 /*
1046 * Submit various sized batches of empty requests, to each engine
1047 * (individually), and wait for the batch to complete. We can check
1048 * the overhead of submitting requests to the hardware.
1049 */
1050
1051 for_each_uabi_engine(engine, i915) {
1052 IGT_TIMEOUT(end_time);
1053 struct i915_request *request;
1054 struct i915_vma *batch;
1055 unsigned long n, prime;
1056 ktime_t times[2] = {};
1057
1058 batch = empty_batch(engine->gt);
1059 if (IS_ERR(batch))
1060 return PTR_ERR(batch);
1061
1062 err = igt_live_test_begin(&t, i915, __func__, engine->name);
1063 if (err)
1064 goto out_batch;
1065
1066 intel_engine_pm_get(engine);
1067
1068 /* Warmup / preload */
1069 request = empty_request(engine, batch);
1070 if (IS_ERR(request)) {
1071 err = PTR_ERR(request);
1072 intel_engine_pm_put(engine);
1073 goto out_batch;
1074 }
1075 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
1076
1077 for_each_prime_number_from(prime, 1, 8192) {
1078 times[1] = ktime_get_raw();
1079
1080 for (n = 0; n < prime; n++) {
1081 i915_request_put(request);
1082 request = empty_request(engine, batch);
1083 if (IS_ERR(request)) {
1084 err = PTR_ERR(request);
1085 intel_engine_pm_put(engine);
1086 goto out_batch;
1087 }
1088 }
1089 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
1090
1091 times[1] = ktime_sub(ktime_get_raw(), times[1]);
1092 if (prime == 1)
1093 times[0] = times[1];
1094
1095 if (__igt_timeout(end_time, NULL))
1096 break;
1097 }
1098 i915_request_put(request);
1099 intel_engine_pm_put(engine);
1100
1101 err = igt_live_test_end(&t);
1102 if (err)
1103 goto out_batch;
1104
1105 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
1106 engine->name,
1107 ktime_to_ns(times[0]),
1108 prime, div64_u64(ktime_to_ns(times[1]), prime));
1109 out_batch:
1110 i915_vma_unpin(batch);
1111 i915_vma_put(batch);
1112 if (err)
1113 break;
1114 }
1115
1116 return err;
1117 }
1118
recursive_batch(struct intel_gt * gt)1119 static struct i915_vma *recursive_batch(struct intel_gt *gt)
1120 {
1121 struct drm_i915_gem_object *obj;
1122 const int ver = GRAPHICS_VER(gt->i915);
1123 struct i915_vma *vma;
1124 u32 *cmd;
1125 int err;
1126
1127 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
1128 if (IS_ERR(obj))
1129 return ERR_CAST(obj);
1130
1131 vma = i915_vma_instance(obj, gt->vm, NULL);
1132 if (IS_ERR(vma)) {
1133 err = PTR_ERR(vma);
1134 goto err;
1135 }
1136
1137 err = i915_vma_pin(vma, 0, 0, PIN_USER);
1138 if (err)
1139 goto err;
1140
1141 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
1142 if (IS_ERR(cmd)) {
1143 err = PTR_ERR(cmd);
1144 goto err;
1145 }
1146
1147 if (ver >= 8) {
1148 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
1149 *cmd++ = lower_32_bits(i915_vma_offset(vma));
1150 *cmd++ = upper_32_bits(i915_vma_offset(vma));
1151 } else if (ver >= 6) {
1152 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
1153 *cmd++ = lower_32_bits(i915_vma_offset(vma));
1154 } else {
1155 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1156 *cmd++ = lower_32_bits(i915_vma_offset(vma));
1157 }
1158 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
1159
1160 __i915_gem_object_flush_map(obj, 0, 64);
1161 i915_gem_object_unpin_map(obj);
1162
1163 intel_gt_chipset_flush(gt);
1164
1165 return vma;
1166
1167 err:
1168 i915_gem_object_put(obj);
1169 return ERR_PTR(err);
1170 }
1171
recursive_batch_resolve(struct i915_vma * batch)1172 static int recursive_batch_resolve(struct i915_vma *batch)
1173 {
1174 u32 *cmd;
1175
1176 cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC);
1177 if (IS_ERR(cmd))
1178 return PTR_ERR(cmd);
1179
1180 *cmd = MI_BATCH_BUFFER_END;
1181
1182 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
1183 i915_gem_object_unpin_map(batch->obj);
1184
1185 intel_gt_chipset_flush(batch->vm->gt);
1186
1187 return 0;
1188 }
1189
live_all_engines(void * arg)1190 static int live_all_engines(void *arg)
1191 {
1192 struct drm_i915_private *i915 = arg;
1193 const unsigned int nengines = num_uabi_engines(i915);
1194 struct intel_engine_cs *engine;
1195 struct i915_request **request;
1196 struct igt_live_test t;
1197 unsigned int idx;
1198 int err;
1199
1200 /*
1201 * Check we can submit requests to all engines simultaneously. We
1202 * send a recursive batch to each engine - checking that we don't
1203 * block doing so, and that they don't complete too soon.
1204 */
1205
1206 request = kzalloc_objs(*request, nengines);
1207 if (!request)
1208 return -ENOMEM;
1209
1210 err = igt_live_test_begin(&t, i915, __func__, "");
1211 if (err)
1212 goto out_free;
1213
1214 idx = 0;
1215 for_each_uabi_engine(engine, i915) {
1216 struct i915_vma *batch;
1217
1218 batch = recursive_batch(engine->gt);
1219 if (IS_ERR(batch)) {
1220 err = PTR_ERR(batch);
1221 pr_err("%s: Unable to create batch, err=%d\n",
1222 __func__, err);
1223 goto out_free;
1224 }
1225
1226 i915_vma_lock(batch);
1227 request[idx] = intel_engine_create_kernel_request(engine);
1228 if (IS_ERR(request[idx])) {
1229 err = PTR_ERR(request[idx]);
1230 pr_err("%s: Request allocation failed with err=%d\n",
1231 __func__, err);
1232 goto out_unlock;
1233 }
1234 GEM_BUG_ON(request[idx]->context->vm != batch->vm);
1235
1236 err = i915_vma_move_to_active(batch, request[idx], 0);
1237 GEM_BUG_ON(err);
1238
1239 err = emit_bb_start(request[idx], batch);
1240 GEM_BUG_ON(err);
1241 request[idx]->batch = batch;
1242
1243 i915_request_get(request[idx]);
1244 i915_request_add(request[idx]);
1245 idx++;
1246 out_unlock:
1247 i915_vma_unlock(batch);
1248 if (err)
1249 goto out_request;
1250 }
1251
1252 idx = 0;
1253 for_each_uabi_engine(engine, i915) {
1254 if (i915_request_completed(request[idx])) {
1255 pr_err("%s(%s): request completed too early!\n",
1256 __func__, engine->name);
1257 err = -EINVAL;
1258 goto out_request;
1259 }
1260 idx++;
1261 }
1262
1263 idx = 0;
1264 for_each_uabi_engine(engine, i915) {
1265 err = recursive_batch_resolve(request[idx]->batch);
1266 if (err) {
1267 pr_err("%s: failed to resolve batch, err=%d\n",
1268 __func__, err);
1269 goto out_request;
1270 }
1271 idx++;
1272 }
1273
1274 idx = 0;
1275 for_each_uabi_engine(engine, i915) {
1276 struct i915_request *rq = request[idx];
1277 long timeout;
1278
1279 timeout = i915_request_wait(rq, 0,
1280 MAX_SCHEDULE_TIMEOUT);
1281 if (timeout < 0) {
1282 err = timeout;
1283 pr_err("%s: error waiting for request on %s, err=%d\n",
1284 __func__, engine->name, err);
1285 goto out_request;
1286 }
1287
1288 GEM_BUG_ON(!i915_request_completed(rq));
1289 i915_vma_unpin(rq->batch);
1290 i915_vma_put(rq->batch);
1291 i915_request_put(rq);
1292 request[idx] = NULL;
1293 idx++;
1294 }
1295
1296 err = igt_live_test_end(&t);
1297
1298 out_request:
1299 idx = 0;
1300 for_each_uabi_engine(engine, i915) {
1301 struct i915_request *rq = request[idx];
1302
1303 if (!rq)
1304 continue;
1305
1306 if (rq->batch) {
1307 i915_vma_unpin(rq->batch);
1308 i915_vma_put(rq->batch);
1309 }
1310 i915_request_put(rq);
1311 idx++;
1312 }
1313 out_free:
1314 kfree(request);
1315 return err;
1316 }
1317
live_sequential_engines(void * arg)1318 static int live_sequential_engines(void *arg)
1319 {
1320 struct drm_i915_private *i915 = arg;
1321 const unsigned int nengines = num_uabi_engines(i915);
1322 struct i915_request **request;
1323 struct i915_request *prev = NULL;
1324 struct intel_engine_cs *engine;
1325 struct igt_live_test t;
1326 unsigned int idx;
1327 int err;
1328
1329 /*
1330 * Check we can submit requests to all engines sequentially, such
1331 * that each successive request waits for the earlier ones. This
1332 * tests that we don't execute requests out of order, even though
1333 * they are running on independent engines.
1334 */
1335
1336 request = kzalloc_objs(*request, nengines);
1337 if (!request)
1338 return -ENOMEM;
1339
1340 err = igt_live_test_begin(&t, i915, __func__, "");
1341 if (err)
1342 goto out_free;
1343
1344 idx = 0;
1345 for_each_uabi_engine(engine, i915) {
1346 struct i915_vma *batch;
1347
1348 batch = recursive_batch(engine->gt);
1349 if (IS_ERR(batch)) {
1350 err = PTR_ERR(batch);
1351 pr_err("%s: Unable to create batch for %s, err=%d\n",
1352 __func__, engine->name, err);
1353 goto out_free;
1354 }
1355
1356 i915_vma_lock(batch);
1357 request[idx] = intel_engine_create_kernel_request(engine);
1358 if (IS_ERR(request[idx])) {
1359 err = PTR_ERR(request[idx]);
1360 pr_err("%s: Request allocation failed for %s with err=%d\n",
1361 __func__, engine->name, err);
1362 goto out_unlock;
1363 }
1364 GEM_BUG_ON(request[idx]->context->vm != batch->vm);
1365
1366 if (prev) {
1367 err = i915_request_await_dma_fence(request[idx],
1368 &prev->fence);
1369 if (err) {
1370 i915_request_add(request[idx]);
1371 pr_err("%s: Request await failed for %s with err=%d\n",
1372 __func__, engine->name, err);
1373 goto out_unlock;
1374 }
1375 }
1376
1377 err = i915_vma_move_to_active(batch, request[idx], 0);
1378 GEM_BUG_ON(err);
1379
1380 err = emit_bb_start(request[idx], batch);
1381 GEM_BUG_ON(err);
1382 request[idx]->batch = batch;
1383
1384 i915_request_get(request[idx]);
1385 i915_request_add(request[idx]);
1386
1387 prev = request[idx];
1388 idx++;
1389
1390 out_unlock:
1391 i915_vma_unlock(batch);
1392 if (err)
1393 goto out_request;
1394 }
1395
1396 idx = 0;
1397 for_each_uabi_engine(engine, i915) {
1398 long timeout;
1399
1400 if (i915_request_completed(request[idx])) {
1401 pr_err("%s(%s): request completed too early!\n",
1402 __func__, engine->name);
1403 err = -EINVAL;
1404 goto out_request;
1405 }
1406
1407 err = recursive_batch_resolve(request[idx]->batch);
1408 if (err) {
1409 pr_err("%s: failed to resolve batch, err=%d\n",
1410 __func__, err);
1411 goto out_request;
1412 }
1413
1414 timeout = i915_request_wait(request[idx], 0,
1415 MAX_SCHEDULE_TIMEOUT);
1416 if (timeout < 0) {
1417 err = timeout;
1418 pr_err("%s: error waiting for request on %s, err=%d\n",
1419 __func__, engine->name, err);
1420 goto out_request;
1421 }
1422
1423 GEM_BUG_ON(!i915_request_completed(request[idx]));
1424 idx++;
1425 }
1426
1427 err = igt_live_test_end(&t);
1428
1429 out_request:
1430 idx = 0;
1431 for_each_uabi_engine(engine, i915) {
1432 u32 *cmd;
1433
1434 if (!request[idx])
1435 break;
1436
1437 cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj,
1438 I915_MAP_WC);
1439 if (!IS_ERR(cmd)) {
1440 *cmd = MI_BATCH_BUFFER_END;
1441
1442 __i915_gem_object_flush_map(request[idx]->batch->obj,
1443 0, sizeof(*cmd));
1444 i915_gem_object_unpin_map(request[idx]->batch->obj);
1445
1446 intel_gt_chipset_flush(engine->gt);
1447 }
1448
1449 i915_vma_put(request[idx]->batch);
1450 i915_request_put(request[idx]);
1451 idx++;
1452 }
1453 out_free:
1454 kfree(request);
1455 return err;
1456 }
1457
1458 struct parallel_thread {
1459 struct kthread_worker *worker;
1460 struct kthread_work work;
1461 struct intel_engine_cs *engine;
1462 int result;
1463 };
1464
__live_parallel_engine1(struct kthread_work * work)1465 static void __live_parallel_engine1(struct kthread_work *work)
1466 {
1467 struct parallel_thread *thread =
1468 container_of(work, typeof(*thread), work);
1469 struct intel_engine_cs *engine = thread->engine;
1470 IGT_TIMEOUT(end_time);
1471 unsigned long count;
1472 int err = 0;
1473
1474 count = 0;
1475 intel_engine_pm_get(engine);
1476 do {
1477 struct i915_request *rq;
1478
1479 rq = i915_request_create(engine->kernel_context);
1480 if (IS_ERR(rq)) {
1481 err = PTR_ERR(rq);
1482 break;
1483 }
1484
1485 i915_request_get(rq);
1486 i915_request_add(rq);
1487
1488 err = 0;
1489 if (i915_request_wait(rq, 0, HZ) < 0)
1490 err = -ETIME;
1491 i915_request_put(rq);
1492 if (err)
1493 break;
1494
1495 count++;
1496 } while (!__igt_timeout(end_time, NULL));
1497 intel_engine_pm_put(engine);
1498
1499 pr_info("%s: %lu request + sync\n", engine->name, count);
1500 thread->result = err;
1501 }
1502
__live_parallel_engineN(struct kthread_work * work)1503 static void __live_parallel_engineN(struct kthread_work *work)
1504 {
1505 struct parallel_thread *thread =
1506 container_of(work, typeof(*thread), work);
1507 struct intel_engine_cs *engine = thread->engine;
1508 IGT_TIMEOUT(end_time);
1509 unsigned long count;
1510 int err = 0;
1511
1512 count = 0;
1513 intel_engine_pm_get(engine);
1514 do {
1515 struct i915_request *rq;
1516
1517 rq = i915_request_create(engine->kernel_context);
1518 if (IS_ERR(rq)) {
1519 err = PTR_ERR(rq);
1520 break;
1521 }
1522
1523 i915_request_add(rq);
1524 count++;
1525 } while (!__igt_timeout(end_time, NULL));
1526 intel_engine_pm_put(engine);
1527
1528 pr_info("%s: %lu requests\n", engine->name, count);
1529 thread->result = err;
1530 }
1531
wake_all(struct drm_i915_private * i915)1532 static bool wake_all(struct drm_i915_private *i915)
1533 {
1534 if (atomic_dec_and_test(&i915->selftest.counter)) {
1535 wake_up_var(&i915->selftest.counter);
1536 return true;
1537 }
1538
1539 return false;
1540 }
1541
wait_for_all(struct drm_i915_private * i915)1542 static int wait_for_all(struct drm_i915_private *i915)
1543 {
1544 if (wake_all(i915))
1545 return 0;
1546
1547 if (wait_var_event_timeout(&i915->selftest.counter,
1548 !atomic_read(&i915->selftest.counter),
1549 i915_selftest.timeout_jiffies))
1550 return 0;
1551
1552 return -ETIME;
1553 }
1554
__live_parallel_spin(struct kthread_work * work)1555 static void __live_parallel_spin(struct kthread_work *work)
1556 {
1557 struct parallel_thread *thread =
1558 container_of(work, typeof(*thread), work);
1559 struct intel_engine_cs *engine = thread->engine;
1560 struct igt_spinner spin;
1561 struct i915_request *rq;
1562 int err = 0;
1563
1564 /*
1565 * Create a spinner running for eternity on each engine. If a second
1566 * spinner is incorrectly placed on the same engine, it will not be
1567 * able to start in time.
1568 */
1569
1570 if (igt_spinner_init(&spin, engine->gt)) {
1571 wake_all(engine->i915);
1572 thread->result = -ENOMEM;
1573 return;
1574 }
1575
1576 intel_engine_pm_get(engine);
1577 rq = igt_spinner_create_request(&spin,
1578 engine->kernel_context,
1579 MI_NOOP); /* no preemption */
1580 intel_engine_pm_put(engine);
1581 if (IS_ERR(rq)) {
1582 err = PTR_ERR(rq);
1583 if (err == -ENODEV)
1584 err = 0;
1585 wake_all(engine->i915);
1586 goto out_spin;
1587 }
1588
1589 i915_request_get(rq);
1590 i915_request_add(rq);
1591 if (igt_wait_for_spinner(&spin, rq)) {
1592 /* Occupy this engine for the whole test */
1593 err = wait_for_all(engine->i915);
1594 } else {
1595 pr_err("Failed to start spinner on %s\n", engine->name);
1596 err = -EINVAL;
1597 }
1598 igt_spinner_end(&spin);
1599
1600 if (err == 0 && i915_request_wait(rq, 0, HZ) < 0)
1601 err = -EIO;
1602 i915_request_put(rq);
1603
1604 out_spin:
1605 igt_spinner_fini(&spin);
1606 thread->result = err;
1607 }
1608
live_parallel_engines(void * arg)1609 static int live_parallel_engines(void *arg)
1610 {
1611 struct drm_i915_private *i915 = arg;
1612 static void (* const func[])(struct kthread_work *) = {
1613 __live_parallel_engine1,
1614 __live_parallel_engineN,
1615 __live_parallel_spin,
1616 NULL,
1617 };
1618 const unsigned int nengines = num_uabi_engines(i915);
1619 struct parallel_thread *threads;
1620 struct intel_engine_cs *engine;
1621 void (* const *fn)(struct kthread_work *);
1622 int err = 0;
1623
1624 /*
1625 * Check we can submit requests to all engines concurrently. This
1626 * tests that we load up the system maximally.
1627 */
1628
1629 threads = kzalloc_objs(*threads, nengines);
1630 if (!threads)
1631 return -ENOMEM;
1632
1633 for (fn = func; !err && *fn; fn++) {
1634 char name[KSYM_NAME_LEN];
1635 struct igt_live_test t;
1636 unsigned int idx;
1637
1638 snprintf(name, sizeof(name), "%ps", *fn);
1639 err = igt_live_test_begin(&t, i915, __func__, name);
1640 if (err)
1641 break;
1642
1643 atomic_set(&i915->selftest.counter, nengines);
1644
1645 idx = 0;
1646 for_each_uabi_engine(engine, i915) {
1647 struct kthread_worker *worker;
1648
1649 worker = kthread_run_worker(0, "igt/parallel:%s",
1650 engine->name);
1651 if (IS_ERR(worker)) {
1652 err = PTR_ERR(worker);
1653 break;
1654 }
1655
1656 threads[idx].worker = worker;
1657 threads[idx].result = 0;
1658 threads[idx].engine = engine;
1659
1660 kthread_init_work(&threads[idx].work, *fn);
1661 kthread_queue_work(worker, &threads[idx].work);
1662 idx++;
1663 }
1664
1665 idx = 0;
1666 for_each_uabi_engine(engine, i915) {
1667 int status;
1668
1669 if (!threads[idx].worker)
1670 break;
1671
1672 kthread_flush_work(&threads[idx].work);
1673 status = READ_ONCE(threads[idx].result);
1674 if (status && !err)
1675 err = status;
1676
1677 kthread_destroy_worker(threads[idx++].worker);
1678 }
1679
1680 if (igt_live_test_end(&t))
1681 err = -EIO;
1682 }
1683
1684 kfree(threads);
1685 return err;
1686 }
1687
1688 static int
max_batches(struct i915_gem_context * ctx,struct intel_engine_cs * engine)1689 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1690 {
1691 struct i915_request *rq;
1692 int ret;
1693
1694 /*
1695 * Before execlists, all contexts share the same ringbuffer. With
1696 * execlists, each context/engine has a separate ringbuffer and
1697 * for the purposes of this test, inexhaustible.
1698 *
1699 * For the global ringbuffer though, we have to be very careful
1700 * that we do not wrap while preventing the execution of requests
1701 * with a unsignaled fence.
1702 */
1703 if (HAS_EXECLISTS(ctx->i915))
1704 return INT_MAX;
1705
1706 rq = igt_request_alloc(ctx, engine);
1707 if (IS_ERR(rq)) {
1708 ret = PTR_ERR(rq);
1709 } else {
1710 int sz;
1711
1712 ret = rq->ring->size - rq->reserved_space;
1713 i915_request_add(rq);
1714
1715 sz = rq->ring->emit - rq->head;
1716 if (sz < 0)
1717 sz += rq->ring->size;
1718 ret /= sz;
1719 ret /= 2; /* leave half spare, in case of emergency! */
1720 }
1721
1722 return ret;
1723 }
1724
live_breadcrumbs_smoketest(void * arg)1725 static int live_breadcrumbs_smoketest(void *arg)
1726 {
1727 struct drm_i915_private *i915 = arg;
1728 const unsigned int nengines = num_uabi_engines(i915);
1729 const unsigned int ncpus = /* saturate with nengines * ncpus */
1730 max_t(int, 2, DIV_ROUND_UP(num_online_cpus(), nengines));
1731 unsigned long num_waits, num_fences;
1732 struct intel_engine_cs *engine;
1733 struct smoke_thread *threads;
1734 struct igt_live_test live;
1735 intel_wakeref_t wakeref;
1736 struct smoketest *smoke;
1737 unsigned int n, idx;
1738 struct file *file;
1739 int ret = 0;
1740
1741 /*
1742 * Smoketest our breadcrumb/signal handling for requests across multiple
1743 * threads. A very simple test to only catch the most egregious of bugs.
1744 * See __igt_breadcrumbs_smoketest();
1745 *
1746 * On real hardware this time.
1747 */
1748
1749 wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1750
1751 file = mock_file(i915);
1752 if (IS_ERR(file)) {
1753 ret = PTR_ERR(file);
1754 goto out_rpm;
1755 }
1756
1757 smoke = kzalloc_objs(*smoke, nengines);
1758 if (!smoke) {
1759 ret = -ENOMEM;
1760 goto out_file;
1761 }
1762
1763 threads = kzalloc_objs(*threads, ncpus * nengines);
1764 if (!threads) {
1765 ret = -ENOMEM;
1766 goto out_smoke;
1767 }
1768
1769 smoke[0].request_alloc = __live_request_alloc;
1770 smoke[0].ncontexts = 64;
1771 smoke[0].contexts = kzalloc_objs(*smoke[0].contexts, smoke[0].ncontexts);
1772 if (!smoke[0].contexts) {
1773 ret = -ENOMEM;
1774 goto out_threads;
1775 }
1776
1777 for (n = 0; n < smoke[0].ncontexts; n++) {
1778 smoke[0].contexts[n] = live_context(i915, file);
1779 if (IS_ERR(smoke[0].contexts[n])) {
1780 ret = PTR_ERR(smoke[0].contexts[n]);
1781 goto out_contexts;
1782 }
1783 }
1784
1785 ret = igt_live_test_begin(&live, i915, __func__, "");
1786 if (ret)
1787 goto out_contexts;
1788
1789 idx = 0;
1790 for_each_uabi_engine(engine, i915) {
1791 smoke[idx] = smoke[0];
1792 smoke[idx].engine = engine;
1793 smoke[idx].max_batch =
1794 max_batches(smoke[0].contexts[0], engine);
1795 if (smoke[idx].max_batch < 0) {
1796 ret = smoke[idx].max_batch;
1797 goto out_flush;
1798 }
1799 /* One ring interleaved between requests from all cpus */
1800 smoke[idx].max_batch /= ncpus + 1;
1801 pr_debug("Limiting batches to %d requests on %s\n",
1802 smoke[idx].max_batch, engine->name);
1803
1804 for (n = 0; n < ncpus; n++) {
1805 unsigned int i = idx * ncpus + n;
1806 struct kthread_worker *worker;
1807
1808 worker = kthread_run_worker(0, "igt/%d.%d", idx, n);
1809 if (IS_ERR(worker)) {
1810 ret = PTR_ERR(worker);
1811 goto out_flush;
1812 }
1813
1814 threads[i].worker = worker;
1815 threads[i].t = &smoke[idx];
1816
1817 kthread_init_work(&threads[i].work,
1818 __igt_breadcrumbs_smoketest);
1819 kthread_queue_work(worker, &threads[i].work);
1820 }
1821
1822 idx++;
1823 }
1824
1825 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1826
1827 out_flush:
1828 idx = 0;
1829 num_waits = 0;
1830 num_fences = 0;
1831 for_each_uabi_engine(engine, i915) {
1832 for (n = 0; n < ncpus; n++) {
1833 unsigned int i = idx * ncpus + n;
1834 int err;
1835
1836 if (!threads[i].worker)
1837 continue;
1838
1839 WRITE_ONCE(threads[i].stop, true);
1840 kthread_flush_work(&threads[i].work);
1841 err = READ_ONCE(threads[i].result);
1842 if (err < 0 && !ret)
1843 ret = err;
1844
1845 kthread_destroy_worker(threads[i].worker);
1846 }
1847
1848 num_waits += atomic_long_read(&smoke[idx].num_waits);
1849 num_fences += atomic_long_read(&smoke[idx].num_fences);
1850 idx++;
1851 }
1852 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1853 num_waits, num_fences, idx, ncpus);
1854
1855 ret = igt_live_test_end(&live) ?: ret;
1856 out_contexts:
1857 kfree(smoke[0].contexts);
1858 out_threads:
1859 kfree(threads);
1860 out_smoke:
1861 kfree(smoke);
1862 out_file:
1863 fput(file);
1864 out_rpm:
1865 intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1866
1867 return ret;
1868 }
1869
i915_request_live_selftests(struct drm_i915_private * i915)1870 int i915_request_live_selftests(struct drm_i915_private *i915)
1871 {
1872 static const struct i915_subtest tests[] = {
1873 SUBTEST(live_nop_request),
1874 SUBTEST(live_all_engines),
1875 SUBTEST(live_sequential_engines),
1876 SUBTEST(live_parallel_engines),
1877 SUBTEST(live_empty_request),
1878 SUBTEST(live_cancel_request),
1879 SUBTEST(live_breadcrumbs_smoketest),
1880 };
1881
1882 if (intel_gt_is_wedged(to_gt(i915)))
1883 return 0;
1884
1885 return i915_live_subtests(tests, i915);
1886 }
1887
switch_to_kernel_sync(struct intel_context * ce,int err)1888 static int switch_to_kernel_sync(struct intel_context *ce, int err)
1889 {
1890 struct i915_request *rq;
1891 struct dma_fence *fence;
1892
1893 rq = intel_engine_create_kernel_request(ce->engine);
1894 if (IS_ERR(rq))
1895 return PTR_ERR(rq);
1896
1897 fence = i915_active_fence_get(&ce->timeline->last_request);
1898 if (fence) {
1899 i915_request_await_dma_fence(rq, fence);
1900 dma_fence_put(fence);
1901 }
1902
1903 rq = i915_request_get(rq);
1904 i915_request_add(rq);
1905 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1906 err = -ETIME;
1907 i915_request_put(rq);
1908
1909 while (!err && !intel_engine_is_idle(ce->engine))
1910 intel_engine_flush_submission(ce->engine);
1911
1912 return err;
1913 }
1914
1915 struct perf_stats {
1916 struct intel_engine_cs *engine;
1917 unsigned long count;
1918 ktime_t time;
1919 ktime_t busy;
1920 u64 runtime;
1921 };
1922
1923 struct perf_series {
1924 struct drm_i915_private *i915;
1925 unsigned int nengines;
1926 struct intel_context *ce[] __counted_by(nengines);
1927 };
1928
cmp_u32(const void * A,const void * B)1929 static int cmp_u32(const void *A, const void *B)
1930 {
1931 const u32 *a = A, *b = B;
1932
1933 return *a - *b;
1934 }
1935
trifilter(u32 * a)1936 static u32 trifilter(u32 *a)
1937 {
1938 u64 sum;
1939
1940 #define TF_COUNT 5
1941 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1942
1943 sum = mul_u32_u32(a[2], 2);
1944 sum += a[1];
1945 sum += a[3];
1946
1947 GEM_BUG_ON(sum > U32_MAX);
1948 return sum;
1949 #define TF_BIAS 2
1950 }
1951
cycles_to_ns(struct intel_engine_cs * engine,u32 cycles)1952 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1953 {
1954 u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles);
1955
1956 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1957 }
1958
emit_timestamp_store(u32 * cs,struct intel_context * ce,u32 offset)1959 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1960 {
1961 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1962 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1963 *cs++ = offset;
1964 *cs++ = 0;
1965
1966 return cs;
1967 }
1968
emit_store_dw(u32 * cs,u32 offset,u32 value)1969 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1970 {
1971 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1972 *cs++ = offset;
1973 *cs++ = 0;
1974 *cs++ = value;
1975
1976 return cs;
1977 }
1978
emit_semaphore_poll(u32 * cs,u32 mode,u32 value,u32 offset)1979 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1980 {
1981 *cs++ = MI_SEMAPHORE_WAIT |
1982 MI_SEMAPHORE_GLOBAL_GTT |
1983 MI_SEMAPHORE_POLL |
1984 mode;
1985 *cs++ = value;
1986 *cs++ = offset;
1987 *cs++ = 0;
1988
1989 return cs;
1990 }
1991
emit_semaphore_poll_until(u32 * cs,u32 offset,u32 value)1992 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1993 {
1994 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1995 }
1996
semaphore_set(u32 * sema,u32 value)1997 static void semaphore_set(u32 *sema, u32 value)
1998 {
1999 WRITE_ONCE(*sema, value);
2000 wmb(); /* flush the update to the cache, and beyond */
2001 }
2002
hwsp_scratch(const struct intel_context * ce)2003 static u32 *hwsp_scratch(const struct intel_context *ce)
2004 {
2005 return memset32(ce->engine->status_page.addr + 1000, 0, 21);
2006 }
2007
hwsp_offset(const struct intel_context * ce,u32 * dw)2008 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
2009 {
2010 return (i915_ggtt_offset(ce->engine->status_page.vma) +
2011 offset_in_page(dw));
2012 }
2013
measure_semaphore_response(struct intel_context * ce)2014 static int measure_semaphore_response(struct intel_context *ce)
2015 {
2016 u32 *sema = hwsp_scratch(ce);
2017 const u32 offset = hwsp_offset(ce, sema);
2018 u32 elapsed[TF_COUNT], cycles;
2019 struct i915_request *rq;
2020 u32 *cs;
2021 int err;
2022 int i;
2023
2024 /*
2025 * Measure how many cycles it takes for the HW to detect the change
2026 * in a semaphore value.
2027 *
2028 * A: read CS_TIMESTAMP from CPU
2029 * poke semaphore
2030 * B: read CS_TIMESTAMP on GPU
2031 *
2032 * Semaphore latency: B - A
2033 */
2034
2035 semaphore_set(sema, -1);
2036
2037 rq = i915_request_create(ce);
2038 if (IS_ERR(rq))
2039 return PTR_ERR(rq);
2040
2041 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
2042 if (IS_ERR(cs)) {
2043 i915_request_add(rq);
2044 err = PTR_ERR(cs);
2045 goto err;
2046 }
2047
2048 cs = emit_store_dw(cs, offset, 0);
2049 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2050 cs = emit_semaphore_poll_until(cs, offset, i);
2051 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2052 cs = emit_store_dw(cs, offset, 0);
2053 }
2054
2055 intel_ring_advance(rq, cs);
2056 i915_request_add(rq);
2057
2058 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
2059 err = -EIO;
2060 goto err;
2061 }
2062
2063 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2064 preempt_disable();
2065 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2066 semaphore_set(sema, i);
2067 preempt_enable();
2068
2069 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
2070 err = -EIO;
2071 goto err;
2072 }
2073
2074 elapsed[i - 1] = sema[i] - cycles;
2075 }
2076
2077 cycles = trifilter(elapsed);
2078 pr_info("%s: semaphore response %d cycles, %lluns\n",
2079 ce->engine->name, cycles >> TF_BIAS,
2080 cycles_to_ns(ce->engine, cycles));
2081
2082 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2083
2084 err:
2085 intel_gt_set_wedged(ce->engine->gt);
2086 return err;
2087 }
2088
measure_idle_dispatch(struct intel_context * ce)2089 static int measure_idle_dispatch(struct intel_context *ce)
2090 {
2091 u32 *sema = hwsp_scratch(ce);
2092 const u32 offset = hwsp_offset(ce, sema);
2093 u32 elapsed[TF_COUNT], cycles;
2094 u32 *cs;
2095 int err;
2096 int i;
2097
2098 /*
2099 * Measure how long it takes for us to submit a request while the
2100 * engine is idle, but is resting in our context.
2101 *
2102 * A: read CS_TIMESTAMP from CPU
2103 * submit request
2104 * B: read CS_TIMESTAMP on GPU
2105 *
2106 * Submission latency: B - A
2107 */
2108
2109 for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2110 struct i915_request *rq;
2111
2112 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2113 if (err)
2114 return err;
2115
2116 rq = i915_request_create(ce);
2117 if (IS_ERR(rq)) {
2118 err = PTR_ERR(rq);
2119 goto err;
2120 }
2121
2122 cs = intel_ring_begin(rq, 4);
2123 if (IS_ERR(cs)) {
2124 i915_request_add(rq);
2125 err = PTR_ERR(cs);
2126 goto err;
2127 }
2128
2129 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2130
2131 intel_ring_advance(rq, cs);
2132
2133 preempt_disable();
2134 local_bh_disable();
2135 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2136 i915_request_add(rq);
2137 local_bh_enable();
2138 preempt_enable();
2139 }
2140
2141 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2142 if (err)
2143 goto err;
2144
2145 for (i = 0; i < ARRAY_SIZE(elapsed); i++)
2146 elapsed[i] = sema[i] - elapsed[i];
2147
2148 cycles = trifilter(elapsed);
2149 pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
2150 ce->engine->name, cycles >> TF_BIAS,
2151 cycles_to_ns(ce->engine, cycles));
2152
2153 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2154
2155 err:
2156 intel_gt_set_wedged(ce->engine->gt);
2157 return err;
2158 }
2159
measure_busy_dispatch(struct intel_context * ce)2160 static int measure_busy_dispatch(struct intel_context *ce)
2161 {
2162 u32 *sema = hwsp_scratch(ce);
2163 const u32 offset = hwsp_offset(ce, sema);
2164 u32 elapsed[TF_COUNT + 1], cycles;
2165 u32 *cs;
2166 int err;
2167 int i;
2168
2169 /*
2170 * Measure how long it takes for us to submit a request while the
2171 * engine is busy, polling on a semaphore in our context. With
2172 * direct submission, this will include the cost of a lite restore.
2173 *
2174 * A: read CS_TIMESTAMP from CPU
2175 * submit request
2176 * B: read CS_TIMESTAMP on GPU
2177 *
2178 * Submission latency: B - A
2179 */
2180
2181 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2182 struct i915_request *rq;
2183
2184 rq = i915_request_create(ce);
2185 if (IS_ERR(rq)) {
2186 err = PTR_ERR(rq);
2187 goto err;
2188 }
2189
2190 cs = intel_ring_begin(rq, 12);
2191 if (IS_ERR(cs)) {
2192 i915_request_add(rq);
2193 err = PTR_ERR(cs);
2194 goto err;
2195 }
2196
2197 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2198 cs = emit_semaphore_poll_until(cs, offset, i);
2199 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2200
2201 intel_ring_advance(rq, cs);
2202
2203 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
2204 err = -EIO;
2205 goto err;
2206 }
2207
2208 preempt_disable();
2209 local_bh_disable();
2210 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2211 i915_request_add(rq);
2212 local_bh_enable();
2213 semaphore_set(sema, i - 1);
2214 preempt_enable();
2215 }
2216
2217 wait_for(READ_ONCE(sema[i - 1]), 500);
2218 semaphore_set(sema, i - 1);
2219
2220 for (i = 1; i <= TF_COUNT; i++) {
2221 GEM_BUG_ON(sema[i] == -1);
2222 elapsed[i - 1] = sema[i] - elapsed[i];
2223 }
2224
2225 cycles = trifilter(elapsed);
2226 pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
2227 ce->engine->name, cycles >> TF_BIAS,
2228 cycles_to_ns(ce->engine, cycles));
2229
2230 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2231
2232 err:
2233 intel_gt_set_wedged(ce->engine->gt);
2234 return err;
2235 }
2236
plug(struct intel_engine_cs * engine,u32 * sema,u32 mode,int value)2237 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
2238 {
2239 const u32 offset =
2240 i915_ggtt_offset(engine->status_page.vma) +
2241 offset_in_page(sema);
2242 struct i915_request *rq;
2243 u32 *cs;
2244
2245 rq = i915_request_create(engine->kernel_context);
2246 if (IS_ERR(rq))
2247 return PTR_ERR(rq);
2248
2249 cs = intel_ring_begin(rq, 4);
2250 if (IS_ERR(cs)) {
2251 i915_request_add(rq);
2252 return PTR_ERR(cs);
2253 }
2254
2255 cs = emit_semaphore_poll(cs, mode, value, offset);
2256
2257 intel_ring_advance(rq, cs);
2258 i915_request_add(rq);
2259
2260 return 0;
2261 }
2262
measure_inter_request(struct intel_context * ce)2263 static int measure_inter_request(struct intel_context *ce)
2264 {
2265 u32 *sema = hwsp_scratch(ce);
2266 const u32 offset = hwsp_offset(ce, sema);
2267 u32 elapsed[TF_COUNT + 1], cycles;
2268 struct i915_sw_fence *submit;
2269 int i, err;
2270
2271 /*
2272 * Measure how long it takes to advance from one request into the
2273 * next. Between each request we flush the GPU caches to memory,
2274 * update the breadcrumbs, and then invalidate those caches.
2275 * We queue up all the requests to be submitted in one batch so
2276 * it should be one set of contiguous measurements.
2277 *
2278 * A: read CS_TIMESTAMP on GPU
2279 * advance request
2280 * B: read CS_TIMESTAMP on GPU
2281 *
2282 * Request latency: B - A
2283 */
2284
2285 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2286 if (err)
2287 return err;
2288
2289 submit = heap_fence_create(GFP_KERNEL);
2290 if (!submit) {
2291 semaphore_set(sema, 1);
2292 return -ENOMEM;
2293 }
2294
2295 intel_engine_flush_submission(ce->engine);
2296 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2297 struct i915_request *rq;
2298 u32 *cs;
2299
2300 rq = i915_request_create(ce);
2301 if (IS_ERR(rq)) {
2302 err = PTR_ERR(rq);
2303 goto err_submit;
2304 }
2305
2306 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
2307 submit,
2308 GFP_KERNEL);
2309 if (err < 0) {
2310 i915_request_add(rq);
2311 goto err_submit;
2312 }
2313
2314 cs = intel_ring_begin(rq, 4);
2315 if (IS_ERR(cs)) {
2316 i915_request_add(rq);
2317 err = PTR_ERR(cs);
2318 goto err_submit;
2319 }
2320
2321 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2322
2323 intel_ring_advance(rq, cs);
2324 i915_request_add(rq);
2325 }
2326 i915_sw_fence_commit(submit);
2327 intel_engine_flush_submission(ce->engine);
2328 heap_fence_put(submit);
2329
2330 semaphore_set(sema, 1);
2331 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2332 if (err)
2333 goto err;
2334
2335 for (i = 1; i <= TF_COUNT; i++)
2336 elapsed[i - 1] = sema[i + 1] - sema[i];
2337
2338 cycles = trifilter(elapsed);
2339 pr_info("%s: inter-request latency %d cycles, %lluns\n",
2340 ce->engine->name, cycles >> TF_BIAS,
2341 cycles_to_ns(ce->engine, cycles));
2342
2343 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2344
2345 err_submit:
2346 i915_sw_fence_commit(submit);
2347 heap_fence_put(submit);
2348 semaphore_set(sema, 1);
2349 err:
2350 intel_gt_set_wedged(ce->engine->gt);
2351 return err;
2352 }
2353
measure_context_switch(struct intel_context * ce)2354 static int measure_context_switch(struct intel_context *ce)
2355 {
2356 u32 *sema = hwsp_scratch(ce);
2357 const u32 offset = hwsp_offset(ce, sema);
2358 struct i915_request *fence = NULL;
2359 u32 elapsed[TF_COUNT + 1], cycles;
2360 int i, j, err;
2361 u32 *cs;
2362
2363 /*
2364 * Measure how long it takes to advance from one request in one
2365 * context to a request in another context. This allows us to
2366 * measure how long the context save/restore take, along with all
2367 * the inter-context setup we require.
2368 *
2369 * A: read CS_TIMESTAMP on GPU
2370 * switch context
2371 * B: read CS_TIMESTAMP on GPU
2372 *
2373 * Context switch latency: B - A
2374 */
2375
2376 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2377 if (err)
2378 return err;
2379
2380 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2381 struct intel_context *arr[] = {
2382 ce, ce->engine->kernel_context
2383 };
2384 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
2385
2386 for (j = 0; j < ARRAY_SIZE(arr); j++) {
2387 struct i915_request *rq;
2388
2389 rq = i915_request_create(arr[j]);
2390 if (IS_ERR(rq)) {
2391 err = PTR_ERR(rq);
2392 goto err_fence;
2393 }
2394
2395 if (fence) {
2396 err = i915_request_await_dma_fence(rq,
2397 &fence->fence);
2398 if (err) {
2399 i915_request_add(rq);
2400 goto err_fence;
2401 }
2402 }
2403
2404 cs = intel_ring_begin(rq, 4);
2405 if (IS_ERR(cs)) {
2406 i915_request_add(rq);
2407 err = PTR_ERR(cs);
2408 goto err_fence;
2409 }
2410
2411 cs = emit_timestamp_store(cs, ce, addr);
2412 addr += sizeof(u32);
2413
2414 intel_ring_advance(rq, cs);
2415
2416 i915_request_put(fence);
2417 fence = i915_request_get(rq);
2418
2419 i915_request_add(rq);
2420 }
2421 }
2422 i915_request_put(fence);
2423 intel_engine_flush_submission(ce->engine);
2424
2425 semaphore_set(sema, 1);
2426 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2427 if (err)
2428 goto err;
2429
2430 for (i = 1; i <= TF_COUNT; i++)
2431 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2432
2433 cycles = trifilter(elapsed);
2434 pr_info("%s: context switch latency %d cycles, %lluns\n",
2435 ce->engine->name, cycles >> TF_BIAS,
2436 cycles_to_ns(ce->engine, cycles));
2437
2438 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2439
2440 err_fence:
2441 i915_request_put(fence);
2442 semaphore_set(sema, 1);
2443 err:
2444 intel_gt_set_wedged(ce->engine->gt);
2445 return err;
2446 }
2447
measure_preemption(struct intel_context * ce)2448 static int measure_preemption(struct intel_context *ce)
2449 {
2450 u32 *sema = hwsp_scratch(ce);
2451 const u32 offset = hwsp_offset(ce, sema);
2452 u32 elapsed[TF_COUNT], cycles;
2453 u32 *cs;
2454 int err;
2455 int i;
2456
2457 /*
2458 * We measure two latencies while triggering preemption. The first
2459 * latency is how long it takes for us to submit a preempting request.
2460 * The second latency is how it takes for us to return from the
2461 * preemption back to the original context.
2462 *
2463 * A: read CS_TIMESTAMP from CPU
2464 * submit preemption
2465 * B: read CS_TIMESTAMP on GPU (in preempting context)
2466 * context switch
2467 * C: read CS_TIMESTAMP on GPU (in original context)
2468 *
2469 * Preemption dispatch latency: B - A
2470 * Preemption switch latency: C - B
2471 */
2472
2473 if (!intel_engine_has_preemption(ce->engine))
2474 return 0;
2475
2476 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2477 u32 addr = offset + 2 * i * sizeof(u32);
2478 struct i915_request *rq;
2479
2480 rq = i915_request_create(ce);
2481 if (IS_ERR(rq)) {
2482 err = PTR_ERR(rq);
2483 goto err;
2484 }
2485
2486 cs = intel_ring_begin(rq, 12);
2487 if (IS_ERR(cs)) {
2488 i915_request_add(rq);
2489 err = PTR_ERR(cs);
2490 goto err;
2491 }
2492
2493 cs = emit_store_dw(cs, addr, -1);
2494 cs = emit_semaphore_poll_until(cs, offset, i);
2495 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2496
2497 intel_ring_advance(rq, cs);
2498 i915_request_add(rq);
2499
2500 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2501 err = -EIO;
2502 goto err;
2503 }
2504
2505 rq = i915_request_create(ce->engine->kernel_context);
2506 if (IS_ERR(rq)) {
2507 err = PTR_ERR(rq);
2508 goto err;
2509 }
2510
2511 cs = intel_ring_begin(rq, 8);
2512 if (IS_ERR(cs)) {
2513 i915_request_add(rq);
2514 err = PTR_ERR(cs);
2515 goto err;
2516 }
2517
2518 cs = emit_timestamp_store(cs, ce, addr);
2519 cs = emit_store_dw(cs, offset, i);
2520
2521 intel_ring_advance(rq, cs);
2522 rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2523
2524 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2525 i915_request_add(rq);
2526 }
2527
2528 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2529 err = -EIO;
2530 goto err;
2531 }
2532
2533 for (i = 1; i <= TF_COUNT; i++)
2534 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2535
2536 cycles = trifilter(elapsed);
2537 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2538 ce->engine->name, cycles >> TF_BIAS,
2539 cycles_to_ns(ce->engine, cycles));
2540
2541 for (i = 1; i <= TF_COUNT; i++)
2542 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2543
2544 cycles = trifilter(elapsed);
2545 pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2546 ce->engine->name, cycles >> TF_BIAS,
2547 cycles_to_ns(ce->engine, cycles));
2548
2549 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2550
2551 err:
2552 intel_gt_set_wedged(ce->engine->gt);
2553 return err;
2554 }
2555
2556 struct signal_cb {
2557 struct dma_fence_cb base;
2558 bool seen;
2559 };
2560
signal_cb(struct dma_fence * fence,struct dma_fence_cb * cb)2561 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2562 {
2563 struct signal_cb *s = container_of(cb, typeof(*s), base);
2564
2565 smp_store_mb(s->seen, true); /* be safe, be strong */
2566 }
2567
measure_completion(struct intel_context * ce)2568 static int measure_completion(struct intel_context *ce)
2569 {
2570 u32 *sema = hwsp_scratch(ce);
2571 const u32 offset = hwsp_offset(ce, sema);
2572 u32 elapsed[TF_COUNT], cycles;
2573 u32 *cs;
2574 int err;
2575 int i;
2576
2577 /*
2578 * Measure how long it takes for the signal (interrupt) to be
2579 * sent from the GPU to be processed by the CPU.
2580 *
2581 * A: read CS_TIMESTAMP on GPU
2582 * signal
2583 * B: read CS_TIMESTAMP from CPU
2584 *
2585 * Completion latency: B - A
2586 */
2587
2588 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2589 struct signal_cb cb = { .seen = false };
2590 struct i915_request *rq;
2591
2592 rq = i915_request_create(ce);
2593 if (IS_ERR(rq)) {
2594 err = PTR_ERR(rq);
2595 goto err;
2596 }
2597
2598 cs = intel_ring_begin(rq, 12);
2599 if (IS_ERR(cs)) {
2600 i915_request_add(rq);
2601 err = PTR_ERR(cs);
2602 goto err;
2603 }
2604
2605 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2606 cs = emit_semaphore_poll_until(cs, offset, i);
2607 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2608
2609 intel_ring_advance(rq, cs);
2610
2611 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2612 i915_request_add(rq);
2613
2614 intel_engine_flush_submission(ce->engine);
2615 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2616 err = -EIO;
2617 goto err;
2618 }
2619
2620 preempt_disable();
2621 semaphore_set(sema, i);
2622 while (!READ_ONCE(cb.seen))
2623 cpu_relax();
2624
2625 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2626 preempt_enable();
2627 }
2628
2629 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2630 if (err)
2631 goto err;
2632
2633 for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2634 GEM_BUG_ON(sema[i + 1] == -1);
2635 elapsed[i] = elapsed[i] - sema[i + 1];
2636 }
2637
2638 cycles = trifilter(elapsed);
2639 pr_info("%s: completion latency %d cycles, %lluns\n",
2640 ce->engine->name, cycles >> TF_BIAS,
2641 cycles_to_ns(ce->engine, cycles));
2642
2643 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2644
2645 err:
2646 intel_gt_set_wedged(ce->engine->gt);
2647 return err;
2648 }
2649
rps_pin(struct intel_gt * gt)2650 static void rps_pin(struct intel_gt *gt)
2651 {
2652 /* Pin the frequency to max */
2653 atomic_inc(>->rps.num_waiters);
2654 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2655
2656 mutex_lock(>->rps.lock);
2657 intel_rps_set(>->rps, gt->rps.max_freq);
2658 mutex_unlock(>->rps.lock);
2659 }
2660
rps_unpin(struct intel_gt * gt)2661 static void rps_unpin(struct intel_gt *gt)
2662 {
2663 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2664 atomic_dec(>->rps.num_waiters);
2665 }
2666
perf_request_latency(void * arg)2667 static int perf_request_latency(void *arg)
2668 {
2669 struct drm_i915_private *i915 = arg;
2670 struct intel_engine_cs *engine;
2671 struct pm_qos_request qos;
2672 int err = 0;
2673
2674 if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */
2675 return 0;
2676
2677 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2678
2679 for_each_uabi_engine(engine, i915) {
2680 struct intel_context *ce;
2681
2682 ce = intel_context_create(engine);
2683 if (IS_ERR(ce)) {
2684 err = PTR_ERR(ce);
2685 goto out;
2686 }
2687
2688 err = intel_context_pin(ce);
2689 if (err) {
2690 intel_context_put(ce);
2691 goto out;
2692 }
2693
2694 st_engine_heartbeat_disable(engine);
2695 rps_pin(engine->gt);
2696
2697 if (err == 0)
2698 err = measure_semaphore_response(ce);
2699 if (err == 0)
2700 err = measure_idle_dispatch(ce);
2701 if (err == 0)
2702 err = measure_busy_dispatch(ce);
2703 if (err == 0)
2704 err = measure_inter_request(ce);
2705 if (err == 0)
2706 err = measure_context_switch(ce);
2707 if (err == 0)
2708 err = measure_preemption(ce);
2709 if (err == 0)
2710 err = measure_completion(ce);
2711
2712 rps_unpin(engine->gt);
2713 st_engine_heartbeat_enable(engine);
2714
2715 intel_context_unpin(ce);
2716 intel_context_put(ce);
2717 if (err)
2718 goto out;
2719 }
2720
2721 out:
2722 if (igt_flush_test(i915))
2723 err = -EIO;
2724
2725 cpu_latency_qos_remove_request(&qos);
2726 return err;
2727 }
2728
s_sync0(void * arg)2729 static int s_sync0(void *arg)
2730 {
2731 struct perf_series *ps = arg;
2732 IGT_TIMEOUT(end_time);
2733 unsigned int idx = 0;
2734 int err = 0;
2735
2736 GEM_BUG_ON(!ps->nengines);
2737 do {
2738 struct i915_request *rq;
2739
2740 rq = i915_request_create(ps->ce[idx]);
2741 if (IS_ERR(rq)) {
2742 err = PTR_ERR(rq);
2743 break;
2744 }
2745
2746 i915_request_get(rq);
2747 i915_request_add(rq);
2748
2749 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2750 err = -ETIME;
2751 i915_request_put(rq);
2752 if (err)
2753 break;
2754
2755 if (++idx == ps->nengines)
2756 idx = 0;
2757 } while (!__igt_timeout(end_time, NULL));
2758
2759 return err;
2760 }
2761
s_sync1(void * arg)2762 static int s_sync1(void *arg)
2763 {
2764 struct perf_series *ps = arg;
2765 struct i915_request *prev = NULL;
2766 IGT_TIMEOUT(end_time);
2767 unsigned int idx = 0;
2768 int err = 0;
2769
2770 GEM_BUG_ON(!ps->nengines);
2771 do {
2772 struct i915_request *rq;
2773
2774 rq = i915_request_create(ps->ce[idx]);
2775 if (IS_ERR(rq)) {
2776 err = PTR_ERR(rq);
2777 break;
2778 }
2779
2780 i915_request_get(rq);
2781 i915_request_add(rq);
2782
2783 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2784 err = -ETIME;
2785 i915_request_put(prev);
2786 prev = rq;
2787 if (err)
2788 break;
2789
2790 if (++idx == ps->nengines)
2791 idx = 0;
2792 } while (!__igt_timeout(end_time, NULL));
2793 i915_request_put(prev);
2794
2795 return err;
2796 }
2797
s_many(void * arg)2798 static int s_many(void *arg)
2799 {
2800 struct perf_series *ps = arg;
2801 IGT_TIMEOUT(end_time);
2802 unsigned int idx = 0;
2803
2804 GEM_BUG_ON(!ps->nengines);
2805 do {
2806 struct i915_request *rq;
2807
2808 rq = i915_request_create(ps->ce[idx]);
2809 if (IS_ERR(rq))
2810 return PTR_ERR(rq);
2811
2812 i915_request_add(rq);
2813
2814 if (++idx == ps->nengines)
2815 idx = 0;
2816 } while (!__igt_timeout(end_time, NULL));
2817
2818 return 0;
2819 }
2820
perf_series_engines(void * arg)2821 static int perf_series_engines(void *arg)
2822 {
2823 struct drm_i915_private *i915 = arg;
2824 static int (* const func[])(void *arg) = {
2825 s_sync0,
2826 s_sync1,
2827 s_many,
2828 NULL,
2829 };
2830 const unsigned int nengines = num_uabi_engines(i915);
2831 struct intel_engine_cs *engine;
2832 int (* const *fn)(void *arg);
2833 struct pm_qos_request qos;
2834 struct perf_stats *stats;
2835 struct perf_series *ps;
2836 unsigned int idx;
2837 int err = 0;
2838
2839 stats = kzalloc_objs(*stats, nengines);
2840 if (!stats)
2841 return -ENOMEM;
2842
2843 ps = kzalloc_flex(*ps, ce, nengines);
2844 if (!ps) {
2845 kfree(stats);
2846 return -ENOMEM;
2847 }
2848
2849 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2850
2851 ps->i915 = i915;
2852 ps->nengines = nengines;
2853
2854 idx = 0;
2855 for_each_uabi_engine(engine, i915) {
2856 struct intel_context *ce;
2857
2858 ce = intel_context_create(engine);
2859 if (IS_ERR(ce)) {
2860 err = PTR_ERR(ce);
2861 goto out;
2862 }
2863
2864 err = intel_context_pin(ce);
2865 if (err) {
2866 intel_context_put(ce);
2867 goto out;
2868 }
2869
2870 ps->ce[idx++] = ce;
2871 }
2872 GEM_BUG_ON(idx != ps->nengines);
2873
2874 for (fn = func; *fn && !err; fn++) {
2875 char name[KSYM_NAME_LEN];
2876 struct igt_live_test t;
2877
2878 snprintf(name, sizeof(name), "%ps", *fn);
2879 err = igt_live_test_begin(&t, i915, __func__, name);
2880 if (err)
2881 break;
2882
2883 for (idx = 0; idx < nengines; idx++) {
2884 struct perf_stats *p =
2885 memset(&stats[idx], 0, sizeof(stats[idx]));
2886 struct intel_context *ce = ps->ce[idx];
2887
2888 p->engine = ps->ce[idx]->engine;
2889 intel_engine_pm_get(p->engine);
2890
2891 if (intel_engine_supports_stats(p->engine))
2892 p->busy = intel_engine_get_busy_time(p->engine,
2893 &p->time) + 1;
2894 else
2895 p->time = ktime_get();
2896 p->runtime = -intel_context_get_total_runtime_ns(ce);
2897 }
2898
2899 err = (*fn)(ps);
2900 if (igt_live_test_end(&t))
2901 err = -EIO;
2902
2903 for (idx = 0; idx < nengines; idx++) {
2904 struct perf_stats *p = &stats[idx];
2905 struct intel_context *ce = ps->ce[idx];
2906 int integer, decimal;
2907 u64 busy, dt, now;
2908
2909 if (p->busy)
2910 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2911 &now),
2912 p->busy - 1);
2913 else
2914 now = ktime_get();
2915 p->time = ktime_sub(now, p->time);
2916
2917 err = switch_to_kernel_sync(ce, err);
2918 p->runtime += intel_context_get_total_runtime_ns(ce);
2919 intel_engine_pm_put(p->engine);
2920
2921 busy = 100 * ktime_to_ns(p->busy);
2922 dt = ktime_to_ns(p->time);
2923 if (dt) {
2924 integer = div64_u64(busy, dt);
2925 busy -= integer * dt;
2926 decimal = div64_u64(100 * busy, dt);
2927 } else {
2928 integer = 0;
2929 decimal = 0;
2930 }
2931
2932 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2933 name, p->engine->name, ce->timeline->seqno,
2934 integer, decimal,
2935 div_u64(p->runtime, 1000 * 1000),
2936 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2937 }
2938 }
2939
2940 out:
2941 for (idx = 0; idx < nengines; idx++) {
2942 if (IS_ERR_OR_NULL(ps->ce[idx]))
2943 break;
2944
2945 intel_context_unpin(ps->ce[idx]);
2946 intel_context_put(ps->ce[idx]);
2947 }
2948 kfree(ps);
2949
2950 cpu_latency_qos_remove_request(&qos);
2951 kfree(stats);
2952 return err;
2953 }
2954
2955 struct p_thread {
2956 struct perf_stats p;
2957 struct kthread_worker *worker;
2958 struct kthread_work work;
2959 struct intel_engine_cs *engine;
2960 int result;
2961 };
2962
p_sync0(struct kthread_work * work)2963 static void p_sync0(struct kthread_work *work)
2964 {
2965 struct p_thread *thread = container_of(work, typeof(*thread), work);
2966 struct perf_stats *p = &thread->p;
2967 struct intel_engine_cs *engine = p->engine;
2968 struct intel_context *ce;
2969 IGT_TIMEOUT(end_time);
2970 unsigned long count;
2971 bool busy;
2972 int err = 0;
2973
2974 ce = intel_context_create(engine);
2975 if (IS_ERR(ce)) {
2976 thread->result = PTR_ERR(ce);
2977 return;
2978 }
2979
2980 err = intel_context_pin(ce);
2981 if (err) {
2982 intel_context_put(ce);
2983 thread->result = err;
2984 return;
2985 }
2986
2987 if (intel_engine_supports_stats(engine)) {
2988 p->busy = intel_engine_get_busy_time(engine, &p->time);
2989 busy = true;
2990 } else {
2991 p->time = ktime_get();
2992 busy = false;
2993 }
2994
2995 count = 0;
2996 do {
2997 struct i915_request *rq;
2998
2999 rq = i915_request_create(ce);
3000 if (IS_ERR(rq)) {
3001 err = PTR_ERR(rq);
3002 break;
3003 }
3004
3005 i915_request_get(rq);
3006 i915_request_add(rq);
3007
3008 err = 0;
3009 if (i915_request_wait(rq, 0, HZ) < 0)
3010 err = -ETIME;
3011 i915_request_put(rq);
3012 if (err)
3013 break;
3014
3015 count++;
3016 } while (!__igt_timeout(end_time, NULL));
3017
3018 if (busy) {
3019 ktime_t now;
3020
3021 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3022 p->busy);
3023 p->time = ktime_sub(now, p->time);
3024 } else {
3025 p->time = ktime_sub(ktime_get(), p->time);
3026 }
3027
3028 err = switch_to_kernel_sync(ce, err);
3029 p->runtime = intel_context_get_total_runtime_ns(ce);
3030 p->count = count;
3031
3032 intel_context_unpin(ce);
3033 intel_context_put(ce);
3034 thread->result = err;
3035 }
3036
p_sync1(struct kthread_work * work)3037 static void p_sync1(struct kthread_work *work)
3038 {
3039 struct p_thread *thread = container_of(work, typeof(*thread), work);
3040 struct perf_stats *p = &thread->p;
3041 struct intel_engine_cs *engine = p->engine;
3042 struct i915_request *prev = NULL;
3043 struct intel_context *ce;
3044 IGT_TIMEOUT(end_time);
3045 unsigned long count;
3046 bool busy;
3047 int err = 0;
3048
3049 ce = intel_context_create(engine);
3050 if (IS_ERR(ce)) {
3051 thread->result = PTR_ERR(ce);
3052 return;
3053 }
3054
3055 err = intel_context_pin(ce);
3056 if (err) {
3057 intel_context_put(ce);
3058 thread->result = err;
3059 return;
3060 }
3061
3062 if (intel_engine_supports_stats(engine)) {
3063 p->busy = intel_engine_get_busy_time(engine, &p->time);
3064 busy = true;
3065 } else {
3066 p->time = ktime_get();
3067 busy = false;
3068 }
3069
3070 count = 0;
3071 do {
3072 struct i915_request *rq;
3073
3074 rq = i915_request_create(ce);
3075 if (IS_ERR(rq)) {
3076 err = PTR_ERR(rq);
3077 break;
3078 }
3079
3080 i915_request_get(rq);
3081 i915_request_add(rq);
3082
3083 err = 0;
3084 if (prev && i915_request_wait(prev, 0, HZ) < 0)
3085 err = -ETIME;
3086 i915_request_put(prev);
3087 prev = rq;
3088 if (err)
3089 break;
3090
3091 count++;
3092 } while (!__igt_timeout(end_time, NULL));
3093 i915_request_put(prev);
3094
3095 if (busy) {
3096 ktime_t now;
3097
3098 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3099 p->busy);
3100 p->time = ktime_sub(now, p->time);
3101 } else {
3102 p->time = ktime_sub(ktime_get(), p->time);
3103 }
3104
3105 err = switch_to_kernel_sync(ce, err);
3106 p->runtime = intel_context_get_total_runtime_ns(ce);
3107 p->count = count;
3108
3109 intel_context_unpin(ce);
3110 intel_context_put(ce);
3111 thread->result = err;
3112 }
3113
p_many(struct kthread_work * work)3114 static void p_many(struct kthread_work *work)
3115 {
3116 struct p_thread *thread = container_of(work, typeof(*thread), work);
3117 struct perf_stats *p = &thread->p;
3118 struct intel_engine_cs *engine = p->engine;
3119 struct intel_context *ce;
3120 IGT_TIMEOUT(end_time);
3121 unsigned long count;
3122 int err = 0;
3123 bool busy;
3124
3125 ce = intel_context_create(engine);
3126 if (IS_ERR(ce)) {
3127 thread->result = PTR_ERR(ce);
3128 return;
3129 }
3130
3131 err = intel_context_pin(ce);
3132 if (err) {
3133 intel_context_put(ce);
3134 thread->result = err;
3135 return;
3136 }
3137
3138 if (intel_engine_supports_stats(engine)) {
3139 p->busy = intel_engine_get_busy_time(engine, &p->time);
3140 busy = true;
3141 } else {
3142 p->time = ktime_get();
3143 busy = false;
3144 }
3145
3146 count = 0;
3147 do {
3148 struct i915_request *rq;
3149
3150 rq = i915_request_create(ce);
3151 if (IS_ERR(rq)) {
3152 err = PTR_ERR(rq);
3153 break;
3154 }
3155
3156 i915_request_add(rq);
3157 count++;
3158 } while (!__igt_timeout(end_time, NULL));
3159
3160 if (busy) {
3161 ktime_t now;
3162
3163 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3164 p->busy);
3165 p->time = ktime_sub(now, p->time);
3166 } else {
3167 p->time = ktime_sub(ktime_get(), p->time);
3168 }
3169
3170 err = switch_to_kernel_sync(ce, err);
3171 p->runtime = intel_context_get_total_runtime_ns(ce);
3172 p->count = count;
3173
3174 intel_context_unpin(ce);
3175 intel_context_put(ce);
3176 thread->result = err;
3177 }
3178
perf_parallel_engines(void * arg)3179 static int perf_parallel_engines(void *arg)
3180 {
3181 struct drm_i915_private *i915 = arg;
3182 static void (* const func[])(struct kthread_work *) = {
3183 p_sync0,
3184 p_sync1,
3185 p_many,
3186 NULL,
3187 };
3188 const unsigned int nengines = num_uabi_engines(i915);
3189 void (* const *fn)(struct kthread_work *);
3190 struct intel_engine_cs *engine;
3191 struct pm_qos_request qos;
3192 struct p_thread *engines;
3193 int err = 0;
3194
3195 engines = kzalloc_objs(*engines, nengines);
3196 if (!engines)
3197 return -ENOMEM;
3198
3199 cpu_latency_qos_add_request(&qos, 0);
3200
3201 for (fn = func; *fn; fn++) {
3202 char name[KSYM_NAME_LEN];
3203 struct igt_live_test t;
3204 unsigned int idx;
3205
3206 snprintf(name, sizeof(name), "%ps", *fn);
3207 err = igt_live_test_begin(&t, i915, __func__, name);
3208 if (err)
3209 break;
3210
3211 atomic_set(&i915->selftest.counter, nengines);
3212
3213 idx = 0;
3214 for_each_uabi_engine(engine, i915) {
3215 struct kthread_worker *worker;
3216
3217 intel_engine_pm_get(engine);
3218
3219 memset(&engines[idx].p, 0, sizeof(engines[idx].p));
3220
3221 worker = kthread_run_worker(0, "igt:%s",
3222 engine->name);
3223 if (IS_ERR(worker)) {
3224 err = PTR_ERR(worker);
3225 intel_engine_pm_put(engine);
3226 break;
3227 }
3228 engines[idx].worker = worker;
3229 engines[idx].result = 0;
3230 engines[idx].p.engine = engine;
3231 engines[idx].engine = engine;
3232
3233 kthread_init_work(&engines[idx].work, *fn);
3234 kthread_queue_work(worker, &engines[idx].work);
3235 idx++;
3236 }
3237
3238 idx = 0;
3239 for_each_uabi_engine(engine, i915) {
3240 int status;
3241
3242 if (!engines[idx].worker)
3243 break;
3244
3245 kthread_flush_work(&engines[idx].work);
3246 status = READ_ONCE(engines[idx].result);
3247 if (status && !err)
3248 err = status;
3249
3250 intel_engine_pm_put(engine);
3251
3252 kthread_destroy_worker(engines[idx].worker);
3253 idx++;
3254 }
3255
3256 if (igt_live_test_end(&t))
3257 err = -EIO;
3258 if (err)
3259 break;
3260
3261 idx = 0;
3262 for_each_uabi_engine(engine, i915) {
3263 struct perf_stats *p = &engines[idx].p;
3264 u64 busy = 100 * ktime_to_ns(p->busy);
3265 u64 dt = ktime_to_ns(p->time);
3266 int integer, decimal;
3267
3268 if (dt) {
3269 integer = div64_u64(busy, dt);
3270 busy -= integer * dt;
3271 decimal = div64_u64(100 * busy, dt);
3272 } else {
3273 integer = 0;
3274 decimal = 0;
3275 }
3276
3277 GEM_BUG_ON(engine != p->engine);
3278 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
3279 name, engine->name, p->count, integer, decimal,
3280 div_u64(p->runtime, 1000 * 1000),
3281 div_u64(ktime_to_ns(p->time), 1000 * 1000));
3282 idx++;
3283 }
3284 }
3285
3286 cpu_latency_qos_remove_request(&qos);
3287 kfree(engines);
3288 return err;
3289 }
3290
i915_request_perf_selftests(struct drm_i915_private * i915)3291 int i915_request_perf_selftests(struct drm_i915_private *i915)
3292 {
3293 static const struct i915_subtest tests[] = {
3294 SUBTEST(perf_request_latency),
3295 SUBTEST(perf_series_engines),
3296 SUBTEST(perf_parallel_engines),
3297 };
3298
3299 if (intel_gt_is_wedged(to_gt(i915)))
3300 return 0;
3301
3302 return i915_subtests(tests, i915);
3303 }
3304