xref: /linux/drivers/gpu/drm/i915/selftests/i915_request.c (revision 16018c0d27eda6a7f69dafa750d23770fb46b00f)
1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/prime_numbers.h>
26 #include <linux/pm_qos.h>
27 #include <linux/sort.h>
28 
29 #include "gem/i915_gem_pm.h"
30 #include "gem/selftests/mock_context.h"
31 
32 #include "gt/intel_engine_heartbeat.h"
33 #include "gt/intel_engine_pm.h"
34 #include "gt/intel_engine_user.h"
35 #include "gt/intel_gt.h"
36 #include "gt/intel_gt_clock_utils.h"
37 #include "gt/intel_gt_requests.h"
38 #include "gt/selftest_engine_heartbeat.h"
39 
40 #include "i915_random.h"
41 #include "i915_selftest.h"
42 #include "igt_flush_test.h"
43 #include "igt_live_test.h"
44 #include "igt_spinner.h"
45 #include "lib_sw_fence.h"
46 
47 #include "mock_drm.h"
48 #include "mock_gem_device.h"
49 
50 static unsigned int num_uabi_engines(struct drm_i915_private *i915)
51 {
52 	struct intel_engine_cs *engine;
53 	unsigned int count;
54 
55 	count = 0;
56 	for_each_uabi_engine(engine, i915)
57 		count++;
58 
59 	return count;
60 }
61 
62 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
63 {
64 	return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
65 }
66 
67 static int igt_add_request(void *arg)
68 {
69 	struct drm_i915_private *i915 = arg;
70 	struct i915_request *request;
71 
72 	/* Basic preliminary test to create a request and let it loose! */
73 
74 	request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
75 	if (!request)
76 		return -ENOMEM;
77 
78 	i915_request_add(request);
79 
80 	return 0;
81 }
82 
83 static int igt_wait_request(void *arg)
84 {
85 	const long T = HZ / 4;
86 	struct drm_i915_private *i915 = arg;
87 	struct i915_request *request;
88 	int err = -EINVAL;
89 
90 	/* Submit a request, then wait upon it */
91 
92 	request = mock_request(rcs0(i915)->kernel_context, T);
93 	if (!request)
94 		return -ENOMEM;
95 
96 	i915_request_get(request);
97 
98 	if (i915_request_wait(request, 0, 0) != -ETIME) {
99 		pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
100 		goto out_request;
101 	}
102 
103 	if (i915_request_wait(request, 0, T) != -ETIME) {
104 		pr_err("request wait succeeded (expected timeout before submit!)\n");
105 		goto out_request;
106 	}
107 
108 	if (i915_request_completed(request)) {
109 		pr_err("request completed before submit!!\n");
110 		goto out_request;
111 	}
112 
113 	i915_request_add(request);
114 
115 	if (i915_request_wait(request, 0, 0) != -ETIME) {
116 		pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
117 		goto out_request;
118 	}
119 
120 	if (i915_request_completed(request)) {
121 		pr_err("request completed immediately!\n");
122 		goto out_request;
123 	}
124 
125 	if (i915_request_wait(request, 0, T / 2) != -ETIME) {
126 		pr_err("request wait succeeded (expected timeout!)\n");
127 		goto out_request;
128 	}
129 
130 	if (i915_request_wait(request, 0, T) == -ETIME) {
131 		pr_err("request wait timed out!\n");
132 		goto out_request;
133 	}
134 
135 	if (!i915_request_completed(request)) {
136 		pr_err("request not complete after waiting!\n");
137 		goto out_request;
138 	}
139 
140 	if (i915_request_wait(request, 0, T) == -ETIME) {
141 		pr_err("request wait timed out when already complete!\n");
142 		goto out_request;
143 	}
144 
145 	err = 0;
146 out_request:
147 	i915_request_put(request);
148 	mock_device_flush(i915);
149 	return err;
150 }
151 
152 static int igt_fence_wait(void *arg)
153 {
154 	const long T = HZ / 4;
155 	struct drm_i915_private *i915 = arg;
156 	struct i915_request *request;
157 	int err = -EINVAL;
158 
159 	/* Submit a request, treat it as a fence and wait upon it */
160 
161 	request = mock_request(rcs0(i915)->kernel_context, T);
162 	if (!request)
163 		return -ENOMEM;
164 
165 	if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
166 		pr_err("fence wait success before submit (expected timeout)!\n");
167 		goto out;
168 	}
169 
170 	i915_request_add(request);
171 
172 	if (dma_fence_is_signaled(&request->fence)) {
173 		pr_err("fence signaled immediately!\n");
174 		goto out;
175 	}
176 
177 	if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
178 		pr_err("fence wait success after submit (expected timeout)!\n");
179 		goto out;
180 	}
181 
182 	if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
183 		pr_err("fence wait timed out (expected success)!\n");
184 		goto out;
185 	}
186 
187 	if (!dma_fence_is_signaled(&request->fence)) {
188 		pr_err("fence unsignaled after waiting!\n");
189 		goto out;
190 	}
191 
192 	if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
193 		pr_err("fence wait timed out when complete (expected success)!\n");
194 		goto out;
195 	}
196 
197 	err = 0;
198 out:
199 	mock_device_flush(i915);
200 	return err;
201 }
202 
203 static int igt_request_rewind(void *arg)
204 {
205 	struct drm_i915_private *i915 = arg;
206 	struct i915_request *request, *vip;
207 	struct i915_gem_context *ctx[2];
208 	struct intel_context *ce;
209 	int err = -EINVAL;
210 
211 	ctx[0] = mock_context(i915, "A");
212 	if (!ctx[0]) {
213 		err = -ENOMEM;
214 		goto err_ctx_0;
215 	}
216 
217 	ce = i915_gem_context_get_engine(ctx[0], RCS0);
218 	GEM_BUG_ON(IS_ERR(ce));
219 	request = mock_request(ce, 2 * HZ);
220 	intel_context_put(ce);
221 	if (!request) {
222 		err = -ENOMEM;
223 		goto err_context_0;
224 	}
225 
226 	i915_request_get(request);
227 	i915_request_add(request);
228 
229 	ctx[1] = mock_context(i915, "B");
230 	if (!ctx[1]) {
231 		err = -ENOMEM;
232 		goto err_ctx_1;
233 	}
234 
235 	ce = i915_gem_context_get_engine(ctx[1], RCS0);
236 	GEM_BUG_ON(IS_ERR(ce));
237 	vip = mock_request(ce, 0);
238 	intel_context_put(ce);
239 	if (!vip) {
240 		err = -ENOMEM;
241 		goto err_context_1;
242 	}
243 
244 	/* Simulate preemption by manual reordering */
245 	if (!mock_cancel_request(request)) {
246 		pr_err("failed to cancel request (already executed)!\n");
247 		i915_request_add(vip);
248 		goto err_context_1;
249 	}
250 	i915_request_get(vip);
251 	i915_request_add(vip);
252 	rcu_read_lock();
253 	request->engine->submit_request(request);
254 	rcu_read_unlock();
255 
256 
257 	if (i915_request_wait(vip, 0, HZ) == -ETIME) {
258 		pr_err("timed out waiting for high priority request\n");
259 		goto err;
260 	}
261 
262 	if (i915_request_completed(request)) {
263 		pr_err("low priority request already completed\n");
264 		goto err;
265 	}
266 
267 	err = 0;
268 err:
269 	i915_request_put(vip);
270 err_context_1:
271 	mock_context_close(ctx[1]);
272 err_ctx_1:
273 	i915_request_put(request);
274 err_context_0:
275 	mock_context_close(ctx[0]);
276 err_ctx_0:
277 	mock_device_flush(i915);
278 	return err;
279 }
280 
281 struct smoketest {
282 	struct intel_engine_cs *engine;
283 	struct i915_gem_context **contexts;
284 	atomic_long_t num_waits, num_fences;
285 	int ncontexts, max_batch;
286 	struct i915_request *(*request_alloc)(struct intel_context *ce);
287 };
288 
289 static struct i915_request *
290 __mock_request_alloc(struct intel_context *ce)
291 {
292 	return mock_request(ce, 0);
293 }
294 
295 static struct i915_request *
296 __live_request_alloc(struct intel_context *ce)
297 {
298 	return intel_context_create_request(ce);
299 }
300 
301 static int __igt_breadcrumbs_smoketest(void *arg)
302 {
303 	struct smoketest *t = arg;
304 	const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
305 	const unsigned int total = 4 * t->ncontexts + 1;
306 	unsigned int num_waits = 0, num_fences = 0;
307 	struct i915_request **requests;
308 	I915_RND_STATE(prng);
309 	unsigned int *order;
310 	int err = 0;
311 
312 	/*
313 	 * A very simple test to catch the most egregious of list handling bugs.
314 	 *
315 	 * At its heart, we simply create oodles of requests running across
316 	 * multiple kthreads and enable signaling on them, for the sole purpose
317 	 * of stressing our breadcrumb handling. The only inspection we do is
318 	 * that the fences were marked as signaled.
319 	 */
320 
321 	requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
322 	if (!requests)
323 		return -ENOMEM;
324 
325 	order = i915_random_order(total, &prng);
326 	if (!order) {
327 		err = -ENOMEM;
328 		goto out_requests;
329 	}
330 
331 	while (!kthread_should_stop()) {
332 		struct i915_sw_fence *submit, *wait;
333 		unsigned int n, count;
334 
335 		submit = heap_fence_create(GFP_KERNEL);
336 		if (!submit) {
337 			err = -ENOMEM;
338 			break;
339 		}
340 
341 		wait = heap_fence_create(GFP_KERNEL);
342 		if (!wait) {
343 			i915_sw_fence_commit(submit);
344 			heap_fence_put(submit);
345 			err = -ENOMEM;
346 			break;
347 		}
348 
349 		i915_random_reorder(order, total, &prng);
350 		count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
351 
352 		for (n = 0; n < count; n++) {
353 			struct i915_gem_context *ctx =
354 				t->contexts[order[n] % t->ncontexts];
355 			struct i915_request *rq;
356 			struct intel_context *ce;
357 
358 			ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
359 			GEM_BUG_ON(IS_ERR(ce));
360 			rq = t->request_alloc(ce);
361 			intel_context_put(ce);
362 			if (IS_ERR(rq)) {
363 				err = PTR_ERR(rq);
364 				count = n;
365 				break;
366 			}
367 
368 			err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
369 							       submit,
370 							       GFP_KERNEL);
371 
372 			requests[n] = i915_request_get(rq);
373 			i915_request_add(rq);
374 
375 			if (err >= 0)
376 				err = i915_sw_fence_await_dma_fence(wait,
377 								    &rq->fence,
378 								    0,
379 								    GFP_KERNEL);
380 
381 			if (err < 0) {
382 				i915_request_put(rq);
383 				count = n;
384 				break;
385 			}
386 		}
387 
388 		i915_sw_fence_commit(submit);
389 		i915_sw_fence_commit(wait);
390 
391 		if (!wait_event_timeout(wait->wait,
392 					i915_sw_fence_done(wait),
393 					5 * HZ)) {
394 			struct i915_request *rq = requests[count - 1];
395 
396 			pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
397 			       atomic_read(&wait->pending), count,
398 			       rq->fence.context, rq->fence.seqno,
399 			       t->engine->name);
400 			GEM_TRACE_DUMP();
401 
402 			intel_gt_set_wedged(t->engine->gt);
403 			GEM_BUG_ON(!i915_request_completed(rq));
404 			i915_sw_fence_wait(wait);
405 			err = -EIO;
406 		}
407 
408 		for (n = 0; n < count; n++) {
409 			struct i915_request *rq = requests[n];
410 
411 			if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
412 				      &rq->fence.flags)) {
413 				pr_err("%llu:%llu was not signaled!\n",
414 				       rq->fence.context, rq->fence.seqno);
415 				err = -EINVAL;
416 			}
417 
418 			i915_request_put(rq);
419 		}
420 
421 		heap_fence_put(wait);
422 		heap_fence_put(submit);
423 
424 		if (err < 0)
425 			break;
426 
427 		num_fences += count;
428 		num_waits++;
429 
430 		cond_resched();
431 	}
432 
433 	atomic_long_add(num_fences, &t->num_fences);
434 	atomic_long_add(num_waits, &t->num_waits);
435 
436 	kfree(order);
437 out_requests:
438 	kfree(requests);
439 	return err;
440 }
441 
442 static int mock_breadcrumbs_smoketest(void *arg)
443 {
444 	struct drm_i915_private *i915 = arg;
445 	struct smoketest t = {
446 		.engine = rcs0(i915),
447 		.ncontexts = 1024,
448 		.max_batch = 1024,
449 		.request_alloc = __mock_request_alloc
450 	};
451 	unsigned int ncpus = num_online_cpus();
452 	struct task_struct **threads;
453 	unsigned int n;
454 	int ret = 0;
455 
456 	/*
457 	 * Smoketest our breadcrumb/signal handling for requests across multiple
458 	 * threads. A very simple test to only catch the most egregious of bugs.
459 	 * See __igt_breadcrumbs_smoketest();
460 	 */
461 
462 	threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
463 	if (!threads)
464 		return -ENOMEM;
465 
466 	t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
467 	if (!t.contexts) {
468 		ret = -ENOMEM;
469 		goto out_threads;
470 	}
471 
472 	for (n = 0; n < t.ncontexts; n++) {
473 		t.contexts[n] = mock_context(t.engine->i915, "mock");
474 		if (!t.contexts[n]) {
475 			ret = -ENOMEM;
476 			goto out_contexts;
477 		}
478 	}
479 
480 	for (n = 0; n < ncpus; n++) {
481 		threads[n] = kthread_run(__igt_breadcrumbs_smoketest,
482 					 &t, "igt/%d", n);
483 		if (IS_ERR(threads[n])) {
484 			ret = PTR_ERR(threads[n]);
485 			ncpus = n;
486 			break;
487 		}
488 
489 		get_task_struct(threads[n]);
490 	}
491 
492 	yield(); /* start all threads before we begin */
493 	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
494 
495 	for (n = 0; n < ncpus; n++) {
496 		int err;
497 
498 		err = kthread_stop(threads[n]);
499 		if (err < 0 && !ret)
500 			ret = err;
501 
502 		put_task_struct(threads[n]);
503 	}
504 	pr_info("Completed %lu waits for %lu fence across %d cpus\n",
505 		atomic_long_read(&t.num_waits),
506 		atomic_long_read(&t.num_fences),
507 		ncpus);
508 
509 out_contexts:
510 	for (n = 0; n < t.ncontexts; n++) {
511 		if (!t.contexts[n])
512 			break;
513 		mock_context_close(t.contexts[n]);
514 	}
515 	kfree(t.contexts);
516 out_threads:
517 	kfree(threads);
518 	return ret;
519 }
520 
521 int i915_request_mock_selftests(void)
522 {
523 	static const struct i915_subtest tests[] = {
524 		SUBTEST(igt_add_request),
525 		SUBTEST(igt_wait_request),
526 		SUBTEST(igt_fence_wait),
527 		SUBTEST(igt_request_rewind),
528 		SUBTEST(mock_breadcrumbs_smoketest),
529 	};
530 	struct drm_i915_private *i915;
531 	intel_wakeref_t wakeref;
532 	int err = 0;
533 
534 	i915 = mock_gem_device();
535 	if (!i915)
536 		return -ENOMEM;
537 
538 	with_intel_runtime_pm(&i915->runtime_pm, wakeref)
539 		err = i915_subtests(tests, i915);
540 
541 	mock_destroy_device(i915);
542 
543 	return err;
544 }
545 
546 static int live_nop_request(void *arg)
547 {
548 	struct drm_i915_private *i915 = arg;
549 	struct intel_engine_cs *engine;
550 	struct igt_live_test t;
551 	int err = -ENODEV;
552 
553 	/*
554 	 * Submit various sized batches of empty requests, to each engine
555 	 * (individually), and wait for the batch to complete. We can check
556 	 * the overhead of submitting requests to the hardware.
557 	 */
558 
559 	for_each_uabi_engine(engine, i915) {
560 		unsigned long n, prime;
561 		IGT_TIMEOUT(end_time);
562 		ktime_t times[2] = {};
563 
564 		err = igt_live_test_begin(&t, i915, __func__, engine->name);
565 		if (err)
566 			return err;
567 
568 		intel_engine_pm_get(engine);
569 		for_each_prime_number_from(prime, 1, 8192) {
570 			struct i915_request *request = NULL;
571 
572 			times[1] = ktime_get_raw();
573 
574 			for (n = 0; n < prime; n++) {
575 				i915_request_put(request);
576 				request = i915_request_create(engine->kernel_context);
577 				if (IS_ERR(request))
578 					return PTR_ERR(request);
579 
580 				/*
581 				 * This space is left intentionally blank.
582 				 *
583 				 * We do not actually want to perform any
584 				 * action with this request, we just want
585 				 * to measure the latency in allocation
586 				 * and submission of our breadcrumbs -
587 				 * ensuring that the bare request is sufficient
588 				 * for the system to work (i.e. proper HEAD
589 				 * tracking of the rings, interrupt handling,
590 				 * etc). It also gives us the lowest bounds
591 				 * for latency.
592 				 */
593 
594 				i915_request_get(request);
595 				i915_request_add(request);
596 			}
597 			i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
598 			i915_request_put(request);
599 
600 			times[1] = ktime_sub(ktime_get_raw(), times[1]);
601 			if (prime == 1)
602 				times[0] = times[1];
603 
604 			if (__igt_timeout(end_time, NULL))
605 				break;
606 		}
607 		intel_engine_pm_put(engine);
608 
609 		err = igt_live_test_end(&t);
610 		if (err)
611 			return err;
612 
613 		pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
614 			engine->name,
615 			ktime_to_ns(times[0]),
616 			prime, div64_u64(ktime_to_ns(times[1]), prime));
617 	}
618 
619 	return err;
620 }
621 
622 static int __cancel_inactive(struct intel_engine_cs *engine)
623 {
624 	struct intel_context *ce;
625 	struct igt_spinner spin;
626 	struct i915_request *rq;
627 	int err = 0;
628 
629 	if (igt_spinner_init(&spin, engine->gt))
630 		return -ENOMEM;
631 
632 	ce = intel_context_create(engine);
633 	if (IS_ERR(ce)) {
634 		err = PTR_ERR(ce);
635 		goto out_spin;
636 	}
637 
638 	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
639 	if (IS_ERR(rq)) {
640 		err = PTR_ERR(rq);
641 		goto out_ce;
642 	}
643 
644 	pr_debug("%s: Cancelling inactive request\n", engine->name);
645 	i915_request_cancel(rq, -EINTR);
646 	i915_request_get(rq);
647 	i915_request_add(rq);
648 
649 	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
650 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
651 
652 		pr_err("%s: Failed to cancel inactive request\n", engine->name);
653 		intel_engine_dump(engine, &p, "%s\n", engine->name);
654 		err = -ETIME;
655 		goto out_rq;
656 	}
657 
658 	if (rq->fence.error != -EINTR) {
659 		pr_err("%s: fence not cancelled (%u)\n",
660 		       engine->name, rq->fence.error);
661 		err = -EINVAL;
662 	}
663 
664 out_rq:
665 	i915_request_put(rq);
666 out_ce:
667 	intel_context_put(ce);
668 out_spin:
669 	igt_spinner_fini(&spin);
670 	if (err)
671 		pr_err("%s: %s error %d\n", __func__, engine->name, err);
672 	return err;
673 }
674 
675 static int __cancel_active(struct intel_engine_cs *engine)
676 {
677 	struct intel_context *ce;
678 	struct igt_spinner spin;
679 	struct i915_request *rq;
680 	int err = 0;
681 
682 	if (igt_spinner_init(&spin, engine->gt))
683 		return -ENOMEM;
684 
685 	ce = intel_context_create(engine);
686 	if (IS_ERR(ce)) {
687 		err = PTR_ERR(ce);
688 		goto out_spin;
689 	}
690 
691 	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
692 	if (IS_ERR(rq)) {
693 		err = PTR_ERR(rq);
694 		goto out_ce;
695 	}
696 
697 	pr_debug("%s: Cancelling active request\n", engine->name);
698 	i915_request_get(rq);
699 	i915_request_add(rq);
700 	if (!igt_wait_for_spinner(&spin, rq)) {
701 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
702 
703 		pr_err("Failed to start spinner on %s\n", engine->name);
704 		intel_engine_dump(engine, &p, "%s\n", engine->name);
705 		err = -ETIME;
706 		goto out_rq;
707 	}
708 	i915_request_cancel(rq, -EINTR);
709 
710 	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
711 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
712 
713 		pr_err("%s: Failed to cancel active request\n", engine->name);
714 		intel_engine_dump(engine, &p, "%s\n", engine->name);
715 		err = -ETIME;
716 		goto out_rq;
717 	}
718 
719 	if (rq->fence.error != -EINTR) {
720 		pr_err("%s: fence not cancelled (%u)\n",
721 		       engine->name, rq->fence.error);
722 		err = -EINVAL;
723 	}
724 
725 out_rq:
726 	i915_request_put(rq);
727 out_ce:
728 	intel_context_put(ce);
729 out_spin:
730 	igt_spinner_fini(&spin);
731 	if (err)
732 		pr_err("%s: %s error %d\n", __func__, engine->name, err);
733 	return err;
734 }
735 
736 static int __cancel_completed(struct intel_engine_cs *engine)
737 {
738 	struct intel_context *ce;
739 	struct igt_spinner spin;
740 	struct i915_request *rq;
741 	int err = 0;
742 
743 	if (igt_spinner_init(&spin, engine->gt))
744 		return -ENOMEM;
745 
746 	ce = intel_context_create(engine);
747 	if (IS_ERR(ce)) {
748 		err = PTR_ERR(ce);
749 		goto out_spin;
750 	}
751 
752 	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
753 	if (IS_ERR(rq)) {
754 		err = PTR_ERR(rq);
755 		goto out_ce;
756 	}
757 	igt_spinner_end(&spin);
758 	i915_request_get(rq);
759 	i915_request_add(rq);
760 
761 	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
762 		err = -ETIME;
763 		goto out_rq;
764 	}
765 
766 	pr_debug("%s: Cancelling completed request\n", engine->name);
767 	i915_request_cancel(rq, -EINTR);
768 	if (rq->fence.error) {
769 		pr_err("%s: fence not cancelled (%u)\n",
770 		       engine->name, rq->fence.error);
771 		err = -EINVAL;
772 	}
773 
774 out_rq:
775 	i915_request_put(rq);
776 out_ce:
777 	intel_context_put(ce);
778 out_spin:
779 	igt_spinner_fini(&spin);
780 	if (err)
781 		pr_err("%s: %s error %d\n", __func__, engine->name, err);
782 	return err;
783 }
784 
785 static int live_cancel_request(void *arg)
786 {
787 	struct drm_i915_private *i915 = arg;
788 	struct intel_engine_cs *engine;
789 
790 	/*
791 	 * Check cancellation of requests. We expect to be able to immediately
792 	 * cancel active requests, even if they are currently on the GPU.
793 	 */
794 
795 	for_each_uabi_engine(engine, i915) {
796 		struct igt_live_test t;
797 		int err, err2;
798 
799 		if (!intel_engine_has_preemption(engine))
800 			continue;
801 
802 		err = igt_live_test_begin(&t, i915, __func__, engine->name);
803 		if (err)
804 			return err;
805 
806 		err = __cancel_inactive(engine);
807 		if (err == 0)
808 			err = __cancel_active(engine);
809 		if (err == 0)
810 			err = __cancel_completed(engine);
811 
812 		err2 = igt_live_test_end(&t);
813 		if (err)
814 			return err;
815 		if (err2)
816 			return err2;
817 	}
818 
819 	return 0;
820 }
821 
822 static struct i915_vma *empty_batch(struct drm_i915_private *i915)
823 {
824 	struct drm_i915_gem_object *obj;
825 	struct i915_vma *vma;
826 	u32 *cmd;
827 	int err;
828 
829 	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
830 	if (IS_ERR(obj))
831 		return ERR_CAST(obj);
832 
833 	cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB);
834 	if (IS_ERR(cmd)) {
835 		err = PTR_ERR(cmd);
836 		goto err;
837 	}
838 
839 	*cmd = MI_BATCH_BUFFER_END;
840 
841 	__i915_gem_object_flush_map(obj, 0, 64);
842 	i915_gem_object_unpin_map(obj);
843 
844 	intel_gt_chipset_flush(to_gt(i915));
845 
846 	vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
847 	if (IS_ERR(vma)) {
848 		err = PTR_ERR(vma);
849 		goto err;
850 	}
851 
852 	err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
853 	if (err)
854 		goto err;
855 
856 	/* Force the wait wait now to avoid including it in the benchmark */
857 	err = i915_vma_sync(vma);
858 	if (err)
859 		goto err_pin;
860 
861 	return vma;
862 
863 err_pin:
864 	i915_vma_unpin(vma);
865 err:
866 	i915_gem_object_put(obj);
867 	return ERR_PTR(err);
868 }
869 
870 static struct i915_request *
871 empty_request(struct intel_engine_cs *engine,
872 	      struct i915_vma *batch)
873 {
874 	struct i915_request *request;
875 	int err;
876 
877 	request = i915_request_create(engine->kernel_context);
878 	if (IS_ERR(request))
879 		return request;
880 
881 	err = engine->emit_bb_start(request,
882 				    batch->node.start,
883 				    batch->node.size,
884 				    I915_DISPATCH_SECURE);
885 	if (err)
886 		goto out_request;
887 
888 	i915_request_get(request);
889 out_request:
890 	i915_request_add(request);
891 	return err ? ERR_PTR(err) : request;
892 }
893 
894 static int live_empty_request(void *arg)
895 {
896 	struct drm_i915_private *i915 = arg;
897 	struct intel_engine_cs *engine;
898 	struct igt_live_test t;
899 	struct i915_vma *batch;
900 	int err = 0;
901 
902 	/*
903 	 * Submit various sized batches of empty requests, to each engine
904 	 * (individually), and wait for the batch to complete. We can check
905 	 * the overhead of submitting requests to the hardware.
906 	 */
907 
908 	batch = empty_batch(i915);
909 	if (IS_ERR(batch))
910 		return PTR_ERR(batch);
911 
912 	for_each_uabi_engine(engine, i915) {
913 		IGT_TIMEOUT(end_time);
914 		struct i915_request *request;
915 		unsigned long n, prime;
916 		ktime_t times[2] = {};
917 
918 		err = igt_live_test_begin(&t, i915, __func__, engine->name);
919 		if (err)
920 			goto out_batch;
921 
922 		intel_engine_pm_get(engine);
923 
924 		/* Warmup / preload */
925 		request = empty_request(engine, batch);
926 		if (IS_ERR(request)) {
927 			err = PTR_ERR(request);
928 			intel_engine_pm_put(engine);
929 			goto out_batch;
930 		}
931 		i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
932 
933 		for_each_prime_number_from(prime, 1, 8192) {
934 			times[1] = ktime_get_raw();
935 
936 			for (n = 0; n < prime; n++) {
937 				i915_request_put(request);
938 				request = empty_request(engine, batch);
939 				if (IS_ERR(request)) {
940 					err = PTR_ERR(request);
941 					intel_engine_pm_put(engine);
942 					goto out_batch;
943 				}
944 			}
945 			i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
946 
947 			times[1] = ktime_sub(ktime_get_raw(), times[1]);
948 			if (prime == 1)
949 				times[0] = times[1];
950 
951 			if (__igt_timeout(end_time, NULL))
952 				break;
953 		}
954 		i915_request_put(request);
955 		intel_engine_pm_put(engine);
956 
957 		err = igt_live_test_end(&t);
958 		if (err)
959 			goto out_batch;
960 
961 		pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
962 			engine->name,
963 			ktime_to_ns(times[0]),
964 			prime, div64_u64(ktime_to_ns(times[1]), prime));
965 	}
966 
967 out_batch:
968 	i915_vma_unpin(batch);
969 	i915_vma_put(batch);
970 	return err;
971 }
972 
973 static struct i915_vma *recursive_batch(struct drm_i915_private *i915)
974 {
975 	struct drm_i915_gem_object *obj;
976 	const int ver = GRAPHICS_VER(i915);
977 	struct i915_vma *vma;
978 	u32 *cmd;
979 	int err;
980 
981 	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
982 	if (IS_ERR(obj))
983 		return ERR_CAST(obj);
984 
985 	vma = i915_vma_instance(obj, to_gt(i915)->vm, NULL);
986 	if (IS_ERR(vma)) {
987 		err = PTR_ERR(vma);
988 		goto err;
989 	}
990 
991 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
992 	if (err)
993 		goto err;
994 
995 	cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
996 	if (IS_ERR(cmd)) {
997 		err = PTR_ERR(cmd);
998 		goto err;
999 	}
1000 
1001 	if (ver >= 8) {
1002 		*cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
1003 		*cmd++ = lower_32_bits(vma->node.start);
1004 		*cmd++ = upper_32_bits(vma->node.start);
1005 	} else if (ver >= 6) {
1006 		*cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
1007 		*cmd++ = lower_32_bits(vma->node.start);
1008 	} else {
1009 		*cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1010 		*cmd++ = lower_32_bits(vma->node.start);
1011 	}
1012 	*cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
1013 
1014 	__i915_gem_object_flush_map(obj, 0, 64);
1015 	i915_gem_object_unpin_map(obj);
1016 
1017 	intel_gt_chipset_flush(to_gt(i915));
1018 
1019 	return vma;
1020 
1021 err:
1022 	i915_gem_object_put(obj);
1023 	return ERR_PTR(err);
1024 }
1025 
1026 static int recursive_batch_resolve(struct i915_vma *batch)
1027 {
1028 	u32 *cmd;
1029 
1030 	cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC);
1031 	if (IS_ERR(cmd))
1032 		return PTR_ERR(cmd);
1033 
1034 	*cmd = MI_BATCH_BUFFER_END;
1035 
1036 	__i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
1037 	i915_gem_object_unpin_map(batch->obj);
1038 
1039 	intel_gt_chipset_flush(batch->vm->gt);
1040 
1041 	return 0;
1042 }
1043 
1044 static int live_all_engines(void *arg)
1045 {
1046 	struct drm_i915_private *i915 = arg;
1047 	const unsigned int nengines = num_uabi_engines(i915);
1048 	struct intel_engine_cs *engine;
1049 	struct i915_request **request;
1050 	struct igt_live_test t;
1051 	struct i915_vma *batch;
1052 	unsigned int idx;
1053 	int err;
1054 
1055 	/*
1056 	 * Check we can submit requests to all engines simultaneously. We
1057 	 * send a recursive batch to each engine - checking that we don't
1058 	 * block doing so, and that they don't complete too soon.
1059 	 */
1060 
1061 	request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1062 	if (!request)
1063 		return -ENOMEM;
1064 
1065 	err = igt_live_test_begin(&t, i915, __func__, "");
1066 	if (err)
1067 		goto out_free;
1068 
1069 	batch = recursive_batch(i915);
1070 	if (IS_ERR(batch)) {
1071 		err = PTR_ERR(batch);
1072 		pr_err("%s: Unable to create batch, err=%d\n", __func__, err);
1073 		goto out_free;
1074 	}
1075 
1076 	i915_vma_lock(batch);
1077 
1078 	idx = 0;
1079 	for_each_uabi_engine(engine, i915) {
1080 		request[idx] = intel_engine_create_kernel_request(engine);
1081 		if (IS_ERR(request[idx])) {
1082 			err = PTR_ERR(request[idx]);
1083 			pr_err("%s: Request allocation failed with err=%d\n",
1084 			       __func__, err);
1085 			goto out_request;
1086 		}
1087 
1088 		err = i915_request_await_object(request[idx], batch->obj, 0);
1089 		if (err == 0)
1090 			err = i915_vma_move_to_active(batch, request[idx], 0);
1091 		GEM_BUG_ON(err);
1092 
1093 		err = engine->emit_bb_start(request[idx],
1094 					    batch->node.start,
1095 					    batch->node.size,
1096 					    0);
1097 		GEM_BUG_ON(err);
1098 		request[idx]->batch = batch;
1099 
1100 		i915_request_get(request[idx]);
1101 		i915_request_add(request[idx]);
1102 		idx++;
1103 	}
1104 
1105 	i915_vma_unlock(batch);
1106 
1107 	idx = 0;
1108 	for_each_uabi_engine(engine, i915) {
1109 		if (i915_request_completed(request[idx])) {
1110 			pr_err("%s(%s): request completed too early!\n",
1111 			       __func__, engine->name);
1112 			err = -EINVAL;
1113 			goto out_request;
1114 		}
1115 		idx++;
1116 	}
1117 
1118 	err = recursive_batch_resolve(batch);
1119 	if (err) {
1120 		pr_err("%s: failed to resolve batch, err=%d\n", __func__, err);
1121 		goto out_request;
1122 	}
1123 
1124 	idx = 0;
1125 	for_each_uabi_engine(engine, i915) {
1126 		long timeout;
1127 
1128 		timeout = i915_request_wait(request[idx], 0,
1129 					    MAX_SCHEDULE_TIMEOUT);
1130 		if (timeout < 0) {
1131 			err = timeout;
1132 			pr_err("%s: error waiting for request on %s, err=%d\n",
1133 			       __func__, engine->name, err);
1134 			goto out_request;
1135 		}
1136 
1137 		GEM_BUG_ON(!i915_request_completed(request[idx]));
1138 		i915_request_put(request[idx]);
1139 		request[idx] = NULL;
1140 		idx++;
1141 	}
1142 
1143 	err = igt_live_test_end(&t);
1144 
1145 out_request:
1146 	idx = 0;
1147 	for_each_uabi_engine(engine, i915) {
1148 		if (request[idx])
1149 			i915_request_put(request[idx]);
1150 		idx++;
1151 	}
1152 	i915_vma_unpin(batch);
1153 	i915_vma_put(batch);
1154 out_free:
1155 	kfree(request);
1156 	return err;
1157 }
1158 
1159 static int live_sequential_engines(void *arg)
1160 {
1161 	struct drm_i915_private *i915 = arg;
1162 	const unsigned int nengines = num_uabi_engines(i915);
1163 	struct i915_request **request;
1164 	struct i915_request *prev = NULL;
1165 	struct intel_engine_cs *engine;
1166 	struct igt_live_test t;
1167 	unsigned int idx;
1168 	int err;
1169 
1170 	/*
1171 	 * Check we can submit requests to all engines sequentially, such
1172 	 * that each successive request waits for the earlier ones. This
1173 	 * tests that we don't execute requests out of order, even though
1174 	 * they are running on independent engines.
1175 	 */
1176 
1177 	request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1178 	if (!request)
1179 		return -ENOMEM;
1180 
1181 	err = igt_live_test_begin(&t, i915, __func__, "");
1182 	if (err)
1183 		goto out_free;
1184 
1185 	idx = 0;
1186 	for_each_uabi_engine(engine, i915) {
1187 		struct i915_vma *batch;
1188 
1189 		batch = recursive_batch(i915);
1190 		if (IS_ERR(batch)) {
1191 			err = PTR_ERR(batch);
1192 			pr_err("%s: Unable to create batch for %s, err=%d\n",
1193 			       __func__, engine->name, err);
1194 			goto out_free;
1195 		}
1196 
1197 		i915_vma_lock(batch);
1198 		request[idx] = intel_engine_create_kernel_request(engine);
1199 		if (IS_ERR(request[idx])) {
1200 			err = PTR_ERR(request[idx]);
1201 			pr_err("%s: Request allocation failed for %s with err=%d\n",
1202 			       __func__, engine->name, err);
1203 			goto out_unlock;
1204 		}
1205 
1206 		if (prev) {
1207 			err = i915_request_await_dma_fence(request[idx],
1208 							   &prev->fence);
1209 			if (err) {
1210 				i915_request_add(request[idx]);
1211 				pr_err("%s: Request await failed for %s with err=%d\n",
1212 				       __func__, engine->name, err);
1213 				goto out_unlock;
1214 			}
1215 		}
1216 
1217 		err = i915_request_await_object(request[idx],
1218 						batch->obj, false);
1219 		if (err == 0)
1220 			err = i915_vma_move_to_active(batch, request[idx], 0);
1221 		GEM_BUG_ON(err);
1222 
1223 		err = engine->emit_bb_start(request[idx],
1224 					    batch->node.start,
1225 					    batch->node.size,
1226 					    0);
1227 		GEM_BUG_ON(err);
1228 		request[idx]->batch = batch;
1229 
1230 		i915_request_get(request[idx]);
1231 		i915_request_add(request[idx]);
1232 
1233 		prev = request[idx];
1234 		idx++;
1235 
1236 out_unlock:
1237 		i915_vma_unlock(batch);
1238 		if (err)
1239 			goto out_request;
1240 	}
1241 
1242 	idx = 0;
1243 	for_each_uabi_engine(engine, i915) {
1244 		long timeout;
1245 
1246 		if (i915_request_completed(request[idx])) {
1247 			pr_err("%s(%s): request completed too early!\n",
1248 			       __func__, engine->name);
1249 			err = -EINVAL;
1250 			goto out_request;
1251 		}
1252 
1253 		err = recursive_batch_resolve(request[idx]->batch);
1254 		if (err) {
1255 			pr_err("%s: failed to resolve batch, err=%d\n",
1256 			       __func__, err);
1257 			goto out_request;
1258 		}
1259 
1260 		timeout = i915_request_wait(request[idx], 0,
1261 					    MAX_SCHEDULE_TIMEOUT);
1262 		if (timeout < 0) {
1263 			err = timeout;
1264 			pr_err("%s: error waiting for request on %s, err=%d\n",
1265 			       __func__, engine->name, err);
1266 			goto out_request;
1267 		}
1268 
1269 		GEM_BUG_ON(!i915_request_completed(request[idx]));
1270 		idx++;
1271 	}
1272 
1273 	err = igt_live_test_end(&t);
1274 
1275 out_request:
1276 	idx = 0;
1277 	for_each_uabi_engine(engine, i915) {
1278 		u32 *cmd;
1279 
1280 		if (!request[idx])
1281 			break;
1282 
1283 		cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj,
1284 						       I915_MAP_WC);
1285 		if (!IS_ERR(cmd)) {
1286 			*cmd = MI_BATCH_BUFFER_END;
1287 
1288 			__i915_gem_object_flush_map(request[idx]->batch->obj,
1289 						    0, sizeof(*cmd));
1290 			i915_gem_object_unpin_map(request[idx]->batch->obj);
1291 
1292 			intel_gt_chipset_flush(engine->gt);
1293 		}
1294 
1295 		i915_vma_put(request[idx]->batch);
1296 		i915_request_put(request[idx]);
1297 		idx++;
1298 	}
1299 out_free:
1300 	kfree(request);
1301 	return err;
1302 }
1303 
1304 static int __live_parallel_engine1(void *arg)
1305 {
1306 	struct intel_engine_cs *engine = arg;
1307 	IGT_TIMEOUT(end_time);
1308 	unsigned long count;
1309 	int err = 0;
1310 
1311 	count = 0;
1312 	intel_engine_pm_get(engine);
1313 	do {
1314 		struct i915_request *rq;
1315 
1316 		rq = i915_request_create(engine->kernel_context);
1317 		if (IS_ERR(rq)) {
1318 			err = PTR_ERR(rq);
1319 			break;
1320 		}
1321 
1322 		i915_request_get(rq);
1323 		i915_request_add(rq);
1324 
1325 		err = 0;
1326 		if (i915_request_wait(rq, 0, HZ) < 0)
1327 			err = -ETIME;
1328 		i915_request_put(rq);
1329 		if (err)
1330 			break;
1331 
1332 		count++;
1333 	} while (!__igt_timeout(end_time, NULL));
1334 	intel_engine_pm_put(engine);
1335 
1336 	pr_info("%s: %lu request + sync\n", engine->name, count);
1337 	return err;
1338 }
1339 
1340 static int __live_parallel_engineN(void *arg)
1341 {
1342 	struct intel_engine_cs *engine = arg;
1343 	IGT_TIMEOUT(end_time);
1344 	unsigned long count;
1345 	int err = 0;
1346 
1347 	count = 0;
1348 	intel_engine_pm_get(engine);
1349 	do {
1350 		struct i915_request *rq;
1351 
1352 		rq = i915_request_create(engine->kernel_context);
1353 		if (IS_ERR(rq)) {
1354 			err = PTR_ERR(rq);
1355 			break;
1356 		}
1357 
1358 		i915_request_add(rq);
1359 		count++;
1360 	} while (!__igt_timeout(end_time, NULL));
1361 	intel_engine_pm_put(engine);
1362 
1363 	pr_info("%s: %lu requests\n", engine->name, count);
1364 	return err;
1365 }
1366 
1367 static bool wake_all(struct drm_i915_private *i915)
1368 {
1369 	if (atomic_dec_and_test(&i915->selftest.counter)) {
1370 		wake_up_var(&i915->selftest.counter);
1371 		return true;
1372 	}
1373 
1374 	return false;
1375 }
1376 
1377 static int wait_for_all(struct drm_i915_private *i915)
1378 {
1379 	if (wake_all(i915))
1380 		return 0;
1381 
1382 	if (wait_var_event_timeout(&i915->selftest.counter,
1383 				   !atomic_read(&i915->selftest.counter),
1384 				   i915_selftest.timeout_jiffies))
1385 		return 0;
1386 
1387 	return -ETIME;
1388 }
1389 
1390 static int __live_parallel_spin(void *arg)
1391 {
1392 	struct intel_engine_cs *engine = arg;
1393 	struct igt_spinner spin;
1394 	struct i915_request *rq;
1395 	int err = 0;
1396 
1397 	/*
1398 	 * Create a spinner running for eternity on each engine. If a second
1399 	 * spinner is incorrectly placed on the same engine, it will not be
1400 	 * able to start in time.
1401 	 */
1402 
1403 	if (igt_spinner_init(&spin, engine->gt)) {
1404 		wake_all(engine->i915);
1405 		return -ENOMEM;
1406 	}
1407 
1408 	intel_engine_pm_get(engine);
1409 	rq = igt_spinner_create_request(&spin,
1410 					engine->kernel_context,
1411 					MI_NOOP); /* no preemption */
1412 	intel_engine_pm_put(engine);
1413 	if (IS_ERR(rq)) {
1414 		err = PTR_ERR(rq);
1415 		if (err == -ENODEV)
1416 			err = 0;
1417 		wake_all(engine->i915);
1418 		goto out_spin;
1419 	}
1420 
1421 	i915_request_get(rq);
1422 	i915_request_add(rq);
1423 	if (igt_wait_for_spinner(&spin, rq)) {
1424 		/* Occupy this engine for the whole test */
1425 		err = wait_for_all(engine->i915);
1426 	} else {
1427 		pr_err("Failed to start spinner on %s\n", engine->name);
1428 		err = -EINVAL;
1429 	}
1430 	igt_spinner_end(&spin);
1431 
1432 	if (err == 0 && i915_request_wait(rq, 0, HZ) < 0)
1433 		err = -EIO;
1434 	i915_request_put(rq);
1435 
1436 out_spin:
1437 	igt_spinner_fini(&spin);
1438 	return err;
1439 }
1440 
1441 static int live_parallel_engines(void *arg)
1442 {
1443 	struct drm_i915_private *i915 = arg;
1444 	static int (* const func[])(void *arg) = {
1445 		__live_parallel_engine1,
1446 		__live_parallel_engineN,
1447 		__live_parallel_spin,
1448 		NULL,
1449 	};
1450 	const unsigned int nengines = num_uabi_engines(i915);
1451 	struct intel_engine_cs *engine;
1452 	int (* const *fn)(void *arg);
1453 	struct task_struct **tsk;
1454 	int err = 0;
1455 
1456 	/*
1457 	 * Check we can submit requests to all engines concurrently. This
1458 	 * tests that we load up the system maximally.
1459 	 */
1460 
1461 	tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL);
1462 	if (!tsk)
1463 		return -ENOMEM;
1464 
1465 	for (fn = func; !err && *fn; fn++) {
1466 		char name[KSYM_NAME_LEN];
1467 		struct igt_live_test t;
1468 		unsigned int idx;
1469 
1470 		snprintf(name, sizeof(name), "%ps", *fn);
1471 		err = igt_live_test_begin(&t, i915, __func__, name);
1472 		if (err)
1473 			break;
1474 
1475 		atomic_set(&i915->selftest.counter, nengines);
1476 
1477 		idx = 0;
1478 		for_each_uabi_engine(engine, i915) {
1479 			tsk[idx] = kthread_run(*fn, engine,
1480 					       "igt/parallel:%s",
1481 					       engine->name);
1482 			if (IS_ERR(tsk[idx])) {
1483 				err = PTR_ERR(tsk[idx]);
1484 				break;
1485 			}
1486 			get_task_struct(tsk[idx++]);
1487 		}
1488 
1489 		yield(); /* start all threads before we kthread_stop() */
1490 
1491 		idx = 0;
1492 		for_each_uabi_engine(engine, i915) {
1493 			int status;
1494 
1495 			if (IS_ERR(tsk[idx]))
1496 				break;
1497 
1498 			status = kthread_stop(tsk[idx]);
1499 			if (status && !err)
1500 				err = status;
1501 
1502 			put_task_struct(tsk[idx++]);
1503 		}
1504 
1505 		if (igt_live_test_end(&t))
1506 			err = -EIO;
1507 	}
1508 
1509 	kfree(tsk);
1510 	return err;
1511 }
1512 
1513 static int
1514 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1515 {
1516 	struct i915_request *rq;
1517 	int ret;
1518 
1519 	/*
1520 	 * Before execlists, all contexts share the same ringbuffer. With
1521 	 * execlists, each context/engine has a separate ringbuffer and
1522 	 * for the purposes of this test, inexhaustible.
1523 	 *
1524 	 * For the global ringbuffer though, we have to be very careful
1525 	 * that we do not wrap while preventing the execution of requests
1526 	 * with a unsignaled fence.
1527 	 */
1528 	if (HAS_EXECLISTS(ctx->i915))
1529 		return INT_MAX;
1530 
1531 	rq = igt_request_alloc(ctx, engine);
1532 	if (IS_ERR(rq)) {
1533 		ret = PTR_ERR(rq);
1534 	} else {
1535 		int sz;
1536 
1537 		ret = rq->ring->size - rq->reserved_space;
1538 		i915_request_add(rq);
1539 
1540 		sz = rq->ring->emit - rq->head;
1541 		if (sz < 0)
1542 			sz += rq->ring->size;
1543 		ret /= sz;
1544 		ret /= 2; /* leave half spare, in case of emergency! */
1545 	}
1546 
1547 	return ret;
1548 }
1549 
1550 static int live_breadcrumbs_smoketest(void *arg)
1551 {
1552 	struct drm_i915_private *i915 = arg;
1553 	const unsigned int nengines = num_uabi_engines(i915);
1554 	const unsigned int ncpus = num_online_cpus();
1555 	unsigned long num_waits, num_fences;
1556 	struct intel_engine_cs *engine;
1557 	struct task_struct **threads;
1558 	struct igt_live_test live;
1559 	intel_wakeref_t wakeref;
1560 	struct smoketest *smoke;
1561 	unsigned int n, idx;
1562 	struct file *file;
1563 	int ret = 0;
1564 
1565 	/*
1566 	 * Smoketest our breadcrumb/signal handling for requests across multiple
1567 	 * threads. A very simple test to only catch the most egregious of bugs.
1568 	 * See __igt_breadcrumbs_smoketest();
1569 	 *
1570 	 * On real hardware this time.
1571 	 */
1572 
1573 	wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1574 
1575 	file = mock_file(i915);
1576 	if (IS_ERR(file)) {
1577 		ret = PTR_ERR(file);
1578 		goto out_rpm;
1579 	}
1580 
1581 	smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1582 	if (!smoke) {
1583 		ret = -ENOMEM;
1584 		goto out_file;
1585 	}
1586 
1587 	threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1588 	if (!threads) {
1589 		ret = -ENOMEM;
1590 		goto out_smoke;
1591 	}
1592 
1593 	smoke[0].request_alloc = __live_request_alloc;
1594 	smoke[0].ncontexts = 64;
1595 	smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1596 				    sizeof(*smoke[0].contexts),
1597 				    GFP_KERNEL);
1598 	if (!smoke[0].contexts) {
1599 		ret = -ENOMEM;
1600 		goto out_threads;
1601 	}
1602 
1603 	for (n = 0; n < smoke[0].ncontexts; n++) {
1604 		smoke[0].contexts[n] = live_context(i915, file);
1605 		if (IS_ERR(smoke[0].contexts[n])) {
1606 			ret = PTR_ERR(smoke[0].contexts[n]);
1607 			goto out_contexts;
1608 		}
1609 	}
1610 
1611 	ret = igt_live_test_begin(&live, i915, __func__, "");
1612 	if (ret)
1613 		goto out_contexts;
1614 
1615 	idx = 0;
1616 	for_each_uabi_engine(engine, i915) {
1617 		smoke[idx] = smoke[0];
1618 		smoke[idx].engine = engine;
1619 		smoke[idx].max_batch =
1620 			max_batches(smoke[0].contexts[0], engine);
1621 		if (smoke[idx].max_batch < 0) {
1622 			ret = smoke[idx].max_batch;
1623 			goto out_flush;
1624 		}
1625 		/* One ring interleaved between requests from all cpus */
1626 		smoke[idx].max_batch /= num_online_cpus() + 1;
1627 		pr_debug("Limiting batches to %d requests on %s\n",
1628 			 smoke[idx].max_batch, engine->name);
1629 
1630 		for (n = 0; n < ncpus; n++) {
1631 			struct task_struct *tsk;
1632 
1633 			tsk = kthread_run(__igt_breadcrumbs_smoketest,
1634 					  &smoke[idx], "igt/%d.%d", idx, n);
1635 			if (IS_ERR(tsk)) {
1636 				ret = PTR_ERR(tsk);
1637 				goto out_flush;
1638 			}
1639 
1640 			get_task_struct(tsk);
1641 			threads[idx * ncpus + n] = tsk;
1642 		}
1643 
1644 		idx++;
1645 	}
1646 
1647 	yield(); /* start all threads before we begin */
1648 	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1649 
1650 out_flush:
1651 	idx = 0;
1652 	num_waits = 0;
1653 	num_fences = 0;
1654 	for_each_uabi_engine(engine, i915) {
1655 		for (n = 0; n < ncpus; n++) {
1656 			struct task_struct *tsk = threads[idx * ncpus + n];
1657 			int err;
1658 
1659 			if (!tsk)
1660 				continue;
1661 
1662 			err = kthread_stop(tsk);
1663 			if (err < 0 && !ret)
1664 				ret = err;
1665 
1666 			put_task_struct(tsk);
1667 		}
1668 
1669 		num_waits += atomic_long_read(&smoke[idx].num_waits);
1670 		num_fences += atomic_long_read(&smoke[idx].num_fences);
1671 		idx++;
1672 	}
1673 	pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1674 		num_waits, num_fences, idx, ncpus);
1675 
1676 	ret = igt_live_test_end(&live) ?: ret;
1677 out_contexts:
1678 	kfree(smoke[0].contexts);
1679 out_threads:
1680 	kfree(threads);
1681 out_smoke:
1682 	kfree(smoke);
1683 out_file:
1684 	fput(file);
1685 out_rpm:
1686 	intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1687 
1688 	return ret;
1689 }
1690 
1691 int i915_request_live_selftests(struct drm_i915_private *i915)
1692 {
1693 	static const struct i915_subtest tests[] = {
1694 		SUBTEST(live_nop_request),
1695 		SUBTEST(live_all_engines),
1696 		SUBTEST(live_sequential_engines),
1697 		SUBTEST(live_parallel_engines),
1698 		SUBTEST(live_empty_request),
1699 		SUBTEST(live_cancel_request),
1700 		SUBTEST(live_breadcrumbs_smoketest),
1701 	};
1702 
1703 	if (intel_gt_is_wedged(to_gt(i915)))
1704 		return 0;
1705 
1706 	return i915_subtests(tests, i915);
1707 }
1708 
1709 static int switch_to_kernel_sync(struct intel_context *ce, int err)
1710 {
1711 	struct i915_request *rq;
1712 	struct dma_fence *fence;
1713 
1714 	rq = intel_engine_create_kernel_request(ce->engine);
1715 	if (IS_ERR(rq))
1716 		return PTR_ERR(rq);
1717 
1718 	fence = i915_active_fence_get(&ce->timeline->last_request);
1719 	if (fence) {
1720 		i915_request_await_dma_fence(rq, fence);
1721 		dma_fence_put(fence);
1722 	}
1723 
1724 	rq = i915_request_get(rq);
1725 	i915_request_add(rq);
1726 	if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1727 		err = -ETIME;
1728 	i915_request_put(rq);
1729 
1730 	while (!err && !intel_engine_is_idle(ce->engine))
1731 		intel_engine_flush_submission(ce->engine);
1732 
1733 	return err;
1734 }
1735 
1736 struct perf_stats {
1737 	struct intel_engine_cs *engine;
1738 	unsigned long count;
1739 	ktime_t time;
1740 	ktime_t busy;
1741 	u64 runtime;
1742 };
1743 
1744 struct perf_series {
1745 	struct drm_i915_private *i915;
1746 	unsigned int nengines;
1747 	struct intel_context *ce[];
1748 };
1749 
1750 static int cmp_u32(const void *A, const void *B)
1751 {
1752 	const u32 *a = A, *b = B;
1753 
1754 	return *a - *b;
1755 }
1756 
1757 static u32 trifilter(u32 *a)
1758 {
1759 	u64 sum;
1760 
1761 #define TF_COUNT 5
1762 	sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1763 
1764 	sum = mul_u32_u32(a[2], 2);
1765 	sum += a[1];
1766 	sum += a[3];
1767 
1768 	GEM_BUG_ON(sum > U32_MAX);
1769 	return sum;
1770 #define TF_BIAS 2
1771 }
1772 
1773 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1774 {
1775 	u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles);
1776 
1777 	return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1778 }
1779 
1780 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1781 {
1782 	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1783 	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1784 	*cs++ = offset;
1785 	*cs++ = 0;
1786 
1787 	return cs;
1788 }
1789 
1790 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1791 {
1792 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1793 	*cs++ = offset;
1794 	*cs++ = 0;
1795 	*cs++ = value;
1796 
1797 	return cs;
1798 }
1799 
1800 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1801 {
1802 	*cs++ = MI_SEMAPHORE_WAIT |
1803 		MI_SEMAPHORE_GLOBAL_GTT |
1804 		MI_SEMAPHORE_POLL |
1805 		mode;
1806 	*cs++ = value;
1807 	*cs++ = offset;
1808 	*cs++ = 0;
1809 
1810 	return cs;
1811 }
1812 
1813 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1814 {
1815 	return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1816 }
1817 
1818 static void semaphore_set(u32 *sema, u32 value)
1819 {
1820 	WRITE_ONCE(*sema, value);
1821 	wmb(); /* flush the update to the cache, and beyond */
1822 }
1823 
1824 static u32 *hwsp_scratch(const struct intel_context *ce)
1825 {
1826 	return memset32(ce->engine->status_page.addr + 1000, 0, 21);
1827 }
1828 
1829 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
1830 {
1831 	return (i915_ggtt_offset(ce->engine->status_page.vma) +
1832 		offset_in_page(dw));
1833 }
1834 
1835 static int measure_semaphore_response(struct intel_context *ce)
1836 {
1837 	u32 *sema = hwsp_scratch(ce);
1838 	const u32 offset = hwsp_offset(ce, sema);
1839 	u32 elapsed[TF_COUNT], cycles;
1840 	struct i915_request *rq;
1841 	u32 *cs;
1842 	int err;
1843 	int i;
1844 
1845 	/*
1846 	 * Measure how many cycles it takes for the HW to detect the change
1847 	 * in a semaphore value.
1848 	 *
1849 	 *    A: read CS_TIMESTAMP from CPU
1850 	 *    poke semaphore
1851 	 *    B: read CS_TIMESTAMP on GPU
1852 	 *
1853 	 * Semaphore latency: B - A
1854 	 */
1855 
1856 	semaphore_set(sema, -1);
1857 
1858 	rq = i915_request_create(ce);
1859 	if (IS_ERR(rq))
1860 		return PTR_ERR(rq);
1861 
1862 	cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
1863 	if (IS_ERR(cs)) {
1864 		i915_request_add(rq);
1865 		err = PTR_ERR(cs);
1866 		goto err;
1867 	}
1868 
1869 	cs = emit_store_dw(cs, offset, 0);
1870 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1871 		cs = emit_semaphore_poll_until(cs, offset, i);
1872 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1873 		cs = emit_store_dw(cs, offset, 0);
1874 	}
1875 
1876 	intel_ring_advance(rq, cs);
1877 	i915_request_add(rq);
1878 
1879 	if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1880 		err = -EIO;
1881 		goto err;
1882 	}
1883 
1884 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1885 		preempt_disable();
1886 		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1887 		semaphore_set(sema, i);
1888 		preempt_enable();
1889 
1890 		if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1891 			err = -EIO;
1892 			goto err;
1893 		}
1894 
1895 		elapsed[i - 1] = sema[i] - cycles;
1896 	}
1897 
1898 	cycles = trifilter(elapsed);
1899 	pr_info("%s: semaphore response %d cycles, %lluns\n",
1900 		ce->engine->name, cycles >> TF_BIAS,
1901 		cycles_to_ns(ce->engine, cycles));
1902 
1903 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1904 
1905 err:
1906 	intel_gt_set_wedged(ce->engine->gt);
1907 	return err;
1908 }
1909 
1910 static int measure_idle_dispatch(struct intel_context *ce)
1911 {
1912 	u32 *sema = hwsp_scratch(ce);
1913 	const u32 offset = hwsp_offset(ce, sema);
1914 	u32 elapsed[TF_COUNT], cycles;
1915 	u32 *cs;
1916 	int err;
1917 	int i;
1918 
1919 	/*
1920 	 * Measure how long it takes for us to submit a request while the
1921 	 * engine is idle, but is resting in our context.
1922 	 *
1923 	 *    A: read CS_TIMESTAMP from CPU
1924 	 *    submit request
1925 	 *    B: read CS_TIMESTAMP on GPU
1926 	 *
1927 	 * Submission latency: B - A
1928 	 */
1929 
1930 	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
1931 		struct i915_request *rq;
1932 
1933 		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1934 		if (err)
1935 			return err;
1936 
1937 		rq = i915_request_create(ce);
1938 		if (IS_ERR(rq)) {
1939 			err = PTR_ERR(rq);
1940 			goto err;
1941 		}
1942 
1943 		cs = intel_ring_begin(rq, 4);
1944 		if (IS_ERR(cs)) {
1945 			i915_request_add(rq);
1946 			err = PTR_ERR(cs);
1947 			goto err;
1948 		}
1949 
1950 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1951 
1952 		intel_ring_advance(rq, cs);
1953 
1954 		preempt_disable();
1955 		local_bh_disable();
1956 		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1957 		i915_request_add(rq);
1958 		local_bh_enable();
1959 		preempt_enable();
1960 	}
1961 
1962 	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1963 	if (err)
1964 		goto err;
1965 
1966 	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
1967 		elapsed[i] = sema[i] - elapsed[i];
1968 
1969 	cycles = trifilter(elapsed);
1970 	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
1971 		ce->engine->name, cycles >> TF_BIAS,
1972 		cycles_to_ns(ce->engine, cycles));
1973 
1974 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1975 
1976 err:
1977 	intel_gt_set_wedged(ce->engine->gt);
1978 	return err;
1979 }
1980 
1981 static int measure_busy_dispatch(struct intel_context *ce)
1982 {
1983 	u32 *sema = hwsp_scratch(ce);
1984 	const u32 offset = hwsp_offset(ce, sema);
1985 	u32 elapsed[TF_COUNT + 1], cycles;
1986 	u32 *cs;
1987 	int err;
1988 	int i;
1989 
1990 	/*
1991 	 * Measure how long it takes for us to submit a request while the
1992 	 * engine is busy, polling on a semaphore in our context. With
1993 	 * direct submission, this will include the cost of a lite restore.
1994 	 *
1995 	 *    A: read CS_TIMESTAMP from CPU
1996 	 *    submit request
1997 	 *    B: read CS_TIMESTAMP on GPU
1998 	 *
1999 	 * Submission latency: B - A
2000 	 */
2001 
2002 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2003 		struct i915_request *rq;
2004 
2005 		rq = i915_request_create(ce);
2006 		if (IS_ERR(rq)) {
2007 			err = PTR_ERR(rq);
2008 			goto err;
2009 		}
2010 
2011 		cs = intel_ring_begin(rq, 12);
2012 		if (IS_ERR(cs)) {
2013 			i915_request_add(rq);
2014 			err = PTR_ERR(cs);
2015 			goto err;
2016 		}
2017 
2018 		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2019 		cs = emit_semaphore_poll_until(cs, offset, i);
2020 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2021 
2022 		intel_ring_advance(rq, cs);
2023 
2024 		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
2025 			err = -EIO;
2026 			goto err;
2027 		}
2028 
2029 		preempt_disable();
2030 		local_bh_disable();
2031 		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2032 		i915_request_add(rq);
2033 		local_bh_enable();
2034 		semaphore_set(sema, i - 1);
2035 		preempt_enable();
2036 	}
2037 
2038 	wait_for(READ_ONCE(sema[i - 1]), 500);
2039 	semaphore_set(sema, i - 1);
2040 
2041 	for (i = 1; i <= TF_COUNT; i++) {
2042 		GEM_BUG_ON(sema[i] == -1);
2043 		elapsed[i - 1] = sema[i] - elapsed[i];
2044 	}
2045 
2046 	cycles = trifilter(elapsed);
2047 	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
2048 		ce->engine->name, cycles >> TF_BIAS,
2049 		cycles_to_ns(ce->engine, cycles));
2050 
2051 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2052 
2053 err:
2054 	intel_gt_set_wedged(ce->engine->gt);
2055 	return err;
2056 }
2057 
2058 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
2059 {
2060 	const u32 offset =
2061 		i915_ggtt_offset(engine->status_page.vma) +
2062 		offset_in_page(sema);
2063 	struct i915_request *rq;
2064 	u32 *cs;
2065 
2066 	rq = i915_request_create(engine->kernel_context);
2067 	if (IS_ERR(rq))
2068 		return PTR_ERR(rq);
2069 
2070 	cs = intel_ring_begin(rq, 4);
2071 	if (IS_ERR(cs)) {
2072 		i915_request_add(rq);
2073 		return PTR_ERR(cs);
2074 	}
2075 
2076 	cs = emit_semaphore_poll(cs, mode, value, offset);
2077 
2078 	intel_ring_advance(rq, cs);
2079 	i915_request_add(rq);
2080 
2081 	return 0;
2082 }
2083 
2084 static int measure_inter_request(struct intel_context *ce)
2085 {
2086 	u32 *sema = hwsp_scratch(ce);
2087 	const u32 offset = hwsp_offset(ce, sema);
2088 	u32 elapsed[TF_COUNT + 1], cycles;
2089 	struct i915_sw_fence *submit;
2090 	int i, err;
2091 
2092 	/*
2093 	 * Measure how long it takes to advance from one request into the
2094 	 * next. Between each request we flush the GPU caches to memory,
2095 	 * update the breadcrumbs, and then invalidate those caches.
2096 	 * We queue up all the requests to be submitted in one batch so
2097 	 * it should be one set of contiguous measurements.
2098 	 *
2099 	 *    A: read CS_TIMESTAMP on GPU
2100 	 *    advance request
2101 	 *    B: read CS_TIMESTAMP on GPU
2102 	 *
2103 	 * Request latency: B - A
2104 	 */
2105 
2106 	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2107 	if (err)
2108 		return err;
2109 
2110 	submit = heap_fence_create(GFP_KERNEL);
2111 	if (!submit) {
2112 		semaphore_set(sema, 1);
2113 		return -ENOMEM;
2114 	}
2115 
2116 	intel_engine_flush_submission(ce->engine);
2117 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2118 		struct i915_request *rq;
2119 		u32 *cs;
2120 
2121 		rq = i915_request_create(ce);
2122 		if (IS_ERR(rq)) {
2123 			err = PTR_ERR(rq);
2124 			goto err_submit;
2125 		}
2126 
2127 		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
2128 						       submit,
2129 						       GFP_KERNEL);
2130 		if (err < 0) {
2131 			i915_request_add(rq);
2132 			goto err_submit;
2133 		}
2134 
2135 		cs = intel_ring_begin(rq, 4);
2136 		if (IS_ERR(cs)) {
2137 			i915_request_add(rq);
2138 			err = PTR_ERR(cs);
2139 			goto err_submit;
2140 		}
2141 
2142 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2143 
2144 		intel_ring_advance(rq, cs);
2145 		i915_request_add(rq);
2146 	}
2147 	i915_sw_fence_commit(submit);
2148 	intel_engine_flush_submission(ce->engine);
2149 	heap_fence_put(submit);
2150 
2151 	semaphore_set(sema, 1);
2152 	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2153 	if (err)
2154 		goto err;
2155 
2156 	for (i = 1; i <= TF_COUNT; i++)
2157 		elapsed[i - 1] = sema[i + 1] - sema[i];
2158 
2159 	cycles = trifilter(elapsed);
2160 	pr_info("%s: inter-request latency %d cycles, %lluns\n",
2161 		ce->engine->name, cycles >> TF_BIAS,
2162 		cycles_to_ns(ce->engine, cycles));
2163 
2164 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2165 
2166 err_submit:
2167 	i915_sw_fence_commit(submit);
2168 	heap_fence_put(submit);
2169 	semaphore_set(sema, 1);
2170 err:
2171 	intel_gt_set_wedged(ce->engine->gt);
2172 	return err;
2173 }
2174 
2175 static int measure_context_switch(struct intel_context *ce)
2176 {
2177 	u32 *sema = hwsp_scratch(ce);
2178 	const u32 offset = hwsp_offset(ce, sema);
2179 	struct i915_request *fence = NULL;
2180 	u32 elapsed[TF_COUNT + 1], cycles;
2181 	int i, j, err;
2182 	u32 *cs;
2183 
2184 	/*
2185 	 * Measure how long it takes to advance from one request in one
2186 	 * context to a request in another context. This allows us to
2187 	 * measure how long the context save/restore take, along with all
2188 	 * the inter-context setup we require.
2189 	 *
2190 	 *    A: read CS_TIMESTAMP on GPU
2191 	 *    switch context
2192 	 *    B: read CS_TIMESTAMP on GPU
2193 	 *
2194 	 * Context switch latency: B - A
2195 	 */
2196 
2197 	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2198 	if (err)
2199 		return err;
2200 
2201 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2202 		struct intel_context *arr[] = {
2203 			ce, ce->engine->kernel_context
2204 		};
2205 		u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
2206 
2207 		for (j = 0; j < ARRAY_SIZE(arr); j++) {
2208 			struct i915_request *rq;
2209 
2210 			rq = i915_request_create(arr[j]);
2211 			if (IS_ERR(rq)) {
2212 				err = PTR_ERR(rq);
2213 				goto err_fence;
2214 			}
2215 
2216 			if (fence) {
2217 				err = i915_request_await_dma_fence(rq,
2218 								   &fence->fence);
2219 				if (err) {
2220 					i915_request_add(rq);
2221 					goto err_fence;
2222 				}
2223 			}
2224 
2225 			cs = intel_ring_begin(rq, 4);
2226 			if (IS_ERR(cs)) {
2227 				i915_request_add(rq);
2228 				err = PTR_ERR(cs);
2229 				goto err_fence;
2230 			}
2231 
2232 			cs = emit_timestamp_store(cs, ce, addr);
2233 			addr += sizeof(u32);
2234 
2235 			intel_ring_advance(rq, cs);
2236 
2237 			i915_request_put(fence);
2238 			fence = i915_request_get(rq);
2239 
2240 			i915_request_add(rq);
2241 		}
2242 	}
2243 	i915_request_put(fence);
2244 	intel_engine_flush_submission(ce->engine);
2245 
2246 	semaphore_set(sema, 1);
2247 	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2248 	if (err)
2249 		goto err;
2250 
2251 	for (i = 1; i <= TF_COUNT; i++)
2252 		elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2253 
2254 	cycles = trifilter(elapsed);
2255 	pr_info("%s: context switch latency %d cycles, %lluns\n",
2256 		ce->engine->name, cycles >> TF_BIAS,
2257 		cycles_to_ns(ce->engine, cycles));
2258 
2259 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2260 
2261 err_fence:
2262 	i915_request_put(fence);
2263 	semaphore_set(sema, 1);
2264 err:
2265 	intel_gt_set_wedged(ce->engine->gt);
2266 	return err;
2267 }
2268 
2269 static int measure_preemption(struct intel_context *ce)
2270 {
2271 	u32 *sema = hwsp_scratch(ce);
2272 	const u32 offset = hwsp_offset(ce, sema);
2273 	u32 elapsed[TF_COUNT], cycles;
2274 	u32 *cs;
2275 	int err;
2276 	int i;
2277 
2278 	/*
2279 	 * We measure two latencies while triggering preemption. The first
2280 	 * latency is how long it takes for us to submit a preempting request.
2281 	 * The second latency is how it takes for us to return from the
2282 	 * preemption back to the original context.
2283 	 *
2284 	 *    A: read CS_TIMESTAMP from CPU
2285 	 *    submit preemption
2286 	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
2287 	 *    context switch
2288 	 *    C: read CS_TIMESTAMP on GPU (in original context)
2289 	 *
2290 	 * Preemption dispatch latency: B - A
2291 	 * Preemption switch latency: C - B
2292 	 */
2293 
2294 	if (!intel_engine_has_preemption(ce->engine))
2295 		return 0;
2296 
2297 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2298 		u32 addr = offset + 2 * i * sizeof(u32);
2299 		struct i915_request *rq;
2300 
2301 		rq = i915_request_create(ce);
2302 		if (IS_ERR(rq)) {
2303 			err = PTR_ERR(rq);
2304 			goto err;
2305 		}
2306 
2307 		cs = intel_ring_begin(rq, 12);
2308 		if (IS_ERR(cs)) {
2309 			i915_request_add(rq);
2310 			err = PTR_ERR(cs);
2311 			goto err;
2312 		}
2313 
2314 		cs = emit_store_dw(cs, addr, -1);
2315 		cs = emit_semaphore_poll_until(cs, offset, i);
2316 		cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2317 
2318 		intel_ring_advance(rq, cs);
2319 		i915_request_add(rq);
2320 
2321 		if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2322 			err = -EIO;
2323 			goto err;
2324 		}
2325 
2326 		rq = i915_request_create(ce->engine->kernel_context);
2327 		if (IS_ERR(rq)) {
2328 			err = PTR_ERR(rq);
2329 			goto err;
2330 		}
2331 
2332 		cs = intel_ring_begin(rq, 8);
2333 		if (IS_ERR(cs)) {
2334 			i915_request_add(rq);
2335 			err = PTR_ERR(cs);
2336 			goto err;
2337 		}
2338 
2339 		cs = emit_timestamp_store(cs, ce, addr);
2340 		cs = emit_store_dw(cs, offset, i);
2341 
2342 		intel_ring_advance(rq, cs);
2343 		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2344 
2345 		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2346 		i915_request_add(rq);
2347 	}
2348 
2349 	if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2350 		err = -EIO;
2351 		goto err;
2352 	}
2353 
2354 	for (i = 1; i <= TF_COUNT; i++)
2355 		elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2356 
2357 	cycles = trifilter(elapsed);
2358 	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2359 		ce->engine->name, cycles >> TF_BIAS,
2360 		cycles_to_ns(ce->engine, cycles));
2361 
2362 	for (i = 1; i <= TF_COUNT; i++)
2363 		elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2364 
2365 	cycles = trifilter(elapsed);
2366 	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2367 		ce->engine->name, cycles >> TF_BIAS,
2368 		cycles_to_ns(ce->engine, cycles));
2369 
2370 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2371 
2372 err:
2373 	intel_gt_set_wedged(ce->engine->gt);
2374 	return err;
2375 }
2376 
2377 struct signal_cb {
2378 	struct dma_fence_cb base;
2379 	bool seen;
2380 };
2381 
2382 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2383 {
2384 	struct signal_cb *s = container_of(cb, typeof(*s), base);
2385 
2386 	smp_store_mb(s->seen, true); /* be safe, be strong */
2387 }
2388 
2389 static int measure_completion(struct intel_context *ce)
2390 {
2391 	u32 *sema = hwsp_scratch(ce);
2392 	const u32 offset = hwsp_offset(ce, sema);
2393 	u32 elapsed[TF_COUNT], cycles;
2394 	u32 *cs;
2395 	int err;
2396 	int i;
2397 
2398 	/*
2399 	 * Measure how long it takes for the signal (interrupt) to be
2400 	 * sent from the GPU to be processed by the CPU.
2401 	 *
2402 	 *    A: read CS_TIMESTAMP on GPU
2403 	 *    signal
2404 	 *    B: read CS_TIMESTAMP from CPU
2405 	 *
2406 	 * Completion latency: B - A
2407 	 */
2408 
2409 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2410 		struct signal_cb cb = { .seen = false };
2411 		struct i915_request *rq;
2412 
2413 		rq = i915_request_create(ce);
2414 		if (IS_ERR(rq)) {
2415 			err = PTR_ERR(rq);
2416 			goto err;
2417 		}
2418 
2419 		cs = intel_ring_begin(rq, 12);
2420 		if (IS_ERR(cs)) {
2421 			i915_request_add(rq);
2422 			err = PTR_ERR(cs);
2423 			goto err;
2424 		}
2425 
2426 		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2427 		cs = emit_semaphore_poll_until(cs, offset, i);
2428 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2429 
2430 		intel_ring_advance(rq, cs);
2431 
2432 		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2433 		i915_request_add(rq);
2434 
2435 		intel_engine_flush_submission(ce->engine);
2436 		if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2437 			err = -EIO;
2438 			goto err;
2439 		}
2440 
2441 		preempt_disable();
2442 		semaphore_set(sema, i);
2443 		while (!READ_ONCE(cb.seen))
2444 			cpu_relax();
2445 
2446 		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2447 		preempt_enable();
2448 	}
2449 
2450 	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2451 	if (err)
2452 		goto err;
2453 
2454 	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2455 		GEM_BUG_ON(sema[i + 1] == -1);
2456 		elapsed[i] = elapsed[i] - sema[i + 1];
2457 	}
2458 
2459 	cycles = trifilter(elapsed);
2460 	pr_info("%s: completion latency %d cycles, %lluns\n",
2461 		ce->engine->name, cycles >> TF_BIAS,
2462 		cycles_to_ns(ce->engine, cycles));
2463 
2464 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2465 
2466 err:
2467 	intel_gt_set_wedged(ce->engine->gt);
2468 	return err;
2469 }
2470 
2471 static void rps_pin(struct intel_gt *gt)
2472 {
2473 	/* Pin the frequency to max */
2474 	atomic_inc(&gt->rps.num_waiters);
2475 	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2476 
2477 	mutex_lock(&gt->rps.lock);
2478 	intel_rps_set(&gt->rps, gt->rps.max_freq);
2479 	mutex_unlock(&gt->rps.lock);
2480 }
2481 
2482 static void rps_unpin(struct intel_gt *gt)
2483 {
2484 	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2485 	atomic_dec(&gt->rps.num_waiters);
2486 }
2487 
2488 static int perf_request_latency(void *arg)
2489 {
2490 	struct drm_i915_private *i915 = arg;
2491 	struct intel_engine_cs *engine;
2492 	struct pm_qos_request qos;
2493 	int err = 0;
2494 
2495 	if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */
2496 		return 0;
2497 
2498 	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2499 
2500 	for_each_uabi_engine(engine, i915) {
2501 		struct intel_context *ce;
2502 
2503 		ce = intel_context_create(engine);
2504 		if (IS_ERR(ce)) {
2505 			err = PTR_ERR(ce);
2506 			goto out;
2507 		}
2508 
2509 		err = intel_context_pin(ce);
2510 		if (err) {
2511 			intel_context_put(ce);
2512 			goto out;
2513 		}
2514 
2515 		st_engine_heartbeat_disable(engine);
2516 		rps_pin(engine->gt);
2517 
2518 		if (err == 0)
2519 			err = measure_semaphore_response(ce);
2520 		if (err == 0)
2521 			err = measure_idle_dispatch(ce);
2522 		if (err == 0)
2523 			err = measure_busy_dispatch(ce);
2524 		if (err == 0)
2525 			err = measure_inter_request(ce);
2526 		if (err == 0)
2527 			err = measure_context_switch(ce);
2528 		if (err == 0)
2529 			err = measure_preemption(ce);
2530 		if (err == 0)
2531 			err = measure_completion(ce);
2532 
2533 		rps_unpin(engine->gt);
2534 		st_engine_heartbeat_enable(engine);
2535 
2536 		intel_context_unpin(ce);
2537 		intel_context_put(ce);
2538 		if (err)
2539 			goto out;
2540 	}
2541 
2542 out:
2543 	if (igt_flush_test(i915))
2544 		err = -EIO;
2545 
2546 	cpu_latency_qos_remove_request(&qos);
2547 	return err;
2548 }
2549 
2550 static int s_sync0(void *arg)
2551 {
2552 	struct perf_series *ps = arg;
2553 	IGT_TIMEOUT(end_time);
2554 	unsigned int idx = 0;
2555 	int err = 0;
2556 
2557 	GEM_BUG_ON(!ps->nengines);
2558 	do {
2559 		struct i915_request *rq;
2560 
2561 		rq = i915_request_create(ps->ce[idx]);
2562 		if (IS_ERR(rq)) {
2563 			err = PTR_ERR(rq);
2564 			break;
2565 		}
2566 
2567 		i915_request_get(rq);
2568 		i915_request_add(rq);
2569 
2570 		if (i915_request_wait(rq, 0, HZ / 5) < 0)
2571 			err = -ETIME;
2572 		i915_request_put(rq);
2573 		if (err)
2574 			break;
2575 
2576 		if (++idx == ps->nengines)
2577 			idx = 0;
2578 	} while (!__igt_timeout(end_time, NULL));
2579 
2580 	return err;
2581 }
2582 
2583 static int s_sync1(void *arg)
2584 {
2585 	struct perf_series *ps = arg;
2586 	struct i915_request *prev = NULL;
2587 	IGT_TIMEOUT(end_time);
2588 	unsigned int idx = 0;
2589 	int err = 0;
2590 
2591 	GEM_BUG_ON(!ps->nengines);
2592 	do {
2593 		struct i915_request *rq;
2594 
2595 		rq = i915_request_create(ps->ce[idx]);
2596 		if (IS_ERR(rq)) {
2597 			err = PTR_ERR(rq);
2598 			break;
2599 		}
2600 
2601 		i915_request_get(rq);
2602 		i915_request_add(rq);
2603 
2604 		if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2605 			err = -ETIME;
2606 		i915_request_put(prev);
2607 		prev = rq;
2608 		if (err)
2609 			break;
2610 
2611 		if (++idx == ps->nengines)
2612 			idx = 0;
2613 	} while (!__igt_timeout(end_time, NULL));
2614 	i915_request_put(prev);
2615 
2616 	return err;
2617 }
2618 
2619 static int s_many(void *arg)
2620 {
2621 	struct perf_series *ps = arg;
2622 	IGT_TIMEOUT(end_time);
2623 	unsigned int idx = 0;
2624 
2625 	GEM_BUG_ON(!ps->nengines);
2626 	do {
2627 		struct i915_request *rq;
2628 
2629 		rq = i915_request_create(ps->ce[idx]);
2630 		if (IS_ERR(rq))
2631 			return PTR_ERR(rq);
2632 
2633 		i915_request_add(rq);
2634 
2635 		if (++idx == ps->nengines)
2636 			idx = 0;
2637 	} while (!__igt_timeout(end_time, NULL));
2638 
2639 	return 0;
2640 }
2641 
2642 static int perf_series_engines(void *arg)
2643 {
2644 	struct drm_i915_private *i915 = arg;
2645 	static int (* const func[])(void *arg) = {
2646 		s_sync0,
2647 		s_sync1,
2648 		s_many,
2649 		NULL,
2650 	};
2651 	const unsigned int nengines = num_uabi_engines(i915);
2652 	struct intel_engine_cs *engine;
2653 	int (* const *fn)(void *arg);
2654 	struct pm_qos_request qos;
2655 	struct perf_stats *stats;
2656 	struct perf_series *ps;
2657 	unsigned int idx;
2658 	int err = 0;
2659 
2660 	stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2661 	if (!stats)
2662 		return -ENOMEM;
2663 
2664 	ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2665 	if (!ps) {
2666 		kfree(stats);
2667 		return -ENOMEM;
2668 	}
2669 
2670 	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2671 
2672 	ps->i915 = i915;
2673 	ps->nengines = nengines;
2674 
2675 	idx = 0;
2676 	for_each_uabi_engine(engine, i915) {
2677 		struct intel_context *ce;
2678 
2679 		ce = intel_context_create(engine);
2680 		if (IS_ERR(ce)) {
2681 			err = PTR_ERR(ce);
2682 			goto out;
2683 		}
2684 
2685 		err = intel_context_pin(ce);
2686 		if (err) {
2687 			intel_context_put(ce);
2688 			goto out;
2689 		}
2690 
2691 		ps->ce[idx++] = ce;
2692 	}
2693 	GEM_BUG_ON(idx != ps->nengines);
2694 
2695 	for (fn = func; *fn && !err; fn++) {
2696 		char name[KSYM_NAME_LEN];
2697 		struct igt_live_test t;
2698 
2699 		snprintf(name, sizeof(name), "%ps", *fn);
2700 		err = igt_live_test_begin(&t, i915, __func__, name);
2701 		if (err)
2702 			break;
2703 
2704 		for (idx = 0; idx < nengines; idx++) {
2705 			struct perf_stats *p =
2706 				memset(&stats[idx], 0, sizeof(stats[idx]));
2707 			struct intel_context *ce = ps->ce[idx];
2708 
2709 			p->engine = ps->ce[idx]->engine;
2710 			intel_engine_pm_get(p->engine);
2711 
2712 			if (intel_engine_supports_stats(p->engine))
2713 				p->busy = intel_engine_get_busy_time(p->engine,
2714 								     &p->time) + 1;
2715 			else
2716 				p->time = ktime_get();
2717 			p->runtime = -intel_context_get_total_runtime_ns(ce);
2718 		}
2719 
2720 		err = (*fn)(ps);
2721 		if (igt_live_test_end(&t))
2722 			err = -EIO;
2723 
2724 		for (idx = 0; idx < nengines; idx++) {
2725 			struct perf_stats *p = &stats[idx];
2726 			struct intel_context *ce = ps->ce[idx];
2727 			int integer, decimal;
2728 			u64 busy, dt, now;
2729 
2730 			if (p->busy)
2731 				p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2732 									       &now),
2733 						    p->busy - 1);
2734 			else
2735 				now = ktime_get();
2736 			p->time = ktime_sub(now, p->time);
2737 
2738 			err = switch_to_kernel_sync(ce, err);
2739 			p->runtime += intel_context_get_total_runtime_ns(ce);
2740 			intel_engine_pm_put(p->engine);
2741 
2742 			busy = 100 * ktime_to_ns(p->busy);
2743 			dt = ktime_to_ns(p->time);
2744 			if (dt) {
2745 				integer = div64_u64(busy, dt);
2746 				busy -= integer * dt;
2747 				decimal = div64_u64(100 * busy, dt);
2748 			} else {
2749 				integer = 0;
2750 				decimal = 0;
2751 			}
2752 
2753 			pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2754 				name, p->engine->name, ce->timeline->seqno,
2755 				integer, decimal,
2756 				div_u64(p->runtime, 1000 * 1000),
2757 				div_u64(ktime_to_ns(p->time), 1000 * 1000));
2758 		}
2759 	}
2760 
2761 out:
2762 	for (idx = 0; idx < nengines; idx++) {
2763 		if (IS_ERR_OR_NULL(ps->ce[idx]))
2764 			break;
2765 
2766 		intel_context_unpin(ps->ce[idx]);
2767 		intel_context_put(ps->ce[idx]);
2768 	}
2769 	kfree(ps);
2770 
2771 	cpu_latency_qos_remove_request(&qos);
2772 	kfree(stats);
2773 	return err;
2774 }
2775 
2776 static int p_sync0(void *arg)
2777 {
2778 	struct perf_stats *p = arg;
2779 	struct intel_engine_cs *engine = p->engine;
2780 	struct intel_context *ce;
2781 	IGT_TIMEOUT(end_time);
2782 	unsigned long count;
2783 	bool busy;
2784 	int err = 0;
2785 
2786 	ce = intel_context_create(engine);
2787 	if (IS_ERR(ce))
2788 		return PTR_ERR(ce);
2789 
2790 	err = intel_context_pin(ce);
2791 	if (err) {
2792 		intel_context_put(ce);
2793 		return err;
2794 	}
2795 
2796 	if (intel_engine_supports_stats(engine)) {
2797 		p->busy = intel_engine_get_busy_time(engine, &p->time);
2798 		busy = true;
2799 	} else {
2800 		p->time = ktime_get();
2801 		busy = false;
2802 	}
2803 
2804 	count = 0;
2805 	do {
2806 		struct i915_request *rq;
2807 
2808 		rq = i915_request_create(ce);
2809 		if (IS_ERR(rq)) {
2810 			err = PTR_ERR(rq);
2811 			break;
2812 		}
2813 
2814 		i915_request_get(rq);
2815 		i915_request_add(rq);
2816 
2817 		err = 0;
2818 		if (i915_request_wait(rq, 0, HZ) < 0)
2819 			err = -ETIME;
2820 		i915_request_put(rq);
2821 		if (err)
2822 			break;
2823 
2824 		count++;
2825 	} while (!__igt_timeout(end_time, NULL));
2826 
2827 	if (busy) {
2828 		ktime_t now;
2829 
2830 		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2831 				    p->busy);
2832 		p->time = ktime_sub(now, p->time);
2833 	} else {
2834 		p->time = ktime_sub(ktime_get(), p->time);
2835 	}
2836 
2837 	err = switch_to_kernel_sync(ce, err);
2838 	p->runtime = intel_context_get_total_runtime_ns(ce);
2839 	p->count = count;
2840 
2841 	intel_context_unpin(ce);
2842 	intel_context_put(ce);
2843 	return err;
2844 }
2845 
2846 static int p_sync1(void *arg)
2847 {
2848 	struct perf_stats *p = arg;
2849 	struct intel_engine_cs *engine = p->engine;
2850 	struct i915_request *prev = NULL;
2851 	struct intel_context *ce;
2852 	IGT_TIMEOUT(end_time);
2853 	unsigned long count;
2854 	bool busy;
2855 	int err = 0;
2856 
2857 	ce = intel_context_create(engine);
2858 	if (IS_ERR(ce))
2859 		return PTR_ERR(ce);
2860 
2861 	err = intel_context_pin(ce);
2862 	if (err) {
2863 		intel_context_put(ce);
2864 		return err;
2865 	}
2866 
2867 	if (intel_engine_supports_stats(engine)) {
2868 		p->busy = intel_engine_get_busy_time(engine, &p->time);
2869 		busy = true;
2870 	} else {
2871 		p->time = ktime_get();
2872 		busy = false;
2873 	}
2874 
2875 	count = 0;
2876 	do {
2877 		struct i915_request *rq;
2878 
2879 		rq = i915_request_create(ce);
2880 		if (IS_ERR(rq)) {
2881 			err = PTR_ERR(rq);
2882 			break;
2883 		}
2884 
2885 		i915_request_get(rq);
2886 		i915_request_add(rq);
2887 
2888 		err = 0;
2889 		if (prev && i915_request_wait(prev, 0, HZ) < 0)
2890 			err = -ETIME;
2891 		i915_request_put(prev);
2892 		prev = rq;
2893 		if (err)
2894 			break;
2895 
2896 		count++;
2897 	} while (!__igt_timeout(end_time, NULL));
2898 	i915_request_put(prev);
2899 
2900 	if (busy) {
2901 		ktime_t now;
2902 
2903 		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2904 				    p->busy);
2905 		p->time = ktime_sub(now, p->time);
2906 	} else {
2907 		p->time = ktime_sub(ktime_get(), p->time);
2908 	}
2909 
2910 	err = switch_to_kernel_sync(ce, err);
2911 	p->runtime = intel_context_get_total_runtime_ns(ce);
2912 	p->count = count;
2913 
2914 	intel_context_unpin(ce);
2915 	intel_context_put(ce);
2916 	return err;
2917 }
2918 
2919 static int p_many(void *arg)
2920 {
2921 	struct perf_stats *p = arg;
2922 	struct intel_engine_cs *engine = p->engine;
2923 	struct intel_context *ce;
2924 	IGT_TIMEOUT(end_time);
2925 	unsigned long count;
2926 	int err = 0;
2927 	bool busy;
2928 
2929 	ce = intel_context_create(engine);
2930 	if (IS_ERR(ce))
2931 		return PTR_ERR(ce);
2932 
2933 	err = intel_context_pin(ce);
2934 	if (err) {
2935 		intel_context_put(ce);
2936 		return err;
2937 	}
2938 
2939 	if (intel_engine_supports_stats(engine)) {
2940 		p->busy = intel_engine_get_busy_time(engine, &p->time);
2941 		busy = true;
2942 	} else {
2943 		p->time = ktime_get();
2944 		busy = false;
2945 	}
2946 
2947 	count = 0;
2948 	do {
2949 		struct i915_request *rq;
2950 
2951 		rq = i915_request_create(ce);
2952 		if (IS_ERR(rq)) {
2953 			err = PTR_ERR(rq);
2954 			break;
2955 		}
2956 
2957 		i915_request_add(rq);
2958 		count++;
2959 	} while (!__igt_timeout(end_time, NULL));
2960 
2961 	if (busy) {
2962 		ktime_t now;
2963 
2964 		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2965 				    p->busy);
2966 		p->time = ktime_sub(now, p->time);
2967 	} else {
2968 		p->time = ktime_sub(ktime_get(), p->time);
2969 	}
2970 
2971 	err = switch_to_kernel_sync(ce, err);
2972 	p->runtime = intel_context_get_total_runtime_ns(ce);
2973 	p->count = count;
2974 
2975 	intel_context_unpin(ce);
2976 	intel_context_put(ce);
2977 	return err;
2978 }
2979 
2980 static int perf_parallel_engines(void *arg)
2981 {
2982 	struct drm_i915_private *i915 = arg;
2983 	static int (* const func[])(void *arg) = {
2984 		p_sync0,
2985 		p_sync1,
2986 		p_many,
2987 		NULL,
2988 	};
2989 	const unsigned int nengines = num_uabi_engines(i915);
2990 	struct intel_engine_cs *engine;
2991 	int (* const *fn)(void *arg);
2992 	struct pm_qos_request qos;
2993 	struct {
2994 		struct perf_stats p;
2995 		struct task_struct *tsk;
2996 	} *engines;
2997 	int err = 0;
2998 
2999 	engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
3000 	if (!engines)
3001 		return -ENOMEM;
3002 
3003 	cpu_latency_qos_add_request(&qos, 0);
3004 
3005 	for (fn = func; *fn; fn++) {
3006 		char name[KSYM_NAME_LEN];
3007 		struct igt_live_test t;
3008 		unsigned int idx;
3009 
3010 		snprintf(name, sizeof(name), "%ps", *fn);
3011 		err = igt_live_test_begin(&t, i915, __func__, name);
3012 		if (err)
3013 			break;
3014 
3015 		atomic_set(&i915->selftest.counter, nengines);
3016 
3017 		idx = 0;
3018 		for_each_uabi_engine(engine, i915) {
3019 			intel_engine_pm_get(engine);
3020 
3021 			memset(&engines[idx].p, 0, sizeof(engines[idx].p));
3022 			engines[idx].p.engine = engine;
3023 
3024 			engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
3025 						       "igt:%s", engine->name);
3026 			if (IS_ERR(engines[idx].tsk)) {
3027 				err = PTR_ERR(engines[idx].tsk);
3028 				intel_engine_pm_put(engine);
3029 				break;
3030 			}
3031 			get_task_struct(engines[idx++].tsk);
3032 		}
3033 
3034 		yield(); /* start all threads before we kthread_stop() */
3035 
3036 		idx = 0;
3037 		for_each_uabi_engine(engine, i915) {
3038 			int status;
3039 
3040 			if (IS_ERR(engines[idx].tsk))
3041 				break;
3042 
3043 			status = kthread_stop(engines[idx].tsk);
3044 			if (status && !err)
3045 				err = status;
3046 
3047 			intel_engine_pm_put(engine);
3048 			put_task_struct(engines[idx++].tsk);
3049 		}
3050 
3051 		if (igt_live_test_end(&t))
3052 			err = -EIO;
3053 		if (err)
3054 			break;
3055 
3056 		idx = 0;
3057 		for_each_uabi_engine(engine, i915) {
3058 			struct perf_stats *p = &engines[idx].p;
3059 			u64 busy = 100 * ktime_to_ns(p->busy);
3060 			u64 dt = ktime_to_ns(p->time);
3061 			int integer, decimal;
3062 
3063 			if (dt) {
3064 				integer = div64_u64(busy, dt);
3065 				busy -= integer * dt;
3066 				decimal = div64_u64(100 * busy, dt);
3067 			} else {
3068 				integer = 0;
3069 				decimal = 0;
3070 			}
3071 
3072 			GEM_BUG_ON(engine != p->engine);
3073 			pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
3074 				name, engine->name, p->count, integer, decimal,
3075 				div_u64(p->runtime, 1000 * 1000),
3076 				div_u64(ktime_to_ns(p->time), 1000 * 1000));
3077 			idx++;
3078 		}
3079 	}
3080 
3081 	cpu_latency_qos_remove_request(&qos);
3082 	kfree(engines);
3083 	return err;
3084 }
3085 
3086 int i915_request_perf_selftests(struct drm_i915_private *i915)
3087 {
3088 	static const struct i915_subtest tests[] = {
3089 		SUBTEST(perf_request_latency),
3090 		SUBTEST(perf_series_engines),
3091 		SUBTEST(perf_parallel_engines),
3092 	};
3093 
3094 	if (intel_gt_is_wedged(to_gt(i915)))
3095 		return 0;
3096 
3097 	return i915_subtests(tests, i915);
3098 }
3099