xref: /linux/drivers/gpu/drm/i915/selftests/i915_request.c (revision 4d5e3b06e1fc1428be14cd4ebe3b37c1bb34f95d)
1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/prime_numbers.h>
26 #include <linux/pm_qos.h>
27 #include <linux/sort.h>
28 
29 #include "gem/i915_gem_internal.h"
30 #include "gem/i915_gem_pm.h"
31 #include "gem/selftests/mock_context.h"
32 
33 #include "gt/intel_engine_heartbeat.h"
34 #include "gt/intel_engine_pm.h"
35 #include "gt/intel_engine_user.h"
36 #include "gt/intel_gt.h"
37 #include "gt/intel_gt_clock_utils.h"
38 #include "gt/intel_gt_requests.h"
39 #include "gt/selftest_engine_heartbeat.h"
40 
41 #include "i915_random.h"
42 #include "i915_selftest.h"
43 #include "igt_flush_test.h"
44 #include "igt_live_test.h"
45 #include "igt_spinner.h"
46 #include "lib_sw_fence.h"
47 
48 #include "mock_drm.h"
49 #include "mock_gem_device.h"
50 
51 static unsigned int num_uabi_engines(struct drm_i915_private *i915)
52 {
53 	struct intel_engine_cs *engine;
54 	unsigned int count;
55 
56 	count = 0;
57 	for_each_uabi_engine(engine, i915)
58 		count++;
59 
60 	return count;
61 }
62 
63 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
64 {
65 	return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
66 }
67 
68 static int igt_add_request(void *arg)
69 {
70 	struct drm_i915_private *i915 = arg;
71 	struct i915_request *request;
72 
73 	/* Basic preliminary test to create a request and let it loose! */
74 
75 	request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
76 	if (!request)
77 		return -ENOMEM;
78 
79 	i915_request_add(request);
80 
81 	return 0;
82 }
83 
84 static int igt_wait_request(void *arg)
85 {
86 	const long T = HZ / 4;
87 	struct drm_i915_private *i915 = arg;
88 	struct i915_request *request;
89 	int err = -EINVAL;
90 
91 	/* Submit a request, then wait upon it */
92 
93 	request = mock_request(rcs0(i915)->kernel_context, T);
94 	if (!request)
95 		return -ENOMEM;
96 
97 	i915_request_get(request);
98 
99 	if (i915_request_wait(request, 0, 0) != -ETIME) {
100 		pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
101 		goto out_request;
102 	}
103 
104 	if (i915_request_wait(request, 0, T) != -ETIME) {
105 		pr_err("request wait succeeded (expected timeout before submit!)\n");
106 		goto out_request;
107 	}
108 
109 	if (i915_request_completed(request)) {
110 		pr_err("request completed before submit!!\n");
111 		goto out_request;
112 	}
113 
114 	i915_request_add(request);
115 
116 	if (i915_request_wait(request, 0, 0) != -ETIME) {
117 		pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
118 		goto out_request;
119 	}
120 
121 	if (i915_request_completed(request)) {
122 		pr_err("request completed immediately!\n");
123 		goto out_request;
124 	}
125 
126 	if (i915_request_wait(request, 0, T / 2) != -ETIME) {
127 		pr_err("request wait succeeded (expected timeout!)\n");
128 		goto out_request;
129 	}
130 
131 	if (i915_request_wait(request, 0, T) == -ETIME) {
132 		pr_err("request wait timed out!\n");
133 		goto out_request;
134 	}
135 
136 	if (!i915_request_completed(request)) {
137 		pr_err("request not complete after waiting!\n");
138 		goto out_request;
139 	}
140 
141 	if (i915_request_wait(request, 0, T) == -ETIME) {
142 		pr_err("request wait timed out when already complete!\n");
143 		goto out_request;
144 	}
145 
146 	err = 0;
147 out_request:
148 	i915_request_put(request);
149 	mock_device_flush(i915);
150 	return err;
151 }
152 
153 static int igt_fence_wait(void *arg)
154 {
155 	const long T = HZ / 4;
156 	struct drm_i915_private *i915 = arg;
157 	struct i915_request *request;
158 	int err = -EINVAL;
159 
160 	/* Submit a request, treat it as a fence and wait upon it */
161 
162 	request = mock_request(rcs0(i915)->kernel_context, T);
163 	if (!request)
164 		return -ENOMEM;
165 
166 	if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
167 		pr_err("fence wait success before submit (expected timeout)!\n");
168 		goto out;
169 	}
170 
171 	i915_request_add(request);
172 
173 	if (dma_fence_is_signaled(&request->fence)) {
174 		pr_err("fence signaled immediately!\n");
175 		goto out;
176 	}
177 
178 	if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
179 		pr_err("fence wait success after submit (expected timeout)!\n");
180 		goto out;
181 	}
182 
183 	if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
184 		pr_err("fence wait timed out (expected success)!\n");
185 		goto out;
186 	}
187 
188 	if (!dma_fence_is_signaled(&request->fence)) {
189 		pr_err("fence unsignaled after waiting!\n");
190 		goto out;
191 	}
192 
193 	if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
194 		pr_err("fence wait timed out when complete (expected success)!\n");
195 		goto out;
196 	}
197 
198 	err = 0;
199 out:
200 	mock_device_flush(i915);
201 	return err;
202 }
203 
204 static int igt_request_rewind(void *arg)
205 {
206 	struct drm_i915_private *i915 = arg;
207 	struct i915_request *request, *vip;
208 	struct i915_gem_context *ctx[2];
209 	struct intel_context *ce;
210 	int err = -EINVAL;
211 
212 	ctx[0] = mock_context(i915, "A");
213 	if (!ctx[0]) {
214 		err = -ENOMEM;
215 		goto err_ctx_0;
216 	}
217 
218 	ce = i915_gem_context_get_engine(ctx[0], RCS0);
219 	GEM_BUG_ON(IS_ERR(ce));
220 	request = mock_request(ce, 2 * HZ);
221 	intel_context_put(ce);
222 	if (!request) {
223 		err = -ENOMEM;
224 		goto err_context_0;
225 	}
226 
227 	i915_request_get(request);
228 	i915_request_add(request);
229 
230 	ctx[1] = mock_context(i915, "B");
231 	if (!ctx[1]) {
232 		err = -ENOMEM;
233 		goto err_ctx_1;
234 	}
235 
236 	ce = i915_gem_context_get_engine(ctx[1], RCS0);
237 	GEM_BUG_ON(IS_ERR(ce));
238 	vip = mock_request(ce, 0);
239 	intel_context_put(ce);
240 	if (!vip) {
241 		err = -ENOMEM;
242 		goto err_context_1;
243 	}
244 
245 	/* Simulate preemption by manual reordering */
246 	if (!mock_cancel_request(request)) {
247 		pr_err("failed to cancel request (already executed)!\n");
248 		i915_request_add(vip);
249 		goto err_context_1;
250 	}
251 	i915_request_get(vip);
252 	i915_request_add(vip);
253 	rcu_read_lock();
254 	request->engine->submit_request(request);
255 	rcu_read_unlock();
256 
257 
258 	if (i915_request_wait(vip, 0, HZ) == -ETIME) {
259 		pr_err("timed out waiting for high priority request\n");
260 		goto err;
261 	}
262 
263 	if (i915_request_completed(request)) {
264 		pr_err("low priority request already completed\n");
265 		goto err;
266 	}
267 
268 	err = 0;
269 err:
270 	i915_request_put(vip);
271 err_context_1:
272 	mock_context_close(ctx[1]);
273 err_ctx_1:
274 	i915_request_put(request);
275 err_context_0:
276 	mock_context_close(ctx[0]);
277 err_ctx_0:
278 	mock_device_flush(i915);
279 	return err;
280 }
281 
282 struct smoketest {
283 	struct intel_engine_cs *engine;
284 	struct i915_gem_context **contexts;
285 	atomic_long_t num_waits, num_fences;
286 	int ncontexts, max_batch;
287 	struct i915_request *(*request_alloc)(struct intel_context *ce);
288 };
289 
290 static struct i915_request *
291 __mock_request_alloc(struct intel_context *ce)
292 {
293 	return mock_request(ce, 0);
294 }
295 
296 static struct i915_request *
297 __live_request_alloc(struct intel_context *ce)
298 {
299 	return intel_context_create_request(ce);
300 }
301 
302 static int __igt_breadcrumbs_smoketest(void *arg)
303 {
304 	struct smoketest *t = arg;
305 	const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
306 	const unsigned int total = 4 * t->ncontexts + 1;
307 	unsigned int num_waits = 0, num_fences = 0;
308 	struct i915_request **requests;
309 	I915_RND_STATE(prng);
310 	unsigned int *order;
311 	int err = 0;
312 
313 	/*
314 	 * A very simple test to catch the most egregious of list handling bugs.
315 	 *
316 	 * At its heart, we simply create oodles of requests running across
317 	 * multiple kthreads and enable signaling on them, for the sole purpose
318 	 * of stressing our breadcrumb handling. The only inspection we do is
319 	 * that the fences were marked as signaled.
320 	 */
321 
322 	requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
323 	if (!requests)
324 		return -ENOMEM;
325 
326 	order = i915_random_order(total, &prng);
327 	if (!order) {
328 		err = -ENOMEM;
329 		goto out_requests;
330 	}
331 
332 	while (!kthread_should_stop()) {
333 		struct i915_sw_fence *submit, *wait;
334 		unsigned int n, count;
335 
336 		submit = heap_fence_create(GFP_KERNEL);
337 		if (!submit) {
338 			err = -ENOMEM;
339 			break;
340 		}
341 
342 		wait = heap_fence_create(GFP_KERNEL);
343 		if (!wait) {
344 			i915_sw_fence_commit(submit);
345 			heap_fence_put(submit);
346 			err = -ENOMEM;
347 			break;
348 		}
349 
350 		i915_random_reorder(order, total, &prng);
351 		count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
352 
353 		for (n = 0; n < count; n++) {
354 			struct i915_gem_context *ctx =
355 				t->contexts[order[n] % t->ncontexts];
356 			struct i915_request *rq;
357 			struct intel_context *ce;
358 
359 			ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
360 			GEM_BUG_ON(IS_ERR(ce));
361 			rq = t->request_alloc(ce);
362 			intel_context_put(ce);
363 			if (IS_ERR(rq)) {
364 				err = PTR_ERR(rq);
365 				count = n;
366 				break;
367 			}
368 
369 			err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
370 							       submit,
371 							       GFP_KERNEL);
372 
373 			requests[n] = i915_request_get(rq);
374 			i915_request_add(rq);
375 
376 			if (err >= 0)
377 				err = i915_sw_fence_await_dma_fence(wait,
378 								    &rq->fence,
379 								    0,
380 								    GFP_KERNEL);
381 
382 			if (err < 0) {
383 				i915_request_put(rq);
384 				count = n;
385 				break;
386 			}
387 		}
388 
389 		i915_sw_fence_commit(submit);
390 		i915_sw_fence_commit(wait);
391 
392 		if (!wait_event_timeout(wait->wait,
393 					i915_sw_fence_done(wait),
394 					5 * HZ)) {
395 			struct i915_request *rq = requests[count - 1];
396 
397 			pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
398 			       atomic_read(&wait->pending), count,
399 			       rq->fence.context, rq->fence.seqno,
400 			       t->engine->name);
401 			GEM_TRACE_DUMP();
402 
403 			intel_gt_set_wedged(t->engine->gt);
404 			GEM_BUG_ON(!i915_request_completed(rq));
405 			i915_sw_fence_wait(wait);
406 			err = -EIO;
407 		}
408 
409 		for (n = 0; n < count; n++) {
410 			struct i915_request *rq = requests[n];
411 
412 			if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
413 				      &rq->fence.flags)) {
414 				pr_err("%llu:%llu was not signaled!\n",
415 				       rq->fence.context, rq->fence.seqno);
416 				err = -EINVAL;
417 			}
418 
419 			i915_request_put(rq);
420 		}
421 
422 		heap_fence_put(wait);
423 		heap_fence_put(submit);
424 
425 		if (err < 0)
426 			break;
427 
428 		num_fences += count;
429 		num_waits++;
430 
431 		cond_resched();
432 	}
433 
434 	atomic_long_add(num_fences, &t->num_fences);
435 	atomic_long_add(num_waits, &t->num_waits);
436 
437 	kfree(order);
438 out_requests:
439 	kfree(requests);
440 	return err;
441 }
442 
443 static int mock_breadcrumbs_smoketest(void *arg)
444 {
445 	struct drm_i915_private *i915 = arg;
446 	struct smoketest t = {
447 		.engine = rcs0(i915),
448 		.ncontexts = 1024,
449 		.max_batch = 1024,
450 		.request_alloc = __mock_request_alloc
451 	};
452 	unsigned int ncpus = num_online_cpus();
453 	struct task_struct **threads;
454 	unsigned int n;
455 	int ret = 0;
456 
457 	/*
458 	 * Smoketest our breadcrumb/signal handling for requests across multiple
459 	 * threads. A very simple test to only catch the most egregious of bugs.
460 	 * See __igt_breadcrumbs_smoketest();
461 	 */
462 
463 	threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
464 	if (!threads)
465 		return -ENOMEM;
466 
467 	t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
468 	if (!t.contexts) {
469 		ret = -ENOMEM;
470 		goto out_threads;
471 	}
472 
473 	for (n = 0; n < t.ncontexts; n++) {
474 		t.contexts[n] = mock_context(t.engine->i915, "mock");
475 		if (!t.contexts[n]) {
476 			ret = -ENOMEM;
477 			goto out_contexts;
478 		}
479 	}
480 
481 	for (n = 0; n < ncpus; n++) {
482 		threads[n] = kthread_run(__igt_breadcrumbs_smoketest,
483 					 &t, "igt/%d", n);
484 		if (IS_ERR(threads[n])) {
485 			ret = PTR_ERR(threads[n]);
486 			ncpus = n;
487 			break;
488 		}
489 
490 		get_task_struct(threads[n]);
491 	}
492 
493 	yield(); /* start all threads before we begin */
494 	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
495 
496 	for (n = 0; n < ncpus; n++) {
497 		int err;
498 
499 		err = kthread_stop(threads[n]);
500 		if (err < 0 && !ret)
501 			ret = err;
502 
503 		put_task_struct(threads[n]);
504 	}
505 	pr_info("Completed %lu waits for %lu fence across %d cpus\n",
506 		atomic_long_read(&t.num_waits),
507 		atomic_long_read(&t.num_fences),
508 		ncpus);
509 
510 out_contexts:
511 	for (n = 0; n < t.ncontexts; n++) {
512 		if (!t.contexts[n])
513 			break;
514 		mock_context_close(t.contexts[n]);
515 	}
516 	kfree(t.contexts);
517 out_threads:
518 	kfree(threads);
519 	return ret;
520 }
521 
522 int i915_request_mock_selftests(void)
523 {
524 	static const struct i915_subtest tests[] = {
525 		SUBTEST(igt_add_request),
526 		SUBTEST(igt_wait_request),
527 		SUBTEST(igt_fence_wait),
528 		SUBTEST(igt_request_rewind),
529 		SUBTEST(mock_breadcrumbs_smoketest),
530 	};
531 	struct drm_i915_private *i915;
532 	intel_wakeref_t wakeref;
533 	int err = 0;
534 
535 	i915 = mock_gem_device();
536 	if (!i915)
537 		return -ENOMEM;
538 
539 	with_intel_runtime_pm(&i915->runtime_pm, wakeref)
540 		err = i915_subtests(tests, i915);
541 
542 	mock_destroy_device(i915);
543 
544 	return err;
545 }
546 
547 static int live_nop_request(void *arg)
548 {
549 	struct drm_i915_private *i915 = arg;
550 	struct intel_engine_cs *engine;
551 	struct igt_live_test t;
552 	int err = -ENODEV;
553 
554 	/*
555 	 * Submit various sized batches of empty requests, to each engine
556 	 * (individually), and wait for the batch to complete. We can check
557 	 * the overhead of submitting requests to the hardware.
558 	 */
559 
560 	for_each_uabi_engine(engine, i915) {
561 		unsigned long n, prime;
562 		IGT_TIMEOUT(end_time);
563 		ktime_t times[2] = {};
564 
565 		err = igt_live_test_begin(&t, i915, __func__, engine->name);
566 		if (err)
567 			return err;
568 
569 		intel_engine_pm_get(engine);
570 		for_each_prime_number_from(prime, 1, 8192) {
571 			struct i915_request *request = NULL;
572 
573 			times[1] = ktime_get_raw();
574 
575 			for (n = 0; n < prime; n++) {
576 				i915_request_put(request);
577 				request = i915_request_create(engine->kernel_context);
578 				if (IS_ERR(request))
579 					return PTR_ERR(request);
580 
581 				/*
582 				 * This space is left intentionally blank.
583 				 *
584 				 * We do not actually want to perform any
585 				 * action with this request, we just want
586 				 * to measure the latency in allocation
587 				 * and submission of our breadcrumbs -
588 				 * ensuring that the bare request is sufficient
589 				 * for the system to work (i.e. proper HEAD
590 				 * tracking of the rings, interrupt handling,
591 				 * etc). It also gives us the lowest bounds
592 				 * for latency.
593 				 */
594 
595 				i915_request_get(request);
596 				i915_request_add(request);
597 			}
598 			i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
599 			i915_request_put(request);
600 
601 			times[1] = ktime_sub(ktime_get_raw(), times[1]);
602 			if (prime == 1)
603 				times[0] = times[1];
604 
605 			if (__igt_timeout(end_time, NULL))
606 				break;
607 		}
608 		intel_engine_pm_put(engine);
609 
610 		err = igt_live_test_end(&t);
611 		if (err)
612 			return err;
613 
614 		pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
615 			engine->name,
616 			ktime_to_ns(times[0]),
617 			prime, div64_u64(ktime_to_ns(times[1]), prime));
618 	}
619 
620 	return err;
621 }
622 
623 static int __cancel_inactive(struct intel_engine_cs *engine)
624 {
625 	struct intel_context *ce;
626 	struct igt_spinner spin;
627 	struct i915_request *rq;
628 	int err = 0;
629 
630 	if (igt_spinner_init(&spin, engine->gt))
631 		return -ENOMEM;
632 
633 	ce = intel_context_create(engine);
634 	if (IS_ERR(ce)) {
635 		err = PTR_ERR(ce);
636 		goto out_spin;
637 	}
638 
639 	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
640 	if (IS_ERR(rq)) {
641 		err = PTR_ERR(rq);
642 		goto out_ce;
643 	}
644 
645 	pr_debug("%s: Cancelling inactive request\n", engine->name);
646 	i915_request_cancel(rq, -EINTR);
647 	i915_request_get(rq);
648 	i915_request_add(rq);
649 
650 	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
651 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
652 
653 		pr_err("%s: Failed to cancel inactive request\n", engine->name);
654 		intel_engine_dump(engine, &p, "%s\n", engine->name);
655 		err = -ETIME;
656 		goto out_rq;
657 	}
658 
659 	if (rq->fence.error != -EINTR) {
660 		pr_err("%s: fence not cancelled (%u)\n",
661 		       engine->name, rq->fence.error);
662 		err = -EINVAL;
663 	}
664 
665 out_rq:
666 	i915_request_put(rq);
667 out_ce:
668 	intel_context_put(ce);
669 out_spin:
670 	igt_spinner_fini(&spin);
671 	if (err)
672 		pr_err("%s: %s error %d\n", __func__, engine->name, err);
673 	return err;
674 }
675 
676 static int __cancel_active(struct intel_engine_cs *engine)
677 {
678 	struct intel_context *ce;
679 	struct igt_spinner spin;
680 	struct i915_request *rq;
681 	int err = 0;
682 
683 	if (igt_spinner_init(&spin, engine->gt))
684 		return -ENOMEM;
685 
686 	ce = intel_context_create(engine);
687 	if (IS_ERR(ce)) {
688 		err = PTR_ERR(ce);
689 		goto out_spin;
690 	}
691 
692 	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
693 	if (IS_ERR(rq)) {
694 		err = PTR_ERR(rq);
695 		goto out_ce;
696 	}
697 
698 	pr_debug("%s: Cancelling active request\n", engine->name);
699 	i915_request_get(rq);
700 	i915_request_add(rq);
701 	if (!igt_wait_for_spinner(&spin, rq)) {
702 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
703 
704 		pr_err("Failed to start spinner on %s\n", engine->name);
705 		intel_engine_dump(engine, &p, "%s\n", engine->name);
706 		err = -ETIME;
707 		goto out_rq;
708 	}
709 	i915_request_cancel(rq, -EINTR);
710 
711 	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
712 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
713 
714 		pr_err("%s: Failed to cancel active request\n", engine->name);
715 		intel_engine_dump(engine, &p, "%s\n", engine->name);
716 		err = -ETIME;
717 		goto out_rq;
718 	}
719 
720 	if (rq->fence.error != -EINTR) {
721 		pr_err("%s: fence not cancelled (%u)\n",
722 		       engine->name, rq->fence.error);
723 		err = -EINVAL;
724 	}
725 
726 out_rq:
727 	i915_request_put(rq);
728 out_ce:
729 	intel_context_put(ce);
730 out_spin:
731 	igt_spinner_fini(&spin);
732 	if (err)
733 		pr_err("%s: %s error %d\n", __func__, engine->name, err);
734 	return err;
735 }
736 
737 static int __cancel_completed(struct intel_engine_cs *engine)
738 {
739 	struct intel_context *ce;
740 	struct igt_spinner spin;
741 	struct i915_request *rq;
742 	int err = 0;
743 
744 	if (igt_spinner_init(&spin, engine->gt))
745 		return -ENOMEM;
746 
747 	ce = intel_context_create(engine);
748 	if (IS_ERR(ce)) {
749 		err = PTR_ERR(ce);
750 		goto out_spin;
751 	}
752 
753 	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
754 	if (IS_ERR(rq)) {
755 		err = PTR_ERR(rq);
756 		goto out_ce;
757 	}
758 	igt_spinner_end(&spin);
759 	i915_request_get(rq);
760 	i915_request_add(rq);
761 
762 	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
763 		err = -ETIME;
764 		goto out_rq;
765 	}
766 
767 	pr_debug("%s: Cancelling completed request\n", engine->name);
768 	i915_request_cancel(rq, -EINTR);
769 	if (rq->fence.error) {
770 		pr_err("%s: fence not cancelled (%u)\n",
771 		       engine->name, rq->fence.error);
772 		err = -EINVAL;
773 	}
774 
775 out_rq:
776 	i915_request_put(rq);
777 out_ce:
778 	intel_context_put(ce);
779 out_spin:
780 	igt_spinner_fini(&spin);
781 	if (err)
782 		pr_err("%s: %s error %d\n", __func__, engine->name, err);
783 	return err;
784 }
785 
786 /*
787  * Test to prove a non-preemptable request can be cancelled and a subsequent
788  * request on the same context can successfully complete after cancellation.
789  *
790  * Testing methodology is to create a non-preemptible request and submit it,
791  * wait for spinner to start, create a NOP request and submit it, cancel the
792  * spinner, wait for spinner to complete and verify it failed with an error,
793  * finally wait for NOP request to complete verify it succeeded without an
794  * error. Preemption timeout also reduced / restored so test runs in a timely
795  * maner.
796  */
797 static int __cancel_reset(struct drm_i915_private *i915,
798 			  struct intel_engine_cs *engine)
799 {
800 	struct intel_context *ce;
801 	struct igt_spinner spin;
802 	struct i915_request *rq, *nop;
803 	unsigned long preempt_timeout_ms;
804 	int err = 0;
805 
806 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT ||
807 	    !intel_has_reset_engine(engine->gt))
808 		return 0;
809 
810 	preempt_timeout_ms = engine->props.preempt_timeout_ms;
811 	engine->props.preempt_timeout_ms = 100;
812 
813 	if (igt_spinner_init(&spin, engine->gt))
814 		goto out_restore;
815 
816 	ce = intel_context_create(engine);
817 	if (IS_ERR(ce)) {
818 		err = PTR_ERR(ce);
819 		goto out_spin;
820 	}
821 
822 	rq = igt_spinner_create_request(&spin, ce, MI_NOOP);
823 	if (IS_ERR(rq)) {
824 		err = PTR_ERR(rq);
825 		goto out_ce;
826 	}
827 
828 	pr_debug("%s: Cancelling active non-preemptable request\n",
829 		 engine->name);
830 	i915_request_get(rq);
831 	i915_request_add(rq);
832 	if (!igt_wait_for_spinner(&spin, rq)) {
833 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
834 
835 		pr_err("Failed to start spinner on %s\n", engine->name);
836 		intel_engine_dump(engine, &p, "%s\n", engine->name);
837 		err = -ETIME;
838 		goto out_rq;
839 	}
840 
841 	nop = intel_context_create_request(ce);
842 	if (IS_ERR(nop))
843 		goto out_rq;
844 	i915_request_get(nop);
845 	i915_request_add(nop);
846 
847 	i915_request_cancel(rq, -EINTR);
848 
849 	if (i915_request_wait(rq, 0, HZ) < 0) {
850 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
851 
852 		pr_err("%s: Failed to cancel hung request\n", engine->name);
853 		intel_engine_dump(engine, &p, "%s\n", engine->name);
854 		err = -ETIME;
855 		goto out_nop;
856 	}
857 
858 	if (rq->fence.error != -EINTR) {
859 		pr_err("%s: fence not cancelled (%u)\n",
860 		       engine->name, rq->fence.error);
861 		err = -EINVAL;
862 		goto out_nop;
863 	}
864 
865 	if (i915_request_wait(nop, 0, HZ) < 0) {
866 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
867 
868 		pr_err("%s: Failed to complete nop request\n", engine->name);
869 		intel_engine_dump(engine, &p, "%s\n", engine->name);
870 		err = -ETIME;
871 		goto out_nop;
872 	}
873 
874 	if (nop->fence.error != 0) {
875 		pr_err("%s: Nop request errored (%u)\n",
876 		       engine->name, nop->fence.error);
877 		err = -EINVAL;
878 	}
879 
880 out_nop:
881 	i915_request_put(nop);
882 out_rq:
883 	i915_request_put(rq);
884 out_ce:
885 	intel_context_put(ce);
886 out_spin:
887 	igt_spinner_fini(&spin);
888 out_restore:
889 	engine->props.preempt_timeout_ms = preempt_timeout_ms;
890 	if (err)
891 		pr_err("%s: %s error %d\n", __func__, engine->name, err);
892 	return err;
893 }
894 
895 static int live_cancel_request(void *arg)
896 {
897 	struct drm_i915_private *i915 = arg;
898 	struct intel_engine_cs *engine;
899 
900 	/*
901 	 * Check cancellation of requests. We expect to be able to immediately
902 	 * cancel active requests, even if they are currently on the GPU.
903 	 */
904 
905 	for_each_uabi_engine(engine, i915) {
906 		struct igt_live_test t;
907 		int err, err2;
908 
909 		if (!intel_engine_has_preemption(engine))
910 			continue;
911 
912 		err = igt_live_test_begin(&t, i915, __func__, engine->name);
913 		if (err)
914 			return err;
915 
916 		err = __cancel_inactive(engine);
917 		if (err == 0)
918 			err = __cancel_active(engine);
919 		if (err == 0)
920 			err = __cancel_completed(engine);
921 
922 		err2 = igt_live_test_end(&t);
923 		if (err)
924 			return err;
925 		if (err2)
926 			return err2;
927 
928 		/* Expects reset so call outside of igt_live_test_* */
929 		err = __cancel_reset(i915, engine);
930 		if (err)
931 			return err;
932 
933 		if (igt_flush_test(i915))
934 			return -EIO;
935 	}
936 
937 	return 0;
938 }
939 
940 static struct i915_vma *empty_batch(struct drm_i915_private *i915)
941 {
942 	struct drm_i915_gem_object *obj;
943 	struct i915_vma *vma;
944 	u32 *cmd;
945 	int err;
946 
947 	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
948 	if (IS_ERR(obj))
949 		return ERR_CAST(obj);
950 
951 	cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB);
952 	if (IS_ERR(cmd)) {
953 		err = PTR_ERR(cmd);
954 		goto err;
955 	}
956 
957 	*cmd = MI_BATCH_BUFFER_END;
958 
959 	__i915_gem_object_flush_map(obj, 0, 64);
960 	i915_gem_object_unpin_map(obj);
961 
962 	intel_gt_chipset_flush(to_gt(i915));
963 
964 	vma = i915_vma_instance(obj, &to_gt(i915)->ggtt->vm, NULL);
965 	if (IS_ERR(vma)) {
966 		err = PTR_ERR(vma);
967 		goto err;
968 	}
969 
970 	err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
971 	if (err)
972 		goto err;
973 
974 	/* Force the wait wait now to avoid including it in the benchmark */
975 	err = i915_vma_sync(vma);
976 	if (err)
977 		goto err_pin;
978 
979 	return vma;
980 
981 err_pin:
982 	i915_vma_unpin(vma);
983 err:
984 	i915_gem_object_put(obj);
985 	return ERR_PTR(err);
986 }
987 
988 static struct i915_request *
989 empty_request(struct intel_engine_cs *engine,
990 	      struct i915_vma *batch)
991 {
992 	struct i915_request *request;
993 	int err;
994 
995 	request = i915_request_create(engine->kernel_context);
996 	if (IS_ERR(request))
997 		return request;
998 
999 	err = engine->emit_bb_start(request,
1000 				    batch->node.start,
1001 				    batch->node.size,
1002 				    I915_DISPATCH_SECURE);
1003 	if (err)
1004 		goto out_request;
1005 
1006 	i915_request_get(request);
1007 out_request:
1008 	i915_request_add(request);
1009 	return err ? ERR_PTR(err) : request;
1010 }
1011 
1012 static int live_empty_request(void *arg)
1013 {
1014 	struct drm_i915_private *i915 = arg;
1015 	struct intel_engine_cs *engine;
1016 	struct igt_live_test t;
1017 	struct i915_vma *batch;
1018 	int err = 0;
1019 
1020 	/*
1021 	 * Submit various sized batches of empty requests, to each engine
1022 	 * (individually), and wait for the batch to complete. We can check
1023 	 * the overhead of submitting requests to the hardware.
1024 	 */
1025 
1026 	batch = empty_batch(i915);
1027 	if (IS_ERR(batch))
1028 		return PTR_ERR(batch);
1029 
1030 	for_each_uabi_engine(engine, i915) {
1031 		IGT_TIMEOUT(end_time);
1032 		struct i915_request *request;
1033 		unsigned long n, prime;
1034 		ktime_t times[2] = {};
1035 
1036 		err = igt_live_test_begin(&t, i915, __func__, engine->name);
1037 		if (err)
1038 			goto out_batch;
1039 
1040 		intel_engine_pm_get(engine);
1041 
1042 		/* Warmup / preload */
1043 		request = empty_request(engine, batch);
1044 		if (IS_ERR(request)) {
1045 			err = PTR_ERR(request);
1046 			intel_engine_pm_put(engine);
1047 			goto out_batch;
1048 		}
1049 		i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
1050 
1051 		for_each_prime_number_from(prime, 1, 8192) {
1052 			times[1] = ktime_get_raw();
1053 
1054 			for (n = 0; n < prime; n++) {
1055 				i915_request_put(request);
1056 				request = empty_request(engine, batch);
1057 				if (IS_ERR(request)) {
1058 					err = PTR_ERR(request);
1059 					intel_engine_pm_put(engine);
1060 					goto out_batch;
1061 				}
1062 			}
1063 			i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
1064 
1065 			times[1] = ktime_sub(ktime_get_raw(), times[1]);
1066 			if (prime == 1)
1067 				times[0] = times[1];
1068 
1069 			if (__igt_timeout(end_time, NULL))
1070 				break;
1071 		}
1072 		i915_request_put(request);
1073 		intel_engine_pm_put(engine);
1074 
1075 		err = igt_live_test_end(&t);
1076 		if (err)
1077 			goto out_batch;
1078 
1079 		pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
1080 			engine->name,
1081 			ktime_to_ns(times[0]),
1082 			prime, div64_u64(ktime_to_ns(times[1]), prime));
1083 	}
1084 
1085 out_batch:
1086 	i915_vma_unpin(batch);
1087 	i915_vma_put(batch);
1088 	return err;
1089 }
1090 
1091 static struct i915_vma *recursive_batch(struct drm_i915_private *i915)
1092 {
1093 	struct drm_i915_gem_object *obj;
1094 	const int ver = GRAPHICS_VER(i915);
1095 	struct i915_vma *vma;
1096 	u32 *cmd;
1097 	int err;
1098 
1099 	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
1100 	if (IS_ERR(obj))
1101 		return ERR_CAST(obj);
1102 
1103 	vma = i915_vma_instance(obj, to_gt(i915)->vm, NULL);
1104 	if (IS_ERR(vma)) {
1105 		err = PTR_ERR(vma);
1106 		goto err;
1107 	}
1108 
1109 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
1110 	if (err)
1111 		goto err;
1112 
1113 	cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
1114 	if (IS_ERR(cmd)) {
1115 		err = PTR_ERR(cmd);
1116 		goto err;
1117 	}
1118 
1119 	if (ver >= 8) {
1120 		*cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
1121 		*cmd++ = lower_32_bits(vma->node.start);
1122 		*cmd++ = upper_32_bits(vma->node.start);
1123 	} else if (ver >= 6) {
1124 		*cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
1125 		*cmd++ = lower_32_bits(vma->node.start);
1126 	} else {
1127 		*cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1128 		*cmd++ = lower_32_bits(vma->node.start);
1129 	}
1130 	*cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
1131 
1132 	__i915_gem_object_flush_map(obj, 0, 64);
1133 	i915_gem_object_unpin_map(obj);
1134 
1135 	intel_gt_chipset_flush(to_gt(i915));
1136 
1137 	return vma;
1138 
1139 err:
1140 	i915_gem_object_put(obj);
1141 	return ERR_PTR(err);
1142 }
1143 
1144 static int recursive_batch_resolve(struct i915_vma *batch)
1145 {
1146 	u32 *cmd;
1147 
1148 	cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC);
1149 	if (IS_ERR(cmd))
1150 		return PTR_ERR(cmd);
1151 
1152 	*cmd = MI_BATCH_BUFFER_END;
1153 
1154 	__i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
1155 	i915_gem_object_unpin_map(batch->obj);
1156 
1157 	intel_gt_chipset_flush(batch->vm->gt);
1158 
1159 	return 0;
1160 }
1161 
1162 static int live_all_engines(void *arg)
1163 {
1164 	struct drm_i915_private *i915 = arg;
1165 	const unsigned int nengines = num_uabi_engines(i915);
1166 	struct intel_engine_cs *engine;
1167 	struct i915_request **request;
1168 	struct igt_live_test t;
1169 	struct i915_vma *batch;
1170 	unsigned int idx;
1171 	int err;
1172 
1173 	/*
1174 	 * Check we can submit requests to all engines simultaneously. We
1175 	 * send a recursive batch to each engine - checking that we don't
1176 	 * block doing so, and that they don't complete too soon.
1177 	 */
1178 
1179 	request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1180 	if (!request)
1181 		return -ENOMEM;
1182 
1183 	err = igt_live_test_begin(&t, i915, __func__, "");
1184 	if (err)
1185 		goto out_free;
1186 
1187 	batch = recursive_batch(i915);
1188 	if (IS_ERR(batch)) {
1189 		err = PTR_ERR(batch);
1190 		pr_err("%s: Unable to create batch, err=%d\n", __func__, err);
1191 		goto out_free;
1192 	}
1193 
1194 	i915_vma_lock(batch);
1195 
1196 	idx = 0;
1197 	for_each_uabi_engine(engine, i915) {
1198 		request[idx] = intel_engine_create_kernel_request(engine);
1199 		if (IS_ERR(request[idx])) {
1200 			err = PTR_ERR(request[idx]);
1201 			pr_err("%s: Request allocation failed with err=%d\n",
1202 			       __func__, err);
1203 			goto out_request;
1204 		}
1205 
1206 		err = i915_request_await_object(request[idx], batch->obj, 0);
1207 		if (err == 0)
1208 			err = i915_vma_move_to_active(batch, request[idx], 0);
1209 		GEM_BUG_ON(err);
1210 
1211 		err = engine->emit_bb_start(request[idx],
1212 					    batch->node.start,
1213 					    batch->node.size,
1214 					    0);
1215 		GEM_BUG_ON(err);
1216 		request[idx]->batch = batch;
1217 
1218 		i915_request_get(request[idx]);
1219 		i915_request_add(request[idx]);
1220 		idx++;
1221 	}
1222 
1223 	i915_vma_unlock(batch);
1224 
1225 	idx = 0;
1226 	for_each_uabi_engine(engine, i915) {
1227 		if (i915_request_completed(request[idx])) {
1228 			pr_err("%s(%s): request completed too early!\n",
1229 			       __func__, engine->name);
1230 			err = -EINVAL;
1231 			goto out_request;
1232 		}
1233 		idx++;
1234 	}
1235 
1236 	err = recursive_batch_resolve(batch);
1237 	if (err) {
1238 		pr_err("%s: failed to resolve batch, err=%d\n", __func__, err);
1239 		goto out_request;
1240 	}
1241 
1242 	idx = 0;
1243 	for_each_uabi_engine(engine, i915) {
1244 		long timeout;
1245 
1246 		timeout = i915_request_wait(request[idx], 0,
1247 					    MAX_SCHEDULE_TIMEOUT);
1248 		if (timeout < 0) {
1249 			err = timeout;
1250 			pr_err("%s: error waiting for request on %s, err=%d\n",
1251 			       __func__, engine->name, err);
1252 			goto out_request;
1253 		}
1254 
1255 		GEM_BUG_ON(!i915_request_completed(request[idx]));
1256 		i915_request_put(request[idx]);
1257 		request[idx] = NULL;
1258 		idx++;
1259 	}
1260 
1261 	err = igt_live_test_end(&t);
1262 
1263 out_request:
1264 	idx = 0;
1265 	for_each_uabi_engine(engine, i915) {
1266 		if (request[idx])
1267 			i915_request_put(request[idx]);
1268 		idx++;
1269 	}
1270 	i915_vma_unpin(batch);
1271 	i915_vma_put(batch);
1272 out_free:
1273 	kfree(request);
1274 	return err;
1275 }
1276 
1277 static int live_sequential_engines(void *arg)
1278 {
1279 	struct drm_i915_private *i915 = arg;
1280 	const unsigned int nengines = num_uabi_engines(i915);
1281 	struct i915_request **request;
1282 	struct i915_request *prev = NULL;
1283 	struct intel_engine_cs *engine;
1284 	struct igt_live_test t;
1285 	unsigned int idx;
1286 	int err;
1287 
1288 	/*
1289 	 * Check we can submit requests to all engines sequentially, such
1290 	 * that each successive request waits for the earlier ones. This
1291 	 * tests that we don't execute requests out of order, even though
1292 	 * they are running on independent engines.
1293 	 */
1294 
1295 	request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1296 	if (!request)
1297 		return -ENOMEM;
1298 
1299 	err = igt_live_test_begin(&t, i915, __func__, "");
1300 	if (err)
1301 		goto out_free;
1302 
1303 	idx = 0;
1304 	for_each_uabi_engine(engine, i915) {
1305 		struct i915_vma *batch;
1306 
1307 		batch = recursive_batch(i915);
1308 		if (IS_ERR(batch)) {
1309 			err = PTR_ERR(batch);
1310 			pr_err("%s: Unable to create batch for %s, err=%d\n",
1311 			       __func__, engine->name, err);
1312 			goto out_free;
1313 		}
1314 
1315 		i915_vma_lock(batch);
1316 		request[idx] = intel_engine_create_kernel_request(engine);
1317 		if (IS_ERR(request[idx])) {
1318 			err = PTR_ERR(request[idx]);
1319 			pr_err("%s: Request allocation failed for %s with err=%d\n",
1320 			       __func__, engine->name, err);
1321 			goto out_unlock;
1322 		}
1323 
1324 		if (prev) {
1325 			err = i915_request_await_dma_fence(request[idx],
1326 							   &prev->fence);
1327 			if (err) {
1328 				i915_request_add(request[idx]);
1329 				pr_err("%s: Request await failed for %s with err=%d\n",
1330 				       __func__, engine->name, err);
1331 				goto out_unlock;
1332 			}
1333 		}
1334 
1335 		err = i915_request_await_object(request[idx],
1336 						batch->obj, false);
1337 		if (err == 0)
1338 			err = i915_vma_move_to_active(batch, request[idx], 0);
1339 		GEM_BUG_ON(err);
1340 
1341 		err = engine->emit_bb_start(request[idx],
1342 					    batch->node.start,
1343 					    batch->node.size,
1344 					    0);
1345 		GEM_BUG_ON(err);
1346 		request[idx]->batch = batch;
1347 
1348 		i915_request_get(request[idx]);
1349 		i915_request_add(request[idx]);
1350 
1351 		prev = request[idx];
1352 		idx++;
1353 
1354 out_unlock:
1355 		i915_vma_unlock(batch);
1356 		if (err)
1357 			goto out_request;
1358 	}
1359 
1360 	idx = 0;
1361 	for_each_uabi_engine(engine, i915) {
1362 		long timeout;
1363 
1364 		if (i915_request_completed(request[idx])) {
1365 			pr_err("%s(%s): request completed too early!\n",
1366 			       __func__, engine->name);
1367 			err = -EINVAL;
1368 			goto out_request;
1369 		}
1370 
1371 		err = recursive_batch_resolve(request[idx]->batch);
1372 		if (err) {
1373 			pr_err("%s: failed to resolve batch, err=%d\n",
1374 			       __func__, err);
1375 			goto out_request;
1376 		}
1377 
1378 		timeout = i915_request_wait(request[idx], 0,
1379 					    MAX_SCHEDULE_TIMEOUT);
1380 		if (timeout < 0) {
1381 			err = timeout;
1382 			pr_err("%s: error waiting for request on %s, err=%d\n",
1383 			       __func__, engine->name, err);
1384 			goto out_request;
1385 		}
1386 
1387 		GEM_BUG_ON(!i915_request_completed(request[idx]));
1388 		idx++;
1389 	}
1390 
1391 	err = igt_live_test_end(&t);
1392 
1393 out_request:
1394 	idx = 0;
1395 	for_each_uabi_engine(engine, i915) {
1396 		u32 *cmd;
1397 
1398 		if (!request[idx])
1399 			break;
1400 
1401 		cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj,
1402 						       I915_MAP_WC);
1403 		if (!IS_ERR(cmd)) {
1404 			*cmd = MI_BATCH_BUFFER_END;
1405 
1406 			__i915_gem_object_flush_map(request[idx]->batch->obj,
1407 						    0, sizeof(*cmd));
1408 			i915_gem_object_unpin_map(request[idx]->batch->obj);
1409 
1410 			intel_gt_chipset_flush(engine->gt);
1411 		}
1412 
1413 		i915_vma_put(request[idx]->batch);
1414 		i915_request_put(request[idx]);
1415 		idx++;
1416 	}
1417 out_free:
1418 	kfree(request);
1419 	return err;
1420 }
1421 
1422 static int __live_parallel_engine1(void *arg)
1423 {
1424 	struct intel_engine_cs *engine = arg;
1425 	IGT_TIMEOUT(end_time);
1426 	unsigned long count;
1427 	int err = 0;
1428 
1429 	count = 0;
1430 	intel_engine_pm_get(engine);
1431 	do {
1432 		struct i915_request *rq;
1433 
1434 		rq = i915_request_create(engine->kernel_context);
1435 		if (IS_ERR(rq)) {
1436 			err = PTR_ERR(rq);
1437 			break;
1438 		}
1439 
1440 		i915_request_get(rq);
1441 		i915_request_add(rq);
1442 
1443 		err = 0;
1444 		if (i915_request_wait(rq, 0, HZ) < 0)
1445 			err = -ETIME;
1446 		i915_request_put(rq);
1447 		if (err)
1448 			break;
1449 
1450 		count++;
1451 	} while (!__igt_timeout(end_time, NULL));
1452 	intel_engine_pm_put(engine);
1453 
1454 	pr_info("%s: %lu request + sync\n", engine->name, count);
1455 	return err;
1456 }
1457 
1458 static int __live_parallel_engineN(void *arg)
1459 {
1460 	struct intel_engine_cs *engine = arg;
1461 	IGT_TIMEOUT(end_time);
1462 	unsigned long count;
1463 	int err = 0;
1464 
1465 	count = 0;
1466 	intel_engine_pm_get(engine);
1467 	do {
1468 		struct i915_request *rq;
1469 
1470 		rq = i915_request_create(engine->kernel_context);
1471 		if (IS_ERR(rq)) {
1472 			err = PTR_ERR(rq);
1473 			break;
1474 		}
1475 
1476 		i915_request_add(rq);
1477 		count++;
1478 	} while (!__igt_timeout(end_time, NULL));
1479 	intel_engine_pm_put(engine);
1480 
1481 	pr_info("%s: %lu requests\n", engine->name, count);
1482 	return err;
1483 }
1484 
1485 static bool wake_all(struct drm_i915_private *i915)
1486 {
1487 	if (atomic_dec_and_test(&i915->selftest.counter)) {
1488 		wake_up_var(&i915->selftest.counter);
1489 		return true;
1490 	}
1491 
1492 	return false;
1493 }
1494 
1495 static int wait_for_all(struct drm_i915_private *i915)
1496 {
1497 	if (wake_all(i915))
1498 		return 0;
1499 
1500 	if (wait_var_event_timeout(&i915->selftest.counter,
1501 				   !atomic_read(&i915->selftest.counter),
1502 				   i915_selftest.timeout_jiffies))
1503 		return 0;
1504 
1505 	return -ETIME;
1506 }
1507 
1508 static int __live_parallel_spin(void *arg)
1509 {
1510 	struct intel_engine_cs *engine = arg;
1511 	struct igt_spinner spin;
1512 	struct i915_request *rq;
1513 	int err = 0;
1514 
1515 	/*
1516 	 * Create a spinner running for eternity on each engine. If a second
1517 	 * spinner is incorrectly placed on the same engine, it will not be
1518 	 * able to start in time.
1519 	 */
1520 
1521 	if (igt_spinner_init(&spin, engine->gt)) {
1522 		wake_all(engine->i915);
1523 		return -ENOMEM;
1524 	}
1525 
1526 	intel_engine_pm_get(engine);
1527 	rq = igt_spinner_create_request(&spin,
1528 					engine->kernel_context,
1529 					MI_NOOP); /* no preemption */
1530 	intel_engine_pm_put(engine);
1531 	if (IS_ERR(rq)) {
1532 		err = PTR_ERR(rq);
1533 		if (err == -ENODEV)
1534 			err = 0;
1535 		wake_all(engine->i915);
1536 		goto out_spin;
1537 	}
1538 
1539 	i915_request_get(rq);
1540 	i915_request_add(rq);
1541 	if (igt_wait_for_spinner(&spin, rq)) {
1542 		/* Occupy this engine for the whole test */
1543 		err = wait_for_all(engine->i915);
1544 	} else {
1545 		pr_err("Failed to start spinner on %s\n", engine->name);
1546 		err = -EINVAL;
1547 	}
1548 	igt_spinner_end(&spin);
1549 
1550 	if (err == 0 && i915_request_wait(rq, 0, HZ) < 0)
1551 		err = -EIO;
1552 	i915_request_put(rq);
1553 
1554 out_spin:
1555 	igt_spinner_fini(&spin);
1556 	return err;
1557 }
1558 
1559 static int live_parallel_engines(void *arg)
1560 {
1561 	struct drm_i915_private *i915 = arg;
1562 	static int (* const func[])(void *arg) = {
1563 		__live_parallel_engine1,
1564 		__live_parallel_engineN,
1565 		__live_parallel_spin,
1566 		NULL,
1567 	};
1568 	const unsigned int nengines = num_uabi_engines(i915);
1569 	struct intel_engine_cs *engine;
1570 	int (* const *fn)(void *arg);
1571 	struct task_struct **tsk;
1572 	int err = 0;
1573 
1574 	/*
1575 	 * Check we can submit requests to all engines concurrently. This
1576 	 * tests that we load up the system maximally.
1577 	 */
1578 
1579 	tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL);
1580 	if (!tsk)
1581 		return -ENOMEM;
1582 
1583 	for (fn = func; !err && *fn; fn++) {
1584 		char name[KSYM_NAME_LEN];
1585 		struct igt_live_test t;
1586 		unsigned int idx;
1587 
1588 		snprintf(name, sizeof(name), "%ps", *fn);
1589 		err = igt_live_test_begin(&t, i915, __func__, name);
1590 		if (err)
1591 			break;
1592 
1593 		atomic_set(&i915->selftest.counter, nengines);
1594 
1595 		idx = 0;
1596 		for_each_uabi_engine(engine, i915) {
1597 			tsk[idx] = kthread_run(*fn, engine,
1598 					       "igt/parallel:%s",
1599 					       engine->name);
1600 			if (IS_ERR(tsk[idx])) {
1601 				err = PTR_ERR(tsk[idx]);
1602 				break;
1603 			}
1604 			get_task_struct(tsk[idx++]);
1605 		}
1606 
1607 		yield(); /* start all threads before we kthread_stop() */
1608 
1609 		idx = 0;
1610 		for_each_uabi_engine(engine, i915) {
1611 			int status;
1612 
1613 			if (IS_ERR(tsk[idx]))
1614 				break;
1615 
1616 			status = kthread_stop(tsk[idx]);
1617 			if (status && !err)
1618 				err = status;
1619 
1620 			put_task_struct(tsk[idx++]);
1621 		}
1622 
1623 		if (igt_live_test_end(&t))
1624 			err = -EIO;
1625 	}
1626 
1627 	kfree(tsk);
1628 	return err;
1629 }
1630 
1631 static int
1632 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1633 {
1634 	struct i915_request *rq;
1635 	int ret;
1636 
1637 	/*
1638 	 * Before execlists, all contexts share the same ringbuffer. With
1639 	 * execlists, each context/engine has a separate ringbuffer and
1640 	 * for the purposes of this test, inexhaustible.
1641 	 *
1642 	 * For the global ringbuffer though, we have to be very careful
1643 	 * that we do not wrap while preventing the execution of requests
1644 	 * with a unsignaled fence.
1645 	 */
1646 	if (HAS_EXECLISTS(ctx->i915))
1647 		return INT_MAX;
1648 
1649 	rq = igt_request_alloc(ctx, engine);
1650 	if (IS_ERR(rq)) {
1651 		ret = PTR_ERR(rq);
1652 	} else {
1653 		int sz;
1654 
1655 		ret = rq->ring->size - rq->reserved_space;
1656 		i915_request_add(rq);
1657 
1658 		sz = rq->ring->emit - rq->head;
1659 		if (sz < 0)
1660 			sz += rq->ring->size;
1661 		ret /= sz;
1662 		ret /= 2; /* leave half spare, in case of emergency! */
1663 	}
1664 
1665 	return ret;
1666 }
1667 
1668 static int live_breadcrumbs_smoketest(void *arg)
1669 {
1670 	struct drm_i915_private *i915 = arg;
1671 	const unsigned int nengines = num_uabi_engines(i915);
1672 	const unsigned int ncpus = num_online_cpus();
1673 	unsigned long num_waits, num_fences;
1674 	struct intel_engine_cs *engine;
1675 	struct task_struct **threads;
1676 	struct igt_live_test live;
1677 	intel_wakeref_t wakeref;
1678 	struct smoketest *smoke;
1679 	unsigned int n, idx;
1680 	struct file *file;
1681 	int ret = 0;
1682 
1683 	/*
1684 	 * Smoketest our breadcrumb/signal handling for requests across multiple
1685 	 * threads. A very simple test to only catch the most egregious of bugs.
1686 	 * See __igt_breadcrumbs_smoketest();
1687 	 *
1688 	 * On real hardware this time.
1689 	 */
1690 
1691 	wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1692 
1693 	file = mock_file(i915);
1694 	if (IS_ERR(file)) {
1695 		ret = PTR_ERR(file);
1696 		goto out_rpm;
1697 	}
1698 
1699 	smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1700 	if (!smoke) {
1701 		ret = -ENOMEM;
1702 		goto out_file;
1703 	}
1704 
1705 	threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1706 	if (!threads) {
1707 		ret = -ENOMEM;
1708 		goto out_smoke;
1709 	}
1710 
1711 	smoke[0].request_alloc = __live_request_alloc;
1712 	smoke[0].ncontexts = 64;
1713 	smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1714 				    sizeof(*smoke[0].contexts),
1715 				    GFP_KERNEL);
1716 	if (!smoke[0].contexts) {
1717 		ret = -ENOMEM;
1718 		goto out_threads;
1719 	}
1720 
1721 	for (n = 0; n < smoke[0].ncontexts; n++) {
1722 		smoke[0].contexts[n] = live_context(i915, file);
1723 		if (IS_ERR(smoke[0].contexts[n])) {
1724 			ret = PTR_ERR(smoke[0].contexts[n]);
1725 			goto out_contexts;
1726 		}
1727 	}
1728 
1729 	ret = igt_live_test_begin(&live, i915, __func__, "");
1730 	if (ret)
1731 		goto out_contexts;
1732 
1733 	idx = 0;
1734 	for_each_uabi_engine(engine, i915) {
1735 		smoke[idx] = smoke[0];
1736 		smoke[idx].engine = engine;
1737 		smoke[idx].max_batch =
1738 			max_batches(smoke[0].contexts[0], engine);
1739 		if (smoke[idx].max_batch < 0) {
1740 			ret = smoke[idx].max_batch;
1741 			goto out_flush;
1742 		}
1743 		/* One ring interleaved between requests from all cpus */
1744 		smoke[idx].max_batch /= num_online_cpus() + 1;
1745 		pr_debug("Limiting batches to %d requests on %s\n",
1746 			 smoke[idx].max_batch, engine->name);
1747 
1748 		for (n = 0; n < ncpus; n++) {
1749 			struct task_struct *tsk;
1750 
1751 			tsk = kthread_run(__igt_breadcrumbs_smoketest,
1752 					  &smoke[idx], "igt/%d.%d", idx, n);
1753 			if (IS_ERR(tsk)) {
1754 				ret = PTR_ERR(tsk);
1755 				goto out_flush;
1756 			}
1757 
1758 			get_task_struct(tsk);
1759 			threads[idx * ncpus + n] = tsk;
1760 		}
1761 
1762 		idx++;
1763 	}
1764 
1765 	yield(); /* start all threads before we begin */
1766 	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1767 
1768 out_flush:
1769 	idx = 0;
1770 	num_waits = 0;
1771 	num_fences = 0;
1772 	for_each_uabi_engine(engine, i915) {
1773 		for (n = 0; n < ncpus; n++) {
1774 			struct task_struct *tsk = threads[idx * ncpus + n];
1775 			int err;
1776 
1777 			if (!tsk)
1778 				continue;
1779 
1780 			err = kthread_stop(tsk);
1781 			if (err < 0 && !ret)
1782 				ret = err;
1783 
1784 			put_task_struct(tsk);
1785 		}
1786 
1787 		num_waits += atomic_long_read(&smoke[idx].num_waits);
1788 		num_fences += atomic_long_read(&smoke[idx].num_fences);
1789 		idx++;
1790 	}
1791 	pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1792 		num_waits, num_fences, idx, ncpus);
1793 
1794 	ret = igt_live_test_end(&live) ?: ret;
1795 out_contexts:
1796 	kfree(smoke[0].contexts);
1797 out_threads:
1798 	kfree(threads);
1799 out_smoke:
1800 	kfree(smoke);
1801 out_file:
1802 	fput(file);
1803 out_rpm:
1804 	intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1805 
1806 	return ret;
1807 }
1808 
1809 int i915_request_live_selftests(struct drm_i915_private *i915)
1810 {
1811 	static const struct i915_subtest tests[] = {
1812 		SUBTEST(live_nop_request),
1813 		SUBTEST(live_all_engines),
1814 		SUBTEST(live_sequential_engines),
1815 		SUBTEST(live_parallel_engines),
1816 		SUBTEST(live_empty_request),
1817 		SUBTEST(live_cancel_request),
1818 		SUBTEST(live_breadcrumbs_smoketest),
1819 	};
1820 
1821 	if (intel_gt_is_wedged(to_gt(i915)))
1822 		return 0;
1823 
1824 	return i915_subtests(tests, i915);
1825 }
1826 
1827 static int switch_to_kernel_sync(struct intel_context *ce, int err)
1828 {
1829 	struct i915_request *rq;
1830 	struct dma_fence *fence;
1831 
1832 	rq = intel_engine_create_kernel_request(ce->engine);
1833 	if (IS_ERR(rq))
1834 		return PTR_ERR(rq);
1835 
1836 	fence = i915_active_fence_get(&ce->timeline->last_request);
1837 	if (fence) {
1838 		i915_request_await_dma_fence(rq, fence);
1839 		dma_fence_put(fence);
1840 	}
1841 
1842 	rq = i915_request_get(rq);
1843 	i915_request_add(rq);
1844 	if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1845 		err = -ETIME;
1846 	i915_request_put(rq);
1847 
1848 	while (!err && !intel_engine_is_idle(ce->engine))
1849 		intel_engine_flush_submission(ce->engine);
1850 
1851 	return err;
1852 }
1853 
1854 struct perf_stats {
1855 	struct intel_engine_cs *engine;
1856 	unsigned long count;
1857 	ktime_t time;
1858 	ktime_t busy;
1859 	u64 runtime;
1860 };
1861 
1862 struct perf_series {
1863 	struct drm_i915_private *i915;
1864 	unsigned int nengines;
1865 	struct intel_context *ce[];
1866 };
1867 
1868 static int cmp_u32(const void *A, const void *B)
1869 {
1870 	const u32 *a = A, *b = B;
1871 
1872 	return *a - *b;
1873 }
1874 
1875 static u32 trifilter(u32 *a)
1876 {
1877 	u64 sum;
1878 
1879 #define TF_COUNT 5
1880 	sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1881 
1882 	sum = mul_u32_u32(a[2], 2);
1883 	sum += a[1];
1884 	sum += a[3];
1885 
1886 	GEM_BUG_ON(sum > U32_MAX);
1887 	return sum;
1888 #define TF_BIAS 2
1889 }
1890 
1891 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1892 {
1893 	u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles);
1894 
1895 	return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1896 }
1897 
1898 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1899 {
1900 	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1901 	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1902 	*cs++ = offset;
1903 	*cs++ = 0;
1904 
1905 	return cs;
1906 }
1907 
1908 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1909 {
1910 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1911 	*cs++ = offset;
1912 	*cs++ = 0;
1913 	*cs++ = value;
1914 
1915 	return cs;
1916 }
1917 
1918 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1919 {
1920 	*cs++ = MI_SEMAPHORE_WAIT |
1921 		MI_SEMAPHORE_GLOBAL_GTT |
1922 		MI_SEMAPHORE_POLL |
1923 		mode;
1924 	*cs++ = value;
1925 	*cs++ = offset;
1926 	*cs++ = 0;
1927 
1928 	return cs;
1929 }
1930 
1931 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1932 {
1933 	return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1934 }
1935 
1936 static void semaphore_set(u32 *sema, u32 value)
1937 {
1938 	WRITE_ONCE(*sema, value);
1939 	wmb(); /* flush the update to the cache, and beyond */
1940 }
1941 
1942 static u32 *hwsp_scratch(const struct intel_context *ce)
1943 {
1944 	return memset32(ce->engine->status_page.addr + 1000, 0, 21);
1945 }
1946 
1947 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
1948 {
1949 	return (i915_ggtt_offset(ce->engine->status_page.vma) +
1950 		offset_in_page(dw));
1951 }
1952 
1953 static int measure_semaphore_response(struct intel_context *ce)
1954 {
1955 	u32 *sema = hwsp_scratch(ce);
1956 	const u32 offset = hwsp_offset(ce, sema);
1957 	u32 elapsed[TF_COUNT], cycles;
1958 	struct i915_request *rq;
1959 	u32 *cs;
1960 	int err;
1961 	int i;
1962 
1963 	/*
1964 	 * Measure how many cycles it takes for the HW to detect the change
1965 	 * in a semaphore value.
1966 	 *
1967 	 *    A: read CS_TIMESTAMP from CPU
1968 	 *    poke semaphore
1969 	 *    B: read CS_TIMESTAMP on GPU
1970 	 *
1971 	 * Semaphore latency: B - A
1972 	 */
1973 
1974 	semaphore_set(sema, -1);
1975 
1976 	rq = i915_request_create(ce);
1977 	if (IS_ERR(rq))
1978 		return PTR_ERR(rq);
1979 
1980 	cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
1981 	if (IS_ERR(cs)) {
1982 		i915_request_add(rq);
1983 		err = PTR_ERR(cs);
1984 		goto err;
1985 	}
1986 
1987 	cs = emit_store_dw(cs, offset, 0);
1988 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1989 		cs = emit_semaphore_poll_until(cs, offset, i);
1990 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1991 		cs = emit_store_dw(cs, offset, 0);
1992 	}
1993 
1994 	intel_ring_advance(rq, cs);
1995 	i915_request_add(rq);
1996 
1997 	if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1998 		err = -EIO;
1999 		goto err;
2000 	}
2001 
2002 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2003 		preempt_disable();
2004 		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2005 		semaphore_set(sema, i);
2006 		preempt_enable();
2007 
2008 		if (wait_for(READ_ONCE(*sema) == 0, 50)) {
2009 			err = -EIO;
2010 			goto err;
2011 		}
2012 
2013 		elapsed[i - 1] = sema[i] - cycles;
2014 	}
2015 
2016 	cycles = trifilter(elapsed);
2017 	pr_info("%s: semaphore response %d cycles, %lluns\n",
2018 		ce->engine->name, cycles >> TF_BIAS,
2019 		cycles_to_ns(ce->engine, cycles));
2020 
2021 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2022 
2023 err:
2024 	intel_gt_set_wedged(ce->engine->gt);
2025 	return err;
2026 }
2027 
2028 static int measure_idle_dispatch(struct intel_context *ce)
2029 {
2030 	u32 *sema = hwsp_scratch(ce);
2031 	const u32 offset = hwsp_offset(ce, sema);
2032 	u32 elapsed[TF_COUNT], cycles;
2033 	u32 *cs;
2034 	int err;
2035 	int i;
2036 
2037 	/*
2038 	 * Measure how long it takes for us to submit a request while the
2039 	 * engine is idle, but is resting in our context.
2040 	 *
2041 	 *    A: read CS_TIMESTAMP from CPU
2042 	 *    submit request
2043 	 *    B: read CS_TIMESTAMP on GPU
2044 	 *
2045 	 * Submission latency: B - A
2046 	 */
2047 
2048 	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2049 		struct i915_request *rq;
2050 
2051 		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2052 		if (err)
2053 			return err;
2054 
2055 		rq = i915_request_create(ce);
2056 		if (IS_ERR(rq)) {
2057 			err = PTR_ERR(rq);
2058 			goto err;
2059 		}
2060 
2061 		cs = intel_ring_begin(rq, 4);
2062 		if (IS_ERR(cs)) {
2063 			i915_request_add(rq);
2064 			err = PTR_ERR(cs);
2065 			goto err;
2066 		}
2067 
2068 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2069 
2070 		intel_ring_advance(rq, cs);
2071 
2072 		preempt_disable();
2073 		local_bh_disable();
2074 		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2075 		i915_request_add(rq);
2076 		local_bh_enable();
2077 		preempt_enable();
2078 	}
2079 
2080 	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2081 	if (err)
2082 		goto err;
2083 
2084 	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
2085 		elapsed[i] = sema[i] - elapsed[i];
2086 
2087 	cycles = trifilter(elapsed);
2088 	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
2089 		ce->engine->name, cycles >> TF_BIAS,
2090 		cycles_to_ns(ce->engine, cycles));
2091 
2092 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2093 
2094 err:
2095 	intel_gt_set_wedged(ce->engine->gt);
2096 	return err;
2097 }
2098 
2099 static int measure_busy_dispatch(struct intel_context *ce)
2100 {
2101 	u32 *sema = hwsp_scratch(ce);
2102 	const u32 offset = hwsp_offset(ce, sema);
2103 	u32 elapsed[TF_COUNT + 1], cycles;
2104 	u32 *cs;
2105 	int err;
2106 	int i;
2107 
2108 	/*
2109 	 * Measure how long it takes for us to submit a request while the
2110 	 * engine is busy, polling on a semaphore in our context. With
2111 	 * direct submission, this will include the cost of a lite restore.
2112 	 *
2113 	 *    A: read CS_TIMESTAMP from CPU
2114 	 *    submit request
2115 	 *    B: read CS_TIMESTAMP on GPU
2116 	 *
2117 	 * Submission latency: B - A
2118 	 */
2119 
2120 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2121 		struct i915_request *rq;
2122 
2123 		rq = i915_request_create(ce);
2124 		if (IS_ERR(rq)) {
2125 			err = PTR_ERR(rq);
2126 			goto err;
2127 		}
2128 
2129 		cs = intel_ring_begin(rq, 12);
2130 		if (IS_ERR(cs)) {
2131 			i915_request_add(rq);
2132 			err = PTR_ERR(cs);
2133 			goto err;
2134 		}
2135 
2136 		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2137 		cs = emit_semaphore_poll_until(cs, offset, i);
2138 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2139 
2140 		intel_ring_advance(rq, cs);
2141 
2142 		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
2143 			err = -EIO;
2144 			goto err;
2145 		}
2146 
2147 		preempt_disable();
2148 		local_bh_disable();
2149 		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2150 		i915_request_add(rq);
2151 		local_bh_enable();
2152 		semaphore_set(sema, i - 1);
2153 		preempt_enable();
2154 	}
2155 
2156 	wait_for(READ_ONCE(sema[i - 1]), 500);
2157 	semaphore_set(sema, i - 1);
2158 
2159 	for (i = 1; i <= TF_COUNT; i++) {
2160 		GEM_BUG_ON(sema[i] == -1);
2161 		elapsed[i - 1] = sema[i] - elapsed[i];
2162 	}
2163 
2164 	cycles = trifilter(elapsed);
2165 	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
2166 		ce->engine->name, cycles >> TF_BIAS,
2167 		cycles_to_ns(ce->engine, cycles));
2168 
2169 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2170 
2171 err:
2172 	intel_gt_set_wedged(ce->engine->gt);
2173 	return err;
2174 }
2175 
2176 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
2177 {
2178 	const u32 offset =
2179 		i915_ggtt_offset(engine->status_page.vma) +
2180 		offset_in_page(sema);
2181 	struct i915_request *rq;
2182 	u32 *cs;
2183 
2184 	rq = i915_request_create(engine->kernel_context);
2185 	if (IS_ERR(rq))
2186 		return PTR_ERR(rq);
2187 
2188 	cs = intel_ring_begin(rq, 4);
2189 	if (IS_ERR(cs)) {
2190 		i915_request_add(rq);
2191 		return PTR_ERR(cs);
2192 	}
2193 
2194 	cs = emit_semaphore_poll(cs, mode, value, offset);
2195 
2196 	intel_ring_advance(rq, cs);
2197 	i915_request_add(rq);
2198 
2199 	return 0;
2200 }
2201 
2202 static int measure_inter_request(struct intel_context *ce)
2203 {
2204 	u32 *sema = hwsp_scratch(ce);
2205 	const u32 offset = hwsp_offset(ce, sema);
2206 	u32 elapsed[TF_COUNT + 1], cycles;
2207 	struct i915_sw_fence *submit;
2208 	int i, err;
2209 
2210 	/*
2211 	 * Measure how long it takes to advance from one request into the
2212 	 * next. Between each request we flush the GPU caches to memory,
2213 	 * update the breadcrumbs, and then invalidate those caches.
2214 	 * We queue up all the requests to be submitted in one batch so
2215 	 * it should be one set of contiguous measurements.
2216 	 *
2217 	 *    A: read CS_TIMESTAMP on GPU
2218 	 *    advance request
2219 	 *    B: read CS_TIMESTAMP on GPU
2220 	 *
2221 	 * Request latency: B - A
2222 	 */
2223 
2224 	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2225 	if (err)
2226 		return err;
2227 
2228 	submit = heap_fence_create(GFP_KERNEL);
2229 	if (!submit) {
2230 		semaphore_set(sema, 1);
2231 		return -ENOMEM;
2232 	}
2233 
2234 	intel_engine_flush_submission(ce->engine);
2235 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2236 		struct i915_request *rq;
2237 		u32 *cs;
2238 
2239 		rq = i915_request_create(ce);
2240 		if (IS_ERR(rq)) {
2241 			err = PTR_ERR(rq);
2242 			goto err_submit;
2243 		}
2244 
2245 		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
2246 						       submit,
2247 						       GFP_KERNEL);
2248 		if (err < 0) {
2249 			i915_request_add(rq);
2250 			goto err_submit;
2251 		}
2252 
2253 		cs = intel_ring_begin(rq, 4);
2254 		if (IS_ERR(cs)) {
2255 			i915_request_add(rq);
2256 			err = PTR_ERR(cs);
2257 			goto err_submit;
2258 		}
2259 
2260 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2261 
2262 		intel_ring_advance(rq, cs);
2263 		i915_request_add(rq);
2264 	}
2265 	i915_sw_fence_commit(submit);
2266 	intel_engine_flush_submission(ce->engine);
2267 	heap_fence_put(submit);
2268 
2269 	semaphore_set(sema, 1);
2270 	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2271 	if (err)
2272 		goto err;
2273 
2274 	for (i = 1; i <= TF_COUNT; i++)
2275 		elapsed[i - 1] = sema[i + 1] - sema[i];
2276 
2277 	cycles = trifilter(elapsed);
2278 	pr_info("%s: inter-request latency %d cycles, %lluns\n",
2279 		ce->engine->name, cycles >> TF_BIAS,
2280 		cycles_to_ns(ce->engine, cycles));
2281 
2282 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2283 
2284 err_submit:
2285 	i915_sw_fence_commit(submit);
2286 	heap_fence_put(submit);
2287 	semaphore_set(sema, 1);
2288 err:
2289 	intel_gt_set_wedged(ce->engine->gt);
2290 	return err;
2291 }
2292 
2293 static int measure_context_switch(struct intel_context *ce)
2294 {
2295 	u32 *sema = hwsp_scratch(ce);
2296 	const u32 offset = hwsp_offset(ce, sema);
2297 	struct i915_request *fence = NULL;
2298 	u32 elapsed[TF_COUNT + 1], cycles;
2299 	int i, j, err;
2300 	u32 *cs;
2301 
2302 	/*
2303 	 * Measure how long it takes to advance from one request in one
2304 	 * context to a request in another context. This allows us to
2305 	 * measure how long the context save/restore take, along with all
2306 	 * the inter-context setup we require.
2307 	 *
2308 	 *    A: read CS_TIMESTAMP on GPU
2309 	 *    switch context
2310 	 *    B: read CS_TIMESTAMP on GPU
2311 	 *
2312 	 * Context switch latency: B - A
2313 	 */
2314 
2315 	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2316 	if (err)
2317 		return err;
2318 
2319 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2320 		struct intel_context *arr[] = {
2321 			ce, ce->engine->kernel_context
2322 		};
2323 		u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
2324 
2325 		for (j = 0; j < ARRAY_SIZE(arr); j++) {
2326 			struct i915_request *rq;
2327 
2328 			rq = i915_request_create(arr[j]);
2329 			if (IS_ERR(rq)) {
2330 				err = PTR_ERR(rq);
2331 				goto err_fence;
2332 			}
2333 
2334 			if (fence) {
2335 				err = i915_request_await_dma_fence(rq,
2336 								   &fence->fence);
2337 				if (err) {
2338 					i915_request_add(rq);
2339 					goto err_fence;
2340 				}
2341 			}
2342 
2343 			cs = intel_ring_begin(rq, 4);
2344 			if (IS_ERR(cs)) {
2345 				i915_request_add(rq);
2346 				err = PTR_ERR(cs);
2347 				goto err_fence;
2348 			}
2349 
2350 			cs = emit_timestamp_store(cs, ce, addr);
2351 			addr += sizeof(u32);
2352 
2353 			intel_ring_advance(rq, cs);
2354 
2355 			i915_request_put(fence);
2356 			fence = i915_request_get(rq);
2357 
2358 			i915_request_add(rq);
2359 		}
2360 	}
2361 	i915_request_put(fence);
2362 	intel_engine_flush_submission(ce->engine);
2363 
2364 	semaphore_set(sema, 1);
2365 	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2366 	if (err)
2367 		goto err;
2368 
2369 	for (i = 1; i <= TF_COUNT; i++)
2370 		elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2371 
2372 	cycles = trifilter(elapsed);
2373 	pr_info("%s: context switch latency %d cycles, %lluns\n",
2374 		ce->engine->name, cycles >> TF_BIAS,
2375 		cycles_to_ns(ce->engine, cycles));
2376 
2377 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2378 
2379 err_fence:
2380 	i915_request_put(fence);
2381 	semaphore_set(sema, 1);
2382 err:
2383 	intel_gt_set_wedged(ce->engine->gt);
2384 	return err;
2385 }
2386 
2387 static int measure_preemption(struct intel_context *ce)
2388 {
2389 	u32 *sema = hwsp_scratch(ce);
2390 	const u32 offset = hwsp_offset(ce, sema);
2391 	u32 elapsed[TF_COUNT], cycles;
2392 	u32 *cs;
2393 	int err;
2394 	int i;
2395 
2396 	/*
2397 	 * We measure two latencies while triggering preemption. The first
2398 	 * latency is how long it takes for us to submit a preempting request.
2399 	 * The second latency is how it takes for us to return from the
2400 	 * preemption back to the original context.
2401 	 *
2402 	 *    A: read CS_TIMESTAMP from CPU
2403 	 *    submit preemption
2404 	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
2405 	 *    context switch
2406 	 *    C: read CS_TIMESTAMP on GPU (in original context)
2407 	 *
2408 	 * Preemption dispatch latency: B - A
2409 	 * Preemption switch latency: C - B
2410 	 */
2411 
2412 	if (!intel_engine_has_preemption(ce->engine))
2413 		return 0;
2414 
2415 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2416 		u32 addr = offset + 2 * i * sizeof(u32);
2417 		struct i915_request *rq;
2418 
2419 		rq = i915_request_create(ce);
2420 		if (IS_ERR(rq)) {
2421 			err = PTR_ERR(rq);
2422 			goto err;
2423 		}
2424 
2425 		cs = intel_ring_begin(rq, 12);
2426 		if (IS_ERR(cs)) {
2427 			i915_request_add(rq);
2428 			err = PTR_ERR(cs);
2429 			goto err;
2430 		}
2431 
2432 		cs = emit_store_dw(cs, addr, -1);
2433 		cs = emit_semaphore_poll_until(cs, offset, i);
2434 		cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2435 
2436 		intel_ring_advance(rq, cs);
2437 		i915_request_add(rq);
2438 
2439 		if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2440 			err = -EIO;
2441 			goto err;
2442 		}
2443 
2444 		rq = i915_request_create(ce->engine->kernel_context);
2445 		if (IS_ERR(rq)) {
2446 			err = PTR_ERR(rq);
2447 			goto err;
2448 		}
2449 
2450 		cs = intel_ring_begin(rq, 8);
2451 		if (IS_ERR(cs)) {
2452 			i915_request_add(rq);
2453 			err = PTR_ERR(cs);
2454 			goto err;
2455 		}
2456 
2457 		cs = emit_timestamp_store(cs, ce, addr);
2458 		cs = emit_store_dw(cs, offset, i);
2459 
2460 		intel_ring_advance(rq, cs);
2461 		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2462 
2463 		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2464 		i915_request_add(rq);
2465 	}
2466 
2467 	if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2468 		err = -EIO;
2469 		goto err;
2470 	}
2471 
2472 	for (i = 1; i <= TF_COUNT; i++)
2473 		elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2474 
2475 	cycles = trifilter(elapsed);
2476 	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2477 		ce->engine->name, cycles >> TF_BIAS,
2478 		cycles_to_ns(ce->engine, cycles));
2479 
2480 	for (i = 1; i <= TF_COUNT; i++)
2481 		elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2482 
2483 	cycles = trifilter(elapsed);
2484 	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2485 		ce->engine->name, cycles >> TF_BIAS,
2486 		cycles_to_ns(ce->engine, cycles));
2487 
2488 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2489 
2490 err:
2491 	intel_gt_set_wedged(ce->engine->gt);
2492 	return err;
2493 }
2494 
2495 struct signal_cb {
2496 	struct dma_fence_cb base;
2497 	bool seen;
2498 };
2499 
2500 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2501 {
2502 	struct signal_cb *s = container_of(cb, typeof(*s), base);
2503 
2504 	smp_store_mb(s->seen, true); /* be safe, be strong */
2505 }
2506 
2507 static int measure_completion(struct intel_context *ce)
2508 {
2509 	u32 *sema = hwsp_scratch(ce);
2510 	const u32 offset = hwsp_offset(ce, sema);
2511 	u32 elapsed[TF_COUNT], cycles;
2512 	u32 *cs;
2513 	int err;
2514 	int i;
2515 
2516 	/*
2517 	 * Measure how long it takes for the signal (interrupt) to be
2518 	 * sent from the GPU to be processed by the CPU.
2519 	 *
2520 	 *    A: read CS_TIMESTAMP on GPU
2521 	 *    signal
2522 	 *    B: read CS_TIMESTAMP from CPU
2523 	 *
2524 	 * Completion latency: B - A
2525 	 */
2526 
2527 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2528 		struct signal_cb cb = { .seen = false };
2529 		struct i915_request *rq;
2530 
2531 		rq = i915_request_create(ce);
2532 		if (IS_ERR(rq)) {
2533 			err = PTR_ERR(rq);
2534 			goto err;
2535 		}
2536 
2537 		cs = intel_ring_begin(rq, 12);
2538 		if (IS_ERR(cs)) {
2539 			i915_request_add(rq);
2540 			err = PTR_ERR(cs);
2541 			goto err;
2542 		}
2543 
2544 		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2545 		cs = emit_semaphore_poll_until(cs, offset, i);
2546 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2547 
2548 		intel_ring_advance(rq, cs);
2549 
2550 		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2551 		i915_request_add(rq);
2552 
2553 		intel_engine_flush_submission(ce->engine);
2554 		if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2555 			err = -EIO;
2556 			goto err;
2557 		}
2558 
2559 		preempt_disable();
2560 		semaphore_set(sema, i);
2561 		while (!READ_ONCE(cb.seen))
2562 			cpu_relax();
2563 
2564 		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2565 		preempt_enable();
2566 	}
2567 
2568 	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2569 	if (err)
2570 		goto err;
2571 
2572 	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2573 		GEM_BUG_ON(sema[i + 1] == -1);
2574 		elapsed[i] = elapsed[i] - sema[i + 1];
2575 	}
2576 
2577 	cycles = trifilter(elapsed);
2578 	pr_info("%s: completion latency %d cycles, %lluns\n",
2579 		ce->engine->name, cycles >> TF_BIAS,
2580 		cycles_to_ns(ce->engine, cycles));
2581 
2582 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2583 
2584 err:
2585 	intel_gt_set_wedged(ce->engine->gt);
2586 	return err;
2587 }
2588 
2589 static void rps_pin(struct intel_gt *gt)
2590 {
2591 	/* Pin the frequency to max */
2592 	atomic_inc(&gt->rps.num_waiters);
2593 	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2594 
2595 	mutex_lock(&gt->rps.lock);
2596 	intel_rps_set(&gt->rps, gt->rps.max_freq);
2597 	mutex_unlock(&gt->rps.lock);
2598 }
2599 
2600 static void rps_unpin(struct intel_gt *gt)
2601 {
2602 	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2603 	atomic_dec(&gt->rps.num_waiters);
2604 }
2605 
2606 static int perf_request_latency(void *arg)
2607 {
2608 	struct drm_i915_private *i915 = arg;
2609 	struct intel_engine_cs *engine;
2610 	struct pm_qos_request qos;
2611 	int err = 0;
2612 
2613 	if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */
2614 		return 0;
2615 
2616 	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2617 
2618 	for_each_uabi_engine(engine, i915) {
2619 		struct intel_context *ce;
2620 
2621 		ce = intel_context_create(engine);
2622 		if (IS_ERR(ce)) {
2623 			err = PTR_ERR(ce);
2624 			goto out;
2625 		}
2626 
2627 		err = intel_context_pin(ce);
2628 		if (err) {
2629 			intel_context_put(ce);
2630 			goto out;
2631 		}
2632 
2633 		st_engine_heartbeat_disable(engine);
2634 		rps_pin(engine->gt);
2635 
2636 		if (err == 0)
2637 			err = measure_semaphore_response(ce);
2638 		if (err == 0)
2639 			err = measure_idle_dispatch(ce);
2640 		if (err == 0)
2641 			err = measure_busy_dispatch(ce);
2642 		if (err == 0)
2643 			err = measure_inter_request(ce);
2644 		if (err == 0)
2645 			err = measure_context_switch(ce);
2646 		if (err == 0)
2647 			err = measure_preemption(ce);
2648 		if (err == 0)
2649 			err = measure_completion(ce);
2650 
2651 		rps_unpin(engine->gt);
2652 		st_engine_heartbeat_enable(engine);
2653 
2654 		intel_context_unpin(ce);
2655 		intel_context_put(ce);
2656 		if (err)
2657 			goto out;
2658 	}
2659 
2660 out:
2661 	if (igt_flush_test(i915))
2662 		err = -EIO;
2663 
2664 	cpu_latency_qos_remove_request(&qos);
2665 	return err;
2666 }
2667 
2668 static int s_sync0(void *arg)
2669 {
2670 	struct perf_series *ps = arg;
2671 	IGT_TIMEOUT(end_time);
2672 	unsigned int idx = 0;
2673 	int err = 0;
2674 
2675 	GEM_BUG_ON(!ps->nengines);
2676 	do {
2677 		struct i915_request *rq;
2678 
2679 		rq = i915_request_create(ps->ce[idx]);
2680 		if (IS_ERR(rq)) {
2681 			err = PTR_ERR(rq);
2682 			break;
2683 		}
2684 
2685 		i915_request_get(rq);
2686 		i915_request_add(rq);
2687 
2688 		if (i915_request_wait(rq, 0, HZ / 5) < 0)
2689 			err = -ETIME;
2690 		i915_request_put(rq);
2691 		if (err)
2692 			break;
2693 
2694 		if (++idx == ps->nengines)
2695 			idx = 0;
2696 	} while (!__igt_timeout(end_time, NULL));
2697 
2698 	return err;
2699 }
2700 
2701 static int s_sync1(void *arg)
2702 {
2703 	struct perf_series *ps = arg;
2704 	struct i915_request *prev = NULL;
2705 	IGT_TIMEOUT(end_time);
2706 	unsigned int idx = 0;
2707 	int err = 0;
2708 
2709 	GEM_BUG_ON(!ps->nengines);
2710 	do {
2711 		struct i915_request *rq;
2712 
2713 		rq = i915_request_create(ps->ce[idx]);
2714 		if (IS_ERR(rq)) {
2715 			err = PTR_ERR(rq);
2716 			break;
2717 		}
2718 
2719 		i915_request_get(rq);
2720 		i915_request_add(rq);
2721 
2722 		if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2723 			err = -ETIME;
2724 		i915_request_put(prev);
2725 		prev = rq;
2726 		if (err)
2727 			break;
2728 
2729 		if (++idx == ps->nengines)
2730 			idx = 0;
2731 	} while (!__igt_timeout(end_time, NULL));
2732 	i915_request_put(prev);
2733 
2734 	return err;
2735 }
2736 
2737 static int s_many(void *arg)
2738 {
2739 	struct perf_series *ps = arg;
2740 	IGT_TIMEOUT(end_time);
2741 	unsigned int idx = 0;
2742 
2743 	GEM_BUG_ON(!ps->nengines);
2744 	do {
2745 		struct i915_request *rq;
2746 
2747 		rq = i915_request_create(ps->ce[idx]);
2748 		if (IS_ERR(rq))
2749 			return PTR_ERR(rq);
2750 
2751 		i915_request_add(rq);
2752 
2753 		if (++idx == ps->nengines)
2754 			idx = 0;
2755 	} while (!__igt_timeout(end_time, NULL));
2756 
2757 	return 0;
2758 }
2759 
2760 static int perf_series_engines(void *arg)
2761 {
2762 	struct drm_i915_private *i915 = arg;
2763 	static int (* const func[])(void *arg) = {
2764 		s_sync0,
2765 		s_sync1,
2766 		s_many,
2767 		NULL,
2768 	};
2769 	const unsigned int nengines = num_uabi_engines(i915);
2770 	struct intel_engine_cs *engine;
2771 	int (* const *fn)(void *arg);
2772 	struct pm_qos_request qos;
2773 	struct perf_stats *stats;
2774 	struct perf_series *ps;
2775 	unsigned int idx;
2776 	int err = 0;
2777 
2778 	stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2779 	if (!stats)
2780 		return -ENOMEM;
2781 
2782 	ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2783 	if (!ps) {
2784 		kfree(stats);
2785 		return -ENOMEM;
2786 	}
2787 
2788 	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2789 
2790 	ps->i915 = i915;
2791 	ps->nengines = nengines;
2792 
2793 	idx = 0;
2794 	for_each_uabi_engine(engine, i915) {
2795 		struct intel_context *ce;
2796 
2797 		ce = intel_context_create(engine);
2798 		if (IS_ERR(ce)) {
2799 			err = PTR_ERR(ce);
2800 			goto out;
2801 		}
2802 
2803 		err = intel_context_pin(ce);
2804 		if (err) {
2805 			intel_context_put(ce);
2806 			goto out;
2807 		}
2808 
2809 		ps->ce[idx++] = ce;
2810 	}
2811 	GEM_BUG_ON(idx != ps->nengines);
2812 
2813 	for (fn = func; *fn && !err; fn++) {
2814 		char name[KSYM_NAME_LEN];
2815 		struct igt_live_test t;
2816 
2817 		snprintf(name, sizeof(name), "%ps", *fn);
2818 		err = igt_live_test_begin(&t, i915, __func__, name);
2819 		if (err)
2820 			break;
2821 
2822 		for (idx = 0; idx < nengines; idx++) {
2823 			struct perf_stats *p =
2824 				memset(&stats[idx], 0, sizeof(stats[idx]));
2825 			struct intel_context *ce = ps->ce[idx];
2826 
2827 			p->engine = ps->ce[idx]->engine;
2828 			intel_engine_pm_get(p->engine);
2829 
2830 			if (intel_engine_supports_stats(p->engine))
2831 				p->busy = intel_engine_get_busy_time(p->engine,
2832 								     &p->time) + 1;
2833 			else
2834 				p->time = ktime_get();
2835 			p->runtime = -intel_context_get_total_runtime_ns(ce);
2836 		}
2837 
2838 		err = (*fn)(ps);
2839 		if (igt_live_test_end(&t))
2840 			err = -EIO;
2841 
2842 		for (idx = 0; idx < nengines; idx++) {
2843 			struct perf_stats *p = &stats[idx];
2844 			struct intel_context *ce = ps->ce[idx];
2845 			int integer, decimal;
2846 			u64 busy, dt, now;
2847 
2848 			if (p->busy)
2849 				p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2850 									       &now),
2851 						    p->busy - 1);
2852 			else
2853 				now = ktime_get();
2854 			p->time = ktime_sub(now, p->time);
2855 
2856 			err = switch_to_kernel_sync(ce, err);
2857 			p->runtime += intel_context_get_total_runtime_ns(ce);
2858 			intel_engine_pm_put(p->engine);
2859 
2860 			busy = 100 * ktime_to_ns(p->busy);
2861 			dt = ktime_to_ns(p->time);
2862 			if (dt) {
2863 				integer = div64_u64(busy, dt);
2864 				busy -= integer * dt;
2865 				decimal = div64_u64(100 * busy, dt);
2866 			} else {
2867 				integer = 0;
2868 				decimal = 0;
2869 			}
2870 
2871 			pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2872 				name, p->engine->name, ce->timeline->seqno,
2873 				integer, decimal,
2874 				div_u64(p->runtime, 1000 * 1000),
2875 				div_u64(ktime_to_ns(p->time), 1000 * 1000));
2876 		}
2877 	}
2878 
2879 out:
2880 	for (idx = 0; idx < nengines; idx++) {
2881 		if (IS_ERR_OR_NULL(ps->ce[idx]))
2882 			break;
2883 
2884 		intel_context_unpin(ps->ce[idx]);
2885 		intel_context_put(ps->ce[idx]);
2886 	}
2887 	kfree(ps);
2888 
2889 	cpu_latency_qos_remove_request(&qos);
2890 	kfree(stats);
2891 	return err;
2892 }
2893 
2894 static int p_sync0(void *arg)
2895 {
2896 	struct perf_stats *p = arg;
2897 	struct intel_engine_cs *engine = p->engine;
2898 	struct intel_context *ce;
2899 	IGT_TIMEOUT(end_time);
2900 	unsigned long count;
2901 	bool busy;
2902 	int err = 0;
2903 
2904 	ce = intel_context_create(engine);
2905 	if (IS_ERR(ce))
2906 		return PTR_ERR(ce);
2907 
2908 	err = intel_context_pin(ce);
2909 	if (err) {
2910 		intel_context_put(ce);
2911 		return err;
2912 	}
2913 
2914 	if (intel_engine_supports_stats(engine)) {
2915 		p->busy = intel_engine_get_busy_time(engine, &p->time);
2916 		busy = true;
2917 	} else {
2918 		p->time = ktime_get();
2919 		busy = false;
2920 	}
2921 
2922 	count = 0;
2923 	do {
2924 		struct i915_request *rq;
2925 
2926 		rq = i915_request_create(ce);
2927 		if (IS_ERR(rq)) {
2928 			err = PTR_ERR(rq);
2929 			break;
2930 		}
2931 
2932 		i915_request_get(rq);
2933 		i915_request_add(rq);
2934 
2935 		err = 0;
2936 		if (i915_request_wait(rq, 0, HZ) < 0)
2937 			err = -ETIME;
2938 		i915_request_put(rq);
2939 		if (err)
2940 			break;
2941 
2942 		count++;
2943 	} while (!__igt_timeout(end_time, NULL));
2944 
2945 	if (busy) {
2946 		ktime_t now;
2947 
2948 		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2949 				    p->busy);
2950 		p->time = ktime_sub(now, p->time);
2951 	} else {
2952 		p->time = ktime_sub(ktime_get(), p->time);
2953 	}
2954 
2955 	err = switch_to_kernel_sync(ce, err);
2956 	p->runtime = intel_context_get_total_runtime_ns(ce);
2957 	p->count = count;
2958 
2959 	intel_context_unpin(ce);
2960 	intel_context_put(ce);
2961 	return err;
2962 }
2963 
2964 static int p_sync1(void *arg)
2965 {
2966 	struct perf_stats *p = arg;
2967 	struct intel_engine_cs *engine = p->engine;
2968 	struct i915_request *prev = NULL;
2969 	struct intel_context *ce;
2970 	IGT_TIMEOUT(end_time);
2971 	unsigned long count;
2972 	bool busy;
2973 	int err = 0;
2974 
2975 	ce = intel_context_create(engine);
2976 	if (IS_ERR(ce))
2977 		return PTR_ERR(ce);
2978 
2979 	err = intel_context_pin(ce);
2980 	if (err) {
2981 		intel_context_put(ce);
2982 		return err;
2983 	}
2984 
2985 	if (intel_engine_supports_stats(engine)) {
2986 		p->busy = intel_engine_get_busy_time(engine, &p->time);
2987 		busy = true;
2988 	} else {
2989 		p->time = ktime_get();
2990 		busy = false;
2991 	}
2992 
2993 	count = 0;
2994 	do {
2995 		struct i915_request *rq;
2996 
2997 		rq = i915_request_create(ce);
2998 		if (IS_ERR(rq)) {
2999 			err = PTR_ERR(rq);
3000 			break;
3001 		}
3002 
3003 		i915_request_get(rq);
3004 		i915_request_add(rq);
3005 
3006 		err = 0;
3007 		if (prev && i915_request_wait(prev, 0, HZ) < 0)
3008 			err = -ETIME;
3009 		i915_request_put(prev);
3010 		prev = rq;
3011 		if (err)
3012 			break;
3013 
3014 		count++;
3015 	} while (!__igt_timeout(end_time, NULL));
3016 	i915_request_put(prev);
3017 
3018 	if (busy) {
3019 		ktime_t now;
3020 
3021 		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3022 				    p->busy);
3023 		p->time = ktime_sub(now, p->time);
3024 	} else {
3025 		p->time = ktime_sub(ktime_get(), p->time);
3026 	}
3027 
3028 	err = switch_to_kernel_sync(ce, err);
3029 	p->runtime = intel_context_get_total_runtime_ns(ce);
3030 	p->count = count;
3031 
3032 	intel_context_unpin(ce);
3033 	intel_context_put(ce);
3034 	return err;
3035 }
3036 
3037 static int p_many(void *arg)
3038 {
3039 	struct perf_stats *p = arg;
3040 	struct intel_engine_cs *engine = p->engine;
3041 	struct intel_context *ce;
3042 	IGT_TIMEOUT(end_time);
3043 	unsigned long count;
3044 	int err = 0;
3045 	bool busy;
3046 
3047 	ce = intel_context_create(engine);
3048 	if (IS_ERR(ce))
3049 		return PTR_ERR(ce);
3050 
3051 	err = intel_context_pin(ce);
3052 	if (err) {
3053 		intel_context_put(ce);
3054 		return err;
3055 	}
3056 
3057 	if (intel_engine_supports_stats(engine)) {
3058 		p->busy = intel_engine_get_busy_time(engine, &p->time);
3059 		busy = true;
3060 	} else {
3061 		p->time = ktime_get();
3062 		busy = false;
3063 	}
3064 
3065 	count = 0;
3066 	do {
3067 		struct i915_request *rq;
3068 
3069 		rq = i915_request_create(ce);
3070 		if (IS_ERR(rq)) {
3071 			err = PTR_ERR(rq);
3072 			break;
3073 		}
3074 
3075 		i915_request_add(rq);
3076 		count++;
3077 	} while (!__igt_timeout(end_time, NULL));
3078 
3079 	if (busy) {
3080 		ktime_t now;
3081 
3082 		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3083 				    p->busy);
3084 		p->time = ktime_sub(now, p->time);
3085 	} else {
3086 		p->time = ktime_sub(ktime_get(), p->time);
3087 	}
3088 
3089 	err = switch_to_kernel_sync(ce, err);
3090 	p->runtime = intel_context_get_total_runtime_ns(ce);
3091 	p->count = count;
3092 
3093 	intel_context_unpin(ce);
3094 	intel_context_put(ce);
3095 	return err;
3096 }
3097 
3098 static int perf_parallel_engines(void *arg)
3099 {
3100 	struct drm_i915_private *i915 = arg;
3101 	static int (* const func[])(void *arg) = {
3102 		p_sync0,
3103 		p_sync1,
3104 		p_many,
3105 		NULL,
3106 	};
3107 	const unsigned int nengines = num_uabi_engines(i915);
3108 	struct intel_engine_cs *engine;
3109 	int (* const *fn)(void *arg);
3110 	struct pm_qos_request qos;
3111 	struct {
3112 		struct perf_stats p;
3113 		struct task_struct *tsk;
3114 	} *engines;
3115 	int err = 0;
3116 
3117 	engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
3118 	if (!engines)
3119 		return -ENOMEM;
3120 
3121 	cpu_latency_qos_add_request(&qos, 0);
3122 
3123 	for (fn = func; *fn; fn++) {
3124 		char name[KSYM_NAME_LEN];
3125 		struct igt_live_test t;
3126 		unsigned int idx;
3127 
3128 		snprintf(name, sizeof(name), "%ps", *fn);
3129 		err = igt_live_test_begin(&t, i915, __func__, name);
3130 		if (err)
3131 			break;
3132 
3133 		atomic_set(&i915->selftest.counter, nengines);
3134 
3135 		idx = 0;
3136 		for_each_uabi_engine(engine, i915) {
3137 			intel_engine_pm_get(engine);
3138 
3139 			memset(&engines[idx].p, 0, sizeof(engines[idx].p));
3140 			engines[idx].p.engine = engine;
3141 
3142 			engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
3143 						       "igt:%s", engine->name);
3144 			if (IS_ERR(engines[idx].tsk)) {
3145 				err = PTR_ERR(engines[idx].tsk);
3146 				intel_engine_pm_put(engine);
3147 				break;
3148 			}
3149 			get_task_struct(engines[idx++].tsk);
3150 		}
3151 
3152 		yield(); /* start all threads before we kthread_stop() */
3153 
3154 		idx = 0;
3155 		for_each_uabi_engine(engine, i915) {
3156 			int status;
3157 
3158 			if (IS_ERR(engines[idx].tsk))
3159 				break;
3160 
3161 			status = kthread_stop(engines[idx].tsk);
3162 			if (status && !err)
3163 				err = status;
3164 
3165 			intel_engine_pm_put(engine);
3166 			put_task_struct(engines[idx++].tsk);
3167 		}
3168 
3169 		if (igt_live_test_end(&t))
3170 			err = -EIO;
3171 		if (err)
3172 			break;
3173 
3174 		idx = 0;
3175 		for_each_uabi_engine(engine, i915) {
3176 			struct perf_stats *p = &engines[idx].p;
3177 			u64 busy = 100 * ktime_to_ns(p->busy);
3178 			u64 dt = ktime_to_ns(p->time);
3179 			int integer, decimal;
3180 
3181 			if (dt) {
3182 				integer = div64_u64(busy, dt);
3183 				busy -= integer * dt;
3184 				decimal = div64_u64(100 * busy, dt);
3185 			} else {
3186 				integer = 0;
3187 				decimal = 0;
3188 			}
3189 
3190 			GEM_BUG_ON(engine != p->engine);
3191 			pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
3192 				name, engine->name, p->count, integer, decimal,
3193 				div_u64(p->runtime, 1000 * 1000),
3194 				div_u64(ktime_to_ns(p->time), 1000 * 1000));
3195 			idx++;
3196 		}
3197 	}
3198 
3199 	cpu_latency_qos_remove_request(&qos);
3200 	kfree(engines);
3201 	return err;
3202 }
3203 
3204 int i915_request_perf_selftests(struct drm_i915_private *i915)
3205 {
3206 	static const struct i915_subtest tests[] = {
3207 		SUBTEST(perf_request_latency),
3208 		SUBTEST(perf_series_engines),
3209 		SUBTEST(perf_parallel_engines),
3210 	};
3211 
3212 	if (intel_gt_is_wedged(to_gt(i915)))
3213 		return 0;
3214 
3215 	return i915_subtests(tests, i915);
3216 }
3217