xref: /linux/drivers/gpu/drm/i915/selftests/i915_request.c (revision f6e8dc9edf963dbc99085e54f6ced6da9daa6100)
1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/pm_qos.h>
26 #include <linux/prime_numbers.h>
27 #include <linux/sort.h>
28 
29 #include <drm/drm_print.h>
30 
31 #include "gem/i915_gem_internal.h"
32 #include "gem/i915_gem_pm.h"
33 #include "gem/selftests/mock_context.h"
34 #include "gt/intel_engine_heartbeat.h"
35 #include "gt/intel_engine_pm.h"
36 #include "gt/intel_engine_user.h"
37 #include "gt/intel_gt.h"
38 #include "gt/intel_gt_clock_utils.h"
39 #include "gt/intel_gt_requests.h"
40 #include "gt/selftest_engine_heartbeat.h"
41 
42 #include "i915_random.h"
43 #include "i915_selftest.h"
44 #include "i915_wait_util.h"
45 #include "igt_flush_test.h"
46 #include "igt_live_test.h"
47 #include "igt_spinner.h"
48 #include "lib_sw_fence.h"
49 #include "mock_drm.h"
50 #include "mock_gem_device.h"
51 
52 static unsigned int num_uabi_engines(struct drm_i915_private *i915)
53 {
54 	struct intel_engine_cs *engine;
55 	unsigned int count;
56 
57 	count = 0;
58 	for_each_uabi_engine(engine, i915)
59 		count++;
60 
61 	return count;
62 }
63 
64 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
65 {
66 	return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
67 }
68 
69 static int igt_add_request(void *arg)
70 {
71 	struct drm_i915_private *i915 = arg;
72 	struct i915_request *request;
73 
74 	/* Basic preliminary test to create a request and let it loose! */
75 
76 	request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
77 	if (IS_ERR(request))
78 		return PTR_ERR(request);
79 
80 	i915_request_add(request);
81 
82 	return 0;
83 }
84 
85 static int igt_wait_request(void *arg)
86 {
87 	const long T = HZ / 4;
88 	struct drm_i915_private *i915 = arg;
89 	struct i915_request *request;
90 	int err = -EINVAL;
91 
92 	/* Submit a request, then wait upon it */
93 
94 	request = mock_request(rcs0(i915)->kernel_context, T);
95 	if (IS_ERR(request))
96 		return PTR_ERR(request);
97 
98 	i915_request_get(request);
99 
100 	if (i915_request_wait(request, 0, 0) != -ETIME) {
101 		pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
102 		goto out_request;
103 	}
104 
105 	if (i915_request_wait(request, 0, T) != -ETIME) {
106 		pr_err("request wait succeeded (expected timeout before submit!)\n");
107 		goto out_request;
108 	}
109 
110 	if (i915_request_completed(request)) {
111 		pr_err("request completed before submit!!\n");
112 		goto out_request;
113 	}
114 
115 	i915_request_add(request);
116 
117 	if (i915_request_wait(request, 0, 0) != -ETIME) {
118 		pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
119 		goto out_request;
120 	}
121 
122 	if (i915_request_completed(request)) {
123 		pr_err("request completed immediately!\n");
124 		goto out_request;
125 	}
126 
127 	if (i915_request_wait(request, 0, T / 2) != -ETIME) {
128 		pr_err("request wait succeeded (expected timeout!)\n");
129 		goto out_request;
130 	}
131 
132 	if (i915_request_wait(request, 0, T) == -ETIME) {
133 		pr_err("request wait timed out!\n");
134 		goto out_request;
135 	}
136 
137 	if (!i915_request_completed(request)) {
138 		pr_err("request not complete after waiting!\n");
139 		goto out_request;
140 	}
141 
142 	if (i915_request_wait(request, 0, T) == -ETIME) {
143 		pr_err("request wait timed out when already complete!\n");
144 		goto out_request;
145 	}
146 
147 	err = 0;
148 out_request:
149 	i915_request_put(request);
150 	mock_device_flush(i915);
151 	return err;
152 }
153 
154 static int igt_fence_wait(void *arg)
155 {
156 	const long T = HZ / 4;
157 	struct drm_i915_private *i915 = arg;
158 	struct i915_request *request;
159 	int err = -EINVAL;
160 
161 	/* Submit a request, treat it as a fence and wait upon it */
162 
163 	request = mock_request(rcs0(i915)->kernel_context, T);
164 	if (IS_ERR(request))
165 		return PTR_ERR(request);
166 
167 	if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
168 		pr_err("fence wait success before submit (expected timeout)!\n");
169 		goto out;
170 	}
171 
172 	i915_request_add(request);
173 
174 	if (dma_fence_is_signaled(&request->fence)) {
175 		pr_err("fence signaled immediately!\n");
176 		goto out;
177 	}
178 
179 	if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
180 		pr_err("fence wait success after submit (expected timeout)!\n");
181 		goto out;
182 	}
183 
184 	if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
185 		pr_err("fence wait timed out (expected success)!\n");
186 		goto out;
187 	}
188 
189 	if (!dma_fence_is_signaled(&request->fence)) {
190 		pr_err("fence unsignaled after waiting!\n");
191 		goto out;
192 	}
193 
194 	if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
195 		pr_err("fence wait timed out when complete (expected success)!\n");
196 		goto out;
197 	}
198 
199 	err = 0;
200 out:
201 	mock_device_flush(i915);
202 	return err;
203 }
204 
205 static int igt_request_rewind(void *arg)
206 {
207 	struct drm_i915_private *i915 = arg;
208 	struct i915_request *request, *vip;
209 	struct i915_gem_context *ctx[2];
210 	struct intel_context *ce;
211 	int err = -EINVAL;
212 
213 	ctx[0] = mock_context(i915, "A");
214 	if (!ctx[0]) {
215 		err = -ENOMEM;
216 		goto err_ctx_0;
217 	}
218 
219 	ce = i915_gem_context_get_engine(ctx[0], RCS0);
220 	GEM_BUG_ON(IS_ERR(ce));
221 	request = mock_request(ce, 2 * HZ);
222 	intel_context_put(ce);
223 	if (IS_ERR(request)) {
224 		err = PTR_ERR(request);
225 		goto err_context_0;
226 	}
227 
228 	i915_request_get(request);
229 	i915_request_add(request);
230 
231 	ctx[1] = mock_context(i915, "B");
232 	if (!ctx[1]) {
233 		err = -ENOMEM;
234 		goto err_ctx_1;
235 	}
236 
237 	ce = i915_gem_context_get_engine(ctx[1], RCS0);
238 	GEM_BUG_ON(IS_ERR(ce));
239 	vip = mock_request(ce, 0);
240 	intel_context_put(ce);
241 	if (IS_ERR(vip)) {
242 		err = PTR_ERR(vip);
243 		goto err_context_1;
244 	}
245 
246 	/* Simulate preemption by manual reordering */
247 	if (!mock_cancel_request(request)) {
248 		pr_err("failed to cancel request (already executed)!\n");
249 		i915_request_add(vip);
250 		goto err_context_1;
251 	}
252 	i915_request_get(vip);
253 	i915_request_add(vip);
254 	rcu_read_lock();
255 	request->engine->submit_request(request);
256 	rcu_read_unlock();
257 
258 
259 	if (i915_request_wait(vip, 0, HZ) == -ETIME) {
260 		pr_err("timed out waiting for high priority request\n");
261 		goto err;
262 	}
263 
264 	if (i915_request_completed(request)) {
265 		pr_err("low priority request already completed\n");
266 		goto err;
267 	}
268 
269 	err = 0;
270 err:
271 	i915_request_put(vip);
272 err_context_1:
273 	mock_context_close(ctx[1]);
274 err_ctx_1:
275 	i915_request_put(request);
276 err_context_0:
277 	mock_context_close(ctx[0]);
278 err_ctx_0:
279 	mock_device_flush(i915);
280 	return err;
281 }
282 
283 struct smoketest {
284 	struct intel_engine_cs *engine;
285 	struct i915_gem_context **contexts;
286 	atomic_long_t num_waits, num_fences;
287 	int ncontexts, max_batch;
288 	struct i915_request *(*request_alloc)(struct intel_context *ce);
289 };
290 
291 static struct i915_request *
292 __mock_request_alloc(struct intel_context *ce)
293 {
294 	return mock_request(ce, 0);
295 }
296 
297 static struct i915_request *
298 __live_request_alloc(struct intel_context *ce)
299 {
300 	return intel_context_create_request(ce);
301 }
302 
303 struct smoke_thread {
304 	struct kthread_worker *worker;
305 	struct kthread_work work;
306 	struct smoketest *t;
307 	bool stop;
308 	int result;
309 };
310 
311 static void __igt_breadcrumbs_smoketest(struct kthread_work *work)
312 {
313 	struct smoke_thread *thread = container_of(work, typeof(*thread), work);
314 	struct smoketest *t = thread->t;
315 	const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
316 	const unsigned int total = 4 * t->ncontexts + 1;
317 	unsigned int num_waits = 0, num_fences = 0;
318 	struct i915_request **requests;
319 	I915_RND_STATE(prng);
320 	unsigned int *order;
321 	int err = 0;
322 
323 	/*
324 	 * A very simple test to catch the most egregious of list handling bugs.
325 	 *
326 	 * At its heart, we simply create oodles of requests running across
327 	 * multiple kthreads and enable signaling on them, for the sole purpose
328 	 * of stressing our breadcrumb handling. The only inspection we do is
329 	 * that the fences were marked as signaled.
330 	 */
331 
332 	requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
333 	if (!requests) {
334 		thread->result = -ENOMEM;
335 		return;
336 	}
337 
338 	order = i915_random_order(total, &prng);
339 	if (!order) {
340 		err = -ENOMEM;
341 		goto out_requests;
342 	}
343 
344 	while (!READ_ONCE(thread->stop)) {
345 		struct i915_sw_fence *submit, *wait;
346 		unsigned int n, count;
347 
348 		submit = heap_fence_create(GFP_KERNEL);
349 		if (!submit) {
350 			err = -ENOMEM;
351 			break;
352 		}
353 
354 		wait = heap_fence_create(GFP_KERNEL);
355 		if (!wait) {
356 			i915_sw_fence_commit(submit);
357 			heap_fence_put(submit);
358 			err = -ENOMEM;
359 			break;
360 		}
361 
362 		i915_random_reorder(order, total, &prng);
363 		count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
364 
365 		for (n = 0; n < count; n++) {
366 			struct i915_gem_context *ctx =
367 				t->contexts[order[n] % t->ncontexts];
368 			struct i915_request *rq;
369 			struct intel_context *ce;
370 
371 			ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
372 			GEM_BUG_ON(IS_ERR(ce));
373 			rq = t->request_alloc(ce);
374 			intel_context_put(ce);
375 			if (IS_ERR(rq)) {
376 				err = PTR_ERR(rq);
377 				count = n;
378 				break;
379 			}
380 
381 			err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
382 							       submit,
383 							       GFP_KERNEL);
384 
385 			requests[n] = i915_request_get(rq);
386 			i915_request_add(rq);
387 
388 			if (err >= 0)
389 				err = i915_sw_fence_await_dma_fence(wait,
390 								    &rq->fence,
391 								    0,
392 								    GFP_KERNEL);
393 
394 			if (err < 0) {
395 				i915_request_put(rq);
396 				count = n;
397 				break;
398 			}
399 		}
400 
401 		i915_sw_fence_commit(submit);
402 		i915_sw_fence_commit(wait);
403 
404 		if (!wait_event_timeout(wait->wait,
405 					i915_sw_fence_done(wait),
406 					5 * HZ)) {
407 			struct i915_request *rq = requests[count - 1];
408 
409 			pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
410 			       atomic_read(&wait->pending), count,
411 			       rq->fence.context, rq->fence.seqno,
412 			       t->engine->name);
413 			GEM_TRACE_DUMP();
414 
415 			intel_gt_set_wedged(t->engine->gt);
416 			GEM_BUG_ON(!i915_request_completed(rq));
417 			i915_sw_fence_wait(wait);
418 			err = -EIO;
419 		}
420 
421 		for (n = 0; n < count; n++) {
422 			struct i915_request *rq = requests[n];
423 
424 			if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
425 				      &rq->fence.flags)) {
426 				pr_err("%llu:%llu was not signaled!\n",
427 				       rq->fence.context, rq->fence.seqno);
428 				err = -EINVAL;
429 			}
430 
431 			i915_request_put(rq);
432 		}
433 
434 		heap_fence_put(wait);
435 		heap_fence_put(submit);
436 
437 		if (err < 0)
438 			break;
439 
440 		num_fences += count;
441 		num_waits++;
442 
443 		cond_resched();
444 	}
445 
446 	atomic_long_add(num_fences, &t->num_fences);
447 	atomic_long_add(num_waits, &t->num_waits);
448 
449 	kfree(order);
450 out_requests:
451 	kfree(requests);
452 	thread->result = err;
453 }
454 
455 static int mock_breadcrumbs_smoketest(void *arg)
456 {
457 	struct drm_i915_private *i915 = arg;
458 	struct smoketest t = {
459 		.engine = rcs0(i915),
460 		.ncontexts = 1024,
461 		.max_batch = 1024,
462 		.request_alloc = __mock_request_alloc
463 	};
464 	unsigned int ncpus = num_online_cpus();
465 	struct smoke_thread *threads;
466 	unsigned int n;
467 	int ret = 0;
468 
469 	/*
470 	 * Smoketest our breadcrumb/signal handling for requests across multiple
471 	 * threads. A very simple test to only catch the most egregious of bugs.
472 	 * See __igt_breadcrumbs_smoketest();
473 	 */
474 
475 	threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
476 	if (!threads)
477 		return -ENOMEM;
478 
479 	t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
480 	if (!t.contexts) {
481 		ret = -ENOMEM;
482 		goto out_threads;
483 	}
484 
485 	for (n = 0; n < t.ncontexts; n++) {
486 		t.contexts[n] = mock_context(t.engine->i915, "mock");
487 		if (!t.contexts[n]) {
488 			ret = -ENOMEM;
489 			goto out_contexts;
490 		}
491 	}
492 
493 	for (n = 0; n < ncpus; n++) {
494 		struct kthread_worker *worker;
495 
496 		worker = kthread_run_worker(0, "igt/%d", n);
497 		if (IS_ERR(worker)) {
498 			ret = PTR_ERR(worker);
499 			ncpus = n;
500 			break;
501 		}
502 
503 		threads[n].worker = worker;
504 		threads[n].t = &t;
505 		threads[n].stop = false;
506 		threads[n].result = 0;
507 
508 		kthread_init_work(&threads[n].work,
509 				  __igt_breadcrumbs_smoketest);
510 		kthread_queue_work(worker, &threads[n].work);
511 	}
512 
513 	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
514 
515 	for (n = 0; n < ncpus; n++) {
516 		int err;
517 
518 		WRITE_ONCE(threads[n].stop, true);
519 		kthread_flush_work(&threads[n].work);
520 		err = READ_ONCE(threads[n].result);
521 		if (err < 0 && !ret)
522 			ret = err;
523 
524 		kthread_destroy_worker(threads[n].worker);
525 	}
526 	pr_info("Completed %lu waits for %lu fence across %d cpus\n",
527 		atomic_long_read(&t.num_waits),
528 		atomic_long_read(&t.num_fences),
529 		ncpus);
530 
531 out_contexts:
532 	for (n = 0; n < t.ncontexts; n++) {
533 		if (!t.contexts[n])
534 			break;
535 		mock_context_close(t.contexts[n]);
536 	}
537 	kfree(t.contexts);
538 out_threads:
539 	kfree(threads);
540 	return ret;
541 }
542 
543 int i915_request_mock_selftests(void)
544 {
545 	static const struct i915_subtest tests[] = {
546 		SUBTEST(igt_add_request),
547 		SUBTEST(igt_wait_request),
548 		SUBTEST(igt_fence_wait),
549 		SUBTEST(igt_request_rewind),
550 		SUBTEST(mock_breadcrumbs_smoketest),
551 	};
552 	struct drm_i915_private *i915;
553 	intel_wakeref_t wakeref;
554 	int err = 0;
555 
556 	i915 = mock_gem_device();
557 	if (!i915)
558 		return -ENOMEM;
559 
560 	with_intel_runtime_pm(&i915->runtime_pm, wakeref)
561 		err = i915_subtests(tests, i915);
562 
563 	mock_destroy_device(i915);
564 
565 	return err;
566 }
567 
568 static int live_nop_request(void *arg)
569 {
570 	struct drm_i915_private *i915 = arg;
571 	struct intel_engine_cs *engine;
572 	struct igt_live_test t;
573 	int err = -ENODEV;
574 
575 	/*
576 	 * Submit various sized batches of empty requests, to each engine
577 	 * (individually), and wait for the batch to complete. We can check
578 	 * the overhead of submitting requests to the hardware.
579 	 */
580 
581 	for_each_uabi_engine(engine, i915) {
582 		unsigned long n, prime;
583 		IGT_TIMEOUT(end_time);
584 		ktime_t times[2] = {};
585 
586 		err = igt_live_test_begin(&t, i915, __func__, engine->name);
587 		if (err)
588 			return err;
589 
590 		intel_engine_pm_get(engine);
591 		for_each_prime_number_from(prime, 1, 8192) {
592 			struct i915_request *request = NULL;
593 
594 			times[1] = ktime_get_raw();
595 
596 			for (n = 0; n < prime; n++) {
597 				i915_request_put(request);
598 				request = i915_request_create(engine->kernel_context);
599 				if (IS_ERR(request))
600 					return PTR_ERR(request);
601 
602 				/*
603 				 * This space is left intentionally blank.
604 				 *
605 				 * We do not actually want to perform any
606 				 * action with this request, we just want
607 				 * to measure the latency in allocation
608 				 * and submission of our breadcrumbs -
609 				 * ensuring that the bare request is sufficient
610 				 * for the system to work (i.e. proper HEAD
611 				 * tracking of the rings, interrupt handling,
612 				 * etc). It also gives us the lowest bounds
613 				 * for latency.
614 				 */
615 
616 				i915_request_get(request);
617 				i915_request_add(request);
618 			}
619 			i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
620 			i915_request_put(request);
621 
622 			times[1] = ktime_sub(ktime_get_raw(), times[1]);
623 			if (prime == 1)
624 				times[0] = times[1];
625 
626 			if (__igt_timeout(end_time, NULL))
627 				break;
628 		}
629 		intel_engine_pm_put(engine);
630 
631 		err = igt_live_test_end(&t);
632 		if (err)
633 			return err;
634 
635 		pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
636 			engine->name,
637 			ktime_to_ns(times[0]),
638 			prime, div64_u64(ktime_to_ns(times[1]), prime));
639 	}
640 
641 	return err;
642 }
643 
644 static int __cancel_inactive(struct intel_engine_cs *engine)
645 {
646 	struct intel_context *ce;
647 	struct igt_spinner spin;
648 	struct i915_request *rq;
649 	int err = 0;
650 
651 	if (igt_spinner_init(&spin, engine->gt))
652 		return -ENOMEM;
653 
654 	ce = intel_context_create(engine);
655 	if (IS_ERR(ce)) {
656 		err = PTR_ERR(ce);
657 		goto out_spin;
658 	}
659 
660 	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
661 	if (IS_ERR(rq)) {
662 		err = PTR_ERR(rq);
663 		goto out_ce;
664 	}
665 
666 	pr_debug("%s: Cancelling inactive request\n", engine->name);
667 	i915_request_cancel(rq, -EINTR);
668 	i915_request_get(rq);
669 	i915_request_add(rq);
670 
671 	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
672 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
673 
674 		pr_err("%s: Failed to cancel inactive request\n", engine->name);
675 		intel_engine_dump(engine, &p, "%s\n", engine->name);
676 		err = -ETIME;
677 		goto out_rq;
678 	}
679 
680 	if (rq->fence.error != -EINTR) {
681 		pr_err("%s: fence not cancelled (%u)\n",
682 		       engine->name, rq->fence.error);
683 		err = -EINVAL;
684 	}
685 
686 out_rq:
687 	i915_request_put(rq);
688 out_ce:
689 	intel_context_put(ce);
690 out_spin:
691 	igt_spinner_fini(&spin);
692 	if (err)
693 		pr_err("%s: %s error %d\n", __func__, engine->name, err);
694 	return err;
695 }
696 
697 static int __cancel_active(struct intel_engine_cs *engine)
698 {
699 	struct intel_context *ce;
700 	struct igt_spinner spin;
701 	struct i915_request *rq;
702 	int err = 0;
703 
704 	if (igt_spinner_init(&spin, engine->gt))
705 		return -ENOMEM;
706 
707 	ce = intel_context_create(engine);
708 	if (IS_ERR(ce)) {
709 		err = PTR_ERR(ce);
710 		goto out_spin;
711 	}
712 
713 	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
714 	if (IS_ERR(rq)) {
715 		err = PTR_ERR(rq);
716 		goto out_ce;
717 	}
718 
719 	pr_debug("%s: Cancelling active request\n", engine->name);
720 	i915_request_get(rq);
721 	i915_request_add(rq);
722 	if (!igt_wait_for_spinner(&spin, rq)) {
723 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
724 
725 		pr_err("Failed to start spinner on %s\n", engine->name);
726 		intel_engine_dump(engine, &p, "%s\n", engine->name);
727 		err = -ETIME;
728 		goto out_rq;
729 	}
730 	i915_request_cancel(rq, -EINTR);
731 
732 	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
733 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
734 
735 		pr_err("%s: Failed to cancel active request\n", engine->name);
736 		intel_engine_dump(engine, &p, "%s\n", engine->name);
737 		err = -ETIME;
738 		goto out_rq;
739 	}
740 
741 	if (rq->fence.error != -EINTR) {
742 		pr_err("%s: fence not cancelled (%u)\n",
743 		       engine->name, rq->fence.error);
744 		err = -EINVAL;
745 	}
746 
747 out_rq:
748 	i915_request_put(rq);
749 out_ce:
750 	intel_context_put(ce);
751 out_spin:
752 	igt_spinner_fini(&spin);
753 	if (err)
754 		pr_err("%s: %s error %d\n", __func__, engine->name, err);
755 	return err;
756 }
757 
758 static int __cancel_completed(struct intel_engine_cs *engine)
759 {
760 	struct intel_context *ce;
761 	struct igt_spinner spin;
762 	struct i915_request *rq;
763 	int err = 0;
764 
765 	if (igt_spinner_init(&spin, engine->gt))
766 		return -ENOMEM;
767 
768 	ce = intel_context_create(engine);
769 	if (IS_ERR(ce)) {
770 		err = PTR_ERR(ce);
771 		goto out_spin;
772 	}
773 
774 	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
775 	if (IS_ERR(rq)) {
776 		err = PTR_ERR(rq);
777 		goto out_ce;
778 	}
779 	igt_spinner_end(&spin);
780 	i915_request_get(rq);
781 	i915_request_add(rq);
782 
783 	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
784 		err = -ETIME;
785 		goto out_rq;
786 	}
787 
788 	pr_debug("%s: Cancelling completed request\n", engine->name);
789 	i915_request_cancel(rq, -EINTR);
790 	if (rq->fence.error) {
791 		pr_err("%s: fence not cancelled (%u)\n",
792 		       engine->name, rq->fence.error);
793 		err = -EINVAL;
794 	}
795 
796 out_rq:
797 	i915_request_put(rq);
798 out_ce:
799 	intel_context_put(ce);
800 out_spin:
801 	igt_spinner_fini(&spin);
802 	if (err)
803 		pr_err("%s: %s error %d\n", __func__, engine->name, err);
804 	return err;
805 }
806 
807 /*
808  * Test to prove a non-preemptable request can be cancelled and a subsequent
809  * request on the same context can successfully complete after cancellation.
810  *
811  * Testing methodology is to create a non-preemptible request and submit it,
812  * wait for spinner to start, create a NOP request and submit it, cancel the
813  * spinner, wait for spinner to complete and verify it failed with an error,
814  * finally wait for NOP request to complete verify it succeeded without an
815  * error. Preemption timeout also reduced / restored so test runs in a timely
816  * maner.
817  */
818 static int __cancel_reset(struct drm_i915_private *i915,
819 			  struct intel_engine_cs *engine)
820 {
821 	struct intel_context *ce;
822 	struct igt_spinner spin;
823 	struct i915_request *rq, *nop;
824 	unsigned long preempt_timeout_ms;
825 	int err = 0;
826 
827 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT ||
828 	    !intel_has_reset_engine(engine->gt))
829 		return 0;
830 
831 	preempt_timeout_ms = engine->props.preempt_timeout_ms;
832 	engine->props.preempt_timeout_ms = 100;
833 
834 	if (igt_spinner_init(&spin, engine->gt))
835 		goto out_restore;
836 
837 	ce = intel_context_create(engine);
838 	if (IS_ERR(ce)) {
839 		err = PTR_ERR(ce);
840 		goto out_spin;
841 	}
842 
843 	rq = igt_spinner_create_request(&spin, ce, MI_NOOP);
844 	if (IS_ERR(rq)) {
845 		err = PTR_ERR(rq);
846 		goto out_ce;
847 	}
848 
849 	pr_debug("%s: Cancelling active non-preemptable request\n",
850 		 engine->name);
851 	i915_request_get(rq);
852 	i915_request_add(rq);
853 	if (!igt_wait_for_spinner(&spin, rq)) {
854 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
855 
856 		pr_err("Failed to start spinner on %s\n", engine->name);
857 		intel_engine_dump(engine, &p, "%s\n", engine->name);
858 		err = -ETIME;
859 		goto out_rq;
860 	}
861 
862 	nop = intel_context_create_request(ce);
863 	if (IS_ERR(nop))
864 		goto out_rq;
865 	i915_request_get(nop);
866 	i915_request_add(nop);
867 
868 	i915_request_cancel(rq, -EINTR);
869 
870 	if (i915_request_wait(rq, 0, HZ) < 0) {
871 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
872 
873 		pr_err("%s: Failed to cancel hung request\n", engine->name);
874 		intel_engine_dump(engine, &p, "%s\n", engine->name);
875 		err = -ETIME;
876 		goto out_nop;
877 	}
878 
879 	if (rq->fence.error != -EINTR) {
880 		pr_err("%s: fence not cancelled (%u)\n",
881 		       engine->name, rq->fence.error);
882 		err = -EINVAL;
883 		goto out_nop;
884 	}
885 
886 	if (i915_request_wait(nop, 0, HZ) < 0) {
887 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
888 
889 		pr_err("%s: Failed to complete nop request\n", engine->name);
890 		intel_engine_dump(engine, &p, "%s\n", engine->name);
891 		err = -ETIME;
892 		goto out_nop;
893 	}
894 
895 	if (nop->fence.error != 0) {
896 		pr_err("%s: Nop request errored (%u)\n",
897 		       engine->name, nop->fence.error);
898 		err = -EINVAL;
899 	}
900 
901 out_nop:
902 	i915_request_put(nop);
903 out_rq:
904 	i915_request_put(rq);
905 out_ce:
906 	intel_context_put(ce);
907 out_spin:
908 	igt_spinner_fini(&spin);
909 out_restore:
910 	engine->props.preempt_timeout_ms = preempt_timeout_ms;
911 	if (err)
912 		pr_err("%s: %s error %d\n", __func__, engine->name, err);
913 	return err;
914 }
915 
916 static int live_cancel_request(void *arg)
917 {
918 	struct drm_i915_private *i915 = arg;
919 	struct intel_engine_cs *engine;
920 
921 	/*
922 	 * Check cancellation of requests. We expect to be able to immediately
923 	 * cancel active requests, even if they are currently on the GPU.
924 	 */
925 
926 	for_each_uabi_engine(engine, i915) {
927 		struct igt_live_test t;
928 		int err, err2;
929 
930 		if (!intel_engine_has_preemption(engine))
931 			continue;
932 
933 		err = igt_live_test_begin(&t, i915, __func__, engine->name);
934 		if (err)
935 			return err;
936 
937 		err = __cancel_inactive(engine);
938 		if (err == 0)
939 			err = __cancel_active(engine);
940 		if (err == 0)
941 			err = __cancel_completed(engine);
942 
943 		err2 = igt_live_test_end(&t);
944 		if (err)
945 			return err;
946 		if (err2)
947 			return err2;
948 
949 		/* Expects reset so call outside of igt_live_test_* */
950 		err = __cancel_reset(i915, engine);
951 		if (err)
952 			return err;
953 
954 		if (igt_flush_test(i915))
955 			return -EIO;
956 	}
957 
958 	return 0;
959 }
960 
961 static struct i915_vma *empty_batch(struct intel_gt *gt)
962 {
963 	struct drm_i915_gem_object *obj;
964 	struct i915_vma *vma;
965 	u32 *cmd;
966 	int err;
967 
968 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
969 	if (IS_ERR(obj))
970 		return ERR_CAST(obj);
971 
972 	cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
973 	if (IS_ERR(cmd)) {
974 		err = PTR_ERR(cmd);
975 		goto err;
976 	}
977 
978 	*cmd = MI_BATCH_BUFFER_END;
979 
980 	__i915_gem_object_flush_map(obj, 0, 64);
981 	i915_gem_object_unpin_map(obj);
982 
983 	intel_gt_chipset_flush(gt);
984 
985 	vma = i915_vma_instance(obj, gt->vm, NULL);
986 	if (IS_ERR(vma)) {
987 		err = PTR_ERR(vma);
988 		goto err;
989 	}
990 
991 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
992 	if (err)
993 		goto err;
994 
995 	/* Force the wait now to avoid including it in the benchmark */
996 	err = i915_vma_sync(vma);
997 	if (err)
998 		goto err_pin;
999 
1000 	return vma;
1001 
1002 err_pin:
1003 	i915_vma_unpin(vma);
1004 err:
1005 	i915_gem_object_put(obj);
1006 	return ERR_PTR(err);
1007 }
1008 
1009 static int emit_bb_start(struct i915_request *rq, struct i915_vma *batch)
1010 {
1011 	return rq->engine->emit_bb_start(rq,
1012 					 i915_vma_offset(batch),
1013 					 i915_vma_size(batch),
1014 					 0);
1015 }
1016 
1017 static struct i915_request *
1018 empty_request(struct intel_engine_cs *engine,
1019 	      struct i915_vma *batch)
1020 {
1021 	struct i915_request *request;
1022 	int err;
1023 
1024 	request = i915_request_create(engine->kernel_context);
1025 	if (IS_ERR(request))
1026 		return request;
1027 
1028 	err = emit_bb_start(request, batch);
1029 	if (err)
1030 		goto out_request;
1031 
1032 	i915_request_get(request);
1033 out_request:
1034 	i915_request_add(request);
1035 	return err ? ERR_PTR(err) : request;
1036 }
1037 
1038 static int live_empty_request(void *arg)
1039 {
1040 	struct drm_i915_private *i915 = arg;
1041 	struct intel_engine_cs *engine;
1042 	struct igt_live_test t;
1043 	int err;
1044 
1045 	/*
1046 	 * Submit various sized batches of empty requests, to each engine
1047 	 * (individually), and wait for the batch to complete. We can check
1048 	 * the overhead of submitting requests to the hardware.
1049 	 */
1050 
1051 	for_each_uabi_engine(engine, i915) {
1052 		IGT_TIMEOUT(end_time);
1053 		struct i915_request *request;
1054 		struct i915_vma *batch;
1055 		unsigned long n, prime;
1056 		ktime_t times[2] = {};
1057 
1058 		batch = empty_batch(engine->gt);
1059 		if (IS_ERR(batch))
1060 			return PTR_ERR(batch);
1061 
1062 		err = igt_live_test_begin(&t, i915, __func__, engine->name);
1063 		if (err)
1064 			goto out_batch;
1065 
1066 		intel_engine_pm_get(engine);
1067 
1068 		/* Warmup / preload */
1069 		request = empty_request(engine, batch);
1070 		if (IS_ERR(request)) {
1071 			err = PTR_ERR(request);
1072 			intel_engine_pm_put(engine);
1073 			goto out_batch;
1074 		}
1075 		i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
1076 
1077 		for_each_prime_number_from(prime, 1, 8192) {
1078 			times[1] = ktime_get_raw();
1079 
1080 			for (n = 0; n < prime; n++) {
1081 				i915_request_put(request);
1082 				request = empty_request(engine, batch);
1083 				if (IS_ERR(request)) {
1084 					err = PTR_ERR(request);
1085 					intel_engine_pm_put(engine);
1086 					goto out_batch;
1087 				}
1088 			}
1089 			i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
1090 
1091 			times[1] = ktime_sub(ktime_get_raw(), times[1]);
1092 			if (prime == 1)
1093 				times[0] = times[1];
1094 
1095 			if (__igt_timeout(end_time, NULL))
1096 				break;
1097 		}
1098 		i915_request_put(request);
1099 		intel_engine_pm_put(engine);
1100 
1101 		err = igt_live_test_end(&t);
1102 		if (err)
1103 			goto out_batch;
1104 
1105 		pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
1106 			engine->name,
1107 			ktime_to_ns(times[0]),
1108 			prime, div64_u64(ktime_to_ns(times[1]), prime));
1109 out_batch:
1110 		i915_vma_unpin(batch);
1111 		i915_vma_put(batch);
1112 		if (err)
1113 			break;
1114 	}
1115 
1116 	return err;
1117 }
1118 
1119 static struct i915_vma *recursive_batch(struct intel_gt *gt)
1120 {
1121 	struct drm_i915_gem_object *obj;
1122 	const int ver = GRAPHICS_VER(gt->i915);
1123 	struct i915_vma *vma;
1124 	u32 *cmd;
1125 	int err;
1126 
1127 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
1128 	if (IS_ERR(obj))
1129 		return ERR_CAST(obj);
1130 
1131 	vma = i915_vma_instance(obj, gt->vm, NULL);
1132 	if (IS_ERR(vma)) {
1133 		err = PTR_ERR(vma);
1134 		goto err;
1135 	}
1136 
1137 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
1138 	if (err)
1139 		goto err;
1140 
1141 	cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
1142 	if (IS_ERR(cmd)) {
1143 		err = PTR_ERR(cmd);
1144 		goto err;
1145 	}
1146 
1147 	if (ver >= 8) {
1148 		*cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
1149 		*cmd++ = lower_32_bits(i915_vma_offset(vma));
1150 		*cmd++ = upper_32_bits(i915_vma_offset(vma));
1151 	} else if (ver >= 6) {
1152 		*cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
1153 		*cmd++ = lower_32_bits(i915_vma_offset(vma));
1154 	} else {
1155 		*cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1156 		*cmd++ = lower_32_bits(i915_vma_offset(vma));
1157 	}
1158 	*cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
1159 
1160 	__i915_gem_object_flush_map(obj, 0, 64);
1161 	i915_gem_object_unpin_map(obj);
1162 
1163 	intel_gt_chipset_flush(gt);
1164 
1165 	return vma;
1166 
1167 err:
1168 	i915_gem_object_put(obj);
1169 	return ERR_PTR(err);
1170 }
1171 
1172 static int recursive_batch_resolve(struct i915_vma *batch)
1173 {
1174 	u32 *cmd;
1175 
1176 	cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC);
1177 	if (IS_ERR(cmd))
1178 		return PTR_ERR(cmd);
1179 
1180 	*cmd = MI_BATCH_BUFFER_END;
1181 
1182 	__i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
1183 	i915_gem_object_unpin_map(batch->obj);
1184 
1185 	intel_gt_chipset_flush(batch->vm->gt);
1186 
1187 	return 0;
1188 }
1189 
1190 static int live_all_engines(void *arg)
1191 {
1192 	struct drm_i915_private *i915 = arg;
1193 	const unsigned int nengines = num_uabi_engines(i915);
1194 	struct intel_engine_cs *engine;
1195 	struct i915_request **request;
1196 	struct igt_live_test t;
1197 	unsigned int idx;
1198 	int err;
1199 
1200 	/*
1201 	 * Check we can submit requests to all engines simultaneously. We
1202 	 * send a recursive batch to each engine - checking that we don't
1203 	 * block doing so, and that they don't complete too soon.
1204 	 */
1205 
1206 	request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1207 	if (!request)
1208 		return -ENOMEM;
1209 
1210 	err = igt_live_test_begin(&t, i915, __func__, "");
1211 	if (err)
1212 		goto out_free;
1213 
1214 	idx = 0;
1215 	for_each_uabi_engine(engine, i915) {
1216 		struct i915_vma *batch;
1217 
1218 		batch = recursive_batch(engine->gt);
1219 		if (IS_ERR(batch)) {
1220 			err = PTR_ERR(batch);
1221 			pr_err("%s: Unable to create batch, err=%d\n",
1222 			       __func__, err);
1223 			goto out_free;
1224 		}
1225 
1226 		i915_vma_lock(batch);
1227 		request[idx] = intel_engine_create_kernel_request(engine);
1228 		if (IS_ERR(request[idx])) {
1229 			err = PTR_ERR(request[idx]);
1230 			pr_err("%s: Request allocation failed with err=%d\n",
1231 			       __func__, err);
1232 			goto out_unlock;
1233 		}
1234 		GEM_BUG_ON(request[idx]->context->vm != batch->vm);
1235 
1236 		err = i915_vma_move_to_active(batch, request[idx], 0);
1237 		GEM_BUG_ON(err);
1238 
1239 		err = emit_bb_start(request[idx], batch);
1240 		GEM_BUG_ON(err);
1241 		request[idx]->batch = batch;
1242 
1243 		i915_request_get(request[idx]);
1244 		i915_request_add(request[idx]);
1245 		idx++;
1246 out_unlock:
1247 		i915_vma_unlock(batch);
1248 		if (err)
1249 			goto out_request;
1250 	}
1251 
1252 	idx = 0;
1253 	for_each_uabi_engine(engine, i915) {
1254 		if (i915_request_completed(request[idx])) {
1255 			pr_err("%s(%s): request completed too early!\n",
1256 			       __func__, engine->name);
1257 			err = -EINVAL;
1258 			goto out_request;
1259 		}
1260 		idx++;
1261 	}
1262 
1263 	idx = 0;
1264 	for_each_uabi_engine(engine, i915) {
1265 		err = recursive_batch_resolve(request[idx]->batch);
1266 		if (err) {
1267 			pr_err("%s: failed to resolve batch, err=%d\n",
1268 			       __func__, err);
1269 			goto out_request;
1270 		}
1271 		idx++;
1272 	}
1273 
1274 	idx = 0;
1275 	for_each_uabi_engine(engine, i915) {
1276 		struct i915_request *rq = request[idx];
1277 		long timeout;
1278 
1279 		timeout = i915_request_wait(rq, 0,
1280 					    MAX_SCHEDULE_TIMEOUT);
1281 		if (timeout < 0) {
1282 			err = timeout;
1283 			pr_err("%s: error waiting for request on %s, err=%d\n",
1284 			       __func__, engine->name, err);
1285 			goto out_request;
1286 		}
1287 
1288 		GEM_BUG_ON(!i915_request_completed(rq));
1289 		i915_vma_unpin(rq->batch);
1290 		i915_vma_put(rq->batch);
1291 		i915_request_put(rq);
1292 		request[idx] = NULL;
1293 		idx++;
1294 	}
1295 
1296 	err = igt_live_test_end(&t);
1297 
1298 out_request:
1299 	idx = 0;
1300 	for_each_uabi_engine(engine, i915) {
1301 		struct i915_request *rq = request[idx];
1302 
1303 		if (!rq)
1304 			continue;
1305 
1306 		if (rq->batch) {
1307 			i915_vma_unpin(rq->batch);
1308 			i915_vma_put(rq->batch);
1309 		}
1310 		i915_request_put(rq);
1311 		idx++;
1312 	}
1313 out_free:
1314 	kfree(request);
1315 	return err;
1316 }
1317 
1318 static int live_sequential_engines(void *arg)
1319 {
1320 	struct drm_i915_private *i915 = arg;
1321 	const unsigned int nengines = num_uabi_engines(i915);
1322 	struct i915_request **request;
1323 	struct i915_request *prev = NULL;
1324 	struct intel_engine_cs *engine;
1325 	struct igt_live_test t;
1326 	unsigned int idx;
1327 	int err;
1328 
1329 	/*
1330 	 * Check we can submit requests to all engines sequentially, such
1331 	 * that each successive request waits for the earlier ones. This
1332 	 * tests that we don't execute requests out of order, even though
1333 	 * they are running on independent engines.
1334 	 */
1335 
1336 	request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1337 	if (!request)
1338 		return -ENOMEM;
1339 
1340 	err = igt_live_test_begin(&t, i915, __func__, "");
1341 	if (err)
1342 		goto out_free;
1343 
1344 	idx = 0;
1345 	for_each_uabi_engine(engine, i915) {
1346 		struct i915_vma *batch;
1347 
1348 		batch = recursive_batch(engine->gt);
1349 		if (IS_ERR(batch)) {
1350 			err = PTR_ERR(batch);
1351 			pr_err("%s: Unable to create batch for %s, err=%d\n",
1352 			       __func__, engine->name, err);
1353 			goto out_free;
1354 		}
1355 
1356 		i915_vma_lock(batch);
1357 		request[idx] = intel_engine_create_kernel_request(engine);
1358 		if (IS_ERR(request[idx])) {
1359 			err = PTR_ERR(request[idx]);
1360 			pr_err("%s: Request allocation failed for %s with err=%d\n",
1361 			       __func__, engine->name, err);
1362 			goto out_unlock;
1363 		}
1364 		GEM_BUG_ON(request[idx]->context->vm != batch->vm);
1365 
1366 		if (prev) {
1367 			err = i915_request_await_dma_fence(request[idx],
1368 							   &prev->fence);
1369 			if (err) {
1370 				i915_request_add(request[idx]);
1371 				pr_err("%s: Request await failed for %s with err=%d\n",
1372 				       __func__, engine->name, err);
1373 				goto out_unlock;
1374 			}
1375 		}
1376 
1377 		err = i915_vma_move_to_active(batch, request[idx], 0);
1378 		GEM_BUG_ON(err);
1379 
1380 		err = emit_bb_start(request[idx], batch);
1381 		GEM_BUG_ON(err);
1382 		request[idx]->batch = batch;
1383 
1384 		i915_request_get(request[idx]);
1385 		i915_request_add(request[idx]);
1386 
1387 		prev = request[idx];
1388 		idx++;
1389 
1390 out_unlock:
1391 		i915_vma_unlock(batch);
1392 		if (err)
1393 			goto out_request;
1394 	}
1395 
1396 	idx = 0;
1397 	for_each_uabi_engine(engine, i915) {
1398 		long timeout;
1399 
1400 		if (i915_request_completed(request[idx])) {
1401 			pr_err("%s(%s): request completed too early!\n",
1402 			       __func__, engine->name);
1403 			err = -EINVAL;
1404 			goto out_request;
1405 		}
1406 
1407 		err = recursive_batch_resolve(request[idx]->batch);
1408 		if (err) {
1409 			pr_err("%s: failed to resolve batch, err=%d\n",
1410 			       __func__, err);
1411 			goto out_request;
1412 		}
1413 
1414 		timeout = i915_request_wait(request[idx], 0,
1415 					    MAX_SCHEDULE_TIMEOUT);
1416 		if (timeout < 0) {
1417 			err = timeout;
1418 			pr_err("%s: error waiting for request on %s, err=%d\n",
1419 			       __func__, engine->name, err);
1420 			goto out_request;
1421 		}
1422 
1423 		GEM_BUG_ON(!i915_request_completed(request[idx]));
1424 		idx++;
1425 	}
1426 
1427 	err = igt_live_test_end(&t);
1428 
1429 out_request:
1430 	idx = 0;
1431 	for_each_uabi_engine(engine, i915) {
1432 		u32 *cmd;
1433 
1434 		if (!request[idx])
1435 			break;
1436 
1437 		cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj,
1438 						       I915_MAP_WC);
1439 		if (!IS_ERR(cmd)) {
1440 			*cmd = MI_BATCH_BUFFER_END;
1441 
1442 			__i915_gem_object_flush_map(request[idx]->batch->obj,
1443 						    0, sizeof(*cmd));
1444 			i915_gem_object_unpin_map(request[idx]->batch->obj);
1445 
1446 			intel_gt_chipset_flush(engine->gt);
1447 		}
1448 
1449 		i915_vma_put(request[idx]->batch);
1450 		i915_request_put(request[idx]);
1451 		idx++;
1452 	}
1453 out_free:
1454 	kfree(request);
1455 	return err;
1456 }
1457 
1458 struct parallel_thread {
1459 	struct kthread_worker *worker;
1460 	struct kthread_work work;
1461 	struct intel_engine_cs *engine;
1462 	int result;
1463 };
1464 
1465 static void __live_parallel_engine1(struct kthread_work *work)
1466 {
1467 	struct parallel_thread *thread =
1468 		container_of(work, typeof(*thread), work);
1469 	struct intel_engine_cs *engine = thread->engine;
1470 	IGT_TIMEOUT(end_time);
1471 	unsigned long count;
1472 	int err = 0;
1473 
1474 	count = 0;
1475 	intel_engine_pm_get(engine);
1476 	do {
1477 		struct i915_request *rq;
1478 
1479 		rq = i915_request_create(engine->kernel_context);
1480 		if (IS_ERR(rq)) {
1481 			err = PTR_ERR(rq);
1482 			break;
1483 		}
1484 
1485 		i915_request_get(rq);
1486 		i915_request_add(rq);
1487 
1488 		err = 0;
1489 		if (i915_request_wait(rq, 0, HZ) < 0)
1490 			err = -ETIME;
1491 		i915_request_put(rq);
1492 		if (err)
1493 			break;
1494 
1495 		count++;
1496 	} while (!__igt_timeout(end_time, NULL));
1497 	intel_engine_pm_put(engine);
1498 
1499 	pr_info("%s: %lu request + sync\n", engine->name, count);
1500 	thread->result = err;
1501 }
1502 
1503 static void __live_parallel_engineN(struct kthread_work *work)
1504 {
1505 	struct parallel_thread *thread =
1506 		container_of(work, typeof(*thread), work);
1507 	struct intel_engine_cs *engine = thread->engine;
1508 	IGT_TIMEOUT(end_time);
1509 	unsigned long count;
1510 	int err = 0;
1511 
1512 	count = 0;
1513 	intel_engine_pm_get(engine);
1514 	do {
1515 		struct i915_request *rq;
1516 
1517 		rq = i915_request_create(engine->kernel_context);
1518 		if (IS_ERR(rq)) {
1519 			err = PTR_ERR(rq);
1520 			break;
1521 		}
1522 
1523 		i915_request_add(rq);
1524 		count++;
1525 	} while (!__igt_timeout(end_time, NULL));
1526 	intel_engine_pm_put(engine);
1527 
1528 	pr_info("%s: %lu requests\n", engine->name, count);
1529 	thread->result = err;
1530 }
1531 
1532 static bool wake_all(struct drm_i915_private *i915)
1533 {
1534 	if (atomic_dec_and_test(&i915->selftest.counter)) {
1535 		wake_up_var(&i915->selftest.counter);
1536 		return true;
1537 	}
1538 
1539 	return false;
1540 }
1541 
1542 static int wait_for_all(struct drm_i915_private *i915)
1543 {
1544 	if (wake_all(i915))
1545 		return 0;
1546 
1547 	if (wait_var_event_timeout(&i915->selftest.counter,
1548 				   !atomic_read(&i915->selftest.counter),
1549 				   i915_selftest.timeout_jiffies))
1550 		return 0;
1551 
1552 	return -ETIME;
1553 }
1554 
1555 static void __live_parallel_spin(struct kthread_work *work)
1556 {
1557 	struct parallel_thread *thread =
1558 		container_of(work, typeof(*thread), work);
1559 	struct intel_engine_cs *engine = thread->engine;
1560 	struct igt_spinner spin;
1561 	struct i915_request *rq;
1562 	int err = 0;
1563 
1564 	/*
1565 	 * Create a spinner running for eternity on each engine. If a second
1566 	 * spinner is incorrectly placed on the same engine, it will not be
1567 	 * able to start in time.
1568 	 */
1569 
1570 	if (igt_spinner_init(&spin, engine->gt)) {
1571 		wake_all(engine->i915);
1572 		thread->result = -ENOMEM;
1573 		return;
1574 	}
1575 
1576 	intel_engine_pm_get(engine);
1577 	rq = igt_spinner_create_request(&spin,
1578 					engine->kernel_context,
1579 					MI_NOOP); /* no preemption */
1580 	intel_engine_pm_put(engine);
1581 	if (IS_ERR(rq)) {
1582 		err = PTR_ERR(rq);
1583 		if (err == -ENODEV)
1584 			err = 0;
1585 		wake_all(engine->i915);
1586 		goto out_spin;
1587 	}
1588 
1589 	i915_request_get(rq);
1590 	i915_request_add(rq);
1591 	if (igt_wait_for_spinner(&spin, rq)) {
1592 		/* Occupy this engine for the whole test */
1593 		err = wait_for_all(engine->i915);
1594 	} else {
1595 		pr_err("Failed to start spinner on %s\n", engine->name);
1596 		err = -EINVAL;
1597 	}
1598 	igt_spinner_end(&spin);
1599 
1600 	if (err == 0 && i915_request_wait(rq, 0, HZ) < 0)
1601 		err = -EIO;
1602 	i915_request_put(rq);
1603 
1604 out_spin:
1605 	igt_spinner_fini(&spin);
1606 	thread->result = err;
1607 }
1608 
1609 static int live_parallel_engines(void *arg)
1610 {
1611 	struct drm_i915_private *i915 = arg;
1612 	static void (* const func[])(struct kthread_work *) = {
1613 		__live_parallel_engine1,
1614 		__live_parallel_engineN,
1615 		__live_parallel_spin,
1616 		NULL,
1617 	};
1618 	const unsigned int nengines = num_uabi_engines(i915);
1619 	struct parallel_thread *threads;
1620 	struct intel_engine_cs *engine;
1621 	void (* const *fn)(struct kthread_work *);
1622 	int err = 0;
1623 
1624 	/*
1625 	 * Check we can submit requests to all engines concurrently. This
1626 	 * tests that we load up the system maximally.
1627 	 */
1628 
1629 	threads = kcalloc(nengines, sizeof(*threads), GFP_KERNEL);
1630 	if (!threads)
1631 		return -ENOMEM;
1632 
1633 	for (fn = func; !err && *fn; fn++) {
1634 		char name[KSYM_NAME_LEN];
1635 		struct igt_live_test t;
1636 		unsigned int idx;
1637 
1638 		snprintf(name, sizeof(name), "%ps", *fn);
1639 		err = igt_live_test_begin(&t, i915, __func__, name);
1640 		if (err)
1641 			break;
1642 
1643 		atomic_set(&i915->selftest.counter, nengines);
1644 
1645 		idx = 0;
1646 		for_each_uabi_engine(engine, i915) {
1647 			struct kthread_worker *worker;
1648 
1649 			worker = kthread_run_worker(0, "igt/parallel:%s",
1650 						       engine->name);
1651 			if (IS_ERR(worker)) {
1652 				err = PTR_ERR(worker);
1653 				break;
1654 			}
1655 
1656 			threads[idx].worker = worker;
1657 			threads[idx].result = 0;
1658 			threads[idx].engine = engine;
1659 
1660 			kthread_init_work(&threads[idx].work, *fn);
1661 			kthread_queue_work(worker, &threads[idx].work);
1662 			idx++;
1663 		}
1664 
1665 		idx = 0;
1666 		for_each_uabi_engine(engine, i915) {
1667 			int status;
1668 
1669 			if (!threads[idx].worker)
1670 				break;
1671 
1672 			kthread_flush_work(&threads[idx].work);
1673 			status = READ_ONCE(threads[idx].result);
1674 			if (status && !err)
1675 				err = status;
1676 
1677 			kthread_destroy_worker(threads[idx++].worker);
1678 		}
1679 
1680 		if (igt_live_test_end(&t))
1681 			err = -EIO;
1682 	}
1683 
1684 	kfree(threads);
1685 	return err;
1686 }
1687 
1688 static int
1689 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1690 {
1691 	struct i915_request *rq;
1692 	int ret;
1693 
1694 	/*
1695 	 * Before execlists, all contexts share the same ringbuffer. With
1696 	 * execlists, each context/engine has a separate ringbuffer and
1697 	 * for the purposes of this test, inexhaustible.
1698 	 *
1699 	 * For the global ringbuffer though, we have to be very careful
1700 	 * that we do not wrap while preventing the execution of requests
1701 	 * with a unsignaled fence.
1702 	 */
1703 	if (HAS_EXECLISTS(ctx->i915))
1704 		return INT_MAX;
1705 
1706 	rq = igt_request_alloc(ctx, engine);
1707 	if (IS_ERR(rq)) {
1708 		ret = PTR_ERR(rq);
1709 	} else {
1710 		int sz;
1711 
1712 		ret = rq->ring->size - rq->reserved_space;
1713 		i915_request_add(rq);
1714 
1715 		sz = rq->ring->emit - rq->head;
1716 		if (sz < 0)
1717 			sz += rq->ring->size;
1718 		ret /= sz;
1719 		ret /= 2; /* leave half spare, in case of emergency! */
1720 	}
1721 
1722 	return ret;
1723 }
1724 
1725 static int live_breadcrumbs_smoketest(void *arg)
1726 {
1727 	struct drm_i915_private *i915 = arg;
1728 	const unsigned int nengines = num_uabi_engines(i915);
1729 	const unsigned int ncpus = /* saturate with nengines * ncpus */
1730 		max_t(int, 2, DIV_ROUND_UP(num_online_cpus(), nengines));
1731 	unsigned long num_waits, num_fences;
1732 	struct intel_engine_cs *engine;
1733 	struct smoke_thread *threads;
1734 	struct igt_live_test live;
1735 	intel_wakeref_t wakeref;
1736 	struct smoketest *smoke;
1737 	unsigned int n, idx;
1738 	struct file *file;
1739 	int ret = 0;
1740 
1741 	/*
1742 	 * Smoketest our breadcrumb/signal handling for requests across multiple
1743 	 * threads. A very simple test to only catch the most egregious of bugs.
1744 	 * See __igt_breadcrumbs_smoketest();
1745 	 *
1746 	 * On real hardware this time.
1747 	 */
1748 
1749 	wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1750 
1751 	file = mock_file(i915);
1752 	if (IS_ERR(file)) {
1753 		ret = PTR_ERR(file);
1754 		goto out_rpm;
1755 	}
1756 
1757 	smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1758 	if (!smoke) {
1759 		ret = -ENOMEM;
1760 		goto out_file;
1761 	}
1762 
1763 	threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1764 	if (!threads) {
1765 		ret = -ENOMEM;
1766 		goto out_smoke;
1767 	}
1768 
1769 	smoke[0].request_alloc = __live_request_alloc;
1770 	smoke[0].ncontexts = 64;
1771 	smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1772 				    sizeof(*smoke[0].contexts),
1773 				    GFP_KERNEL);
1774 	if (!smoke[0].contexts) {
1775 		ret = -ENOMEM;
1776 		goto out_threads;
1777 	}
1778 
1779 	for (n = 0; n < smoke[0].ncontexts; n++) {
1780 		smoke[0].contexts[n] = live_context(i915, file);
1781 		if (IS_ERR(smoke[0].contexts[n])) {
1782 			ret = PTR_ERR(smoke[0].contexts[n]);
1783 			goto out_contexts;
1784 		}
1785 	}
1786 
1787 	ret = igt_live_test_begin(&live, i915, __func__, "");
1788 	if (ret)
1789 		goto out_contexts;
1790 
1791 	idx = 0;
1792 	for_each_uabi_engine(engine, i915) {
1793 		smoke[idx] = smoke[0];
1794 		smoke[idx].engine = engine;
1795 		smoke[idx].max_batch =
1796 			max_batches(smoke[0].contexts[0], engine);
1797 		if (smoke[idx].max_batch < 0) {
1798 			ret = smoke[idx].max_batch;
1799 			goto out_flush;
1800 		}
1801 		/* One ring interleaved between requests from all cpus */
1802 		smoke[idx].max_batch /= ncpus + 1;
1803 		pr_debug("Limiting batches to %d requests on %s\n",
1804 			 smoke[idx].max_batch, engine->name);
1805 
1806 		for (n = 0; n < ncpus; n++) {
1807 			unsigned int i = idx * ncpus + n;
1808 			struct kthread_worker *worker;
1809 
1810 			worker = kthread_run_worker(0, "igt/%d.%d", idx, n);
1811 			if (IS_ERR(worker)) {
1812 				ret = PTR_ERR(worker);
1813 				goto out_flush;
1814 			}
1815 
1816 			threads[i].worker = worker;
1817 			threads[i].t = &smoke[idx];
1818 
1819 			kthread_init_work(&threads[i].work,
1820 					  __igt_breadcrumbs_smoketest);
1821 			kthread_queue_work(worker, &threads[i].work);
1822 		}
1823 
1824 		idx++;
1825 	}
1826 
1827 	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1828 
1829 out_flush:
1830 	idx = 0;
1831 	num_waits = 0;
1832 	num_fences = 0;
1833 	for_each_uabi_engine(engine, i915) {
1834 		for (n = 0; n < ncpus; n++) {
1835 			unsigned int i = idx * ncpus + n;
1836 			int err;
1837 
1838 			if (!threads[i].worker)
1839 				continue;
1840 
1841 			WRITE_ONCE(threads[i].stop, true);
1842 			kthread_flush_work(&threads[i].work);
1843 			err = READ_ONCE(threads[i].result);
1844 			if (err < 0 && !ret)
1845 				ret = err;
1846 
1847 			kthread_destroy_worker(threads[i].worker);
1848 		}
1849 
1850 		num_waits += atomic_long_read(&smoke[idx].num_waits);
1851 		num_fences += atomic_long_read(&smoke[idx].num_fences);
1852 		idx++;
1853 	}
1854 	pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1855 		num_waits, num_fences, idx, ncpus);
1856 
1857 	ret = igt_live_test_end(&live) ?: ret;
1858 out_contexts:
1859 	kfree(smoke[0].contexts);
1860 out_threads:
1861 	kfree(threads);
1862 out_smoke:
1863 	kfree(smoke);
1864 out_file:
1865 	fput(file);
1866 out_rpm:
1867 	intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1868 
1869 	return ret;
1870 }
1871 
1872 int i915_request_live_selftests(struct drm_i915_private *i915)
1873 {
1874 	static const struct i915_subtest tests[] = {
1875 		SUBTEST(live_nop_request),
1876 		SUBTEST(live_all_engines),
1877 		SUBTEST(live_sequential_engines),
1878 		SUBTEST(live_parallel_engines),
1879 		SUBTEST(live_empty_request),
1880 		SUBTEST(live_cancel_request),
1881 		SUBTEST(live_breadcrumbs_smoketest),
1882 	};
1883 
1884 	if (intel_gt_is_wedged(to_gt(i915)))
1885 		return 0;
1886 
1887 	return i915_live_subtests(tests, i915);
1888 }
1889 
1890 static int switch_to_kernel_sync(struct intel_context *ce, int err)
1891 {
1892 	struct i915_request *rq;
1893 	struct dma_fence *fence;
1894 
1895 	rq = intel_engine_create_kernel_request(ce->engine);
1896 	if (IS_ERR(rq))
1897 		return PTR_ERR(rq);
1898 
1899 	fence = i915_active_fence_get(&ce->timeline->last_request);
1900 	if (fence) {
1901 		i915_request_await_dma_fence(rq, fence);
1902 		dma_fence_put(fence);
1903 	}
1904 
1905 	rq = i915_request_get(rq);
1906 	i915_request_add(rq);
1907 	if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1908 		err = -ETIME;
1909 	i915_request_put(rq);
1910 
1911 	while (!err && !intel_engine_is_idle(ce->engine))
1912 		intel_engine_flush_submission(ce->engine);
1913 
1914 	return err;
1915 }
1916 
1917 struct perf_stats {
1918 	struct intel_engine_cs *engine;
1919 	unsigned long count;
1920 	ktime_t time;
1921 	ktime_t busy;
1922 	u64 runtime;
1923 };
1924 
1925 struct perf_series {
1926 	struct drm_i915_private *i915;
1927 	unsigned int nengines;
1928 	struct intel_context *ce[] __counted_by(nengines);
1929 };
1930 
1931 static int cmp_u32(const void *A, const void *B)
1932 {
1933 	const u32 *a = A, *b = B;
1934 
1935 	return *a - *b;
1936 }
1937 
1938 static u32 trifilter(u32 *a)
1939 {
1940 	u64 sum;
1941 
1942 #define TF_COUNT 5
1943 	sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1944 
1945 	sum = mul_u32_u32(a[2], 2);
1946 	sum += a[1];
1947 	sum += a[3];
1948 
1949 	GEM_BUG_ON(sum > U32_MAX);
1950 	return sum;
1951 #define TF_BIAS 2
1952 }
1953 
1954 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1955 {
1956 	u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles);
1957 
1958 	return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1959 }
1960 
1961 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1962 {
1963 	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1964 	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1965 	*cs++ = offset;
1966 	*cs++ = 0;
1967 
1968 	return cs;
1969 }
1970 
1971 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1972 {
1973 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1974 	*cs++ = offset;
1975 	*cs++ = 0;
1976 	*cs++ = value;
1977 
1978 	return cs;
1979 }
1980 
1981 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1982 {
1983 	*cs++ = MI_SEMAPHORE_WAIT |
1984 		MI_SEMAPHORE_GLOBAL_GTT |
1985 		MI_SEMAPHORE_POLL |
1986 		mode;
1987 	*cs++ = value;
1988 	*cs++ = offset;
1989 	*cs++ = 0;
1990 
1991 	return cs;
1992 }
1993 
1994 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1995 {
1996 	return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1997 }
1998 
1999 static void semaphore_set(u32 *sema, u32 value)
2000 {
2001 	WRITE_ONCE(*sema, value);
2002 	wmb(); /* flush the update to the cache, and beyond */
2003 }
2004 
2005 static u32 *hwsp_scratch(const struct intel_context *ce)
2006 {
2007 	return memset32(ce->engine->status_page.addr + 1000, 0, 21);
2008 }
2009 
2010 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
2011 {
2012 	return (i915_ggtt_offset(ce->engine->status_page.vma) +
2013 		offset_in_page(dw));
2014 }
2015 
2016 static int measure_semaphore_response(struct intel_context *ce)
2017 {
2018 	u32 *sema = hwsp_scratch(ce);
2019 	const u32 offset = hwsp_offset(ce, sema);
2020 	u32 elapsed[TF_COUNT], cycles;
2021 	struct i915_request *rq;
2022 	u32 *cs;
2023 	int err;
2024 	int i;
2025 
2026 	/*
2027 	 * Measure how many cycles it takes for the HW to detect the change
2028 	 * in a semaphore value.
2029 	 *
2030 	 *    A: read CS_TIMESTAMP from CPU
2031 	 *    poke semaphore
2032 	 *    B: read CS_TIMESTAMP on GPU
2033 	 *
2034 	 * Semaphore latency: B - A
2035 	 */
2036 
2037 	semaphore_set(sema, -1);
2038 
2039 	rq = i915_request_create(ce);
2040 	if (IS_ERR(rq))
2041 		return PTR_ERR(rq);
2042 
2043 	cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
2044 	if (IS_ERR(cs)) {
2045 		i915_request_add(rq);
2046 		err = PTR_ERR(cs);
2047 		goto err;
2048 	}
2049 
2050 	cs = emit_store_dw(cs, offset, 0);
2051 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2052 		cs = emit_semaphore_poll_until(cs, offset, i);
2053 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2054 		cs = emit_store_dw(cs, offset, 0);
2055 	}
2056 
2057 	intel_ring_advance(rq, cs);
2058 	i915_request_add(rq);
2059 
2060 	if (wait_for(READ_ONCE(*sema) == 0, 50)) {
2061 		err = -EIO;
2062 		goto err;
2063 	}
2064 
2065 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2066 		preempt_disable();
2067 		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2068 		semaphore_set(sema, i);
2069 		preempt_enable();
2070 
2071 		if (wait_for(READ_ONCE(*sema) == 0, 50)) {
2072 			err = -EIO;
2073 			goto err;
2074 		}
2075 
2076 		elapsed[i - 1] = sema[i] - cycles;
2077 	}
2078 
2079 	cycles = trifilter(elapsed);
2080 	pr_info("%s: semaphore response %d cycles, %lluns\n",
2081 		ce->engine->name, cycles >> TF_BIAS,
2082 		cycles_to_ns(ce->engine, cycles));
2083 
2084 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2085 
2086 err:
2087 	intel_gt_set_wedged(ce->engine->gt);
2088 	return err;
2089 }
2090 
2091 static int measure_idle_dispatch(struct intel_context *ce)
2092 {
2093 	u32 *sema = hwsp_scratch(ce);
2094 	const u32 offset = hwsp_offset(ce, sema);
2095 	u32 elapsed[TF_COUNT], cycles;
2096 	u32 *cs;
2097 	int err;
2098 	int i;
2099 
2100 	/*
2101 	 * Measure how long it takes for us to submit a request while the
2102 	 * engine is idle, but is resting in our context.
2103 	 *
2104 	 *    A: read CS_TIMESTAMP from CPU
2105 	 *    submit request
2106 	 *    B: read CS_TIMESTAMP on GPU
2107 	 *
2108 	 * Submission latency: B - A
2109 	 */
2110 
2111 	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2112 		struct i915_request *rq;
2113 
2114 		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2115 		if (err)
2116 			return err;
2117 
2118 		rq = i915_request_create(ce);
2119 		if (IS_ERR(rq)) {
2120 			err = PTR_ERR(rq);
2121 			goto err;
2122 		}
2123 
2124 		cs = intel_ring_begin(rq, 4);
2125 		if (IS_ERR(cs)) {
2126 			i915_request_add(rq);
2127 			err = PTR_ERR(cs);
2128 			goto err;
2129 		}
2130 
2131 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2132 
2133 		intel_ring_advance(rq, cs);
2134 
2135 		preempt_disable();
2136 		local_bh_disable();
2137 		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2138 		i915_request_add(rq);
2139 		local_bh_enable();
2140 		preempt_enable();
2141 	}
2142 
2143 	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2144 	if (err)
2145 		goto err;
2146 
2147 	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
2148 		elapsed[i] = sema[i] - elapsed[i];
2149 
2150 	cycles = trifilter(elapsed);
2151 	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
2152 		ce->engine->name, cycles >> TF_BIAS,
2153 		cycles_to_ns(ce->engine, cycles));
2154 
2155 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2156 
2157 err:
2158 	intel_gt_set_wedged(ce->engine->gt);
2159 	return err;
2160 }
2161 
2162 static int measure_busy_dispatch(struct intel_context *ce)
2163 {
2164 	u32 *sema = hwsp_scratch(ce);
2165 	const u32 offset = hwsp_offset(ce, sema);
2166 	u32 elapsed[TF_COUNT + 1], cycles;
2167 	u32 *cs;
2168 	int err;
2169 	int i;
2170 
2171 	/*
2172 	 * Measure how long it takes for us to submit a request while the
2173 	 * engine is busy, polling on a semaphore in our context. With
2174 	 * direct submission, this will include the cost of a lite restore.
2175 	 *
2176 	 *    A: read CS_TIMESTAMP from CPU
2177 	 *    submit request
2178 	 *    B: read CS_TIMESTAMP on GPU
2179 	 *
2180 	 * Submission latency: B - A
2181 	 */
2182 
2183 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2184 		struct i915_request *rq;
2185 
2186 		rq = i915_request_create(ce);
2187 		if (IS_ERR(rq)) {
2188 			err = PTR_ERR(rq);
2189 			goto err;
2190 		}
2191 
2192 		cs = intel_ring_begin(rq, 12);
2193 		if (IS_ERR(cs)) {
2194 			i915_request_add(rq);
2195 			err = PTR_ERR(cs);
2196 			goto err;
2197 		}
2198 
2199 		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2200 		cs = emit_semaphore_poll_until(cs, offset, i);
2201 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2202 
2203 		intel_ring_advance(rq, cs);
2204 
2205 		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
2206 			err = -EIO;
2207 			goto err;
2208 		}
2209 
2210 		preempt_disable();
2211 		local_bh_disable();
2212 		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2213 		i915_request_add(rq);
2214 		local_bh_enable();
2215 		semaphore_set(sema, i - 1);
2216 		preempt_enable();
2217 	}
2218 
2219 	wait_for(READ_ONCE(sema[i - 1]), 500);
2220 	semaphore_set(sema, i - 1);
2221 
2222 	for (i = 1; i <= TF_COUNT; i++) {
2223 		GEM_BUG_ON(sema[i] == -1);
2224 		elapsed[i - 1] = sema[i] - elapsed[i];
2225 	}
2226 
2227 	cycles = trifilter(elapsed);
2228 	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
2229 		ce->engine->name, cycles >> TF_BIAS,
2230 		cycles_to_ns(ce->engine, cycles));
2231 
2232 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2233 
2234 err:
2235 	intel_gt_set_wedged(ce->engine->gt);
2236 	return err;
2237 }
2238 
2239 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
2240 {
2241 	const u32 offset =
2242 		i915_ggtt_offset(engine->status_page.vma) +
2243 		offset_in_page(sema);
2244 	struct i915_request *rq;
2245 	u32 *cs;
2246 
2247 	rq = i915_request_create(engine->kernel_context);
2248 	if (IS_ERR(rq))
2249 		return PTR_ERR(rq);
2250 
2251 	cs = intel_ring_begin(rq, 4);
2252 	if (IS_ERR(cs)) {
2253 		i915_request_add(rq);
2254 		return PTR_ERR(cs);
2255 	}
2256 
2257 	cs = emit_semaphore_poll(cs, mode, value, offset);
2258 
2259 	intel_ring_advance(rq, cs);
2260 	i915_request_add(rq);
2261 
2262 	return 0;
2263 }
2264 
2265 static int measure_inter_request(struct intel_context *ce)
2266 {
2267 	u32 *sema = hwsp_scratch(ce);
2268 	const u32 offset = hwsp_offset(ce, sema);
2269 	u32 elapsed[TF_COUNT + 1], cycles;
2270 	struct i915_sw_fence *submit;
2271 	int i, err;
2272 
2273 	/*
2274 	 * Measure how long it takes to advance from one request into the
2275 	 * next. Between each request we flush the GPU caches to memory,
2276 	 * update the breadcrumbs, and then invalidate those caches.
2277 	 * We queue up all the requests to be submitted in one batch so
2278 	 * it should be one set of contiguous measurements.
2279 	 *
2280 	 *    A: read CS_TIMESTAMP on GPU
2281 	 *    advance request
2282 	 *    B: read CS_TIMESTAMP on GPU
2283 	 *
2284 	 * Request latency: B - A
2285 	 */
2286 
2287 	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2288 	if (err)
2289 		return err;
2290 
2291 	submit = heap_fence_create(GFP_KERNEL);
2292 	if (!submit) {
2293 		semaphore_set(sema, 1);
2294 		return -ENOMEM;
2295 	}
2296 
2297 	intel_engine_flush_submission(ce->engine);
2298 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2299 		struct i915_request *rq;
2300 		u32 *cs;
2301 
2302 		rq = i915_request_create(ce);
2303 		if (IS_ERR(rq)) {
2304 			err = PTR_ERR(rq);
2305 			goto err_submit;
2306 		}
2307 
2308 		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
2309 						       submit,
2310 						       GFP_KERNEL);
2311 		if (err < 0) {
2312 			i915_request_add(rq);
2313 			goto err_submit;
2314 		}
2315 
2316 		cs = intel_ring_begin(rq, 4);
2317 		if (IS_ERR(cs)) {
2318 			i915_request_add(rq);
2319 			err = PTR_ERR(cs);
2320 			goto err_submit;
2321 		}
2322 
2323 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2324 
2325 		intel_ring_advance(rq, cs);
2326 		i915_request_add(rq);
2327 	}
2328 	i915_sw_fence_commit(submit);
2329 	intel_engine_flush_submission(ce->engine);
2330 	heap_fence_put(submit);
2331 
2332 	semaphore_set(sema, 1);
2333 	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2334 	if (err)
2335 		goto err;
2336 
2337 	for (i = 1; i <= TF_COUNT; i++)
2338 		elapsed[i - 1] = sema[i + 1] - sema[i];
2339 
2340 	cycles = trifilter(elapsed);
2341 	pr_info("%s: inter-request latency %d cycles, %lluns\n",
2342 		ce->engine->name, cycles >> TF_BIAS,
2343 		cycles_to_ns(ce->engine, cycles));
2344 
2345 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2346 
2347 err_submit:
2348 	i915_sw_fence_commit(submit);
2349 	heap_fence_put(submit);
2350 	semaphore_set(sema, 1);
2351 err:
2352 	intel_gt_set_wedged(ce->engine->gt);
2353 	return err;
2354 }
2355 
2356 static int measure_context_switch(struct intel_context *ce)
2357 {
2358 	u32 *sema = hwsp_scratch(ce);
2359 	const u32 offset = hwsp_offset(ce, sema);
2360 	struct i915_request *fence = NULL;
2361 	u32 elapsed[TF_COUNT + 1], cycles;
2362 	int i, j, err;
2363 	u32 *cs;
2364 
2365 	/*
2366 	 * Measure how long it takes to advance from one request in one
2367 	 * context to a request in another context. This allows us to
2368 	 * measure how long the context save/restore take, along with all
2369 	 * the inter-context setup we require.
2370 	 *
2371 	 *    A: read CS_TIMESTAMP on GPU
2372 	 *    switch context
2373 	 *    B: read CS_TIMESTAMP on GPU
2374 	 *
2375 	 * Context switch latency: B - A
2376 	 */
2377 
2378 	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2379 	if (err)
2380 		return err;
2381 
2382 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2383 		struct intel_context *arr[] = {
2384 			ce, ce->engine->kernel_context
2385 		};
2386 		u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
2387 
2388 		for (j = 0; j < ARRAY_SIZE(arr); j++) {
2389 			struct i915_request *rq;
2390 
2391 			rq = i915_request_create(arr[j]);
2392 			if (IS_ERR(rq)) {
2393 				err = PTR_ERR(rq);
2394 				goto err_fence;
2395 			}
2396 
2397 			if (fence) {
2398 				err = i915_request_await_dma_fence(rq,
2399 								   &fence->fence);
2400 				if (err) {
2401 					i915_request_add(rq);
2402 					goto err_fence;
2403 				}
2404 			}
2405 
2406 			cs = intel_ring_begin(rq, 4);
2407 			if (IS_ERR(cs)) {
2408 				i915_request_add(rq);
2409 				err = PTR_ERR(cs);
2410 				goto err_fence;
2411 			}
2412 
2413 			cs = emit_timestamp_store(cs, ce, addr);
2414 			addr += sizeof(u32);
2415 
2416 			intel_ring_advance(rq, cs);
2417 
2418 			i915_request_put(fence);
2419 			fence = i915_request_get(rq);
2420 
2421 			i915_request_add(rq);
2422 		}
2423 	}
2424 	i915_request_put(fence);
2425 	intel_engine_flush_submission(ce->engine);
2426 
2427 	semaphore_set(sema, 1);
2428 	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2429 	if (err)
2430 		goto err;
2431 
2432 	for (i = 1; i <= TF_COUNT; i++)
2433 		elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2434 
2435 	cycles = trifilter(elapsed);
2436 	pr_info("%s: context switch latency %d cycles, %lluns\n",
2437 		ce->engine->name, cycles >> TF_BIAS,
2438 		cycles_to_ns(ce->engine, cycles));
2439 
2440 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2441 
2442 err_fence:
2443 	i915_request_put(fence);
2444 	semaphore_set(sema, 1);
2445 err:
2446 	intel_gt_set_wedged(ce->engine->gt);
2447 	return err;
2448 }
2449 
2450 static int measure_preemption(struct intel_context *ce)
2451 {
2452 	u32 *sema = hwsp_scratch(ce);
2453 	const u32 offset = hwsp_offset(ce, sema);
2454 	u32 elapsed[TF_COUNT], cycles;
2455 	u32 *cs;
2456 	int err;
2457 	int i;
2458 
2459 	/*
2460 	 * We measure two latencies while triggering preemption. The first
2461 	 * latency is how long it takes for us to submit a preempting request.
2462 	 * The second latency is how it takes for us to return from the
2463 	 * preemption back to the original context.
2464 	 *
2465 	 *    A: read CS_TIMESTAMP from CPU
2466 	 *    submit preemption
2467 	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
2468 	 *    context switch
2469 	 *    C: read CS_TIMESTAMP on GPU (in original context)
2470 	 *
2471 	 * Preemption dispatch latency: B - A
2472 	 * Preemption switch latency: C - B
2473 	 */
2474 
2475 	if (!intel_engine_has_preemption(ce->engine))
2476 		return 0;
2477 
2478 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2479 		u32 addr = offset + 2 * i * sizeof(u32);
2480 		struct i915_request *rq;
2481 
2482 		rq = i915_request_create(ce);
2483 		if (IS_ERR(rq)) {
2484 			err = PTR_ERR(rq);
2485 			goto err;
2486 		}
2487 
2488 		cs = intel_ring_begin(rq, 12);
2489 		if (IS_ERR(cs)) {
2490 			i915_request_add(rq);
2491 			err = PTR_ERR(cs);
2492 			goto err;
2493 		}
2494 
2495 		cs = emit_store_dw(cs, addr, -1);
2496 		cs = emit_semaphore_poll_until(cs, offset, i);
2497 		cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2498 
2499 		intel_ring_advance(rq, cs);
2500 		i915_request_add(rq);
2501 
2502 		if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2503 			err = -EIO;
2504 			goto err;
2505 		}
2506 
2507 		rq = i915_request_create(ce->engine->kernel_context);
2508 		if (IS_ERR(rq)) {
2509 			err = PTR_ERR(rq);
2510 			goto err;
2511 		}
2512 
2513 		cs = intel_ring_begin(rq, 8);
2514 		if (IS_ERR(cs)) {
2515 			i915_request_add(rq);
2516 			err = PTR_ERR(cs);
2517 			goto err;
2518 		}
2519 
2520 		cs = emit_timestamp_store(cs, ce, addr);
2521 		cs = emit_store_dw(cs, offset, i);
2522 
2523 		intel_ring_advance(rq, cs);
2524 		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2525 
2526 		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2527 		i915_request_add(rq);
2528 	}
2529 
2530 	if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2531 		err = -EIO;
2532 		goto err;
2533 	}
2534 
2535 	for (i = 1; i <= TF_COUNT; i++)
2536 		elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2537 
2538 	cycles = trifilter(elapsed);
2539 	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2540 		ce->engine->name, cycles >> TF_BIAS,
2541 		cycles_to_ns(ce->engine, cycles));
2542 
2543 	for (i = 1; i <= TF_COUNT; i++)
2544 		elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2545 
2546 	cycles = trifilter(elapsed);
2547 	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2548 		ce->engine->name, cycles >> TF_BIAS,
2549 		cycles_to_ns(ce->engine, cycles));
2550 
2551 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2552 
2553 err:
2554 	intel_gt_set_wedged(ce->engine->gt);
2555 	return err;
2556 }
2557 
2558 struct signal_cb {
2559 	struct dma_fence_cb base;
2560 	bool seen;
2561 };
2562 
2563 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2564 {
2565 	struct signal_cb *s = container_of(cb, typeof(*s), base);
2566 
2567 	smp_store_mb(s->seen, true); /* be safe, be strong */
2568 }
2569 
2570 static int measure_completion(struct intel_context *ce)
2571 {
2572 	u32 *sema = hwsp_scratch(ce);
2573 	const u32 offset = hwsp_offset(ce, sema);
2574 	u32 elapsed[TF_COUNT], cycles;
2575 	u32 *cs;
2576 	int err;
2577 	int i;
2578 
2579 	/*
2580 	 * Measure how long it takes for the signal (interrupt) to be
2581 	 * sent from the GPU to be processed by the CPU.
2582 	 *
2583 	 *    A: read CS_TIMESTAMP on GPU
2584 	 *    signal
2585 	 *    B: read CS_TIMESTAMP from CPU
2586 	 *
2587 	 * Completion latency: B - A
2588 	 */
2589 
2590 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2591 		struct signal_cb cb = { .seen = false };
2592 		struct i915_request *rq;
2593 
2594 		rq = i915_request_create(ce);
2595 		if (IS_ERR(rq)) {
2596 			err = PTR_ERR(rq);
2597 			goto err;
2598 		}
2599 
2600 		cs = intel_ring_begin(rq, 12);
2601 		if (IS_ERR(cs)) {
2602 			i915_request_add(rq);
2603 			err = PTR_ERR(cs);
2604 			goto err;
2605 		}
2606 
2607 		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2608 		cs = emit_semaphore_poll_until(cs, offset, i);
2609 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2610 
2611 		intel_ring_advance(rq, cs);
2612 
2613 		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2614 		i915_request_add(rq);
2615 
2616 		intel_engine_flush_submission(ce->engine);
2617 		if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2618 			err = -EIO;
2619 			goto err;
2620 		}
2621 
2622 		preempt_disable();
2623 		semaphore_set(sema, i);
2624 		while (!READ_ONCE(cb.seen))
2625 			cpu_relax();
2626 
2627 		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2628 		preempt_enable();
2629 	}
2630 
2631 	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2632 	if (err)
2633 		goto err;
2634 
2635 	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2636 		GEM_BUG_ON(sema[i + 1] == -1);
2637 		elapsed[i] = elapsed[i] - sema[i + 1];
2638 	}
2639 
2640 	cycles = trifilter(elapsed);
2641 	pr_info("%s: completion latency %d cycles, %lluns\n",
2642 		ce->engine->name, cycles >> TF_BIAS,
2643 		cycles_to_ns(ce->engine, cycles));
2644 
2645 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2646 
2647 err:
2648 	intel_gt_set_wedged(ce->engine->gt);
2649 	return err;
2650 }
2651 
2652 static void rps_pin(struct intel_gt *gt)
2653 {
2654 	/* Pin the frequency to max */
2655 	atomic_inc(&gt->rps.num_waiters);
2656 	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2657 
2658 	mutex_lock(&gt->rps.lock);
2659 	intel_rps_set(&gt->rps, gt->rps.max_freq);
2660 	mutex_unlock(&gt->rps.lock);
2661 }
2662 
2663 static void rps_unpin(struct intel_gt *gt)
2664 {
2665 	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2666 	atomic_dec(&gt->rps.num_waiters);
2667 }
2668 
2669 static int perf_request_latency(void *arg)
2670 {
2671 	struct drm_i915_private *i915 = arg;
2672 	struct intel_engine_cs *engine;
2673 	struct pm_qos_request qos;
2674 	int err = 0;
2675 
2676 	if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */
2677 		return 0;
2678 
2679 	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2680 
2681 	for_each_uabi_engine(engine, i915) {
2682 		struct intel_context *ce;
2683 
2684 		ce = intel_context_create(engine);
2685 		if (IS_ERR(ce)) {
2686 			err = PTR_ERR(ce);
2687 			goto out;
2688 		}
2689 
2690 		err = intel_context_pin(ce);
2691 		if (err) {
2692 			intel_context_put(ce);
2693 			goto out;
2694 		}
2695 
2696 		st_engine_heartbeat_disable(engine);
2697 		rps_pin(engine->gt);
2698 
2699 		if (err == 0)
2700 			err = measure_semaphore_response(ce);
2701 		if (err == 0)
2702 			err = measure_idle_dispatch(ce);
2703 		if (err == 0)
2704 			err = measure_busy_dispatch(ce);
2705 		if (err == 0)
2706 			err = measure_inter_request(ce);
2707 		if (err == 0)
2708 			err = measure_context_switch(ce);
2709 		if (err == 0)
2710 			err = measure_preemption(ce);
2711 		if (err == 0)
2712 			err = measure_completion(ce);
2713 
2714 		rps_unpin(engine->gt);
2715 		st_engine_heartbeat_enable(engine);
2716 
2717 		intel_context_unpin(ce);
2718 		intel_context_put(ce);
2719 		if (err)
2720 			goto out;
2721 	}
2722 
2723 out:
2724 	if (igt_flush_test(i915))
2725 		err = -EIO;
2726 
2727 	cpu_latency_qos_remove_request(&qos);
2728 	return err;
2729 }
2730 
2731 static int s_sync0(void *arg)
2732 {
2733 	struct perf_series *ps = arg;
2734 	IGT_TIMEOUT(end_time);
2735 	unsigned int idx = 0;
2736 	int err = 0;
2737 
2738 	GEM_BUG_ON(!ps->nengines);
2739 	do {
2740 		struct i915_request *rq;
2741 
2742 		rq = i915_request_create(ps->ce[idx]);
2743 		if (IS_ERR(rq)) {
2744 			err = PTR_ERR(rq);
2745 			break;
2746 		}
2747 
2748 		i915_request_get(rq);
2749 		i915_request_add(rq);
2750 
2751 		if (i915_request_wait(rq, 0, HZ / 5) < 0)
2752 			err = -ETIME;
2753 		i915_request_put(rq);
2754 		if (err)
2755 			break;
2756 
2757 		if (++idx == ps->nengines)
2758 			idx = 0;
2759 	} while (!__igt_timeout(end_time, NULL));
2760 
2761 	return err;
2762 }
2763 
2764 static int s_sync1(void *arg)
2765 {
2766 	struct perf_series *ps = arg;
2767 	struct i915_request *prev = NULL;
2768 	IGT_TIMEOUT(end_time);
2769 	unsigned int idx = 0;
2770 	int err = 0;
2771 
2772 	GEM_BUG_ON(!ps->nengines);
2773 	do {
2774 		struct i915_request *rq;
2775 
2776 		rq = i915_request_create(ps->ce[idx]);
2777 		if (IS_ERR(rq)) {
2778 			err = PTR_ERR(rq);
2779 			break;
2780 		}
2781 
2782 		i915_request_get(rq);
2783 		i915_request_add(rq);
2784 
2785 		if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2786 			err = -ETIME;
2787 		i915_request_put(prev);
2788 		prev = rq;
2789 		if (err)
2790 			break;
2791 
2792 		if (++idx == ps->nengines)
2793 			idx = 0;
2794 	} while (!__igt_timeout(end_time, NULL));
2795 	i915_request_put(prev);
2796 
2797 	return err;
2798 }
2799 
2800 static int s_many(void *arg)
2801 {
2802 	struct perf_series *ps = arg;
2803 	IGT_TIMEOUT(end_time);
2804 	unsigned int idx = 0;
2805 
2806 	GEM_BUG_ON(!ps->nengines);
2807 	do {
2808 		struct i915_request *rq;
2809 
2810 		rq = i915_request_create(ps->ce[idx]);
2811 		if (IS_ERR(rq))
2812 			return PTR_ERR(rq);
2813 
2814 		i915_request_add(rq);
2815 
2816 		if (++idx == ps->nengines)
2817 			idx = 0;
2818 	} while (!__igt_timeout(end_time, NULL));
2819 
2820 	return 0;
2821 }
2822 
2823 static int perf_series_engines(void *arg)
2824 {
2825 	struct drm_i915_private *i915 = arg;
2826 	static int (* const func[])(void *arg) = {
2827 		s_sync0,
2828 		s_sync1,
2829 		s_many,
2830 		NULL,
2831 	};
2832 	const unsigned int nengines = num_uabi_engines(i915);
2833 	struct intel_engine_cs *engine;
2834 	int (* const *fn)(void *arg);
2835 	struct pm_qos_request qos;
2836 	struct perf_stats *stats;
2837 	struct perf_series *ps;
2838 	unsigned int idx;
2839 	int err = 0;
2840 
2841 	stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2842 	if (!stats)
2843 		return -ENOMEM;
2844 
2845 	ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2846 	if (!ps) {
2847 		kfree(stats);
2848 		return -ENOMEM;
2849 	}
2850 
2851 	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2852 
2853 	ps->i915 = i915;
2854 	ps->nengines = nengines;
2855 
2856 	idx = 0;
2857 	for_each_uabi_engine(engine, i915) {
2858 		struct intel_context *ce;
2859 
2860 		ce = intel_context_create(engine);
2861 		if (IS_ERR(ce)) {
2862 			err = PTR_ERR(ce);
2863 			goto out;
2864 		}
2865 
2866 		err = intel_context_pin(ce);
2867 		if (err) {
2868 			intel_context_put(ce);
2869 			goto out;
2870 		}
2871 
2872 		ps->ce[idx++] = ce;
2873 	}
2874 	GEM_BUG_ON(idx != ps->nengines);
2875 
2876 	for (fn = func; *fn && !err; fn++) {
2877 		char name[KSYM_NAME_LEN];
2878 		struct igt_live_test t;
2879 
2880 		snprintf(name, sizeof(name), "%ps", *fn);
2881 		err = igt_live_test_begin(&t, i915, __func__, name);
2882 		if (err)
2883 			break;
2884 
2885 		for (idx = 0; idx < nengines; idx++) {
2886 			struct perf_stats *p =
2887 				memset(&stats[idx], 0, sizeof(stats[idx]));
2888 			struct intel_context *ce = ps->ce[idx];
2889 
2890 			p->engine = ps->ce[idx]->engine;
2891 			intel_engine_pm_get(p->engine);
2892 
2893 			if (intel_engine_supports_stats(p->engine))
2894 				p->busy = intel_engine_get_busy_time(p->engine,
2895 								     &p->time) + 1;
2896 			else
2897 				p->time = ktime_get();
2898 			p->runtime = -intel_context_get_total_runtime_ns(ce);
2899 		}
2900 
2901 		err = (*fn)(ps);
2902 		if (igt_live_test_end(&t))
2903 			err = -EIO;
2904 
2905 		for (idx = 0; idx < nengines; idx++) {
2906 			struct perf_stats *p = &stats[idx];
2907 			struct intel_context *ce = ps->ce[idx];
2908 			int integer, decimal;
2909 			u64 busy, dt, now;
2910 
2911 			if (p->busy)
2912 				p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2913 									       &now),
2914 						    p->busy - 1);
2915 			else
2916 				now = ktime_get();
2917 			p->time = ktime_sub(now, p->time);
2918 
2919 			err = switch_to_kernel_sync(ce, err);
2920 			p->runtime += intel_context_get_total_runtime_ns(ce);
2921 			intel_engine_pm_put(p->engine);
2922 
2923 			busy = 100 * ktime_to_ns(p->busy);
2924 			dt = ktime_to_ns(p->time);
2925 			if (dt) {
2926 				integer = div64_u64(busy, dt);
2927 				busy -= integer * dt;
2928 				decimal = div64_u64(100 * busy, dt);
2929 			} else {
2930 				integer = 0;
2931 				decimal = 0;
2932 			}
2933 
2934 			pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2935 				name, p->engine->name, ce->timeline->seqno,
2936 				integer, decimal,
2937 				div_u64(p->runtime, 1000 * 1000),
2938 				div_u64(ktime_to_ns(p->time), 1000 * 1000));
2939 		}
2940 	}
2941 
2942 out:
2943 	for (idx = 0; idx < nengines; idx++) {
2944 		if (IS_ERR_OR_NULL(ps->ce[idx]))
2945 			break;
2946 
2947 		intel_context_unpin(ps->ce[idx]);
2948 		intel_context_put(ps->ce[idx]);
2949 	}
2950 	kfree(ps);
2951 
2952 	cpu_latency_qos_remove_request(&qos);
2953 	kfree(stats);
2954 	return err;
2955 }
2956 
2957 struct p_thread {
2958 	struct perf_stats p;
2959 	struct kthread_worker *worker;
2960 	struct kthread_work work;
2961 	struct intel_engine_cs *engine;
2962 	int result;
2963 };
2964 
2965 static void p_sync0(struct kthread_work *work)
2966 {
2967 	struct p_thread *thread = container_of(work, typeof(*thread), work);
2968 	struct perf_stats *p = &thread->p;
2969 	struct intel_engine_cs *engine = p->engine;
2970 	struct intel_context *ce;
2971 	IGT_TIMEOUT(end_time);
2972 	unsigned long count;
2973 	bool busy;
2974 	int err = 0;
2975 
2976 	ce = intel_context_create(engine);
2977 	if (IS_ERR(ce)) {
2978 		thread->result = PTR_ERR(ce);
2979 		return;
2980 	}
2981 
2982 	err = intel_context_pin(ce);
2983 	if (err) {
2984 		intel_context_put(ce);
2985 		thread->result = err;
2986 		return;
2987 	}
2988 
2989 	if (intel_engine_supports_stats(engine)) {
2990 		p->busy = intel_engine_get_busy_time(engine, &p->time);
2991 		busy = true;
2992 	} else {
2993 		p->time = ktime_get();
2994 		busy = false;
2995 	}
2996 
2997 	count = 0;
2998 	do {
2999 		struct i915_request *rq;
3000 
3001 		rq = i915_request_create(ce);
3002 		if (IS_ERR(rq)) {
3003 			err = PTR_ERR(rq);
3004 			break;
3005 		}
3006 
3007 		i915_request_get(rq);
3008 		i915_request_add(rq);
3009 
3010 		err = 0;
3011 		if (i915_request_wait(rq, 0, HZ) < 0)
3012 			err = -ETIME;
3013 		i915_request_put(rq);
3014 		if (err)
3015 			break;
3016 
3017 		count++;
3018 	} while (!__igt_timeout(end_time, NULL));
3019 
3020 	if (busy) {
3021 		ktime_t now;
3022 
3023 		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3024 				    p->busy);
3025 		p->time = ktime_sub(now, p->time);
3026 	} else {
3027 		p->time = ktime_sub(ktime_get(), p->time);
3028 	}
3029 
3030 	err = switch_to_kernel_sync(ce, err);
3031 	p->runtime = intel_context_get_total_runtime_ns(ce);
3032 	p->count = count;
3033 
3034 	intel_context_unpin(ce);
3035 	intel_context_put(ce);
3036 	thread->result = err;
3037 }
3038 
3039 static void p_sync1(struct kthread_work *work)
3040 {
3041 	struct p_thread *thread = container_of(work, typeof(*thread), work);
3042 	struct perf_stats *p = &thread->p;
3043 	struct intel_engine_cs *engine = p->engine;
3044 	struct i915_request *prev = NULL;
3045 	struct intel_context *ce;
3046 	IGT_TIMEOUT(end_time);
3047 	unsigned long count;
3048 	bool busy;
3049 	int err = 0;
3050 
3051 	ce = intel_context_create(engine);
3052 	if (IS_ERR(ce)) {
3053 		thread->result = PTR_ERR(ce);
3054 		return;
3055 	}
3056 
3057 	err = intel_context_pin(ce);
3058 	if (err) {
3059 		intel_context_put(ce);
3060 		thread->result = err;
3061 		return;
3062 	}
3063 
3064 	if (intel_engine_supports_stats(engine)) {
3065 		p->busy = intel_engine_get_busy_time(engine, &p->time);
3066 		busy = true;
3067 	} else {
3068 		p->time = ktime_get();
3069 		busy = false;
3070 	}
3071 
3072 	count = 0;
3073 	do {
3074 		struct i915_request *rq;
3075 
3076 		rq = i915_request_create(ce);
3077 		if (IS_ERR(rq)) {
3078 			err = PTR_ERR(rq);
3079 			break;
3080 		}
3081 
3082 		i915_request_get(rq);
3083 		i915_request_add(rq);
3084 
3085 		err = 0;
3086 		if (prev && i915_request_wait(prev, 0, HZ) < 0)
3087 			err = -ETIME;
3088 		i915_request_put(prev);
3089 		prev = rq;
3090 		if (err)
3091 			break;
3092 
3093 		count++;
3094 	} while (!__igt_timeout(end_time, NULL));
3095 	i915_request_put(prev);
3096 
3097 	if (busy) {
3098 		ktime_t now;
3099 
3100 		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3101 				    p->busy);
3102 		p->time = ktime_sub(now, p->time);
3103 	} else {
3104 		p->time = ktime_sub(ktime_get(), p->time);
3105 	}
3106 
3107 	err = switch_to_kernel_sync(ce, err);
3108 	p->runtime = intel_context_get_total_runtime_ns(ce);
3109 	p->count = count;
3110 
3111 	intel_context_unpin(ce);
3112 	intel_context_put(ce);
3113 	thread->result = err;
3114 }
3115 
3116 static void p_many(struct kthread_work *work)
3117 {
3118 	struct p_thread *thread = container_of(work, typeof(*thread), work);
3119 	struct perf_stats *p = &thread->p;
3120 	struct intel_engine_cs *engine = p->engine;
3121 	struct intel_context *ce;
3122 	IGT_TIMEOUT(end_time);
3123 	unsigned long count;
3124 	int err = 0;
3125 	bool busy;
3126 
3127 	ce = intel_context_create(engine);
3128 	if (IS_ERR(ce)) {
3129 		thread->result = PTR_ERR(ce);
3130 		return;
3131 	}
3132 
3133 	err = intel_context_pin(ce);
3134 	if (err) {
3135 		intel_context_put(ce);
3136 		thread->result = err;
3137 		return;
3138 	}
3139 
3140 	if (intel_engine_supports_stats(engine)) {
3141 		p->busy = intel_engine_get_busy_time(engine, &p->time);
3142 		busy = true;
3143 	} else {
3144 		p->time = ktime_get();
3145 		busy = false;
3146 	}
3147 
3148 	count = 0;
3149 	do {
3150 		struct i915_request *rq;
3151 
3152 		rq = i915_request_create(ce);
3153 		if (IS_ERR(rq)) {
3154 			err = PTR_ERR(rq);
3155 			break;
3156 		}
3157 
3158 		i915_request_add(rq);
3159 		count++;
3160 	} while (!__igt_timeout(end_time, NULL));
3161 
3162 	if (busy) {
3163 		ktime_t now;
3164 
3165 		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3166 				    p->busy);
3167 		p->time = ktime_sub(now, p->time);
3168 	} else {
3169 		p->time = ktime_sub(ktime_get(), p->time);
3170 	}
3171 
3172 	err = switch_to_kernel_sync(ce, err);
3173 	p->runtime = intel_context_get_total_runtime_ns(ce);
3174 	p->count = count;
3175 
3176 	intel_context_unpin(ce);
3177 	intel_context_put(ce);
3178 	thread->result = err;
3179 }
3180 
3181 static int perf_parallel_engines(void *arg)
3182 {
3183 	struct drm_i915_private *i915 = arg;
3184 	static void (* const func[])(struct kthread_work *) = {
3185 		p_sync0,
3186 		p_sync1,
3187 		p_many,
3188 		NULL,
3189 	};
3190 	const unsigned int nengines = num_uabi_engines(i915);
3191 	void (* const *fn)(struct kthread_work *);
3192 	struct intel_engine_cs *engine;
3193 	struct pm_qos_request qos;
3194 	struct p_thread *engines;
3195 	int err = 0;
3196 
3197 	engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
3198 	if (!engines)
3199 		return -ENOMEM;
3200 
3201 	cpu_latency_qos_add_request(&qos, 0);
3202 
3203 	for (fn = func; *fn; fn++) {
3204 		char name[KSYM_NAME_LEN];
3205 		struct igt_live_test t;
3206 		unsigned int idx;
3207 
3208 		snprintf(name, sizeof(name), "%ps", *fn);
3209 		err = igt_live_test_begin(&t, i915, __func__, name);
3210 		if (err)
3211 			break;
3212 
3213 		atomic_set(&i915->selftest.counter, nengines);
3214 
3215 		idx = 0;
3216 		for_each_uabi_engine(engine, i915) {
3217 			struct kthread_worker *worker;
3218 
3219 			intel_engine_pm_get(engine);
3220 
3221 			memset(&engines[idx].p, 0, sizeof(engines[idx].p));
3222 
3223 			worker = kthread_run_worker(0, "igt:%s",
3224 						       engine->name);
3225 			if (IS_ERR(worker)) {
3226 				err = PTR_ERR(worker);
3227 				intel_engine_pm_put(engine);
3228 				break;
3229 			}
3230 			engines[idx].worker = worker;
3231 			engines[idx].result = 0;
3232 			engines[idx].p.engine = engine;
3233 			engines[idx].engine = engine;
3234 
3235 			kthread_init_work(&engines[idx].work, *fn);
3236 			kthread_queue_work(worker, &engines[idx].work);
3237 			idx++;
3238 		}
3239 
3240 		idx = 0;
3241 		for_each_uabi_engine(engine, i915) {
3242 			int status;
3243 
3244 			if (!engines[idx].worker)
3245 				break;
3246 
3247 			kthread_flush_work(&engines[idx].work);
3248 			status = READ_ONCE(engines[idx].result);
3249 			if (status && !err)
3250 				err = status;
3251 
3252 			intel_engine_pm_put(engine);
3253 
3254 			kthread_destroy_worker(engines[idx].worker);
3255 			idx++;
3256 		}
3257 
3258 		if (igt_live_test_end(&t))
3259 			err = -EIO;
3260 		if (err)
3261 			break;
3262 
3263 		idx = 0;
3264 		for_each_uabi_engine(engine, i915) {
3265 			struct perf_stats *p = &engines[idx].p;
3266 			u64 busy = 100 * ktime_to_ns(p->busy);
3267 			u64 dt = ktime_to_ns(p->time);
3268 			int integer, decimal;
3269 
3270 			if (dt) {
3271 				integer = div64_u64(busy, dt);
3272 				busy -= integer * dt;
3273 				decimal = div64_u64(100 * busy, dt);
3274 			} else {
3275 				integer = 0;
3276 				decimal = 0;
3277 			}
3278 
3279 			GEM_BUG_ON(engine != p->engine);
3280 			pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
3281 				name, engine->name, p->count, integer, decimal,
3282 				div_u64(p->runtime, 1000 * 1000),
3283 				div_u64(ktime_to_ns(p->time), 1000 * 1000));
3284 			idx++;
3285 		}
3286 	}
3287 
3288 	cpu_latency_qos_remove_request(&qos);
3289 	kfree(engines);
3290 	return err;
3291 }
3292 
3293 int i915_request_perf_selftests(struct drm_i915_private *i915)
3294 {
3295 	static const struct i915_subtest tests[] = {
3296 		SUBTEST(perf_request_latency),
3297 		SUBTEST(perf_series_engines),
3298 		SUBTEST(perf_parallel_engines),
3299 	};
3300 
3301 	if (intel_gt_is_wedged(to_gt(i915)))
3302 		return 0;
3303 
3304 	return i915_subtests(tests, i915);
3305 }
3306