xref: /linux/drivers/gpu/drm/i915/selftests/i915_request.c (revision 07fdad3a93756b872da7b53647715c48d0f4a2d0)
1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/pm_qos.h>
26 #include <linux/prime_numbers.h>
27 #include <linux/sort.h>
28 
29 #include "gem/i915_gem_internal.h"
30 #include "gem/i915_gem_pm.h"
31 #include "gem/selftests/mock_context.h"
32 #include "gt/intel_engine_heartbeat.h"
33 #include "gt/intel_engine_pm.h"
34 #include "gt/intel_engine_user.h"
35 #include "gt/intel_gt.h"
36 #include "gt/intel_gt_clock_utils.h"
37 #include "gt/intel_gt_requests.h"
38 #include "gt/selftest_engine_heartbeat.h"
39 
40 #include "i915_random.h"
41 #include "i915_selftest.h"
42 #include "i915_wait_util.h"
43 #include "igt_flush_test.h"
44 #include "igt_live_test.h"
45 #include "igt_spinner.h"
46 #include "lib_sw_fence.h"
47 #include "mock_drm.h"
48 #include "mock_gem_device.h"
49 
50 static unsigned int num_uabi_engines(struct drm_i915_private *i915)
51 {
52 	struct intel_engine_cs *engine;
53 	unsigned int count;
54 
55 	count = 0;
56 	for_each_uabi_engine(engine, i915)
57 		count++;
58 
59 	return count;
60 }
61 
62 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
63 {
64 	return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
65 }
66 
67 static int igt_add_request(void *arg)
68 {
69 	struct drm_i915_private *i915 = arg;
70 	struct i915_request *request;
71 
72 	/* Basic preliminary test to create a request and let it loose! */
73 
74 	request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
75 	if (IS_ERR(request))
76 		return PTR_ERR(request);
77 
78 	i915_request_add(request);
79 
80 	return 0;
81 }
82 
83 static int igt_wait_request(void *arg)
84 {
85 	const long T = HZ / 4;
86 	struct drm_i915_private *i915 = arg;
87 	struct i915_request *request;
88 	int err = -EINVAL;
89 
90 	/* Submit a request, then wait upon it */
91 
92 	request = mock_request(rcs0(i915)->kernel_context, T);
93 	if (IS_ERR(request))
94 		return PTR_ERR(request);
95 
96 	i915_request_get(request);
97 
98 	if (i915_request_wait(request, 0, 0) != -ETIME) {
99 		pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
100 		goto out_request;
101 	}
102 
103 	if (i915_request_wait(request, 0, T) != -ETIME) {
104 		pr_err("request wait succeeded (expected timeout before submit!)\n");
105 		goto out_request;
106 	}
107 
108 	if (i915_request_completed(request)) {
109 		pr_err("request completed before submit!!\n");
110 		goto out_request;
111 	}
112 
113 	i915_request_add(request);
114 
115 	if (i915_request_wait(request, 0, 0) != -ETIME) {
116 		pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
117 		goto out_request;
118 	}
119 
120 	if (i915_request_completed(request)) {
121 		pr_err("request completed immediately!\n");
122 		goto out_request;
123 	}
124 
125 	if (i915_request_wait(request, 0, T / 2) != -ETIME) {
126 		pr_err("request wait succeeded (expected timeout!)\n");
127 		goto out_request;
128 	}
129 
130 	if (i915_request_wait(request, 0, T) == -ETIME) {
131 		pr_err("request wait timed out!\n");
132 		goto out_request;
133 	}
134 
135 	if (!i915_request_completed(request)) {
136 		pr_err("request not complete after waiting!\n");
137 		goto out_request;
138 	}
139 
140 	if (i915_request_wait(request, 0, T) == -ETIME) {
141 		pr_err("request wait timed out when already complete!\n");
142 		goto out_request;
143 	}
144 
145 	err = 0;
146 out_request:
147 	i915_request_put(request);
148 	mock_device_flush(i915);
149 	return err;
150 }
151 
152 static int igt_fence_wait(void *arg)
153 {
154 	const long T = HZ / 4;
155 	struct drm_i915_private *i915 = arg;
156 	struct i915_request *request;
157 	int err = -EINVAL;
158 
159 	/* Submit a request, treat it as a fence and wait upon it */
160 
161 	request = mock_request(rcs0(i915)->kernel_context, T);
162 	if (IS_ERR(request))
163 		return PTR_ERR(request);
164 
165 	if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
166 		pr_err("fence wait success before submit (expected timeout)!\n");
167 		goto out;
168 	}
169 
170 	i915_request_add(request);
171 
172 	if (dma_fence_is_signaled(&request->fence)) {
173 		pr_err("fence signaled immediately!\n");
174 		goto out;
175 	}
176 
177 	if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
178 		pr_err("fence wait success after submit (expected timeout)!\n");
179 		goto out;
180 	}
181 
182 	if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
183 		pr_err("fence wait timed out (expected success)!\n");
184 		goto out;
185 	}
186 
187 	if (!dma_fence_is_signaled(&request->fence)) {
188 		pr_err("fence unsignaled after waiting!\n");
189 		goto out;
190 	}
191 
192 	if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
193 		pr_err("fence wait timed out when complete (expected success)!\n");
194 		goto out;
195 	}
196 
197 	err = 0;
198 out:
199 	mock_device_flush(i915);
200 	return err;
201 }
202 
203 static int igt_request_rewind(void *arg)
204 {
205 	struct drm_i915_private *i915 = arg;
206 	struct i915_request *request, *vip;
207 	struct i915_gem_context *ctx[2];
208 	struct intel_context *ce;
209 	int err = -EINVAL;
210 
211 	ctx[0] = mock_context(i915, "A");
212 	if (!ctx[0]) {
213 		err = -ENOMEM;
214 		goto err_ctx_0;
215 	}
216 
217 	ce = i915_gem_context_get_engine(ctx[0], RCS0);
218 	GEM_BUG_ON(IS_ERR(ce));
219 	request = mock_request(ce, 2 * HZ);
220 	intel_context_put(ce);
221 	if (IS_ERR(request)) {
222 		err = PTR_ERR(request);
223 		goto err_context_0;
224 	}
225 
226 	i915_request_get(request);
227 	i915_request_add(request);
228 
229 	ctx[1] = mock_context(i915, "B");
230 	if (!ctx[1]) {
231 		err = -ENOMEM;
232 		goto err_ctx_1;
233 	}
234 
235 	ce = i915_gem_context_get_engine(ctx[1], RCS0);
236 	GEM_BUG_ON(IS_ERR(ce));
237 	vip = mock_request(ce, 0);
238 	intel_context_put(ce);
239 	if (IS_ERR(vip)) {
240 		err = PTR_ERR(vip);
241 		goto err_context_1;
242 	}
243 
244 	/* Simulate preemption by manual reordering */
245 	if (!mock_cancel_request(request)) {
246 		pr_err("failed to cancel request (already executed)!\n");
247 		i915_request_add(vip);
248 		goto err_context_1;
249 	}
250 	i915_request_get(vip);
251 	i915_request_add(vip);
252 	rcu_read_lock();
253 	request->engine->submit_request(request);
254 	rcu_read_unlock();
255 
256 
257 	if (i915_request_wait(vip, 0, HZ) == -ETIME) {
258 		pr_err("timed out waiting for high priority request\n");
259 		goto err;
260 	}
261 
262 	if (i915_request_completed(request)) {
263 		pr_err("low priority request already completed\n");
264 		goto err;
265 	}
266 
267 	err = 0;
268 err:
269 	i915_request_put(vip);
270 err_context_1:
271 	mock_context_close(ctx[1]);
272 err_ctx_1:
273 	i915_request_put(request);
274 err_context_0:
275 	mock_context_close(ctx[0]);
276 err_ctx_0:
277 	mock_device_flush(i915);
278 	return err;
279 }
280 
281 struct smoketest {
282 	struct intel_engine_cs *engine;
283 	struct i915_gem_context **contexts;
284 	atomic_long_t num_waits, num_fences;
285 	int ncontexts, max_batch;
286 	struct i915_request *(*request_alloc)(struct intel_context *ce);
287 };
288 
289 static struct i915_request *
290 __mock_request_alloc(struct intel_context *ce)
291 {
292 	return mock_request(ce, 0);
293 }
294 
295 static struct i915_request *
296 __live_request_alloc(struct intel_context *ce)
297 {
298 	return intel_context_create_request(ce);
299 }
300 
301 struct smoke_thread {
302 	struct kthread_worker *worker;
303 	struct kthread_work work;
304 	struct smoketest *t;
305 	bool stop;
306 	int result;
307 };
308 
309 static void __igt_breadcrumbs_smoketest(struct kthread_work *work)
310 {
311 	struct smoke_thread *thread = container_of(work, typeof(*thread), work);
312 	struct smoketest *t = thread->t;
313 	const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
314 	const unsigned int total = 4 * t->ncontexts + 1;
315 	unsigned int num_waits = 0, num_fences = 0;
316 	struct i915_request **requests;
317 	I915_RND_STATE(prng);
318 	unsigned int *order;
319 	int err = 0;
320 
321 	/*
322 	 * A very simple test to catch the most egregious of list handling bugs.
323 	 *
324 	 * At its heart, we simply create oodles of requests running across
325 	 * multiple kthreads and enable signaling on them, for the sole purpose
326 	 * of stressing our breadcrumb handling. The only inspection we do is
327 	 * that the fences were marked as signaled.
328 	 */
329 
330 	requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
331 	if (!requests) {
332 		thread->result = -ENOMEM;
333 		return;
334 	}
335 
336 	order = i915_random_order(total, &prng);
337 	if (!order) {
338 		err = -ENOMEM;
339 		goto out_requests;
340 	}
341 
342 	while (!READ_ONCE(thread->stop)) {
343 		struct i915_sw_fence *submit, *wait;
344 		unsigned int n, count;
345 
346 		submit = heap_fence_create(GFP_KERNEL);
347 		if (!submit) {
348 			err = -ENOMEM;
349 			break;
350 		}
351 
352 		wait = heap_fence_create(GFP_KERNEL);
353 		if (!wait) {
354 			i915_sw_fence_commit(submit);
355 			heap_fence_put(submit);
356 			err = -ENOMEM;
357 			break;
358 		}
359 
360 		i915_random_reorder(order, total, &prng);
361 		count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
362 
363 		for (n = 0; n < count; n++) {
364 			struct i915_gem_context *ctx =
365 				t->contexts[order[n] % t->ncontexts];
366 			struct i915_request *rq;
367 			struct intel_context *ce;
368 
369 			ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
370 			GEM_BUG_ON(IS_ERR(ce));
371 			rq = t->request_alloc(ce);
372 			intel_context_put(ce);
373 			if (IS_ERR(rq)) {
374 				err = PTR_ERR(rq);
375 				count = n;
376 				break;
377 			}
378 
379 			err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
380 							       submit,
381 							       GFP_KERNEL);
382 
383 			requests[n] = i915_request_get(rq);
384 			i915_request_add(rq);
385 
386 			if (err >= 0)
387 				err = i915_sw_fence_await_dma_fence(wait,
388 								    &rq->fence,
389 								    0,
390 								    GFP_KERNEL);
391 
392 			if (err < 0) {
393 				i915_request_put(rq);
394 				count = n;
395 				break;
396 			}
397 		}
398 
399 		i915_sw_fence_commit(submit);
400 		i915_sw_fence_commit(wait);
401 
402 		if (!wait_event_timeout(wait->wait,
403 					i915_sw_fence_done(wait),
404 					5 * HZ)) {
405 			struct i915_request *rq = requests[count - 1];
406 
407 			pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
408 			       atomic_read(&wait->pending), count,
409 			       rq->fence.context, rq->fence.seqno,
410 			       t->engine->name);
411 			GEM_TRACE_DUMP();
412 
413 			intel_gt_set_wedged(t->engine->gt);
414 			GEM_BUG_ON(!i915_request_completed(rq));
415 			i915_sw_fence_wait(wait);
416 			err = -EIO;
417 		}
418 
419 		for (n = 0; n < count; n++) {
420 			struct i915_request *rq = requests[n];
421 
422 			if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
423 				      &rq->fence.flags)) {
424 				pr_err("%llu:%llu was not signaled!\n",
425 				       rq->fence.context, rq->fence.seqno);
426 				err = -EINVAL;
427 			}
428 
429 			i915_request_put(rq);
430 		}
431 
432 		heap_fence_put(wait);
433 		heap_fence_put(submit);
434 
435 		if (err < 0)
436 			break;
437 
438 		num_fences += count;
439 		num_waits++;
440 
441 		cond_resched();
442 	}
443 
444 	atomic_long_add(num_fences, &t->num_fences);
445 	atomic_long_add(num_waits, &t->num_waits);
446 
447 	kfree(order);
448 out_requests:
449 	kfree(requests);
450 	thread->result = err;
451 }
452 
453 static int mock_breadcrumbs_smoketest(void *arg)
454 {
455 	struct drm_i915_private *i915 = arg;
456 	struct smoketest t = {
457 		.engine = rcs0(i915),
458 		.ncontexts = 1024,
459 		.max_batch = 1024,
460 		.request_alloc = __mock_request_alloc
461 	};
462 	unsigned int ncpus = num_online_cpus();
463 	struct smoke_thread *threads;
464 	unsigned int n;
465 	int ret = 0;
466 
467 	/*
468 	 * Smoketest our breadcrumb/signal handling for requests across multiple
469 	 * threads. A very simple test to only catch the most egregious of bugs.
470 	 * See __igt_breadcrumbs_smoketest();
471 	 */
472 
473 	threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
474 	if (!threads)
475 		return -ENOMEM;
476 
477 	t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
478 	if (!t.contexts) {
479 		ret = -ENOMEM;
480 		goto out_threads;
481 	}
482 
483 	for (n = 0; n < t.ncontexts; n++) {
484 		t.contexts[n] = mock_context(t.engine->i915, "mock");
485 		if (!t.contexts[n]) {
486 			ret = -ENOMEM;
487 			goto out_contexts;
488 		}
489 	}
490 
491 	for (n = 0; n < ncpus; n++) {
492 		struct kthread_worker *worker;
493 
494 		worker = kthread_run_worker(0, "igt/%d", n);
495 		if (IS_ERR(worker)) {
496 			ret = PTR_ERR(worker);
497 			ncpus = n;
498 			break;
499 		}
500 
501 		threads[n].worker = worker;
502 		threads[n].t = &t;
503 		threads[n].stop = false;
504 		threads[n].result = 0;
505 
506 		kthread_init_work(&threads[n].work,
507 				  __igt_breadcrumbs_smoketest);
508 		kthread_queue_work(worker, &threads[n].work);
509 	}
510 
511 	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
512 
513 	for (n = 0; n < ncpus; n++) {
514 		int err;
515 
516 		WRITE_ONCE(threads[n].stop, true);
517 		kthread_flush_work(&threads[n].work);
518 		err = READ_ONCE(threads[n].result);
519 		if (err < 0 && !ret)
520 			ret = err;
521 
522 		kthread_destroy_worker(threads[n].worker);
523 	}
524 	pr_info("Completed %lu waits for %lu fence across %d cpus\n",
525 		atomic_long_read(&t.num_waits),
526 		atomic_long_read(&t.num_fences),
527 		ncpus);
528 
529 out_contexts:
530 	for (n = 0; n < t.ncontexts; n++) {
531 		if (!t.contexts[n])
532 			break;
533 		mock_context_close(t.contexts[n]);
534 	}
535 	kfree(t.contexts);
536 out_threads:
537 	kfree(threads);
538 	return ret;
539 }
540 
541 int i915_request_mock_selftests(void)
542 {
543 	static const struct i915_subtest tests[] = {
544 		SUBTEST(igt_add_request),
545 		SUBTEST(igt_wait_request),
546 		SUBTEST(igt_fence_wait),
547 		SUBTEST(igt_request_rewind),
548 		SUBTEST(mock_breadcrumbs_smoketest),
549 	};
550 	struct drm_i915_private *i915;
551 	intel_wakeref_t wakeref;
552 	int err = 0;
553 
554 	i915 = mock_gem_device();
555 	if (!i915)
556 		return -ENOMEM;
557 
558 	with_intel_runtime_pm(&i915->runtime_pm, wakeref)
559 		err = i915_subtests(tests, i915);
560 
561 	mock_destroy_device(i915);
562 
563 	return err;
564 }
565 
566 static int live_nop_request(void *arg)
567 {
568 	struct drm_i915_private *i915 = arg;
569 	struct intel_engine_cs *engine;
570 	struct igt_live_test t;
571 	int err = -ENODEV;
572 
573 	/*
574 	 * Submit various sized batches of empty requests, to each engine
575 	 * (individually), and wait for the batch to complete. We can check
576 	 * the overhead of submitting requests to the hardware.
577 	 */
578 
579 	for_each_uabi_engine(engine, i915) {
580 		unsigned long n, prime;
581 		IGT_TIMEOUT(end_time);
582 		ktime_t times[2] = {};
583 
584 		err = igt_live_test_begin(&t, i915, __func__, engine->name);
585 		if (err)
586 			return err;
587 
588 		intel_engine_pm_get(engine);
589 		for_each_prime_number_from(prime, 1, 8192) {
590 			struct i915_request *request = NULL;
591 
592 			times[1] = ktime_get_raw();
593 
594 			for (n = 0; n < prime; n++) {
595 				i915_request_put(request);
596 				request = i915_request_create(engine->kernel_context);
597 				if (IS_ERR(request))
598 					return PTR_ERR(request);
599 
600 				/*
601 				 * This space is left intentionally blank.
602 				 *
603 				 * We do not actually want to perform any
604 				 * action with this request, we just want
605 				 * to measure the latency in allocation
606 				 * and submission of our breadcrumbs -
607 				 * ensuring that the bare request is sufficient
608 				 * for the system to work (i.e. proper HEAD
609 				 * tracking of the rings, interrupt handling,
610 				 * etc). It also gives us the lowest bounds
611 				 * for latency.
612 				 */
613 
614 				i915_request_get(request);
615 				i915_request_add(request);
616 			}
617 			i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
618 			i915_request_put(request);
619 
620 			times[1] = ktime_sub(ktime_get_raw(), times[1]);
621 			if (prime == 1)
622 				times[0] = times[1];
623 
624 			if (__igt_timeout(end_time, NULL))
625 				break;
626 		}
627 		intel_engine_pm_put(engine);
628 
629 		err = igt_live_test_end(&t);
630 		if (err)
631 			return err;
632 
633 		pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
634 			engine->name,
635 			ktime_to_ns(times[0]),
636 			prime, div64_u64(ktime_to_ns(times[1]), prime));
637 	}
638 
639 	return err;
640 }
641 
642 static int __cancel_inactive(struct intel_engine_cs *engine)
643 {
644 	struct intel_context *ce;
645 	struct igt_spinner spin;
646 	struct i915_request *rq;
647 	int err = 0;
648 
649 	if (igt_spinner_init(&spin, engine->gt))
650 		return -ENOMEM;
651 
652 	ce = intel_context_create(engine);
653 	if (IS_ERR(ce)) {
654 		err = PTR_ERR(ce);
655 		goto out_spin;
656 	}
657 
658 	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
659 	if (IS_ERR(rq)) {
660 		err = PTR_ERR(rq);
661 		goto out_ce;
662 	}
663 
664 	pr_debug("%s: Cancelling inactive request\n", engine->name);
665 	i915_request_cancel(rq, -EINTR);
666 	i915_request_get(rq);
667 	i915_request_add(rq);
668 
669 	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
670 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
671 
672 		pr_err("%s: Failed to cancel inactive request\n", engine->name);
673 		intel_engine_dump(engine, &p, "%s\n", engine->name);
674 		err = -ETIME;
675 		goto out_rq;
676 	}
677 
678 	if (rq->fence.error != -EINTR) {
679 		pr_err("%s: fence not cancelled (%u)\n",
680 		       engine->name, rq->fence.error);
681 		err = -EINVAL;
682 	}
683 
684 out_rq:
685 	i915_request_put(rq);
686 out_ce:
687 	intel_context_put(ce);
688 out_spin:
689 	igt_spinner_fini(&spin);
690 	if (err)
691 		pr_err("%s: %s error %d\n", __func__, engine->name, err);
692 	return err;
693 }
694 
695 static int __cancel_active(struct intel_engine_cs *engine)
696 {
697 	struct intel_context *ce;
698 	struct igt_spinner spin;
699 	struct i915_request *rq;
700 	int err = 0;
701 
702 	if (igt_spinner_init(&spin, engine->gt))
703 		return -ENOMEM;
704 
705 	ce = intel_context_create(engine);
706 	if (IS_ERR(ce)) {
707 		err = PTR_ERR(ce);
708 		goto out_spin;
709 	}
710 
711 	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
712 	if (IS_ERR(rq)) {
713 		err = PTR_ERR(rq);
714 		goto out_ce;
715 	}
716 
717 	pr_debug("%s: Cancelling active request\n", engine->name);
718 	i915_request_get(rq);
719 	i915_request_add(rq);
720 	if (!igt_wait_for_spinner(&spin, rq)) {
721 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
722 
723 		pr_err("Failed to start spinner on %s\n", engine->name);
724 		intel_engine_dump(engine, &p, "%s\n", engine->name);
725 		err = -ETIME;
726 		goto out_rq;
727 	}
728 	i915_request_cancel(rq, -EINTR);
729 
730 	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
731 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
732 
733 		pr_err("%s: Failed to cancel active request\n", engine->name);
734 		intel_engine_dump(engine, &p, "%s\n", engine->name);
735 		err = -ETIME;
736 		goto out_rq;
737 	}
738 
739 	if (rq->fence.error != -EINTR) {
740 		pr_err("%s: fence not cancelled (%u)\n",
741 		       engine->name, rq->fence.error);
742 		err = -EINVAL;
743 	}
744 
745 out_rq:
746 	i915_request_put(rq);
747 out_ce:
748 	intel_context_put(ce);
749 out_spin:
750 	igt_spinner_fini(&spin);
751 	if (err)
752 		pr_err("%s: %s error %d\n", __func__, engine->name, err);
753 	return err;
754 }
755 
756 static int __cancel_completed(struct intel_engine_cs *engine)
757 {
758 	struct intel_context *ce;
759 	struct igt_spinner spin;
760 	struct i915_request *rq;
761 	int err = 0;
762 
763 	if (igt_spinner_init(&spin, engine->gt))
764 		return -ENOMEM;
765 
766 	ce = intel_context_create(engine);
767 	if (IS_ERR(ce)) {
768 		err = PTR_ERR(ce);
769 		goto out_spin;
770 	}
771 
772 	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
773 	if (IS_ERR(rq)) {
774 		err = PTR_ERR(rq);
775 		goto out_ce;
776 	}
777 	igt_spinner_end(&spin);
778 	i915_request_get(rq);
779 	i915_request_add(rq);
780 
781 	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
782 		err = -ETIME;
783 		goto out_rq;
784 	}
785 
786 	pr_debug("%s: Cancelling completed request\n", engine->name);
787 	i915_request_cancel(rq, -EINTR);
788 	if (rq->fence.error) {
789 		pr_err("%s: fence not cancelled (%u)\n",
790 		       engine->name, rq->fence.error);
791 		err = -EINVAL;
792 	}
793 
794 out_rq:
795 	i915_request_put(rq);
796 out_ce:
797 	intel_context_put(ce);
798 out_spin:
799 	igt_spinner_fini(&spin);
800 	if (err)
801 		pr_err("%s: %s error %d\n", __func__, engine->name, err);
802 	return err;
803 }
804 
805 /*
806  * Test to prove a non-preemptable request can be cancelled and a subsequent
807  * request on the same context can successfully complete after cancellation.
808  *
809  * Testing methodology is to create a non-preemptible request and submit it,
810  * wait for spinner to start, create a NOP request and submit it, cancel the
811  * spinner, wait for spinner to complete and verify it failed with an error,
812  * finally wait for NOP request to complete verify it succeeded without an
813  * error. Preemption timeout also reduced / restored so test runs in a timely
814  * maner.
815  */
816 static int __cancel_reset(struct drm_i915_private *i915,
817 			  struct intel_engine_cs *engine)
818 {
819 	struct intel_context *ce;
820 	struct igt_spinner spin;
821 	struct i915_request *rq, *nop;
822 	unsigned long preempt_timeout_ms;
823 	int err = 0;
824 
825 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT ||
826 	    !intel_has_reset_engine(engine->gt))
827 		return 0;
828 
829 	preempt_timeout_ms = engine->props.preempt_timeout_ms;
830 	engine->props.preempt_timeout_ms = 100;
831 
832 	if (igt_spinner_init(&spin, engine->gt))
833 		goto out_restore;
834 
835 	ce = intel_context_create(engine);
836 	if (IS_ERR(ce)) {
837 		err = PTR_ERR(ce);
838 		goto out_spin;
839 	}
840 
841 	rq = igt_spinner_create_request(&spin, ce, MI_NOOP);
842 	if (IS_ERR(rq)) {
843 		err = PTR_ERR(rq);
844 		goto out_ce;
845 	}
846 
847 	pr_debug("%s: Cancelling active non-preemptable request\n",
848 		 engine->name);
849 	i915_request_get(rq);
850 	i915_request_add(rq);
851 	if (!igt_wait_for_spinner(&spin, rq)) {
852 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
853 
854 		pr_err("Failed to start spinner on %s\n", engine->name);
855 		intel_engine_dump(engine, &p, "%s\n", engine->name);
856 		err = -ETIME;
857 		goto out_rq;
858 	}
859 
860 	nop = intel_context_create_request(ce);
861 	if (IS_ERR(nop))
862 		goto out_rq;
863 	i915_request_get(nop);
864 	i915_request_add(nop);
865 
866 	i915_request_cancel(rq, -EINTR);
867 
868 	if (i915_request_wait(rq, 0, HZ) < 0) {
869 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
870 
871 		pr_err("%s: Failed to cancel hung request\n", engine->name);
872 		intel_engine_dump(engine, &p, "%s\n", engine->name);
873 		err = -ETIME;
874 		goto out_nop;
875 	}
876 
877 	if (rq->fence.error != -EINTR) {
878 		pr_err("%s: fence not cancelled (%u)\n",
879 		       engine->name, rq->fence.error);
880 		err = -EINVAL;
881 		goto out_nop;
882 	}
883 
884 	if (i915_request_wait(nop, 0, HZ) < 0) {
885 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
886 
887 		pr_err("%s: Failed to complete nop request\n", engine->name);
888 		intel_engine_dump(engine, &p, "%s\n", engine->name);
889 		err = -ETIME;
890 		goto out_nop;
891 	}
892 
893 	if (nop->fence.error != 0) {
894 		pr_err("%s: Nop request errored (%u)\n",
895 		       engine->name, nop->fence.error);
896 		err = -EINVAL;
897 	}
898 
899 out_nop:
900 	i915_request_put(nop);
901 out_rq:
902 	i915_request_put(rq);
903 out_ce:
904 	intel_context_put(ce);
905 out_spin:
906 	igt_spinner_fini(&spin);
907 out_restore:
908 	engine->props.preempt_timeout_ms = preempt_timeout_ms;
909 	if (err)
910 		pr_err("%s: %s error %d\n", __func__, engine->name, err);
911 	return err;
912 }
913 
914 static int live_cancel_request(void *arg)
915 {
916 	struct drm_i915_private *i915 = arg;
917 	struct intel_engine_cs *engine;
918 
919 	/*
920 	 * Check cancellation of requests. We expect to be able to immediately
921 	 * cancel active requests, even if they are currently on the GPU.
922 	 */
923 
924 	for_each_uabi_engine(engine, i915) {
925 		struct igt_live_test t;
926 		int err, err2;
927 
928 		if (!intel_engine_has_preemption(engine))
929 			continue;
930 
931 		err = igt_live_test_begin(&t, i915, __func__, engine->name);
932 		if (err)
933 			return err;
934 
935 		err = __cancel_inactive(engine);
936 		if (err == 0)
937 			err = __cancel_active(engine);
938 		if (err == 0)
939 			err = __cancel_completed(engine);
940 
941 		err2 = igt_live_test_end(&t);
942 		if (err)
943 			return err;
944 		if (err2)
945 			return err2;
946 
947 		/* Expects reset so call outside of igt_live_test_* */
948 		err = __cancel_reset(i915, engine);
949 		if (err)
950 			return err;
951 
952 		if (igt_flush_test(i915))
953 			return -EIO;
954 	}
955 
956 	return 0;
957 }
958 
959 static struct i915_vma *empty_batch(struct intel_gt *gt)
960 {
961 	struct drm_i915_gem_object *obj;
962 	struct i915_vma *vma;
963 	u32 *cmd;
964 	int err;
965 
966 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
967 	if (IS_ERR(obj))
968 		return ERR_CAST(obj);
969 
970 	cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
971 	if (IS_ERR(cmd)) {
972 		err = PTR_ERR(cmd);
973 		goto err;
974 	}
975 
976 	*cmd = MI_BATCH_BUFFER_END;
977 
978 	__i915_gem_object_flush_map(obj, 0, 64);
979 	i915_gem_object_unpin_map(obj);
980 
981 	intel_gt_chipset_flush(gt);
982 
983 	vma = i915_vma_instance(obj, gt->vm, NULL);
984 	if (IS_ERR(vma)) {
985 		err = PTR_ERR(vma);
986 		goto err;
987 	}
988 
989 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
990 	if (err)
991 		goto err;
992 
993 	/* Force the wait now to avoid including it in the benchmark */
994 	err = i915_vma_sync(vma);
995 	if (err)
996 		goto err_pin;
997 
998 	return vma;
999 
1000 err_pin:
1001 	i915_vma_unpin(vma);
1002 err:
1003 	i915_gem_object_put(obj);
1004 	return ERR_PTR(err);
1005 }
1006 
1007 static int emit_bb_start(struct i915_request *rq, struct i915_vma *batch)
1008 {
1009 	return rq->engine->emit_bb_start(rq,
1010 					 i915_vma_offset(batch),
1011 					 i915_vma_size(batch),
1012 					 0);
1013 }
1014 
1015 static struct i915_request *
1016 empty_request(struct intel_engine_cs *engine,
1017 	      struct i915_vma *batch)
1018 {
1019 	struct i915_request *request;
1020 	int err;
1021 
1022 	request = i915_request_create(engine->kernel_context);
1023 	if (IS_ERR(request))
1024 		return request;
1025 
1026 	err = emit_bb_start(request, batch);
1027 	if (err)
1028 		goto out_request;
1029 
1030 	i915_request_get(request);
1031 out_request:
1032 	i915_request_add(request);
1033 	return err ? ERR_PTR(err) : request;
1034 }
1035 
1036 static int live_empty_request(void *arg)
1037 {
1038 	struct drm_i915_private *i915 = arg;
1039 	struct intel_engine_cs *engine;
1040 	struct igt_live_test t;
1041 	int err;
1042 
1043 	/*
1044 	 * Submit various sized batches of empty requests, to each engine
1045 	 * (individually), and wait for the batch to complete. We can check
1046 	 * the overhead of submitting requests to the hardware.
1047 	 */
1048 
1049 	for_each_uabi_engine(engine, i915) {
1050 		IGT_TIMEOUT(end_time);
1051 		struct i915_request *request;
1052 		struct i915_vma *batch;
1053 		unsigned long n, prime;
1054 		ktime_t times[2] = {};
1055 
1056 		batch = empty_batch(engine->gt);
1057 		if (IS_ERR(batch))
1058 			return PTR_ERR(batch);
1059 
1060 		err = igt_live_test_begin(&t, i915, __func__, engine->name);
1061 		if (err)
1062 			goto out_batch;
1063 
1064 		intel_engine_pm_get(engine);
1065 
1066 		/* Warmup / preload */
1067 		request = empty_request(engine, batch);
1068 		if (IS_ERR(request)) {
1069 			err = PTR_ERR(request);
1070 			intel_engine_pm_put(engine);
1071 			goto out_batch;
1072 		}
1073 		i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
1074 
1075 		for_each_prime_number_from(prime, 1, 8192) {
1076 			times[1] = ktime_get_raw();
1077 
1078 			for (n = 0; n < prime; n++) {
1079 				i915_request_put(request);
1080 				request = empty_request(engine, batch);
1081 				if (IS_ERR(request)) {
1082 					err = PTR_ERR(request);
1083 					intel_engine_pm_put(engine);
1084 					goto out_batch;
1085 				}
1086 			}
1087 			i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
1088 
1089 			times[1] = ktime_sub(ktime_get_raw(), times[1]);
1090 			if (prime == 1)
1091 				times[0] = times[1];
1092 
1093 			if (__igt_timeout(end_time, NULL))
1094 				break;
1095 		}
1096 		i915_request_put(request);
1097 		intel_engine_pm_put(engine);
1098 
1099 		err = igt_live_test_end(&t);
1100 		if (err)
1101 			goto out_batch;
1102 
1103 		pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
1104 			engine->name,
1105 			ktime_to_ns(times[0]),
1106 			prime, div64_u64(ktime_to_ns(times[1]), prime));
1107 out_batch:
1108 		i915_vma_unpin(batch);
1109 		i915_vma_put(batch);
1110 		if (err)
1111 			break;
1112 	}
1113 
1114 	return err;
1115 }
1116 
1117 static struct i915_vma *recursive_batch(struct intel_gt *gt)
1118 {
1119 	struct drm_i915_gem_object *obj;
1120 	const int ver = GRAPHICS_VER(gt->i915);
1121 	struct i915_vma *vma;
1122 	u32 *cmd;
1123 	int err;
1124 
1125 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
1126 	if (IS_ERR(obj))
1127 		return ERR_CAST(obj);
1128 
1129 	vma = i915_vma_instance(obj, gt->vm, NULL);
1130 	if (IS_ERR(vma)) {
1131 		err = PTR_ERR(vma);
1132 		goto err;
1133 	}
1134 
1135 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
1136 	if (err)
1137 		goto err;
1138 
1139 	cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
1140 	if (IS_ERR(cmd)) {
1141 		err = PTR_ERR(cmd);
1142 		goto err;
1143 	}
1144 
1145 	if (ver >= 8) {
1146 		*cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
1147 		*cmd++ = lower_32_bits(i915_vma_offset(vma));
1148 		*cmd++ = upper_32_bits(i915_vma_offset(vma));
1149 	} else if (ver >= 6) {
1150 		*cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
1151 		*cmd++ = lower_32_bits(i915_vma_offset(vma));
1152 	} else {
1153 		*cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1154 		*cmd++ = lower_32_bits(i915_vma_offset(vma));
1155 	}
1156 	*cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
1157 
1158 	__i915_gem_object_flush_map(obj, 0, 64);
1159 	i915_gem_object_unpin_map(obj);
1160 
1161 	intel_gt_chipset_flush(gt);
1162 
1163 	return vma;
1164 
1165 err:
1166 	i915_gem_object_put(obj);
1167 	return ERR_PTR(err);
1168 }
1169 
1170 static int recursive_batch_resolve(struct i915_vma *batch)
1171 {
1172 	u32 *cmd;
1173 
1174 	cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC);
1175 	if (IS_ERR(cmd))
1176 		return PTR_ERR(cmd);
1177 
1178 	*cmd = MI_BATCH_BUFFER_END;
1179 
1180 	__i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
1181 	i915_gem_object_unpin_map(batch->obj);
1182 
1183 	intel_gt_chipset_flush(batch->vm->gt);
1184 
1185 	return 0;
1186 }
1187 
1188 static int live_all_engines(void *arg)
1189 {
1190 	struct drm_i915_private *i915 = arg;
1191 	const unsigned int nengines = num_uabi_engines(i915);
1192 	struct intel_engine_cs *engine;
1193 	struct i915_request **request;
1194 	struct igt_live_test t;
1195 	unsigned int idx;
1196 	int err;
1197 
1198 	/*
1199 	 * Check we can submit requests to all engines simultaneously. We
1200 	 * send a recursive batch to each engine - checking that we don't
1201 	 * block doing so, and that they don't complete too soon.
1202 	 */
1203 
1204 	request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1205 	if (!request)
1206 		return -ENOMEM;
1207 
1208 	err = igt_live_test_begin(&t, i915, __func__, "");
1209 	if (err)
1210 		goto out_free;
1211 
1212 	idx = 0;
1213 	for_each_uabi_engine(engine, i915) {
1214 		struct i915_vma *batch;
1215 
1216 		batch = recursive_batch(engine->gt);
1217 		if (IS_ERR(batch)) {
1218 			err = PTR_ERR(batch);
1219 			pr_err("%s: Unable to create batch, err=%d\n",
1220 			       __func__, err);
1221 			goto out_free;
1222 		}
1223 
1224 		i915_vma_lock(batch);
1225 		request[idx] = intel_engine_create_kernel_request(engine);
1226 		if (IS_ERR(request[idx])) {
1227 			err = PTR_ERR(request[idx]);
1228 			pr_err("%s: Request allocation failed with err=%d\n",
1229 			       __func__, err);
1230 			goto out_unlock;
1231 		}
1232 		GEM_BUG_ON(request[idx]->context->vm != batch->vm);
1233 
1234 		err = i915_vma_move_to_active(batch, request[idx], 0);
1235 		GEM_BUG_ON(err);
1236 
1237 		err = emit_bb_start(request[idx], batch);
1238 		GEM_BUG_ON(err);
1239 		request[idx]->batch = batch;
1240 
1241 		i915_request_get(request[idx]);
1242 		i915_request_add(request[idx]);
1243 		idx++;
1244 out_unlock:
1245 		i915_vma_unlock(batch);
1246 		if (err)
1247 			goto out_request;
1248 	}
1249 
1250 	idx = 0;
1251 	for_each_uabi_engine(engine, i915) {
1252 		if (i915_request_completed(request[idx])) {
1253 			pr_err("%s(%s): request completed too early!\n",
1254 			       __func__, engine->name);
1255 			err = -EINVAL;
1256 			goto out_request;
1257 		}
1258 		idx++;
1259 	}
1260 
1261 	idx = 0;
1262 	for_each_uabi_engine(engine, i915) {
1263 		err = recursive_batch_resolve(request[idx]->batch);
1264 		if (err) {
1265 			pr_err("%s: failed to resolve batch, err=%d\n",
1266 			       __func__, err);
1267 			goto out_request;
1268 		}
1269 		idx++;
1270 	}
1271 
1272 	idx = 0;
1273 	for_each_uabi_engine(engine, i915) {
1274 		struct i915_request *rq = request[idx];
1275 		long timeout;
1276 
1277 		timeout = i915_request_wait(rq, 0,
1278 					    MAX_SCHEDULE_TIMEOUT);
1279 		if (timeout < 0) {
1280 			err = timeout;
1281 			pr_err("%s: error waiting for request on %s, err=%d\n",
1282 			       __func__, engine->name, err);
1283 			goto out_request;
1284 		}
1285 
1286 		GEM_BUG_ON(!i915_request_completed(rq));
1287 		i915_vma_unpin(rq->batch);
1288 		i915_vma_put(rq->batch);
1289 		i915_request_put(rq);
1290 		request[idx] = NULL;
1291 		idx++;
1292 	}
1293 
1294 	err = igt_live_test_end(&t);
1295 
1296 out_request:
1297 	idx = 0;
1298 	for_each_uabi_engine(engine, i915) {
1299 		struct i915_request *rq = request[idx];
1300 
1301 		if (!rq)
1302 			continue;
1303 
1304 		if (rq->batch) {
1305 			i915_vma_unpin(rq->batch);
1306 			i915_vma_put(rq->batch);
1307 		}
1308 		i915_request_put(rq);
1309 		idx++;
1310 	}
1311 out_free:
1312 	kfree(request);
1313 	return err;
1314 }
1315 
1316 static int live_sequential_engines(void *arg)
1317 {
1318 	struct drm_i915_private *i915 = arg;
1319 	const unsigned int nengines = num_uabi_engines(i915);
1320 	struct i915_request **request;
1321 	struct i915_request *prev = NULL;
1322 	struct intel_engine_cs *engine;
1323 	struct igt_live_test t;
1324 	unsigned int idx;
1325 	int err;
1326 
1327 	/*
1328 	 * Check we can submit requests to all engines sequentially, such
1329 	 * that each successive request waits for the earlier ones. This
1330 	 * tests that we don't execute requests out of order, even though
1331 	 * they are running on independent engines.
1332 	 */
1333 
1334 	request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1335 	if (!request)
1336 		return -ENOMEM;
1337 
1338 	err = igt_live_test_begin(&t, i915, __func__, "");
1339 	if (err)
1340 		goto out_free;
1341 
1342 	idx = 0;
1343 	for_each_uabi_engine(engine, i915) {
1344 		struct i915_vma *batch;
1345 
1346 		batch = recursive_batch(engine->gt);
1347 		if (IS_ERR(batch)) {
1348 			err = PTR_ERR(batch);
1349 			pr_err("%s: Unable to create batch for %s, err=%d\n",
1350 			       __func__, engine->name, err);
1351 			goto out_free;
1352 		}
1353 
1354 		i915_vma_lock(batch);
1355 		request[idx] = intel_engine_create_kernel_request(engine);
1356 		if (IS_ERR(request[idx])) {
1357 			err = PTR_ERR(request[idx]);
1358 			pr_err("%s: Request allocation failed for %s with err=%d\n",
1359 			       __func__, engine->name, err);
1360 			goto out_unlock;
1361 		}
1362 		GEM_BUG_ON(request[idx]->context->vm != batch->vm);
1363 
1364 		if (prev) {
1365 			err = i915_request_await_dma_fence(request[idx],
1366 							   &prev->fence);
1367 			if (err) {
1368 				i915_request_add(request[idx]);
1369 				pr_err("%s: Request await failed for %s with err=%d\n",
1370 				       __func__, engine->name, err);
1371 				goto out_unlock;
1372 			}
1373 		}
1374 
1375 		err = i915_vma_move_to_active(batch, request[idx], 0);
1376 		GEM_BUG_ON(err);
1377 
1378 		err = emit_bb_start(request[idx], batch);
1379 		GEM_BUG_ON(err);
1380 		request[idx]->batch = batch;
1381 
1382 		i915_request_get(request[idx]);
1383 		i915_request_add(request[idx]);
1384 
1385 		prev = request[idx];
1386 		idx++;
1387 
1388 out_unlock:
1389 		i915_vma_unlock(batch);
1390 		if (err)
1391 			goto out_request;
1392 	}
1393 
1394 	idx = 0;
1395 	for_each_uabi_engine(engine, i915) {
1396 		long timeout;
1397 
1398 		if (i915_request_completed(request[idx])) {
1399 			pr_err("%s(%s): request completed too early!\n",
1400 			       __func__, engine->name);
1401 			err = -EINVAL;
1402 			goto out_request;
1403 		}
1404 
1405 		err = recursive_batch_resolve(request[idx]->batch);
1406 		if (err) {
1407 			pr_err("%s: failed to resolve batch, err=%d\n",
1408 			       __func__, err);
1409 			goto out_request;
1410 		}
1411 
1412 		timeout = i915_request_wait(request[idx], 0,
1413 					    MAX_SCHEDULE_TIMEOUT);
1414 		if (timeout < 0) {
1415 			err = timeout;
1416 			pr_err("%s: error waiting for request on %s, err=%d\n",
1417 			       __func__, engine->name, err);
1418 			goto out_request;
1419 		}
1420 
1421 		GEM_BUG_ON(!i915_request_completed(request[idx]));
1422 		idx++;
1423 	}
1424 
1425 	err = igt_live_test_end(&t);
1426 
1427 out_request:
1428 	idx = 0;
1429 	for_each_uabi_engine(engine, i915) {
1430 		u32 *cmd;
1431 
1432 		if (!request[idx])
1433 			break;
1434 
1435 		cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj,
1436 						       I915_MAP_WC);
1437 		if (!IS_ERR(cmd)) {
1438 			*cmd = MI_BATCH_BUFFER_END;
1439 
1440 			__i915_gem_object_flush_map(request[idx]->batch->obj,
1441 						    0, sizeof(*cmd));
1442 			i915_gem_object_unpin_map(request[idx]->batch->obj);
1443 
1444 			intel_gt_chipset_flush(engine->gt);
1445 		}
1446 
1447 		i915_vma_put(request[idx]->batch);
1448 		i915_request_put(request[idx]);
1449 		idx++;
1450 	}
1451 out_free:
1452 	kfree(request);
1453 	return err;
1454 }
1455 
1456 struct parallel_thread {
1457 	struct kthread_worker *worker;
1458 	struct kthread_work work;
1459 	struct intel_engine_cs *engine;
1460 	int result;
1461 };
1462 
1463 static void __live_parallel_engine1(struct kthread_work *work)
1464 {
1465 	struct parallel_thread *thread =
1466 		container_of(work, typeof(*thread), work);
1467 	struct intel_engine_cs *engine = thread->engine;
1468 	IGT_TIMEOUT(end_time);
1469 	unsigned long count;
1470 	int err = 0;
1471 
1472 	count = 0;
1473 	intel_engine_pm_get(engine);
1474 	do {
1475 		struct i915_request *rq;
1476 
1477 		rq = i915_request_create(engine->kernel_context);
1478 		if (IS_ERR(rq)) {
1479 			err = PTR_ERR(rq);
1480 			break;
1481 		}
1482 
1483 		i915_request_get(rq);
1484 		i915_request_add(rq);
1485 
1486 		err = 0;
1487 		if (i915_request_wait(rq, 0, HZ) < 0)
1488 			err = -ETIME;
1489 		i915_request_put(rq);
1490 		if (err)
1491 			break;
1492 
1493 		count++;
1494 	} while (!__igt_timeout(end_time, NULL));
1495 	intel_engine_pm_put(engine);
1496 
1497 	pr_info("%s: %lu request + sync\n", engine->name, count);
1498 	thread->result = err;
1499 }
1500 
1501 static void __live_parallel_engineN(struct kthread_work *work)
1502 {
1503 	struct parallel_thread *thread =
1504 		container_of(work, typeof(*thread), work);
1505 	struct intel_engine_cs *engine = thread->engine;
1506 	IGT_TIMEOUT(end_time);
1507 	unsigned long count;
1508 	int err = 0;
1509 
1510 	count = 0;
1511 	intel_engine_pm_get(engine);
1512 	do {
1513 		struct i915_request *rq;
1514 
1515 		rq = i915_request_create(engine->kernel_context);
1516 		if (IS_ERR(rq)) {
1517 			err = PTR_ERR(rq);
1518 			break;
1519 		}
1520 
1521 		i915_request_add(rq);
1522 		count++;
1523 	} while (!__igt_timeout(end_time, NULL));
1524 	intel_engine_pm_put(engine);
1525 
1526 	pr_info("%s: %lu requests\n", engine->name, count);
1527 	thread->result = err;
1528 }
1529 
1530 static bool wake_all(struct drm_i915_private *i915)
1531 {
1532 	if (atomic_dec_and_test(&i915->selftest.counter)) {
1533 		wake_up_var(&i915->selftest.counter);
1534 		return true;
1535 	}
1536 
1537 	return false;
1538 }
1539 
1540 static int wait_for_all(struct drm_i915_private *i915)
1541 {
1542 	if (wake_all(i915))
1543 		return 0;
1544 
1545 	if (wait_var_event_timeout(&i915->selftest.counter,
1546 				   !atomic_read(&i915->selftest.counter),
1547 				   i915_selftest.timeout_jiffies))
1548 		return 0;
1549 
1550 	return -ETIME;
1551 }
1552 
1553 static void __live_parallel_spin(struct kthread_work *work)
1554 {
1555 	struct parallel_thread *thread =
1556 		container_of(work, typeof(*thread), work);
1557 	struct intel_engine_cs *engine = thread->engine;
1558 	struct igt_spinner spin;
1559 	struct i915_request *rq;
1560 	int err = 0;
1561 
1562 	/*
1563 	 * Create a spinner running for eternity on each engine. If a second
1564 	 * spinner is incorrectly placed on the same engine, it will not be
1565 	 * able to start in time.
1566 	 */
1567 
1568 	if (igt_spinner_init(&spin, engine->gt)) {
1569 		wake_all(engine->i915);
1570 		thread->result = -ENOMEM;
1571 		return;
1572 	}
1573 
1574 	intel_engine_pm_get(engine);
1575 	rq = igt_spinner_create_request(&spin,
1576 					engine->kernel_context,
1577 					MI_NOOP); /* no preemption */
1578 	intel_engine_pm_put(engine);
1579 	if (IS_ERR(rq)) {
1580 		err = PTR_ERR(rq);
1581 		if (err == -ENODEV)
1582 			err = 0;
1583 		wake_all(engine->i915);
1584 		goto out_spin;
1585 	}
1586 
1587 	i915_request_get(rq);
1588 	i915_request_add(rq);
1589 	if (igt_wait_for_spinner(&spin, rq)) {
1590 		/* Occupy this engine for the whole test */
1591 		err = wait_for_all(engine->i915);
1592 	} else {
1593 		pr_err("Failed to start spinner on %s\n", engine->name);
1594 		err = -EINVAL;
1595 	}
1596 	igt_spinner_end(&spin);
1597 
1598 	if (err == 0 && i915_request_wait(rq, 0, HZ) < 0)
1599 		err = -EIO;
1600 	i915_request_put(rq);
1601 
1602 out_spin:
1603 	igt_spinner_fini(&spin);
1604 	thread->result = err;
1605 }
1606 
1607 static int live_parallel_engines(void *arg)
1608 {
1609 	struct drm_i915_private *i915 = arg;
1610 	static void (* const func[])(struct kthread_work *) = {
1611 		__live_parallel_engine1,
1612 		__live_parallel_engineN,
1613 		__live_parallel_spin,
1614 		NULL,
1615 	};
1616 	const unsigned int nengines = num_uabi_engines(i915);
1617 	struct parallel_thread *threads;
1618 	struct intel_engine_cs *engine;
1619 	void (* const *fn)(struct kthread_work *);
1620 	int err = 0;
1621 
1622 	/*
1623 	 * Check we can submit requests to all engines concurrently. This
1624 	 * tests that we load up the system maximally.
1625 	 */
1626 
1627 	threads = kcalloc(nengines, sizeof(*threads), GFP_KERNEL);
1628 	if (!threads)
1629 		return -ENOMEM;
1630 
1631 	for (fn = func; !err && *fn; fn++) {
1632 		char name[KSYM_NAME_LEN];
1633 		struct igt_live_test t;
1634 		unsigned int idx;
1635 
1636 		snprintf(name, sizeof(name), "%ps", *fn);
1637 		err = igt_live_test_begin(&t, i915, __func__, name);
1638 		if (err)
1639 			break;
1640 
1641 		atomic_set(&i915->selftest.counter, nengines);
1642 
1643 		idx = 0;
1644 		for_each_uabi_engine(engine, i915) {
1645 			struct kthread_worker *worker;
1646 
1647 			worker = kthread_run_worker(0, "igt/parallel:%s",
1648 						       engine->name);
1649 			if (IS_ERR(worker)) {
1650 				err = PTR_ERR(worker);
1651 				break;
1652 			}
1653 
1654 			threads[idx].worker = worker;
1655 			threads[idx].result = 0;
1656 			threads[idx].engine = engine;
1657 
1658 			kthread_init_work(&threads[idx].work, *fn);
1659 			kthread_queue_work(worker, &threads[idx].work);
1660 			idx++;
1661 		}
1662 
1663 		idx = 0;
1664 		for_each_uabi_engine(engine, i915) {
1665 			int status;
1666 
1667 			if (!threads[idx].worker)
1668 				break;
1669 
1670 			kthread_flush_work(&threads[idx].work);
1671 			status = READ_ONCE(threads[idx].result);
1672 			if (status && !err)
1673 				err = status;
1674 
1675 			kthread_destroy_worker(threads[idx++].worker);
1676 		}
1677 
1678 		if (igt_live_test_end(&t))
1679 			err = -EIO;
1680 	}
1681 
1682 	kfree(threads);
1683 	return err;
1684 }
1685 
1686 static int
1687 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1688 {
1689 	struct i915_request *rq;
1690 	int ret;
1691 
1692 	/*
1693 	 * Before execlists, all contexts share the same ringbuffer. With
1694 	 * execlists, each context/engine has a separate ringbuffer and
1695 	 * for the purposes of this test, inexhaustible.
1696 	 *
1697 	 * For the global ringbuffer though, we have to be very careful
1698 	 * that we do not wrap while preventing the execution of requests
1699 	 * with a unsignaled fence.
1700 	 */
1701 	if (HAS_EXECLISTS(ctx->i915))
1702 		return INT_MAX;
1703 
1704 	rq = igt_request_alloc(ctx, engine);
1705 	if (IS_ERR(rq)) {
1706 		ret = PTR_ERR(rq);
1707 	} else {
1708 		int sz;
1709 
1710 		ret = rq->ring->size - rq->reserved_space;
1711 		i915_request_add(rq);
1712 
1713 		sz = rq->ring->emit - rq->head;
1714 		if (sz < 0)
1715 			sz += rq->ring->size;
1716 		ret /= sz;
1717 		ret /= 2; /* leave half spare, in case of emergency! */
1718 	}
1719 
1720 	return ret;
1721 }
1722 
1723 static int live_breadcrumbs_smoketest(void *arg)
1724 {
1725 	struct drm_i915_private *i915 = arg;
1726 	const unsigned int nengines = num_uabi_engines(i915);
1727 	const unsigned int ncpus = /* saturate with nengines * ncpus */
1728 		max_t(int, 2, DIV_ROUND_UP(num_online_cpus(), nengines));
1729 	unsigned long num_waits, num_fences;
1730 	struct intel_engine_cs *engine;
1731 	struct smoke_thread *threads;
1732 	struct igt_live_test live;
1733 	intel_wakeref_t wakeref;
1734 	struct smoketest *smoke;
1735 	unsigned int n, idx;
1736 	struct file *file;
1737 	int ret = 0;
1738 
1739 	/*
1740 	 * Smoketest our breadcrumb/signal handling for requests across multiple
1741 	 * threads. A very simple test to only catch the most egregious of bugs.
1742 	 * See __igt_breadcrumbs_smoketest();
1743 	 *
1744 	 * On real hardware this time.
1745 	 */
1746 
1747 	wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1748 
1749 	file = mock_file(i915);
1750 	if (IS_ERR(file)) {
1751 		ret = PTR_ERR(file);
1752 		goto out_rpm;
1753 	}
1754 
1755 	smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1756 	if (!smoke) {
1757 		ret = -ENOMEM;
1758 		goto out_file;
1759 	}
1760 
1761 	threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1762 	if (!threads) {
1763 		ret = -ENOMEM;
1764 		goto out_smoke;
1765 	}
1766 
1767 	smoke[0].request_alloc = __live_request_alloc;
1768 	smoke[0].ncontexts = 64;
1769 	smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1770 				    sizeof(*smoke[0].contexts),
1771 				    GFP_KERNEL);
1772 	if (!smoke[0].contexts) {
1773 		ret = -ENOMEM;
1774 		goto out_threads;
1775 	}
1776 
1777 	for (n = 0; n < smoke[0].ncontexts; n++) {
1778 		smoke[0].contexts[n] = live_context(i915, file);
1779 		if (IS_ERR(smoke[0].contexts[n])) {
1780 			ret = PTR_ERR(smoke[0].contexts[n]);
1781 			goto out_contexts;
1782 		}
1783 	}
1784 
1785 	ret = igt_live_test_begin(&live, i915, __func__, "");
1786 	if (ret)
1787 		goto out_contexts;
1788 
1789 	idx = 0;
1790 	for_each_uabi_engine(engine, i915) {
1791 		smoke[idx] = smoke[0];
1792 		smoke[idx].engine = engine;
1793 		smoke[idx].max_batch =
1794 			max_batches(smoke[0].contexts[0], engine);
1795 		if (smoke[idx].max_batch < 0) {
1796 			ret = smoke[idx].max_batch;
1797 			goto out_flush;
1798 		}
1799 		/* One ring interleaved between requests from all cpus */
1800 		smoke[idx].max_batch /= ncpus + 1;
1801 		pr_debug("Limiting batches to %d requests on %s\n",
1802 			 smoke[idx].max_batch, engine->name);
1803 
1804 		for (n = 0; n < ncpus; n++) {
1805 			unsigned int i = idx * ncpus + n;
1806 			struct kthread_worker *worker;
1807 
1808 			worker = kthread_run_worker(0, "igt/%d.%d", idx, n);
1809 			if (IS_ERR(worker)) {
1810 				ret = PTR_ERR(worker);
1811 				goto out_flush;
1812 			}
1813 
1814 			threads[i].worker = worker;
1815 			threads[i].t = &smoke[idx];
1816 
1817 			kthread_init_work(&threads[i].work,
1818 					  __igt_breadcrumbs_smoketest);
1819 			kthread_queue_work(worker, &threads[i].work);
1820 		}
1821 
1822 		idx++;
1823 	}
1824 
1825 	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1826 
1827 out_flush:
1828 	idx = 0;
1829 	num_waits = 0;
1830 	num_fences = 0;
1831 	for_each_uabi_engine(engine, i915) {
1832 		for (n = 0; n < ncpus; n++) {
1833 			unsigned int i = idx * ncpus + n;
1834 			int err;
1835 
1836 			if (!threads[i].worker)
1837 				continue;
1838 
1839 			WRITE_ONCE(threads[i].stop, true);
1840 			kthread_flush_work(&threads[i].work);
1841 			err = READ_ONCE(threads[i].result);
1842 			if (err < 0 && !ret)
1843 				ret = err;
1844 
1845 			kthread_destroy_worker(threads[i].worker);
1846 		}
1847 
1848 		num_waits += atomic_long_read(&smoke[idx].num_waits);
1849 		num_fences += atomic_long_read(&smoke[idx].num_fences);
1850 		idx++;
1851 	}
1852 	pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1853 		num_waits, num_fences, idx, ncpus);
1854 
1855 	ret = igt_live_test_end(&live) ?: ret;
1856 out_contexts:
1857 	kfree(smoke[0].contexts);
1858 out_threads:
1859 	kfree(threads);
1860 out_smoke:
1861 	kfree(smoke);
1862 out_file:
1863 	fput(file);
1864 out_rpm:
1865 	intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1866 
1867 	return ret;
1868 }
1869 
1870 int i915_request_live_selftests(struct drm_i915_private *i915)
1871 {
1872 	static const struct i915_subtest tests[] = {
1873 		SUBTEST(live_nop_request),
1874 		SUBTEST(live_all_engines),
1875 		SUBTEST(live_sequential_engines),
1876 		SUBTEST(live_parallel_engines),
1877 		SUBTEST(live_empty_request),
1878 		SUBTEST(live_cancel_request),
1879 		SUBTEST(live_breadcrumbs_smoketest),
1880 	};
1881 
1882 	if (intel_gt_is_wedged(to_gt(i915)))
1883 		return 0;
1884 
1885 	return i915_live_subtests(tests, i915);
1886 }
1887 
1888 static int switch_to_kernel_sync(struct intel_context *ce, int err)
1889 {
1890 	struct i915_request *rq;
1891 	struct dma_fence *fence;
1892 
1893 	rq = intel_engine_create_kernel_request(ce->engine);
1894 	if (IS_ERR(rq))
1895 		return PTR_ERR(rq);
1896 
1897 	fence = i915_active_fence_get(&ce->timeline->last_request);
1898 	if (fence) {
1899 		i915_request_await_dma_fence(rq, fence);
1900 		dma_fence_put(fence);
1901 	}
1902 
1903 	rq = i915_request_get(rq);
1904 	i915_request_add(rq);
1905 	if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1906 		err = -ETIME;
1907 	i915_request_put(rq);
1908 
1909 	while (!err && !intel_engine_is_idle(ce->engine))
1910 		intel_engine_flush_submission(ce->engine);
1911 
1912 	return err;
1913 }
1914 
1915 struct perf_stats {
1916 	struct intel_engine_cs *engine;
1917 	unsigned long count;
1918 	ktime_t time;
1919 	ktime_t busy;
1920 	u64 runtime;
1921 };
1922 
1923 struct perf_series {
1924 	struct drm_i915_private *i915;
1925 	unsigned int nengines;
1926 	struct intel_context *ce[] __counted_by(nengines);
1927 };
1928 
1929 static int cmp_u32(const void *A, const void *B)
1930 {
1931 	const u32 *a = A, *b = B;
1932 
1933 	return *a - *b;
1934 }
1935 
1936 static u32 trifilter(u32 *a)
1937 {
1938 	u64 sum;
1939 
1940 #define TF_COUNT 5
1941 	sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1942 
1943 	sum = mul_u32_u32(a[2], 2);
1944 	sum += a[1];
1945 	sum += a[3];
1946 
1947 	GEM_BUG_ON(sum > U32_MAX);
1948 	return sum;
1949 #define TF_BIAS 2
1950 }
1951 
1952 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1953 {
1954 	u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles);
1955 
1956 	return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1957 }
1958 
1959 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1960 {
1961 	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1962 	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1963 	*cs++ = offset;
1964 	*cs++ = 0;
1965 
1966 	return cs;
1967 }
1968 
1969 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1970 {
1971 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1972 	*cs++ = offset;
1973 	*cs++ = 0;
1974 	*cs++ = value;
1975 
1976 	return cs;
1977 }
1978 
1979 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1980 {
1981 	*cs++ = MI_SEMAPHORE_WAIT |
1982 		MI_SEMAPHORE_GLOBAL_GTT |
1983 		MI_SEMAPHORE_POLL |
1984 		mode;
1985 	*cs++ = value;
1986 	*cs++ = offset;
1987 	*cs++ = 0;
1988 
1989 	return cs;
1990 }
1991 
1992 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1993 {
1994 	return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1995 }
1996 
1997 static void semaphore_set(u32 *sema, u32 value)
1998 {
1999 	WRITE_ONCE(*sema, value);
2000 	wmb(); /* flush the update to the cache, and beyond */
2001 }
2002 
2003 static u32 *hwsp_scratch(const struct intel_context *ce)
2004 {
2005 	return memset32(ce->engine->status_page.addr + 1000, 0, 21);
2006 }
2007 
2008 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
2009 {
2010 	return (i915_ggtt_offset(ce->engine->status_page.vma) +
2011 		offset_in_page(dw));
2012 }
2013 
2014 static int measure_semaphore_response(struct intel_context *ce)
2015 {
2016 	u32 *sema = hwsp_scratch(ce);
2017 	const u32 offset = hwsp_offset(ce, sema);
2018 	u32 elapsed[TF_COUNT], cycles;
2019 	struct i915_request *rq;
2020 	u32 *cs;
2021 	int err;
2022 	int i;
2023 
2024 	/*
2025 	 * Measure how many cycles it takes for the HW to detect the change
2026 	 * in a semaphore value.
2027 	 *
2028 	 *    A: read CS_TIMESTAMP from CPU
2029 	 *    poke semaphore
2030 	 *    B: read CS_TIMESTAMP on GPU
2031 	 *
2032 	 * Semaphore latency: B - A
2033 	 */
2034 
2035 	semaphore_set(sema, -1);
2036 
2037 	rq = i915_request_create(ce);
2038 	if (IS_ERR(rq))
2039 		return PTR_ERR(rq);
2040 
2041 	cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
2042 	if (IS_ERR(cs)) {
2043 		i915_request_add(rq);
2044 		err = PTR_ERR(cs);
2045 		goto err;
2046 	}
2047 
2048 	cs = emit_store_dw(cs, offset, 0);
2049 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2050 		cs = emit_semaphore_poll_until(cs, offset, i);
2051 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2052 		cs = emit_store_dw(cs, offset, 0);
2053 	}
2054 
2055 	intel_ring_advance(rq, cs);
2056 	i915_request_add(rq);
2057 
2058 	if (wait_for(READ_ONCE(*sema) == 0, 50)) {
2059 		err = -EIO;
2060 		goto err;
2061 	}
2062 
2063 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2064 		preempt_disable();
2065 		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2066 		semaphore_set(sema, i);
2067 		preempt_enable();
2068 
2069 		if (wait_for(READ_ONCE(*sema) == 0, 50)) {
2070 			err = -EIO;
2071 			goto err;
2072 		}
2073 
2074 		elapsed[i - 1] = sema[i] - cycles;
2075 	}
2076 
2077 	cycles = trifilter(elapsed);
2078 	pr_info("%s: semaphore response %d cycles, %lluns\n",
2079 		ce->engine->name, cycles >> TF_BIAS,
2080 		cycles_to_ns(ce->engine, cycles));
2081 
2082 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2083 
2084 err:
2085 	intel_gt_set_wedged(ce->engine->gt);
2086 	return err;
2087 }
2088 
2089 static int measure_idle_dispatch(struct intel_context *ce)
2090 {
2091 	u32 *sema = hwsp_scratch(ce);
2092 	const u32 offset = hwsp_offset(ce, sema);
2093 	u32 elapsed[TF_COUNT], cycles;
2094 	u32 *cs;
2095 	int err;
2096 	int i;
2097 
2098 	/*
2099 	 * Measure how long it takes for us to submit a request while the
2100 	 * engine is idle, but is resting in our context.
2101 	 *
2102 	 *    A: read CS_TIMESTAMP from CPU
2103 	 *    submit request
2104 	 *    B: read CS_TIMESTAMP on GPU
2105 	 *
2106 	 * Submission latency: B - A
2107 	 */
2108 
2109 	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2110 		struct i915_request *rq;
2111 
2112 		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2113 		if (err)
2114 			return err;
2115 
2116 		rq = i915_request_create(ce);
2117 		if (IS_ERR(rq)) {
2118 			err = PTR_ERR(rq);
2119 			goto err;
2120 		}
2121 
2122 		cs = intel_ring_begin(rq, 4);
2123 		if (IS_ERR(cs)) {
2124 			i915_request_add(rq);
2125 			err = PTR_ERR(cs);
2126 			goto err;
2127 		}
2128 
2129 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2130 
2131 		intel_ring_advance(rq, cs);
2132 
2133 		preempt_disable();
2134 		local_bh_disable();
2135 		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2136 		i915_request_add(rq);
2137 		local_bh_enable();
2138 		preempt_enable();
2139 	}
2140 
2141 	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2142 	if (err)
2143 		goto err;
2144 
2145 	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
2146 		elapsed[i] = sema[i] - elapsed[i];
2147 
2148 	cycles = trifilter(elapsed);
2149 	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
2150 		ce->engine->name, cycles >> TF_BIAS,
2151 		cycles_to_ns(ce->engine, cycles));
2152 
2153 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2154 
2155 err:
2156 	intel_gt_set_wedged(ce->engine->gt);
2157 	return err;
2158 }
2159 
2160 static int measure_busy_dispatch(struct intel_context *ce)
2161 {
2162 	u32 *sema = hwsp_scratch(ce);
2163 	const u32 offset = hwsp_offset(ce, sema);
2164 	u32 elapsed[TF_COUNT + 1], cycles;
2165 	u32 *cs;
2166 	int err;
2167 	int i;
2168 
2169 	/*
2170 	 * Measure how long it takes for us to submit a request while the
2171 	 * engine is busy, polling on a semaphore in our context. With
2172 	 * direct submission, this will include the cost of a lite restore.
2173 	 *
2174 	 *    A: read CS_TIMESTAMP from CPU
2175 	 *    submit request
2176 	 *    B: read CS_TIMESTAMP on GPU
2177 	 *
2178 	 * Submission latency: B - A
2179 	 */
2180 
2181 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2182 		struct i915_request *rq;
2183 
2184 		rq = i915_request_create(ce);
2185 		if (IS_ERR(rq)) {
2186 			err = PTR_ERR(rq);
2187 			goto err;
2188 		}
2189 
2190 		cs = intel_ring_begin(rq, 12);
2191 		if (IS_ERR(cs)) {
2192 			i915_request_add(rq);
2193 			err = PTR_ERR(cs);
2194 			goto err;
2195 		}
2196 
2197 		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2198 		cs = emit_semaphore_poll_until(cs, offset, i);
2199 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2200 
2201 		intel_ring_advance(rq, cs);
2202 
2203 		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
2204 			err = -EIO;
2205 			goto err;
2206 		}
2207 
2208 		preempt_disable();
2209 		local_bh_disable();
2210 		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2211 		i915_request_add(rq);
2212 		local_bh_enable();
2213 		semaphore_set(sema, i - 1);
2214 		preempt_enable();
2215 	}
2216 
2217 	wait_for(READ_ONCE(sema[i - 1]), 500);
2218 	semaphore_set(sema, i - 1);
2219 
2220 	for (i = 1; i <= TF_COUNT; i++) {
2221 		GEM_BUG_ON(sema[i] == -1);
2222 		elapsed[i - 1] = sema[i] - elapsed[i];
2223 	}
2224 
2225 	cycles = trifilter(elapsed);
2226 	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
2227 		ce->engine->name, cycles >> TF_BIAS,
2228 		cycles_to_ns(ce->engine, cycles));
2229 
2230 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2231 
2232 err:
2233 	intel_gt_set_wedged(ce->engine->gt);
2234 	return err;
2235 }
2236 
2237 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
2238 {
2239 	const u32 offset =
2240 		i915_ggtt_offset(engine->status_page.vma) +
2241 		offset_in_page(sema);
2242 	struct i915_request *rq;
2243 	u32 *cs;
2244 
2245 	rq = i915_request_create(engine->kernel_context);
2246 	if (IS_ERR(rq))
2247 		return PTR_ERR(rq);
2248 
2249 	cs = intel_ring_begin(rq, 4);
2250 	if (IS_ERR(cs)) {
2251 		i915_request_add(rq);
2252 		return PTR_ERR(cs);
2253 	}
2254 
2255 	cs = emit_semaphore_poll(cs, mode, value, offset);
2256 
2257 	intel_ring_advance(rq, cs);
2258 	i915_request_add(rq);
2259 
2260 	return 0;
2261 }
2262 
2263 static int measure_inter_request(struct intel_context *ce)
2264 {
2265 	u32 *sema = hwsp_scratch(ce);
2266 	const u32 offset = hwsp_offset(ce, sema);
2267 	u32 elapsed[TF_COUNT + 1], cycles;
2268 	struct i915_sw_fence *submit;
2269 	int i, err;
2270 
2271 	/*
2272 	 * Measure how long it takes to advance from one request into the
2273 	 * next. Between each request we flush the GPU caches to memory,
2274 	 * update the breadcrumbs, and then invalidate those caches.
2275 	 * We queue up all the requests to be submitted in one batch so
2276 	 * it should be one set of contiguous measurements.
2277 	 *
2278 	 *    A: read CS_TIMESTAMP on GPU
2279 	 *    advance request
2280 	 *    B: read CS_TIMESTAMP on GPU
2281 	 *
2282 	 * Request latency: B - A
2283 	 */
2284 
2285 	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2286 	if (err)
2287 		return err;
2288 
2289 	submit = heap_fence_create(GFP_KERNEL);
2290 	if (!submit) {
2291 		semaphore_set(sema, 1);
2292 		return -ENOMEM;
2293 	}
2294 
2295 	intel_engine_flush_submission(ce->engine);
2296 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2297 		struct i915_request *rq;
2298 		u32 *cs;
2299 
2300 		rq = i915_request_create(ce);
2301 		if (IS_ERR(rq)) {
2302 			err = PTR_ERR(rq);
2303 			goto err_submit;
2304 		}
2305 
2306 		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
2307 						       submit,
2308 						       GFP_KERNEL);
2309 		if (err < 0) {
2310 			i915_request_add(rq);
2311 			goto err_submit;
2312 		}
2313 
2314 		cs = intel_ring_begin(rq, 4);
2315 		if (IS_ERR(cs)) {
2316 			i915_request_add(rq);
2317 			err = PTR_ERR(cs);
2318 			goto err_submit;
2319 		}
2320 
2321 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2322 
2323 		intel_ring_advance(rq, cs);
2324 		i915_request_add(rq);
2325 	}
2326 	i915_sw_fence_commit(submit);
2327 	intel_engine_flush_submission(ce->engine);
2328 	heap_fence_put(submit);
2329 
2330 	semaphore_set(sema, 1);
2331 	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2332 	if (err)
2333 		goto err;
2334 
2335 	for (i = 1; i <= TF_COUNT; i++)
2336 		elapsed[i - 1] = sema[i + 1] - sema[i];
2337 
2338 	cycles = trifilter(elapsed);
2339 	pr_info("%s: inter-request latency %d cycles, %lluns\n",
2340 		ce->engine->name, cycles >> TF_BIAS,
2341 		cycles_to_ns(ce->engine, cycles));
2342 
2343 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2344 
2345 err_submit:
2346 	i915_sw_fence_commit(submit);
2347 	heap_fence_put(submit);
2348 	semaphore_set(sema, 1);
2349 err:
2350 	intel_gt_set_wedged(ce->engine->gt);
2351 	return err;
2352 }
2353 
2354 static int measure_context_switch(struct intel_context *ce)
2355 {
2356 	u32 *sema = hwsp_scratch(ce);
2357 	const u32 offset = hwsp_offset(ce, sema);
2358 	struct i915_request *fence = NULL;
2359 	u32 elapsed[TF_COUNT + 1], cycles;
2360 	int i, j, err;
2361 	u32 *cs;
2362 
2363 	/*
2364 	 * Measure how long it takes to advance from one request in one
2365 	 * context to a request in another context. This allows us to
2366 	 * measure how long the context save/restore take, along with all
2367 	 * the inter-context setup we require.
2368 	 *
2369 	 *    A: read CS_TIMESTAMP on GPU
2370 	 *    switch context
2371 	 *    B: read CS_TIMESTAMP on GPU
2372 	 *
2373 	 * Context switch latency: B - A
2374 	 */
2375 
2376 	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2377 	if (err)
2378 		return err;
2379 
2380 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2381 		struct intel_context *arr[] = {
2382 			ce, ce->engine->kernel_context
2383 		};
2384 		u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
2385 
2386 		for (j = 0; j < ARRAY_SIZE(arr); j++) {
2387 			struct i915_request *rq;
2388 
2389 			rq = i915_request_create(arr[j]);
2390 			if (IS_ERR(rq)) {
2391 				err = PTR_ERR(rq);
2392 				goto err_fence;
2393 			}
2394 
2395 			if (fence) {
2396 				err = i915_request_await_dma_fence(rq,
2397 								   &fence->fence);
2398 				if (err) {
2399 					i915_request_add(rq);
2400 					goto err_fence;
2401 				}
2402 			}
2403 
2404 			cs = intel_ring_begin(rq, 4);
2405 			if (IS_ERR(cs)) {
2406 				i915_request_add(rq);
2407 				err = PTR_ERR(cs);
2408 				goto err_fence;
2409 			}
2410 
2411 			cs = emit_timestamp_store(cs, ce, addr);
2412 			addr += sizeof(u32);
2413 
2414 			intel_ring_advance(rq, cs);
2415 
2416 			i915_request_put(fence);
2417 			fence = i915_request_get(rq);
2418 
2419 			i915_request_add(rq);
2420 		}
2421 	}
2422 	i915_request_put(fence);
2423 	intel_engine_flush_submission(ce->engine);
2424 
2425 	semaphore_set(sema, 1);
2426 	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2427 	if (err)
2428 		goto err;
2429 
2430 	for (i = 1; i <= TF_COUNT; i++)
2431 		elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2432 
2433 	cycles = trifilter(elapsed);
2434 	pr_info("%s: context switch latency %d cycles, %lluns\n",
2435 		ce->engine->name, cycles >> TF_BIAS,
2436 		cycles_to_ns(ce->engine, cycles));
2437 
2438 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2439 
2440 err_fence:
2441 	i915_request_put(fence);
2442 	semaphore_set(sema, 1);
2443 err:
2444 	intel_gt_set_wedged(ce->engine->gt);
2445 	return err;
2446 }
2447 
2448 static int measure_preemption(struct intel_context *ce)
2449 {
2450 	u32 *sema = hwsp_scratch(ce);
2451 	const u32 offset = hwsp_offset(ce, sema);
2452 	u32 elapsed[TF_COUNT], cycles;
2453 	u32 *cs;
2454 	int err;
2455 	int i;
2456 
2457 	/*
2458 	 * We measure two latencies while triggering preemption. The first
2459 	 * latency is how long it takes for us to submit a preempting request.
2460 	 * The second latency is how it takes for us to return from the
2461 	 * preemption back to the original context.
2462 	 *
2463 	 *    A: read CS_TIMESTAMP from CPU
2464 	 *    submit preemption
2465 	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
2466 	 *    context switch
2467 	 *    C: read CS_TIMESTAMP on GPU (in original context)
2468 	 *
2469 	 * Preemption dispatch latency: B - A
2470 	 * Preemption switch latency: C - B
2471 	 */
2472 
2473 	if (!intel_engine_has_preemption(ce->engine))
2474 		return 0;
2475 
2476 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2477 		u32 addr = offset + 2 * i * sizeof(u32);
2478 		struct i915_request *rq;
2479 
2480 		rq = i915_request_create(ce);
2481 		if (IS_ERR(rq)) {
2482 			err = PTR_ERR(rq);
2483 			goto err;
2484 		}
2485 
2486 		cs = intel_ring_begin(rq, 12);
2487 		if (IS_ERR(cs)) {
2488 			i915_request_add(rq);
2489 			err = PTR_ERR(cs);
2490 			goto err;
2491 		}
2492 
2493 		cs = emit_store_dw(cs, addr, -1);
2494 		cs = emit_semaphore_poll_until(cs, offset, i);
2495 		cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2496 
2497 		intel_ring_advance(rq, cs);
2498 		i915_request_add(rq);
2499 
2500 		if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2501 			err = -EIO;
2502 			goto err;
2503 		}
2504 
2505 		rq = i915_request_create(ce->engine->kernel_context);
2506 		if (IS_ERR(rq)) {
2507 			err = PTR_ERR(rq);
2508 			goto err;
2509 		}
2510 
2511 		cs = intel_ring_begin(rq, 8);
2512 		if (IS_ERR(cs)) {
2513 			i915_request_add(rq);
2514 			err = PTR_ERR(cs);
2515 			goto err;
2516 		}
2517 
2518 		cs = emit_timestamp_store(cs, ce, addr);
2519 		cs = emit_store_dw(cs, offset, i);
2520 
2521 		intel_ring_advance(rq, cs);
2522 		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2523 
2524 		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2525 		i915_request_add(rq);
2526 	}
2527 
2528 	if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2529 		err = -EIO;
2530 		goto err;
2531 	}
2532 
2533 	for (i = 1; i <= TF_COUNT; i++)
2534 		elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2535 
2536 	cycles = trifilter(elapsed);
2537 	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2538 		ce->engine->name, cycles >> TF_BIAS,
2539 		cycles_to_ns(ce->engine, cycles));
2540 
2541 	for (i = 1; i <= TF_COUNT; i++)
2542 		elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2543 
2544 	cycles = trifilter(elapsed);
2545 	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2546 		ce->engine->name, cycles >> TF_BIAS,
2547 		cycles_to_ns(ce->engine, cycles));
2548 
2549 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2550 
2551 err:
2552 	intel_gt_set_wedged(ce->engine->gt);
2553 	return err;
2554 }
2555 
2556 struct signal_cb {
2557 	struct dma_fence_cb base;
2558 	bool seen;
2559 };
2560 
2561 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2562 {
2563 	struct signal_cb *s = container_of(cb, typeof(*s), base);
2564 
2565 	smp_store_mb(s->seen, true); /* be safe, be strong */
2566 }
2567 
2568 static int measure_completion(struct intel_context *ce)
2569 {
2570 	u32 *sema = hwsp_scratch(ce);
2571 	const u32 offset = hwsp_offset(ce, sema);
2572 	u32 elapsed[TF_COUNT], cycles;
2573 	u32 *cs;
2574 	int err;
2575 	int i;
2576 
2577 	/*
2578 	 * Measure how long it takes for the signal (interrupt) to be
2579 	 * sent from the GPU to be processed by the CPU.
2580 	 *
2581 	 *    A: read CS_TIMESTAMP on GPU
2582 	 *    signal
2583 	 *    B: read CS_TIMESTAMP from CPU
2584 	 *
2585 	 * Completion latency: B - A
2586 	 */
2587 
2588 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2589 		struct signal_cb cb = { .seen = false };
2590 		struct i915_request *rq;
2591 
2592 		rq = i915_request_create(ce);
2593 		if (IS_ERR(rq)) {
2594 			err = PTR_ERR(rq);
2595 			goto err;
2596 		}
2597 
2598 		cs = intel_ring_begin(rq, 12);
2599 		if (IS_ERR(cs)) {
2600 			i915_request_add(rq);
2601 			err = PTR_ERR(cs);
2602 			goto err;
2603 		}
2604 
2605 		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2606 		cs = emit_semaphore_poll_until(cs, offset, i);
2607 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2608 
2609 		intel_ring_advance(rq, cs);
2610 
2611 		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2612 		i915_request_add(rq);
2613 
2614 		intel_engine_flush_submission(ce->engine);
2615 		if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2616 			err = -EIO;
2617 			goto err;
2618 		}
2619 
2620 		preempt_disable();
2621 		semaphore_set(sema, i);
2622 		while (!READ_ONCE(cb.seen))
2623 			cpu_relax();
2624 
2625 		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2626 		preempt_enable();
2627 	}
2628 
2629 	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2630 	if (err)
2631 		goto err;
2632 
2633 	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2634 		GEM_BUG_ON(sema[i + 1] == -1);
2635 		elapsed[i] = elapsed[i] - sema[i + 1];
2636 	}
2637 
2638 	cycles = trifilter(elapsed);
2639 	pr_info("%s: completion latency %d cycles, %lluns\n",
2640 		ce->engine->name, cycles >> TF_BIAS,
2641 		cycles_to_ns(ce->engine, cycles));
2642 
2643 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2644 
2645 err:
2646 	intel_gt_set_wedged(ce->engine->gt);
2647 	return err;
2648 }
2649 
2650 static void rps_pin(struct intel_gt *gt)
2651 {
2652 	/* Pin the frequency to max */
2653 	atomic_inc(&gt->rps.num_waiters);
2654 	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2655 
2656 	mutex_lock(&gt->rps.lock);
2657 	intel_rps_set(&gt->rps, gt->rps.max_freq);
2658 	mutex_unlock(&gt->rps.lock);
2659 }
2660 
2661 static void rps_unpin(struct intel_gt *gt)
2662 {
2663 	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2664 	atomic_dec(&gt->rps.num_waiters);
2665 }
2666 
2667 static int perf_request_latency(void *arg)
2668 {
2669 	struct drm_i915_private *i915 = arg;
2670 	struct intel_engine_cs *engine;
2671 	struct pm_qos_request qos;
2672 	int err = 0;
2673 
2674 	if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */
2675 		return 0;
2676 
2677 	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2678 
2679 	for_each_uabi_engine(engine, i915) {
2680 		struct intel_context *ce;
2681 
2682 		ce = intel_context_create(engine);
2683 		if (IS_ERR(ce)) {
2684 			err = PTR_ERR(ce);
2685 			goto out;
2686 		}
2687 
2688 		err = intel_context_pin(ce);
2689 		if (err) {
2690 			intel_context_put(ce);
2691 			goto out;
2692 		}
2693 
2694 		st_engine_heartbeat_disable(engine);
2695 		rps_pin(engine->gt);
2696 
2697 		if (err == 0)
2698 			err = measure_semaphore_response(ce);
2699 		if (err == 0)
2700 			err = measure_idle_dispatch(ce);
2701 		if (err == 0)
2702 			err = measure_busy_dispatch(ce);
2703 		if (err == 0)
2704 			err = measure_inter_request(ce);
2705 		if (err == 0)
2706 			err = measure_context_switch(ce);
2707 		if (err == 0)
2708 			err = measure_preemption(ce);
2709 		if (err == 0)
2710 			err = measure_completion(ce);
2711 
2712 		rps_unpin(engine->gt);
2713 		st_engine_heartbeat_enable(engine);
2714 
2715 		intel_context_unpin(ce);
2716 		intel_context_put(ce);
2717 		if (err)
2718 			goto out;
2719 	}
2720 
2721 out:
2722 	if (igt_flush_test(i915))
2723 		err = -EIO;
2724 
2725 	cpu_latency_qos_remove_request(&qos);
2726 	return err;
2727 }
2728 
2729 static int s_sync0(void *arg)
2730 {
2731 	struct perf_series *ps = arg;
2732 	IGT_TIMEOUT(end_time);
2733 	unsigned int idx = 0;
2734 	int err = 0;
2735 
2736 	GEM_BUG_ON(!ps->nengines);
2737 	do {
2738 		struct i915_request *rq;
2739 
2740 		rq = i915_request_create(ps->ce[idx]);
2741 		if (IS_ERR(rq)) {
2742 			err = PTR_ERR(rq);
2743 			break;
2744 		}
2745 
2746 		i915_request_get(rq);
2747 		i915_request_add(rq);
2748 
2749 		if (i915_request_wait(rq, 0, HZ / 5) < 0)
2750 			err = -ETIME;
2751 		i915_request_put(rq);
2752 		if (err)
2753 			break;
2754 
2755 		if (++idx == ps->nengines)
2756 			idx = 0;
2757 	} while (!__igt_timeout(end_time, NULL));
2758 
2759 	return err;
2760 }
2761 
2762 static int s_sync1(void *arg)
2763 {
2764 	struct perf_series *ps = arg;
2765 	struct i915_request *prev = NULL;
2766 	IGT_TIMEOUT(end_time);
2767 	unsigned int idx = 0;
2768 	int err = 0;
2769 
2770 	GEM_BUG_ON(!ps->nengines);
2771 	do {
2772 		struct i915_request *rq;
2773 
2774 		rq = i915_request_create(ps->ce[idx]);
2775 		if (IS_ERR(rq)) {
2776 			err = PTR_ERR(rq);
2777 			break;
2778 		}
2779 
2780 		i915_request_get(rq);
2781 		i915_request_add(rq);
2782 
2783 		if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2784 			err = -ETIME;
2785 		i915_request_put(prev);
2786 		prev = rq;
2787 		if (err)
2788 			break;
2789 
2790 		if (++idx == ps->nengines)
2791 			idx = 0;
2792 	} while (!__igt_timeout(end_time, NULL));
2793 	i915_request_put(prev);
2794 
2795 	return err;
2796 }
2797 
2798 static int s_many(void *arg)
2799 {
2800 	struct perf_series *ps = arg;
2801 	IGT_TIMEOUT(end_time);
2802 	unsigned int idx = 0;
2803 
2804 	GEM_BUG_ON(!ps->nengines);
2805 	do {
2806 		struct i915_request *rq;
2807 
2808 		rq = i915_request_create(ps->ce[idx]);
2809 		if (IS_ERR(rq))
2810 			return PTR_ERR(rq);
2811 
2812 		i915_request_add(rq);
2813 
2814 		if (++idx == ps->nengines)
2815 			idx = 0;
2816 	} while (!__igt_timeout(end_time, NULL));
2817 
2818 	return 0;
2819 }
2820 
2821 static int perf_series_engines(void *arg)
2822 {
2823 	struct drm_i915_private *i915 = arg;
2824 	static int (* const func[])(void *arg) = {
2825 		s_sync0,
2826 		s_sync1,
2827 		s_many,
2828 		NULL,
2829 	};
2830 	const unsigned int nengines = num_uabi_engines(i915);
2831 	struct intel_engine_cs *engine;
2832 	int (* const *fn)(void *arg);
2833 	struct pm_qos_request qos;
2834 	struct perf_stats *stats;
2835 	struct perf_series *ps;
2836 	unsigned int idx;
2837 	int err = 0;
2838 
2839 	stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2840 	if (!stats)
2841 		return -ENOMEM;
2842 
2843 	ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2844 	if (!ps) {
2845 		kfree(stats);
2846 		return -ENOMEM;
2847 	}
2848 
2849 	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2850 
2851 	ps->i915 = i915;
2852 	ps->nengines = nengines;
2853 
2854 	idx = 0;
2855 	for_each_uabi_engine(engine, i915) {
2856 		struct intel_context *ce;
2857 
2858 		ce = intel_context_create(engine);
2859 		if (IS_ERR(ce)) {
2860 			err = PTR_ERR(ce);
2861 			goto out;
2862 		}
2863 
2864 		err = intel_context_pin(ce);
2865 		if (err) {
2866 			intel_context_put(ce);
2867 			goto out;
2868 		}
2869 
2870 		ps->ce[idx++] = ce;
2871 	}
2872 	GEM_BUG_ON(idx != ps->nengines);
2873 
2874 	for (fn = func; *fn && !err; fn++) {
2875 		char name[KSYM_NAME_LEN];
2876 		struct igt_live_test t;
2877 
2878 		snprintf(name, sizeof(name), "%ps", *fn);
2879 		err = igt_live_test_begin(&t, i915, __func__, name);
2880 		if (err)
2881 			break;
2882 
2883 		for (idx = 0; idx < nengines; idx++) {
2884 			struct perf_stats *p =
2885 				memset(&stats[idx], 0, sizeof(stats[idx]));
2886 			struct intel_context *ce = ps->ce[idx];
2887 
2888 			p->engine = ps->ce[idx]->engine;
2889 			intel_engine_pm_get(p->engine);
2890 
2891 			if (intel_engine_supports_stats(p->engine))
2892 				p->busy = intel_engine_get_busy_time(p->engine,
2893 								     &p->time) + 1;
2894 			else
2895 				p->time = ktime_get();
2896 			p->runtime = -intel_context_get_total_runtime_ns(ce);
2897 		}
2898 
2899 		err = (*fn)(ps);
2900 		if (igt_live_test_end(&t))
2901 			err = -EIO;
2902 
2903 		for (idx = 0; idx < nengines; idx++) {
2904 			struct perf_stats *p = &stats[idx];
2905 			struct intel_context *ce = ps->ce[idx];
2906 			int integer, decimal;
2907 			u64 busy, dt, now;
2908 
2909 			if (p->busy)
2910 				p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2911 									       &now),
2912 						    p->busy - 1);
2913 			else
2914 				now = ktime_get();
2915 			p->time = ktime_sub(now, p->time);
2916 
2917 			err = switch_to_kernel_sync(ce, err);
2918 			p->runtime += intel_context_get_total_runtime_ns(ce);
2919 			intel_engine_pm_put(p->engine);
2920 
2921 			busy = 100 * ktime_to_ns(p->busy);
2922 			dt = ktime_to_ns(p->time);
2923 			if (dt) {
2924 				integer = div64_u64(busy, dt);
2925 				busy -= integer * dt;
2926 				decimal = div64_u64(100 * busy, dt);
2927 			} else {
2928 				integer = 0;
2929 				decimal = 0;
2930 			}
2931 
2932 			pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2933 				name, p->engine->name, ce->timeline->seqno,
2934 				integer, decimal,
2935 				div_u64(p->runtime, 1000 * 1000),
2936 				div_u64(ktime_to_ns(p->time), 1000 * 1000));
2937 		}
2938 	}
2939 
2940 out:
2941 	for (idx = 0; idx < nengines; idx++) {
2942 		if (IS_ERR_OR_NULL(ps->ce[idx]))
2943 			break;
2944 
2945 		intel_context_unpin(ps->ce[idx]);
2946 		intel_context_put(ps->ce[idx]);
2947 	}
2948 	kfree(ps);
2949 
2950 	cpu_latency_qos_remove_request(&qos);
2951 	kfree(stats);
2952 	return err;
2953 }
2954 
2955 struct p_thread {
2956 	struct perf_stats p;
2957 	struct kthread_worker *worker;
2958 	struct kthread_work work;
2959 	struct intel_engine_cs *engine;
2960 	int result;
2961 };
2962 
2963 static void p_sync0(struct kthread_work *work)
2964 {
2965 	struct p_thread *thread = container_of(work, typeof(*thread), work);
2966 	struct perf_stats *p = &thread->p;
2967 	struct intel_engine_cs *engine = p->engine;
2968 	struct intel_context *ce;
2969 	IGT_TIMEOUT(end_time);
2970 	unsigned long count;
2971 	bool busy;
2972 	int err = 0;
2973 
2974 	ce = intel_context_create(engine);
2975 	if (IS_ERR(ce)) {
2976 		thread->result = PTR_ERR(ce);
2977 		return;
2978 	}
2979 
2980 	err = intel_context_pin(ce);
2981 	if (err) {
2982 		intel_context_put(ce);
2983 		thread->result = err;
2984 		return;
2985 	}
2986 
2987 	if (intel_engine_supports_stats(engine)) {
2988 		p->busy = intel_engine_get_busy_time(engine, &p->time);
2989 		busy = true;
2990 	} else {
2991 		p->time = ktime_get();
2992 		busy = false;
2993 	}
2994 
2995 	count = 0;
2996 	do {
2997 		struct i915_request *rq;
2998 
2999 		rq = i915_request_create(ce);
3000 		if (IS_ERR(rq)) {
3001 			err = PTR_ERR(rq);
3002 			break;
3003 		}
3004 
3005 		i915_request_get(rq);
3006 		i915_request_add(rq);
3007 
3008 		err = 0;
3009 		if (i915_request_wait(rq, 0, HZ) < 0)
3010 			err = -ETIME;
3011 		i915_request_put(rq);
3012 		if (err)
3013 			break;
3014 
3015 		count++;
3016 	} while (!__igt_timeout(end_time, NULL));
3017 
3018 	if (busy) {
3019 		ktime_t now;
3020 
3021 		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3022 				    p->busy);
3023 		p->time = ktime_sub(now, p->time);
3024 	} else {
3025 		p->time = ktime_sub(ktime_get(), p->time);
3026 	}
3027 
3028 	err = switch_to_kernel_sync(ce, err);
3029 	p->runtime = intel_context_get_total_runtime_ns(ce);
3030 	p->count = count;
3031 
3032 	intel_context_unpin(ce);
3033 	intel_context_put(ce);
3034 	thread->result = err;
3035 }
3036 
3037 static void p_sync1(struct kthread_work *work)
3038 {
3039 	struct p_thread *thread = container_of(work, typeof(*thread), work);
3040 	struct perf_stats *p = &thread->p;
3041 	struct intel_engine_cs *engine = p->engine;
3042 	struct i915_request *prev = NULL;
3043 	struct intel_context *ce;
3044 	IGT_TIMEOUT(end_time);
3045 	unsigned long count;
3046 	bool busy;
3047 	int err = 0;
3048 
3049 	ce = intel_context_create(engine);
3050 	if (IS_ERR(ce)) {
3051 		thread->result = PTR_ERR(ce);
3052 		return;
3053 	}
3054 
3055 	err = intel_context_pin(ce);
3056 	if (err) {
3057 		intel_context_put(ce);
3058 		thread->result = err;
3059 		return;
3060 	}
3061 
3062 	if (intel_engine_supports_stats(engine)) {
3063 		p->busy = intel_engine_get_busy_time(engine, &p->time);
3064 		busy = true;
3065 	} else {
3066 		p->time = ktime_get();
3067 		busy = false;
3068 	}
3069 
3070 	count = 0;
3071 	do {
3072 		struct i915_request *rq;
3073 
3074 		rq = i915_request_create(ce);
3075 		if (IS_ERR(rq)) {
3076 			err = PTR_ERR(rq);
3077 			break;
3078 		}
3079 
3080 		i915_request_get(rq);
3081 		i915_request_add(rq);
3082 
3083 		err = 0;
3084 		if (prev && i915_request_wait(prev, 0, HZ) < 0)
3085 			err = -ETIME;
3086 		i915_request_put(prev);
3087 		prev = rq;
3088 		if (err)
3089 			break;
3090 
3091 		count++;
3092 	} while (!__igt_timeout(end_time, NULL));
3093 	i915_request_put(prev);
3094 
3095 	if (busy) {
3096 		ktime_t now;
3097 
3098 		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3099 				    p->busy);
3100 		p->time = ktime_sub(now, p->time);
3101 	} else {
3102 		p->time = ktime_sub(ktime_get(), p->time);
3103 	}
3104 
3105 	err = switch_to_kernel_sync(ce, err);
3106 	p->runtime = intel_context_get_total_runtime_ns(ce);
3107 	p->count = count;
3108 
3109 	intel_context_unpin(ce);
3110 	intel_context_put(ce);
3111 	thread->result = err;
3112 }
3113 
3114 static void p_many(struct kthread_work *work)
3115 {
3116 	struct p_thread *thread = container_of(work, typeof(*thread), work);
3117 	struct perf_stats *p = &thread->p;
3118 	struct intel_engine_cs *engine = p->engine;
3119 	struct intel_context *ce;
3120 	IGT_TIMEOUT(end_time);
3121 	unsigned long count;
3122 	int err = 0;
3123 	bool busy;
3124 
3125 	ce = intel_context_create(engine);
3126 	if (IS_ERR(ce)) {
3127 		thread->result = PTR_ERR(ce);
3128 		return;
3129 	}
3130 
3131 	err = intel_context_pin(ce);
3132 	if (err) {
3133 		intel_context_put(ce);
3134 		thread->result = err;
3135 		return;
3136 	}
3137 
3138 	if (intel_engine_supports_stats(engine)) {
3139 		p->busy = intel_engine_get_busy_time(engine, &p->time);
3140 		busy = true;
3141 	} else {
3142 		p->time = ktime_get();
3143 		busy = false;
3144 	}
3145 
3146 	count = 0;
3147 	do {
3148 		struct i915_request *rq;
3149 
3150 		rq = i915_request_create(ce);
3151 		if (IS_ERR(rq)) {
3152 			err = PTR_ERR(rq);
3153 			break;
3154 		}
3155 
3156 		i915_request_add(rq);
3157 		count++;
3158 	} while (!__igt_timeout(end_time, NULL));
3159 
3160 	if (busy) {
3161 		ktime_t now;
3162 
3163 		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3164 				    p->busy);
3165 		p->time = ktime_sub(now, p->time);
3166 	} else {
3167 		p->time = ktime_sub(ktime_get(), p->time);
3168 	}
3169 
3170 	err = switch_to_kernel_sync(ce, err);
3171 	p->runtime = intel_context_get_total_runtime_ns(ce);
3172 	p->count = count;
3173 
3174 	intel_context_unpin(ce);
3175 	intel_context_put(ce);
3176 	thread->result = err;
3177 }
3178 
3179 static int perf_parallel_engines(void *arg)
3180 {
3181 	struct drm_i915_private *i915 = arg;
3182 	static void (* const func[])(struct kthread_work *) = {
3183 		p_sync0,
3184 		p_sync1,
3185 		p_many,
3186 		NULL,
3187 	};
3188 	const unsigned int nengines = num_uabi_engines(i915);
3189 	void (* const *fn)(struct kthread_work *);
3190 	struct intel_engine_cs *engine;
3191 	struct pm_qos_request qos;
3192 	struct p_thread *engines;
3193 	int err = 0;
3194 
3195 	engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
3196 	if (!engines)
3197 		return -ENOMEM;
3198 
3199 	cpu_latency_qos_add_request(&qos, 0);
3200 
3201 	for (fn = func; *fn; fn++) {
3202 		char name[KSYM_NAME_LEN];
3203 		struct igt_live_test t;
3204 		unsigned int idx;
3205 
3206 		snprintf(name, sizeof(name), "%ps", *fn);
3207 		err = igt_live_test_begin(&t, i915, __func__, name);
3208 		if (err)
3209 			break;
3210 
3211 		atomic_set(&i915->selftest.counter, nengines);
3212 
3213 		idx = 0;
3214 		for_each_uabi_engine(engine, i915) {
3215 			struct kthread_worker *worker;
3216 
3217 			intel_engine_pm_get(engine);
3218 
3219 			memset(&engines[idx].p, 0, sizeof(engines[idx].p));
3220 
3221 			worker = kthread_run_worker(0, "igt:%s",
3222 						       engine->name);
3223 			if (IS_ERR(worker)) {
3224 				err = PTR_ERR(worker);
3225 				intel_engine_pm_put(engine);
3226 				break;
3227 			}
3228 			engines[idx].worker = worker;
3229 			engines[idx].result = 0;
3230 			engines[idx].p.engine = engine;
3231 			engines[idx].engine = engine;
3232 
3233 			kthread_init_work(&engines[idx].work, *fn);
3234 			kthread_queue_work(worker, &engines[idx].work);
3235 			idx++;
3236 		}
3237 
3238 		idx = 0;
3239 		for_each_uabi_engine(engine, i915) {
3240 			int status;
3241 
3242 			if (!engines[idx].worker)
3243 				break;
3244 
3245 			kthread_flush_work(&engines[idx].work);
3246 			status = READ_ONCE(engines[idx].result);
3247 			if (status && !err)
3248 				err = status;
3249 
3250 			intel_engine_pm_put(engine);
3251 
3252 			kthread_destroy_worker(engines[idx].worker);
3253 			idx++;
3254 		}
3255 
3256 		if (igt_live_test_end(&t))
3257 			err = -EIO;
3258 		if (err)
3259 			break;
3260 
3261 		idx = 0;
3262 		for_each_uabi_engine(engine, i915) {
3263 			struct perf_stats *p = &engines[idx].p;
3264 			u64 busy = 100 * ktime_to_ns(p->busy);
3265 			u64 dt = ktime_to_ns(p->time);
3266 			int integer, decimal;
3267 
3268 			if (dt) {
3269 				integer = div64_u64(busy, dt);
3270 				busy -= integer * dt;
3271 				decimal = div64_u64(100 * busy, dt);
3272 			} else {
3273 				integer = 0;
3274 				decimal = 0;
3275 			}
3276 
3277 			GEM_BUG_ON(engine != p->engine);
3278 			pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
3279 				name, engine->name, p->count, integer, decimal,
3280 				div_u64(p->runtime, 1000 * 1000),
3281 				div_u64(ktime_to_ns(p->time), 1000 * 1000));
3282 			idx++;
3283 		}
3284 	}
3285 
3286 	cpu_latency_qos_remove_request(&qos);
3287 	kfree(engines);
3288 	return err;
3289 }
3290 
3291 int i915_request_perf_selftests(struct drm_i915_private *i915)
3292 {
3293 	static const struct i915_subtest tests[] = {
3294 		SUBTEST(perf_request_latency),
3295 		SUBTEST(perf_series_engines),
3296 		SUBTEST(perf_parallel_engines),
3297 	};
3298 
3299 	if (intel_gt_is_wedged(to_gt(i915)))
3300 		return 0;
3301 
3302 	return i915_subtests(tests, i915);
3303 }
3304