xref: /linux/drivers/gpu/drm/i915/gt/selftest_timeline.c (revision ec8a42e7343234802b9054874fe01810880289ce)
1 /*
2  * SPDX-License-Identifier: MIT
3  *
4  * Copyright © 2017-2018 Intel Corporation
5  */
6 
7 #include <linux/prime_numbers.h>
8 
9 #include "intel_context.h"
10 #include "intel_engine_heartbeat.h"
11 #include "intel_engine_pm.h"
12 #include "intel_gt.h"
13 #include "intel_gt_requests.h"
14 #include "intel_ring.h"
15 #include "selftest_engine_heartbeat.h"
16 
17 #include "../selftests/i915_random.h"
18 #include "../i915_selftest.h"
19 
20 #include "selftests/igt_flush_test.h"
21 #include "selftests/lib_sw_fence.h"
22 #include "selftests/mock_gem_device.h"
23 #include "selftests/mock_timeline.h"
24 
25 static struct page *hwsp_page(struct intel_timeline *tl)
26 {
27 	struct drm_i915_gem_object *obj = tl->hwsp_ggtt->obj;
28 
29 	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
30 	return sg_page(obj->mm.pages->sgl);
31 }
32 
33 static unsigned long hwsp_cacheline(struct intel_timeline *tl)
34 {
35 	unsigned long address = (unsigned long)page_address(hwsp_page(tl));
36 
37 	return (address + tl->hwsp_offset) / CACHELINE_BYTES;
38 }
39 
40 #define CACHELINES_PER_PAGE (PAGE_SIZE / CACHELINE_BYTES)
41 
42 struct mock_hwsp_freelist {
43 	struct intel_gt *gt;
44 	struct radix_tree_root cachelines;
45 	struct intel_timeline **history;
46 	unsigned long count, max;
47 	struct rnd_state prng;
48 };
49 
50 enum {
51 	SHUFFLE = BIT(0),
52 };
53 
54 static void __mock_hwsp_record(struct mock_hwsp_freelist *state,
55 			       unsigned int idx,
56 			       struct intel_timeline *tl)
57 {
58 	tl = xchg(&state->history[idx], tl);
59 	if (tl) {
60 		radix_tree_delete(&state->cachelines, hwsp_cacheline(tl));
61 		intel_timeline_put(tl);
62 	}
63 }
64 
65 static int __mock_hwsp_timeline(struct mock_hwsp_freelist *state,
66 				unsigned int count,
67 				unsigned int flags)
68 {
69 	struct intel_timeline *tl;
70 	unsigned int idx;
71 
72 	while (count--) {
73 		unsigned long cacheline;
74 		int err;
75 
76 		tl = intel_timeline_create(state->gt);
77 		if (IS_ERR(tl))
78 			return PTR_ERR(tl);
79 
80 		cacheline = hwsp_cacheline(tl);
81 		err = radix_tree_insert(&state->cachelines, cacheline, tl);
82 		if (err) {
83 			if (err == -EEXIST) {
84 				pr_err("HWSP cacheline %lu already used; duplicate allocation!\n",
85 				       cacheline);
86 			}
87 			intel_timeline_put(tl);
88 			return err;
89 		}
90 
91 		idx = state->count++ % state->max;
92 		__mock_hwsp_record(state, idx, tl);
93 	}
94 
95 	if (flags & SHUFFLE)
96 		i915_prandom_shuffle(state->history,
97 				     sizeof(*state->history),
98 				     min(state->count, state->max),
99 				     &state->prng);
100 
101 	count = i915_prandom_u32_max_state(min(state->count, state->max),
102 					   &state->prng);
103 	while (count--) {
104 		idx = --state->count % state->max;
105 		__mock_hwsp_record(state, idx, NULL);
106 	}
107 
108 	return 0;
109 }
110 
111 static int mock_hwsp_freelist(void *arg)
112 {
113 	struct mock_hwsp_freelist state;
114 	struct drm_i915_private *i915;
115 	const struct {
116 		const char *name;
117 		unsigned int flags;
118 	} phases[] = {
119 		{ "linear", 0 },
120 		{ "shuffled", SHUFFLE },
121 		{ },
122 	}, *p;
123 	unsigned int na;
124 	int err = 0;
125 
126 	i915 = mock_gem_device();
127 	if (!i915)
128 		return -ENOMEM;
129 
130 	INIT_RADIX_TREE(&state.cachelines, GFP_KERNEL);
131 	state.prng = I915_RND_STATE_INITIALIZER(i915_selftest.random_seed);
132 
133 	state.gt = &i915->gt;
134 
135 	/*
136 	 * Create a bunch of timelines and check that their HWSP do not overlap.
137 	 * Free some, and try again.
138 	 */
139 
140 	state.max = PAGE_SIZE / sizeof(*state.history);
141 	state.count = 0;
142 	state.history = kcalloc(state.max, sizeof(*state.history), GFP_KERNEL);
143 	if (!state.history) {
144 		err = -ENOMEM;
145 		goto err_put;
146 	}
147 
148 	for (p = phases; p->name; p++) {
149 		pr_debug("%s(%s)\n", __func__, p->name);
150 		for_each_prime_number_from(na, 1, 2 * CACHELINES_PER_PAGE) {
151 			err = __mock_hwsp_timeline(&state, na, p->flags);
152 			if (err)
153 				goto out;
154 		}
155 	}
156 
157 out:
158 	for (na = 0; na < state.max; na++)
159 		__mock_hwsp_record(&state, na, NULL);
160 	kfree(state.history);
161 err_put:
162 	mock_destroy_device(i915);
163 	return err;
164 }
165 
166 struct __igt_sync {
167 	const char *name;
168 	u32 seqno;
169 	bool expected;
170 	bool set;
171 };
172 
173 static int __igt_sync(struct intel_timeline *tl,
174 		      u64 ctx,
175 		      const struct __igt_sync *p,
176 		      const char *name)
177 {
178 	int ret;
179 
180 	if (__intel_timeline_sync_is_later(tl, ctx, p->seqno) != p->expected) {
181 		pr_err("%s: %s(ctx=%llu, seqno=%u) expected passed %s but failed\n",
182 		       name, p->name, ctx, p->seqno, yesno(p->expected));
183 		return -EINVAL;
184 	}
185 
186 	if (p->set) {
187 		ret = __intel_timeline_sync_set(tl, ctx, p->seqno);
188 		if (ret)
189 			return ret;
190 	}
191 
192 	return 0;
193 }
194 
195 static int igt_sync(void *arg)
196 {
197 	const struct __igt_sync pass[] = {
198 		{ "unset", 0, false, false },
199 		{ "new", 0, false, true },
200 		{ "0a", 0, true, true },
201 		{ "1a", 1, false, true },
202 		{ "1b", 1, true, true },
203 		{ "0b", 0, true, false },
204 		{ "2a", 2, false, true },
205 		{ "4", 4, false, true },
206 		{ "INT_MAX", INT_MAX, false, true },
207 		{ "INT_MAX-1", INT_MAX-1, true, false },
208 		{ "INT_MAX+1", (u32)INT_MAX+1, false, true },
209 		{ "INT_MAX", INT_MAX, true, false },
210 		{ "UINT_MAX", UINT_MAX, false, true },
211 		{ "wrap", 0, false, true },
212 		{ "unwrap", UINT_MAX, true, false },
213 		{},
214 	}, *p;
215 	struct intel_timeline tl;
216 	int order, offset;
217 	int ret = -ENODEV;
218 
219 	mock_timeline_init(&tl, 0);
220 	for (p = pass; p->name; p++) {
221 		for (order = 1; order < 64; order++) {
222 			for (offset = -1; offset <= (order > 1); offset++) {
223 				u64 ctx = BIT_ULL(order) + offset;
224 
225 				ret = __igt_sync(&tl, ctx, p, "1");
226 				if (ret)
227 					goto out;
228 			}
229 		}
230 	}
231 	mock_timeline_fini(&tl);
232 
233 	mock_timeline_init(&tl, 0);
234 	for (order = 1; order < 64; order++) {
235 		for (offset = -1; offset <= (order > 1); offset++) {
236 			u64 ctx = BIT_ULL(order) + offset;
237 
238 			for (p = pass; p->name; p++) {
239 				ret = __igt_sync(&tl, ctx, p, "2");
240 				if (ret)
241 					goto out;
242 			}
243 		}
244 	}
245 
246 out:
247 	mock_timeline_fini(&tl);
248 	return ret;
249 }
250 
251 static unsigned int random_engine(struct rnd_state *rnd)
252 {
253 	return i915_prandom_u32_max_state(I915_NUM_ENGINES, rnd);
254 }
255 
256 static int bench_sync(void *arg)
257 {
258 	struct rnd_state prng;
259 	struct intel_timeline tl;
260 	unsigned long end_time, count;
261 	u64 prng32_1M;
262 	ktime_t kt;
263 	int order, last_order;
264 
265 	mock_timeline_init(&tl, 0);
266 
267 	/* Lookups from cache are very fast and so the random number generation
268 	 * and the loop itself becomes a significant factor in the per-iteration
269 	 * timings. We try to compensate the results by measuring the overhead
270 	 * of the prng and subtract it from the reported results.
271 	 */
272 	prandom_seed_state(&prng, i915_selftest.random_seed);
273 	count = 0;
274 	kt = ktime_get();
275 	end_time = jiffies + HZ/10;
276 	do {
277 		u32 x;
278 
279 		/* Make sure the compiler doesn't optimise away the prng call */
280 		WRITE_ONCE(x, prandom_u32_state(&prng));
281 
282 		count++;
283 	} while (!time_after(jiffies, end_time));
284 	kt = ktime_sub(ktime_get(), kt);
285 	pr_debug("%s: %lu random evaluations, %lluns/prng\n",
286 		 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
287 	prng32_1M = div64_ul(ktime_to_ns(kt) << 20, count);
288 
289 	/* Benchmark (only) setting random context ids */
290 	prandom_seed_state(&prng, i915_selftest.random_seed);
291 	count = 0;
292 	kt = ktime_get();
293 	end_time = jiffies + HZ/10;
294 	do {
295 		u64 id = i915_prandom_u64_state(&prng);
296 
297 		__intel_timeline_sync_set(&tl, id, 0);
298 		count++;
299 	} while (!time_after(jiffies, end_time));
300 	kt = ktime_sub(ktime_get(), kt);
301 	kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
302 	pr_info("%s: %lu random insertions, %lluns/insert\n",
303 		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
304 
305 	/* Benchmark looking up the exact same context ids as we just set */
306 	prandom_seed_state(&prng, i915_selftest.random_seed);
307 	end_time = count;
308 	kt = ktime_get();
309 	while (end_time--) {
310 		u64 id = i915_prandom_u64_state(&prng);
311 
312 		if (!__intel_timeline_sync_is_later(&tl, id, 0)) {
313 			mock_timeline_fini(&tl);
314 			pr_err("Lookup of %llu failed\n", id);
315 			return -EINVAL;
316 		}
317 	}
318 	kt = ktime_sub(ktime_get(), kt);
319 	kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
320 	pr_info("%s: %lu random lookups, %lluns/lookup\n",
321 		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
322 
323 	mock_timeline_fini(&tl);
324 	cond_resched();
325 
326 	mock_timeline_init(&tl, 0);
327 
328 	/* Benchmark setting the first N (in order) contexts */
329 	count = 0;
330 	kt = ktime_get();
331 	end_time = jiffies + HZ/10;
332 	do {
333 		__intel_timeline_sync_set(&tl, count++, 0);
334 	} while (!time_after(jiffies, end_time));
335 	kt = ktime_sub(ktime_get(), kt);
336 	pr_info("%s: %lu in-order insertions, %lluns/insert\n",
337 		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
338 
339 	/* Benchmark looking up the exact same context ids as we just set */
340 	end_time = count;
341 	kt = ktime_get();
342 	while (end_time--) {
343 		if (!__intel_timeline_sync_is_later(&tl, end_time, 0)) {
344 			pr_err("Lookup of %lu failed\n", end_time);
345 			mock_timeline_fini(&tl);
346 			return -EINVAL;
347 		}
348 	}
349 	kt = ktime_sub(ktime_get(), kt);
350 	pr_info("%s: %lu in-order lookups, %lluns/lookup\n",
351 		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
352 
353 	mock_timeline_fini(&tl);
354 	cond_resched();
355 
356 	mock_timeline_init(&tl, 0);
357 
358 	/* Benchmark searching for a random context id and maybe changing it */
359 	prandom_seed_state(&prng, i915_selftest.random_seed);
360 	count = 0;
361 	kt = ktime_get();
362 	end_time = jiffies + HZ/10;
363 	do {
364 		u32 id = random_engine(&prng);
365 		u32 seqno = prandom_u32_state(&prng);
366 
367 		if (!__intel_timeline_sync_is_later(&tl, id, seqno))
368 			__intel_timeline_sync_set(&tl, id, seqno);
369 
370 		count++;
371 	} while (!time_after(jiffies, end_time));
372 	kt = ktime_sub(ktime_get(), kt);
373 	kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
374 	pr_info("%s: %lu repeated insert/lookups, %lluns/op\n",
375 		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
376 	mock_timeline_fini(&tl);
377 	cond_resched();
378 
379 	/* Benchmark searching for a known context id and changing the seqno */
380 	for (last_order = 1, order = 1; order < 32;
381 	     ({ int tmp = last_order; last_order = order; order += tmp; })) {
382 		unsigned int mask = BIT(order) - 1;
383 
384 		mock_timeline_init(&tl, 0);
385 
386 		count = 0;
387 		kt = ktime_get();
388 		end_time = jiffies + HZ/10;
389 		do {
390 			/* Without assuming too many details of the underlying
391 			 * implementation, try to identify its phase-changes
392 			 * (if any)!
393 			 */
394 			u64 id = (u64)(count & mask) << order;
395 
396 			__intel_timeline_sync_is_later(&tl, id, 0);
397 			__intel_timeline_sync_set(&tl, id, 0);
398 
399 			count++;
400 		} while (!time_after(jiffies, end_time));
401 		kt = ktime_sub(ktime_get(), kt);
402 		pr_info("%s: %lu cyclic/%d insert/lookups, %lluns/op\n",
403 			__func__, count, order,
404 			(long long)div64_ul(ktime_to_ns(kt), count));
405 		mock_timeline_fini(&tl);
406 		cond_resched();
407 	}
408 
409 	return 0;
410 }
411 
412 int intel_timeline_mock_selftests(void)
413 {
414 	static const struct i915_subtest tests[] = {
415 		SUBTEST(mock_hwsp_freelist),
416 		SUBTEST(igt_sync),
417 		SUBTEST(bench_sync),
418 	};
419 
420 	return i915_subtests(tests, NULL);
421 }
422 
423 static int emit_ggtt_store_dw(struct i915_request *rq, u32 addr, u32 value)
424 {
425 	u32 *cs;
426 
427 	cs = intel_ring_begin(rq, 4);
428 	if (IS_ERR(cs))
429 		return PTR_ERR(cs);
430 
431 	if (INTEL_GEN(rq->engine->i915) >= 8) {
432 		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
433 		*cs++ = addr;
434 		*cs++ = 0;
435 		*cs++ = value;
436 	} else if (INTEL_GEN(rq->engine->i915) >= 4) {
437 		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
438 		*cs++ = 0;
439 		*cs++ = addr;
440 		*cs++ = value;
441 	} else {
442 		*cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
443 		*cs++ = addr;
444 		*cs++ = value;
445 		*cs++ = MI_NOOP;
446 	}
447 
448 	intel_ring_advance(rq, cs);
449 
450 	return 0;
451 }
452 
453 static struct i915_request *
454 tl_write(struct intel_timeline *tl, struct intel_engine_cs *engine, u32 value)
455 {
456 	struct i915_request *rq;
457 	int err;
458 
459 	err = intel_timeline_pin(tl, NULL);
460 	if (err) {
461 		rq = ERR_PTR(err);
462 		goto out;
463 	}
464 
465 	rq = intel_engine_create_kernel_request(engine);
466 	if (IS_ERR(rq))
467 		goto out_unpin;
468 
469 	i915_request_get(rq);
470 
471 	err = emit_ggtt_store_dw(rq, tl->hwsp_offset, value);
472 	i915_request_add(rq);
473 	if (err) {
474 		i915_request_put(rq);
475 		rq = ERR_PTR(err);
476 	}
477 
478 out_unpin:
479 	intel_timeline_unpin(tl);
480 out:
481 	if (IS_ERR(rq))
482 		pr_err("Failed to write to timeline!\n");
483 	return rq;
484 }
485 
486 static struct intel_timeline *
487 checked_intel_timeline_create(struct intel_gt *gt)
488 {
489 	struct intel_timeline *tl;
490 
491 	tl = intel_timeline_create(gt);
492 	if (IS_ERR(tl))
493 		return tl;
494 
495 	if (READ_ONCE(*tl->hwsp_seqno) != tl->seqno) {
496 		pr_err("Timeline created with incorrect breadcrumb, found %x, expected %x\n",
497 		       *tl->hwsp_seqno, tl->seqno);
498 		intel_timeline_put(tl);
499 		return ERR_PTR(-EINVAL);
500 	}
501 
502 	return tl;
503 }
504 
505 static int live_hwsp_engine(void *arg)
506 {
507 #define NUM_TIMELINES 4096
508 	struct intel_gt *gt = arg;
509 	struct intel_timeline **timelines;
510 	struct intel_engine_cs *engine;
511 	enum intel_engine_id id;
512 	unsigned long count, n;
513 	int err = 0;
514 
515 	/*
516 	 * Create a bunch of timelines and check we can write
517 	 * independently to each of their breadcrumb slots.
518 	 */
519 
520 	timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
521 				   sizeof(*timelines),
522 				   GFP_KERNEL);
523 	if (!timelines)
524 		return -ENOMEM;
525 
526 	count = 0;
527 	for_each_engine(engine, gt, id) {
528 		if (!intel_engine_can_store_dword(engine))
529 			continue;
530 
531 		intel_engine_pm_get(engine);
532 
533 		for (n = 0; n < NUM_TIMELINES; n++) {
534 			struct intel_timeline *tl;
535 			struct i915_request *rq;
536 
537 			tl = checked_intel_timeline_create(gt);
538 			if (IS_ERR(tl)) {
539 				err = PTR_ERR(tl);
540 				break;
541 			}
542 
543 			rq = tl_write(tl, engine, count);
544 			if (IS_ERR(rq)) {
545 				intel_timeline_put(tl);
546 				err = PTR_ERR(rq);
547 				break;
548 			}
549 
550 			timelines[count++] = tl;
551 			i915_request_put(rq);
552 		}
553 
554 		intel_engine_pm_put(engine);
555 		if (err)
556 			break;
557 	}
558 
559 	if (igt_flush_test(gt->i915))
560 		err = -EIO;
561 
562 	for (n = 0; n < count; n++) {
563 		struct intel_timeline *tl = timelines[n];
564 
565 		if (!err && READ_ONCE(*tl->hwsp_seqno) != n) {
566 			GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n",
567 				      n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno);
568 			GEM_TRACE_DUMP();
569 			err = -EINVAL;
570 		}
571 		intel_timeline_put(tl);
572 	}
573 
574 	kvfree(timelines);
575 	return err;
576 #undef NUM_TIMELINES
577 }
578 
579 static int live_hwsp_alternate(void *arg)
580 {
581 #define NUM_TIMELINES 4096
582 	struct intel_gt *gt = arg;
583 	struct intel_timeline **timelines;
584 	struct intel_engine_cs *engine;
585 	enum intel_engine_id id;
586 	unsigned long count, n;
587 	int err = 0;
588 
589 	/*
590 	 * Create a bunch of timelines and check we can write
591 	 * independently to each of their breadcrumb slots with adjacent
592 	 * engines.
593 	 */
594 
595 	timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
596 				   sizeof(*timelines),
597 				   GFP_KERNEL);
598 	if (!timelines)
599 		return -ENOMEM;
600 
601 	count = 0;
602 	for (n = 0; n < NUM_TIMELINES; n++) {
603 		for_each_engine(engine, gt, id) {
604 			struct intel_timeline *tl;
605 			struct i915_request *rq;
606 
607 			if (!intel_engine_can_store_dword(engine))
608 				continue;
609 
610 			tl = checked_intel_timeline_create(gt);
611 			if (IS_ERR(tl)) {
612 				err = PTR_ERR(tl);
613 				goto out;
614 			}
615 
616 			intel_engine_pm_get(engine);
617 			rq = tl_write(tl, engine, count);
618 			intel_engine_pm_put(engine);
619 			if (IS_ERR(rq)) {
620 				intel_timeline_put(tl);
621 				err = PTR_ERR(rq);
622 				goto out;
623 			}
624 
625 			timelines[count++] = tl;
626 			i915_request_put(rq);
627 		}
628 	}
629 
630 out:
631 	if (igt_flush_test(gt->i915))
632 		err = -EIO;
633 
634 	for (n = 0; n < count; n++) {
635 		struct intel_timeline *tl = timelines[n];
636 
637 		if (!err && READ_ONCE(*tl->hwsp_seqno) != n) {
638 			GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n",
639 				      n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno);
640 			GEM_TRACE_DUMP();
641 			err = -EINVAL;
642 		}
643 		intel_timeline_put(tl);
644 	}
645 
646 	kvfree(timelines);
647 	return err;
648 #undef NUM_TIMELINES
649 }
650 
651 static int live_hwsp_wrap(void *arg)
652 {
653 	struct intel_gt *gt = arg;
654 	struct intel_engine_cs *engine;
655 	struct intel_timeline *tl;
656 	enum intel_engine_id id;
657 	int err = 0;
658 
659 	/*
660 	 * Across a seqno wrap, we need to keep the old cacheline alive for
661 	 * foreign GPU references.
662 	 */
663 
664 	tl = intel_timeline_create(gt);
665 	if (IS_ERR(tl))
666 		return PTR_ERR(tl);
667 
668 	if (!tl->has_initial_breadcrumb || !tl->hwsp_cacheline)
669 		goto out_free;
670 
671 	err = intel_timeline_pin(tl, NULL);
672 	if (err)
673 		goto out_free;
674 
675 	for_each_engine(engine, gt, id) {
676 		const u32 *hwsp_seqno[2];
677 		struct i915_request *rq;
678 		u32 seqno[2];
679 
680 		if (!intel_engine_can_store_dword(engine))
681 			continue;
682 
683 		rq = intel_engine_create_kernel_request(engine);
684 		if (IS_ERR(rq)) {
685 			err = PTR_ERR(rq);
686 			goto out;
687 		}
688 
689 		tl->seqno = -4u;
690 
691 		mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING);
692 		err = intel_timeline_get_seqno(tl, rq, &seqno[0]);
693 		mutex_unlock(&tl->mutex);
694 		if (err) {
695 			i915_request_add(rq);
696 			goto out;
697 		}
698 		pr_debug("seqno[0]:%08x, hwsp_offset:%08x\n",
699 			 seqno[0], tl->hwsp_offset);
700 
701 		err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[0]);
702 		if (err) {
703 			i915_request_add(rq);
704 			goto out;
705 		}
706 		hwsp_seqno[0] = tl->hwsp_seqno;
707 
708 		mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING);
709 		err = intel_timeline_get_seqno(tl, rq, &seqno[1]);
710 		mutex_unlock(&tl->mutex);
711 		if (err) {
712 			i915_request_add(rq);
713 			goto out;
714 		}
715 		pr_debug("seqno[1]:%08x, hwsp_offset:%08x\n",
716 			 seqno[1], tl->hwsp_offset);
717 
718 		err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[1]);
719 		if (err) {
720 			i915_request_add(rq);
721 			goto out;
722 		}
723 		hwsp_seqno[1] = tl->hwsp_seqno;
724 
725 		/* With wrap should come a new hwsp */
726 		GEM_BUG_ON(seqno[1] >= seqno[0]);
727 		GEM_BUG_ON(hwsp_seqno[0] == hwsp_seqno[1]);
728 
729 		i915_request_add(rq);
730 
731 		if (i915_request_wait(rq, 0, HZ / 5) < 0) {
732 			pr_err("Wait for timeline writes timed out!\n");
733 			err = -EIO;
734 			goto out;
735 		}
736 
737 		if (READ_ONCE(*hwsp_seqno[0]) != seqno[0] ||
738 		    READ_ONCE(*hwsp_seqno[1]) != seqno[1]) {
739 			pr_err("Bad timeline values: found (%x, %x), expected (%x, %x)\n",
740 			       *hwsp_seqno[0], *hwsp_seqno[1],
741 			       seqno[0], seqno[1]);
742 			err = -EINVAL;
743 			goto out;
744 		}
745 
746 		intel_gt_retire_requests(gt); /* recycle HWSP */
747 	}
748 
749 out:
750 	if (igt_flush_test(gt->i915))
751 		err = -EIO;
752 
753 	intel_timeline_unpin(tl);
754 out_free:
755 	intel_timeline_put(tl);
756 	return err;
757 }
758 
759 static int emit_read_hwsp(struct i915_request *rq,
760 			  u32 seqno, u32 hwsp,
761 			  u32 *addr)
762 {
763 	const u32 gpr = i915_mmio_reg_offset(GEN8_RING_CS_GPR(rq->engine->mmio_base, 0));
764 	u32 *cs;
765 
766 	cs = intel_ring_begin(rq, 12);
767 	if (IS_ERR(cs))
768 		return PTR_ERR(cs);
769 
770 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
771 	*cs++ = *addr;
772 	*cs++ = 0;
773 	*cs++ = seqno;
774 	*addr += 4;
775 
776 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_USE_GGTT;
777 	*cs++ = gpr;
778 	*cs++ = hwsp;
779 	*cs++ = 0;
780 
781 	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
782 	*cs++ = gpr;
783 	*cs++ = *addr;
784 	*cs++ = 0;
785 	*addr += 4;
786 
787 	intel_ring_advance(rq, cs);
788 
789 	return 0;
790 }
791 
792 struct hwsp_watcher {
793 	struct i915_vma *vma;
794 	struct i915_request *rq;
795 	u32 addr;
796 	u32 *map;
797 };
798 
799 static bool cmp_lt(u32 a, u32 b)
800 {
801 	return a < b;
802 }
803 
804 static bool cmp_gte(u32 a, u32 b)
805 {
806 	return a >= b;
807 }
808 
809 static int setup_watcher(struct hwsp_watcher *w, struct intel_gt *gt)
810 {
811 	struct drm_i915_gem_object *obj;
812 	struct i915_vma *vma;
813 
814 	obj = i915_gem_object_create_internal(gt->i915, SZ_2M);
815 	if (IS_ERR(obj))
816 		return PTR_ERR(obj);
817 
818 	w->map = i915_gem_object_pin_map(obj, I915_MAP_WB);
819 	if (IS_ERR(w->map)) {
820 		i915_gem_object_put(obj);
821 		return PTR_ERR(w->map);
822 	}
823 
824 	vma = i915_gem_object_ggtt_pin_ww(obj, NULL, NULL, 0, 0, 0);
825 	if (IS_ERR(vma)) {
826 		i915_gem_object_put(obj);
827 		return PTR_ERR(vma);
828 	}
829 
830 	w->vma = vma;
831 	w->addr = i915_ggtt_offset(vma);
832 	return 0;
833 }
834 
835 static int create_watcher(struct hwsp_watcher *w,
836 			  struct intel_engine_cs *engine,
837 			  int ringsz)
838 {
839 	struct intel_context *ce;
840 	struct intel_timeline *tl;
841 
842 	ce = intel_context_create(engine);
843 	if (IS_ERR(ce))
844 		return PTR_ERR(ce);
845 
846 	ce->ring = __intel_context_ring_size(ringsz);
847 	w->rq = intel_context_create_request(ce);
848 	intel_context_put(ce);
849 	if (IS_ERR(w->rq))
850 		return PTR_ERR(w->rq);
851 
852 	w->addr = i915_ggtt_offset(w->vma);
853 	tl = w->rq->context->timeline;
854 
855 	/* some light mutex juggling required; think co-routines */
856 	lockdep_unpin_lock(&tl->mutex, w->rq->cookie);
857 	mutex_unlock(&tl->mutex);
858 
859 	return 0;
860 }
861 
862 static int check_watcher(struct hwsp_watcher *w, const char *name,
863 			 bool (*op)(u32 hwsp, u32 seqno))
864 {
865 	struct i915_request *rq = fetch_and_zero(&w->rq);
866 	struct intel_timeline *tl = rq->context->timeline;
867 	u32 offset, end;
868 	int err;
869 
870 	GEM_BUG_ON(w->addr - i915_ggtt_offset(w->vma) > w->vma->size);
871 
872 	i915_request_get(rq);
873 	mutex_lock(&tl->mutex);
874 	rq->cookie = lockdep_pin_lock(&tl->mutex);
875 	i915_request_add(rq);
876 
877 	if (i915_request_wait(rq, 0, HZ) < 0) {
878 		err = -ETIME;
879 		goto out;
880 	}
881 
882 	err = 0;
883 	offset = 0;
884 	end = (w->addr - i915_ggtt_offset(w->vma)) / sizeof(*w->map);
885 	while (offset < end) {
886 		if (!op(w->map[offset + 1], w->map[offset])) {
887 			pr_err("Watcher '%s' found HWSP value %x for seqno %x\n",
888 			       name, w->map[offset + 1], w->map[offset]);
889 			err = -EINVAL;
890 		}
891 
892 		offset += 2;
893 	}
894 
895 out:
896 	i915_request_put(rq);
897 	return err;
898 }
899 
900 static void cleanup_watcher(struct hwsp_watcher *w)
901 {
902 	if (w->rq) {
903 		struct intel_timeline *tl = w->rq->context->timeline;
904 
905 		mutex_lock(&tl->mutex);
906 		w->rq->cookie = lockdep_pin_lock(&tl->mutex);
907 
908 		i915_request_add(w->rq);
909 	}
910 
911 	i915_vma_unpin_and_release(&w->vma, I915_VMA_RELEASE_MAP);
912 }
913 
914 static bool retire_requests(struct intel_timeline *tl)
915 {
916 	struct i915_request *rq, *rn;
917 
918 	mutex_lock(&tl->mutex);
919 	list_for_each_entry_safe(rq, rn, &tl->requests, link)
920 		if (!i915_request_retire(rq))
921 			break;
922 	mutex_unlock(&tl->mutex);
923 
924 	return !i915_active_fence_isset(&tl->last_request);
925 }
926 
927 static struct i915_request *wrap_timeline(struct i915_request *rq)
928 {
929 	struct intel_context *ce = rq->context;
930 	struct intel_timeline *tl = ce->timeline;
931 	u32 seqno = rq->fence.seqno;
932 
933 	while (tl->seqno >= seqno) { /* Cause a wrap */
934 		i915_request_put(rq);
935 		rq = intel_context_create_request(ce);
936 		if (IS_ERR(rq))
937 			return rq;
938 
939 		i915_request_get(rq);
940 		i915_request_add(rq);
941 	}
942 
943 	i915_request_put(rq);
944 	rq = intel_context_create_request(ce);
945 	if (IS_ERR(rq))
946 		return rq;
947 
948 	i915_request_get(rq);
949 	i915_request_add(rq);
950 
951 	return rq;
952 }
953 
954 static int live_hwsp_read(void *arg)
955 {
956 	struct intel_gt *gt = arg;
957 	struct hwsp_watcher watcher[2] = {};
958 	struct intel_engine_cs *engine;
959 	struct intel_timeline *tl;
960 	enum intel_engine_id id;
961 	int err = 0;
962 	int i;
963 
964 	/*
965 	 * If we take a reference to the HWSP for reading on the GPU, that
966 	 * read may be arbitrarily delayed (either by foreign fence or
967 	 * priority saturation) and a wrap can happen within 30 minutes.
968 	 * When the GPU read is finally submitted it should be correct,
969 	 * even across multiple wraps.
970 	 */
971 
972 	if (INTEL_GEN(gt->i915) < 8) /* CS convenience [SRM/LRM] */
973 		return 0;
974 
975 	tl = intel_timeline_create(gt);
976 	if (IS_ERR(tl))
977 		return PTR_ERR(tl);
978 
979 	if (!tl->hwsp_cacheline)
980 		goto out_free;
981 
982 	for (i = 0; i < ARRAY_SIZE(watcher); i++) {
983 		err = setup_watcher(&watcher[i], gt);
984 		if (err)
985 			goto out;
986 	}
987 
988 	for_each_engine(engine, gt, id) {
989 		struct intel_context *ce;
990 		unsigned long count = 0;
991 		IGT_TIMEOUT(end_time);
992 
993 		/* Create a request we can use for remote reading of the HWSP */
994 		err = create_watcher(&watcher[1], engine, SZ_512K);
995 		if (err)
996 			goto out;
997 
998 		do {
999 			struct i915_sw_fence *submit;
1000 			struct i915_request *rq;
1001 			u32 hwsp;
1002 
1003 			submit = heap_fence_create(GFP_KERNEL);
1004 			if (!submit) {
1005 				err = -ENOMEM;
1006 				goto out;
1007 			}
1008 
1009 			err = create_watcher(&watcher[0], engine, SZ_4K);
1010 			if (err)
1011 				goto out;
1012 
1013 			ce = intel_context_create(engine);
1014 			if (IS_ERR(ce)) {
1015 				err = PTR_ERR(ce);
1016 				goto out;
1017 			}
1018 
1019 			/* Skip to the end, saving 30 minutes of nops */
1020 			tl->seqno = -10u + 2 * (count & 3);
1021 			WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno);
1022 			ce->timeline = intel_timeline_get(tl);
1023 
1024 			rq = intel_context_create_request(ce);
1025 			if (IS_ERR(rq)) {
1026 				err = PTR_ERR(rq);
1027 				intel_context_put(ce);
1028 				goto out;
1029 			}
1030 
1031 			err = i915_sw_fence_await_dma_fence(&rq->submit,
1032 							    &watcher[0].rq->fence, 0,
1033 							    GFP_KERNEL);
1034 			if (err < 0) {
1035 				i915_request_add(rq);
1036 				intel_context_put(ce);
1037 				goto out;
1038 			}
1039 
1040 			mutex_lock(&watcher[0].rq->context->timeline->mutex);
1041 			err = intel_timeline_read_hwsp(rq, watcher[0].rq, &hwsp);
1042 			if (err == 0)
1043 				err = emit_read_hwsp(watcher[0].rq, /* before */
1044 						     rq->fence.seqno, hwsp,
1045 						     &watcher[0].addr);
1046 			mutex_unlock(&watcher[0].rq->context->timeline->mutex);
1047 			if (err) {
1048 				i915_request_add(rq);
1049 				intel_context_put(ce);
1050 				goto out;
1051 			}
1052 
1053 			mutex_lock(&watcher[1].rq->context->timeline->mutex);
1054 			err = intel_timeline_read_hwsp(rq, watcher[1].rq, &hwsp);
1055 			if (err == 0)
1056 				err = emit_read_hwsp(watcher[1].rq, /* after */
1057 						     rq->fence.seqno, hwsp,
1058 						     &watcher[1].addr);
1059 			mutex_unlock(&watcher[1].rq->context->timeline->mutex);
1060 			if (err) {
1061 				i915_request_add(rq);
1062 				intel_context_put(ce);
1063 				goto out;
1064 			}
1065 
1066 			i915_request_get(rq);
1067 			i915_request_add(rq);
1068 
1069 			rq = wrap_timeline(rq);
1070 			intel_context_put(ce);
1071 			if (IS_ERR(rq)) {
1072 				err = PTR_ERR(rq);
1073 				goto out;
1074 			}
1075 
1076 			err = i915_sw_fence_await_dma_fence(&watcher[1].rq->submit,
1077 							    &rq->fence, 0,
1078 							    GFP_KERNEL);
1079 			if (err < 0) {
1080 				i915_request_put(rq);
1081 				goto out;
1082 			}
1083 
1084 			err = check_watcher(&watcher[0], "before", cmp_lt);
1085 			i915_sw_fence_commit(submit);
1086 			heap_fence_put(submit);
1087 			if (err) {
1088 				i915_request_put(rq);
1089 				goto out;
1090 			}
1091 			count++;
1092 
1093 			if (8 * watcher[1].rq->ring->emit >
1094 			    3 * watcher[1].rq->ring->size) {
1095 				i915_request_put(rq);
1096 				break;
1097 			}
1098 
1099 			/* Flush the timeline before manually wrapping again */
1100 			if (i915_request_wait(rq,
1101 					      I915_WAIT_INTERRUPTIBLE,
1102 					      HZ) < 0) {
1103 				err = -ETIME;
1104 				i915_request_put(rq);
1105 				goto out;
1106 			}
1107 
1108 			retire_requests(tl);
1109 			i915_request_put(rq);
1110 		} while (!__igt_timeout(end_time, NULL));
1111 		WRITE_ONCE(*(u32 *)tl->hwsp_seqno, 0xdeadbeef);
1112 
1113 		pr_info("%s: simulated %lu wraps\n", engine->name, count);
1114 		err = check_watcher(&watcher[1], "after", cmp_gte);
1115 		if (err)
1116 			goto out;
1117 	}
1118 
1119 out:
1120 	for (i = 0; i < ARRAY_SIZE(watcher); i++)
1121 		cleanup_watcher(&watcher[i]);
1122 
1123 	if (igt_flush_test(gt->i915))
1124 		err = -EIO;
1125 
1126 out_free:
1127 	intel_timeline_put(tl);
1128 	return err;
1129 }
1130 
1131 static int live_hwsp_rollover_kernel(void *arg)
1132 {
1133 	struct intel_gt *gt = arg;
1134 	struct intel_engine_cs *engine;
1135 	enum intel_engine_id id;
1136 	int err = 0;
1137 
1138 	/*
1139 	 * Run the host for long enough, and even the kernel context will
1140 	 * see a seqno rollover.
1141 	 */
1142 
1143 	for_each_engine(engine, gt, id) {
1144 		struct intel_context *ce = engine->kernel_context;
1145 		struct intel_timeline *tl = ce->timeline;
1146 		struct i915_request *rq[3] = {};
1147 		int i;
1148 
1149 		st_engine_heartbeat_disable(engine);
1150 		if (intel_gt_wait_for_idle(gt, HZ / 2)) {
1151 			err = -EIO;
1152 			goto out;
1153 		}
1154 
1155 		GEM_BUG_ON(i915_active_fence_isset(&tl->last_request));
1156 		tl->seqno = 0;
1157 		timeline_rollback(tl);
1158 		timeline_rollback(tl);
1159 		WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno);
1160 
1161 		for (i = 0; i < ARRAY_SIZE(rq); i++) {
1162 			struct i915_request *this;
1163 
1164 			this = i915_request_create(ce);
1165 			if (IS_ERR(this)) {
1166 				err = PTR_ERR(this);
1167 				goto out;
1168 			}
1169 
1170 			pr_debug("%s: create fence.seqnp:%d\n",
1171 				 engine->name,
1172 				 lower_32_bits(this->fence.seqno));
1173 
1174 			GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl);
1175 
1176 			rq[i] = i915_request_get(this);
1177 			i915_request_add(this);
1178 		}
1179 
1180 		/* We expected a wrap! */
1181 		GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno);
1182 
1183 		if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
1184 			pr_err("Wait for timeline wrap timed out!\n");
1185 			err = -EIO;
1186 			goto out;
1187 		}
1188 
1189 		for (i = 0; i < ARRAY_SIZE(rq); i++) {
1190 			if (!i915_request_completed(rq[i])) {
1191 				pr_err("Pre-wrap request not completed!\n");
1192 				err = -EINVAL;
1193 				goto out;
1194 			}
1195 		}
1196 
1197 out:
1198 		for (i = 0; i < ARRAY_SIZE(rq); i++)
1199 			i915_request_put(rq[i]);
1200 		st_engine_heartbeat_enable(engine);
1201 		if (err)
1202 			break;
1203 	}
1204 
1205 	if (igt_flush_test(gt->i915))
1206 		err = -EIO;
1207 
1208 	return err;
1209 }
1210 
1211 static int live_hwsp_rollover_user(void *arg)
1212 {
1213 	struct intel_gt *gt = arg;
1214 	struct intel_engine_cs *engine;
1215 	enum intel_engine_id id;
1216 	int err = 0;
1217 
1218 	/*
1219 	 * Simulate a long running user context, and force the seqno wrap
1220 	 * on the user's timeline.
1221 	 */
1222 
1223 	for_each_engine(engine, gt, id) {
1224 		struct i915_request *rq[3] = {};
1225 		struct intel_timeline *tl;
1226 		struct intel_context *ce;
1227 		int i;
1228 
1229 		ce = intel_context_create(engine);
1230 		if (IS_ERR(ce))
1231 			return PTR_ERR(ce);
1232 
1233 		err = intel_context_alloc_state(ce);
1234 		if (err)
1235 			goto out;
1236 
1237 		tl = ce->timeline;
1238 		if (!tl->has_initial_breadcrumb || !tl->hwsp_cacheline)
1239 			goto out;
1240 
1241 		timeline_rollback(tl);
1242 		timeline_rollback(tl);
1243 		WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno);
1244 
1245 		for (i = 0; i < ARRAY_SIZE(rq); i++) {
1246 			struct i915_request *this;
1247 
1248 			this = intel_context_create_request(ce);
1249 			if (IS_ERR(this)) {
1250 				err = PTR_ERR(this);
1251 				goto out;
1252 			}
1253 
1254 			pr_debug("%s: create fence.seqnp:%d\n",
1255 				 engine->name,
1256 				 lower_32_bits(this->fence.seqno));
1257 
1258 			GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl);
1259 
1260 			rq[i] = i915_request_get(this);
1261 			i915_request_add(this);
1262 		}
1263 
1264 		/* We expected a wrap! */
1265 		GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno);
1266 
1267 		if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
1268 			pr_err("Wait for timeline wrap timed out!\n");
1269 			err = -EIO;
1270 			goto out;
1271 		}
1272 
1273 		for (i = 0; i < ARRAY_SIZE(rq); i++) {
1274 			if (!i915_request_completed(rq[i])) {
1275 				pr_err("Pre-wrap request not completed!\n");
1276 				err = -EINVAL;
1277 				goto out;
1278 			}
1279 		}
1280 
1281 out:
1282 		for (i = 0; i < ARRAY_SIZE(rq); i++)
1283 			i915_request_put(rq[i]);
1284 		intel_context_put(ce);
1285 		if (err)
1286 			break;
1287 	}
1288 
1289 	if (igt_flush_test(gt->i915))
1290 		err = -EIO;
1291 
1292 	return err;
1293 }
1294 
1295 static int live_hwsp_recycle(void *arg)
1296 {
1297 	struct intel_gt *gt = arg;
1298 	struct intel_engine_cs *engine;
1299 	enum intel_engine_id id;
1300 	unsigned long count;
1301 	int err = 0;
1302 
1303 	/*
1304 	 * Check seqno writes into one timeline at a time. We expect to
1305 	 * recycle the breadcrumb slot between iterations and neither
1306 	 * want to confuse ourselves or the GPU.
1307 	 */
1308 
1309 	count = 0;
1310 	for_each_engine(engine, gt, id) {
1311 		IGT_TIMEOUT(end_time);
1312 
1313 		if (!intel_engine_can_store_dword(engine))
1314 			continue;
1315 
1316 		intel_engine_pm_get(engine);
1317 
1318 		do {
1319 			struct intel_timeline *tl;
1320 			struct i915_request *rq;
1321 
1322 			tl = checked_intel_timeline_create(gt);
1323 			if (IS_ERR(tl)) {
1324 				err = PTR_ERR(tl);
1325 				break;
1326 			}
1327 
1328 			rq = tl_write(tl, engine, count);
1329 			if (IS_ERR(rq)) {
1330 				intel_timeline_put(tl);
1331 				err = PTR_ERR(rq);
1332 				break;
1333 			}
1334 
1335 			if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1336 				pr_err("Wait for timeline writes timed out!\n");
1337 				i915_request_put(rq);
1338 				intel_timeline_put(tl);
1339 				err = -EIO;
1340 				break;
1341 			}
1342 
1343 			if (READ_ONCE(*tl->hwsp_seqno) != count) {
1344 				GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x found 0x%x\n",
1345 					      count, tl->fence_context,
1346 					      tl->hwsp_offset, *tl->hwsp_seqno);
1347 				GEM_TRACE_DUMP();
1348 				err = -EINVAL;
1349 			}
1350 
1351 			i915_request_put(rq);
1352 			intel_timeline_put(tl);
1353 			count++;
1354 
1355 			if (err)
1356 				break;
1357 		} while (!__igt_timeout(end_time, NULL));
1358 
1359 		intel_engine_pm_put(engine);
1360 		if (err)
1361 			break;
1362 	}
1363 
1364 	return err;
1365 }
1366 
1367 int intel_timeline_live_selftests(struct drm_i915_private *i915)
1368 {
1369 	static const struct i915_subtest tests[] = {
1370 		SUBTEST(live_hwsp_recycle),
1371 		SUBTEST(live_hwsp_engine),
1372 		SUBTEST(live_hwsp_alternate),
1373 		SUBTEST(live_hwsp_wrap),
1374 		SUBTEST(live_hwsp_read),
1375 		SUBTEST(live_hwsp_rollover_kernel),
1376 		SUBTEST(live_hwsp_rollover_user),
1377 	};
1378 
1379 	if (intel_gt_is_wedged(&i915->gt))
1380 		return 0;
1381 
1382 	return intel_gt_live_subtests(tests, &i915->gt);
1383 }
1384