xref: /linux/drivers/gpu/drm/i915/gt/selftest_context.c (revision f6e8dc9edf963dbc99085e54f6ced6da9daa6100)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright © 2019 Intel Corporation
4  */
5 
6 #include <drm/drm_print.h>
7 
8 #include "i915_selftest.h"
9 #include "intel_engine_heartbeat.h"
10 #include "intel_engine_pm.h"
11 #include "intel_gt.h"
12 
13 #include "gem/selftests/mock_context.h"
14 #include "selftests/igt_flush_test.h"
15 #include "selftests/mock_drm.h"
16 
17 static int request_sync(struct i915_request *rq)
18 {
19 	struct intel_timeline *tl = i915_request_timeline(rq);
20 	long timeout;
21 	int err = 0;
22 
23 	intel_timeline_get(tl);
24 	i915_request_get(rq);
25 
26 	/* Opencode i915_request_add() so we can keep the timeline locked. */
27 	__i915_request_commit(rq);
28 	rq->sched.attr.priority = I915_PRIORITY_BARRIER;
29 	__i915_request_queue_bh(rq);
30 
31 	timeout = i915_request_wait(rq, 0, HZ / 10);
32 	if (timeout < 0)
33 		err = timeout;
34 	else
35 		i915_request_retire_upto(rq);
36 
37 	lockdep_unpin_lock(&tl->mutex, rq->cookie);
38 	mutex_unlock(&tl->mutex);
39 
40 	i915_request_put(rq);
41 	intel_timeline_put(tl);
42 
43 	return err;
44 }
45 
46 static int context_sync(struct intel_context *ce)
47 {
48 	struct intel_timeline *tl = ce->timeline;
49 	int err = 0;
50 
51 	mutex_lock(&tl->mutex);
52 	do {
53 		struct i915_request *rq;
54 		long timeout;
55 
56 		if (list_empty(&tl->requests))
57 			break;
58 
59 		rq = list_last_entry(&tl->requests, typeof(*rq), link);
60 		i915_request_get(rq);
61 
62 		timeout = i915_request_wait(rq, 0, HZ / 10);
63 		if (timeout < 0)
64 			err = timeout;
65 		else
66 			i915_request_retire_upto(rq);
67 
68 		i915_request_put(rq);
69 	} while (!err);
70 	mutex_unlock(&tl->mutex);
71 
72 	/* Wait for all barriers to complete (remote CPU) before we check */
73 	i915_active_unlock_wait(&ce->active);
74 	return err;
75 }
76 
77 static int __live_context_size(struct intel_engine_cs *engine)
78 {
79 	struct intel_context *ce;
80 	struct i915_request *rq;
81 	void *vaddr;
82 	int err;
83 
84 	ce = intel_context_create(engine);
85 	if (IS_ERR(ce))
86 		return PTR_ERR(ce);
87 
88 	err = intel_context_pin(ce);
89 	if (err)
90 		goto err;
91 
92 	vaddr = i915_gem_object_pin_map_unlocked(ce->state->obj,
93 						 intel_gt_coherent_map_type(engine->gt,
94 									    ce->state->obj,
95 									    false));
96 	if (IS_ERR(vaddr)) {
97 		err = PTR_ERR(vaddr);
98 		intel_context_unpin(ce);
99 		goto err;
100 	}
101 
102 	/*
103 	 * Note that execlists also applies a redzone which it checks on
104 	 * context unpin when debugging. We are using the same location
105 	 * and same poison value so that our checks overlap. Despite the
106 	 * redundancy, we want to keep this little selftest so that we
107 	 * get coverage of any and all submission backends, and we can
108 	 * always extend this test to ensure we trick the HW into a
109 	 * compromising position wrt to the various sections that need
110 	 * to be written into the context state.
111 	 *
112 	 * TLDR; this overlaps with the execlists redzone.
113 	 */
114 	vaddr += engine->context_size - I915_GTT_PAGE_SIZE;
115 	memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE);
116 
117 	rq = intel_context_create_request(ce);
118 	intel_context_unpin(ce);
119 	if (IS_ERR(rq)) {
120 		err = PTR_ERR(rq);
121 		goto err_unpin;
122 	}
123 
124 	err = request_sync(rq);
125 	if (err)
126 		goto err_unpin;
127 
128 	/* Force the context switch */
129 	rq = intel_engine_create_kernel_request(engine);
130 	if (IS_ERR(rq)) {
131 		err = PTR_ERR(rq);
132 		goto err_unpin;
133 	}
134 	err = request_sync(rq);
135 	if (err)
136 		goto err_unpin;
137 
138 	if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE)) {
139 		pr_err("%s context overwrote trailing red-zone!", engine->name);
140 		err = -EINVAL;
141 	}
142 
143 err_unpin:
144 	i915_gem_object_unpin_map(ce->state->obj);
145 err:
146 	intel_context_put(ce);
147 	return err;
148 }
149 
150 static int live_context_size(void *arg)
151 {
152 	struct intel_gt *gt = arg;
153 	struct intel_engine_cs *engine;
154 	enum intel_engine_id id;
155 	int err = 0;
156 
157 	/*
158 	 * Check that our context sizes are correct by seeing if the
159 	 * HW tries to write past the end of one.
160 	 */
161 
162 	for_each_engine(engine, gt, id) {
163 		struct file *saved;
164 
165 		if (!engine->context_size)
166 			continue;
167 
168 		intel_engine_pm_get(engine);
169 
170 		/*
171 		 * Hide the old default state -- we lie about the context size
172 		 * and get confused when the default state is smaller than
173 		 * expected. For our do nothing request, inheriting the
174 		 * active state is sufficient, we are only checking that we
175 		 * don't use more than we planned.
176 		 */
177 		saved = fetch_and_zero(&engine->default_state);
178 
179 		/* Overlaps with the execlists redzone */
180 		engine->context_size += I915_GTT_PAGE_SIZE;
181 
182 		err = __live_context_size(engine);
183 
184 		engine->context_size -= I915_GTT_PAGE_SIZE;
185 
186 		engine->default_state = saved;
187 
188 		intel_engine_pm_put(engine);
189 
190 		if (err)
191 			break;
192 	}
193 
194 	return err;
195 }
196 
197 static int __live_active_context(struct intel_engine_cs *engine)
198 {
199 	unsigned long saved_heartbeat;
200 	struct intel_context *ce;
201 	int pass;
202 	int err;
203 
204 	/*
205 	 * We keep active contexts alive until after a subsequent context
206 	 * switch as the final write from the context-save will be after
207 	 * we retire the final request. We track when we unpin the context,
208 	 * under the presumption that the final pin is from the last request,
209 	 * and instead of immediately unpinning the context, we add a task
210 	 * to unpin the context from the next idle-barrier.
211 	 *
212 	 * This test makes sure that the context is kept alive until a
213 	 * subsequent idle-barrier (emitted when the engine wakeref hits 0
214 	 * with no more outstanding requests).
215 	 *
216 	 * In GuC submission mode we don't use idle barriers and we instead
217 	 * get a message from the GuC to signal that it is safe to unpin the
218 	 * context from memory.
219 	 */
220 	if (intel_engine_uses_guc(engine))
221 		return 0;
222 
223 	if (intel_engine_pm_is_awake(engine)) {
224 		pr_err("%s is awake before starting %s!\n",
225 		       engine->name, __func__);
226 		return -EINVAL;
227 	}
228 
229 	ce = intel_context_create(engine);
230 	if (IS_ERR(ce))
231 		return PTR_ERR(ce);
232 
233 	saved_heartbeat = engine->props.heartbeat_interval_ms;
234 	engine->props.heartbeat_interval_ms = 0;
235 
236 	for (pass = 0; pass <= 2; pass++) {
237 		struct i915_request *rq;
238 
239 		intel_engine_pm_get(engine);
240 
241 		rq = intel_context_create_request(ce);
242 		if (IS_ERR(rq)) {
243 			err = PTR_ERR(rq);
244 			goto out_engine;
245 		}
246 
247 		err = request_sync(rq);
248 		if (err)
249 			goto out_engine;
250 
251 		/* Context will be kept active until after an idle-barrier. */
252 		if (i915_active_is_idle(&ce->active)) {
253 			pr_err("context is not active; expected idle-barrier (%s pass %d)\n",
254 			       engine->name, pass);
255 			err = -EINVAL;
256 			goto out_engine;
257 		}
258 
259 		if (!intel_engine_pm_is_awake(engine)) {
260 			pr_err("%s is asleep before idle-barrier\n",
261 			       engine->name);
262 			err = -EINVAL;
263 			goto out_engine;
264 		}
265 
266 out_engine:
267 		intel_engine_pm_put(engine);
268 		if (err)
269 			goto err;
270 	}
271 
272 	/* Now make sure our idle-barriers are flushed */
273 	err = intel_engine_flush_barriers(engine);
274 	if (err)
275 		goto err;
276 
277 	/* Wait for the barrier and in the process wait for engine to park */
278 	err = context_sync(engine->kernel_context);
279 	if (err)
280 		goto err;
281 
282 	if (!i915_active_is_idle(&ce->active)) {
283 		pr_err("context is still active!");
284 		err = -EINVAL;
285 	}
286 
287 	intel_engine_pm_flush(engine);
288 
289 	if (intel_engine_pm_is_awake(engine)) {
290 		struct drm_printer p = drm_dbg_printer(&engine->i915->drm,
291 						       DRM_UT_DRIVER, NULL);
292 
293 		intel_engine_dump(engine, &p,
294 				  "%s is still awake:%d after idle-barriers\n",
295 				  engine->name,
296 				  atomic_read(&engine->wakeref.count));
297 		GEM_TRACE_DUMP();
298 
299 		err = -EINVAL;
300 		goto err;
301 	}
302 
303 err:
304 	engine->props.heartbeat_interval_ms = saved_heartbeat;
305 	intel_context_put(ce);
306 	return err;
307 }
308 
309 static int live_active_context(void *arg)
310 {
311 	struct intel_gt *gt = arg;
312 	struct intel_engine_cs *engine;
313 	enum intel_engine_id id;
314 	int err = 0;
315 
316 	for_each_engine(engine, gt, id) {
317 		err = __live_active_context(engine);
318 		if (err)
319 			break;
320 
321 		err = igt_flush_test(gt->i915);
322 		if (err)
323 			break;
324 	}
325 
326 	return err;
327 }
328 
329 static int __remote_sync(struct intel_context *ce, struct intel_context *remote)
330 {
331 	struct i915_request *rq;
332 	int err;
333 
334 	err = intel_context_pin(remote);
335 	if (err)
336 		return err;
337 
338 	rq = intel_context_create_request(ce);
339 	if (IS_ERR(rq)) {
340 		err = PTR_ERR(rq);
341 		goto unpin;
342 	}
343 
344 	err = intel_context_prepare_remote_request(remote, rq);
345 	if (err) {
346 		i915_request_add(rq);
347 		goto unpin;
348 	}
349 
350 	err = request_sync(rq);
351 
352 unpin:
353 	intel_context_unpin(remote);
354 	return err;
355 }
356 
357 static int __live_remote_context(struct intel_engine_cs *engine)
358 {
359 	struct intel_context *local, *remote;
360 	unsigned long saved_heartbeat;
361 	int pass;
362 	int err;
363 
364 	/*
365 	 * Check that our idle barriers do not interfere with normal
366 	 * activity tracking. In particular, check that operating
367 	 * on the context image remotely (intel_context_prepare_remote_request),
368 	 * which inserts foreign fences into intel_context.active, does not
369 	 * clobber the idle-barrier.
370 	 *
371 	 * In GuC submission mode we don't use idle barriers.
372 	 */
373 	if (intel_engine_uses_guc(engine))
374 		return 0;
375 
376 	if (intel_engine_pm_is_awake(engine)) {
377 		pr_err("%s is awake before starting %s!\n",
378 		       engine->name, __func__);
379 		return -EINVAL;
380 	}
381 
382 	remote = intel_context_create(engine);
383 	if (IS_ERR(remote))
384 		return PTR_ERR(remote);
385 
386 	local = intel_context_create(engine);
387 	if (IS_ERR(local)) {
388 		err = PTR_ERR(local);
389 		goto err_remote;
390 	}
391 
392 	saved_heartbeat = engine->props.heartbeat_interval_ms;
393 	engine->props.heartbeat_interval_ms = 0;
394 	intel_engine_pm_get(engine);
395 
396 	for (pass = 0; pass <= 2; pass++) {
397 		err = __remote_sync(local, remote);
398 		if (err)
399 			break;
400 
401 		err = __remote_sync(engine->kernel_context, remote);
402 		if (err)
403 			break;
404 
405 		if (i915_active_is_idle(&remote->active)) {
406 			pr_err("remote context is not active; expected idle-barrier (%s pass %d)\n",
407 			       engine->name, pass);
408 			err = -EINVAL;
409 			break;
410 		}
411 	}
412 
413 	intel_engine_pm_put(engine);
414 	engine->props.heartbeat_interval_ms = saved_heartbeat;
415 
416 	intel_context_put(local);
417 err_remote:
418 	intel_context_put(remote);
419 	return err;
420 }
421 
422 static int live_remote_context(void *arg)
423 {
424 	struct intel_gt *gt = arg;
425 	struct intel_engine_cs *engine;
426 	enum intel_engine_id id;
427 	int err = 0;
428 
429 	for_each_engine(engine, gt, id) {
430 		err = __live_remote_context(engine);
431 		if (err)
432 			break;
433 
434 		err = igt_flush_test(gt->i915);
435 		if (err)
436 			break;
437 	}
438 
439 	return err;
440 }
441 
442 int intel_context_live_selftests(struct drm_i915_private *i915)
443 {
444 	static const struct i915_subtest tests[] = {
445 		SUBTEST(live_context_size),
446 		SUBTEST(live_active_context),
447 		SUBTEST(live_remote_context),
448 	};
449 	struct intel_gt *gt = to_gt(i915);
450 
451 	if (intel_gt_is_wedged(gt))
452 		return 0;
453 
454 	return intel_gt_live_subtests(tests, gt);
455 }
456