xref: /linux/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c (revision f6e8dc9edf963dbc99085e54f6ced6da9daa6100)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2019 Intel Corporation
4  */
5 
6 #include <drm/drm_print.h>
7 
8 #include "i915_drv.h"
9 #include "i915_request.h"
10 
11 #include "intel_context.h"
12 #include "intel_engine_heartbeat.h"
13 #include "intel_engine_pm.h"
14 #include "intel_engine.h"
15 #include "intel_gt.h"
16 #include "intel_reset.h"
17 
18 /*
19  * While the engine is active, we send a periodic pulse along the engine
20  * to check on its health and to flush any idle-barriers. If that request
21  * is stuck, and we fail to preempt it, we declare the engine hung and
22  * issue a reset -- in the hope that restores progress.
23  */
24 
25 static bool next_heartbeat(struct intel_engine_cs *engine)
26 {
27 	struct i915_request *rq;
28 	long delay;
29 
30 	delay = READ_ONCE(engine->props.heartbeat_interval_ms);
31 
32 	rq = engine->heartbeat.systole;
33 
34 	/*
35 	 * FIXME: The final period extension is disabled if the period has been
36 	 * modified from the default. This is to prevent issues with certain
37 	 * selftests which override the value and expect specific behaviour.
38 	 * Once the selftests have been updated to either cope with variable
39 	 * heartbeat periods (or to override the pre-emption timeout as well,
40 	 * or just to add a selftest specific override of the extension), the
41 	 * generic override can be removed.
42 	 */
43 	if (rq && rq->sched.attr.priority >= I915_PRIORITY_BARRIER &&
44 	    delay == engine->defaults.heartbeat_interval_ms) {
45 		long longer;
46 
47 		/*
48 		 * The final try is at the highest priority possible. Up until now
49 		 * a pre-emption might not even have been attempted. So make sure
50 		 * this last attempt allows enough time for a pre-emption to occur.
51 		 */
52 		longer = READ_ONCE(engine->props.preempt_timeout_ms) * 2;
53 		longer = intel_clamp_heartbeat_interval_ms(engine, longer);
54 		if (longer > delay)
55 			delay = longer;
56 	}
57 
58 	if (!delay)
59 		return false;
60 
61 	delay = msecs_to_jiffies_timeout(delay);
62 	if (delay >= HZ)
63 		delay = round_jiffies_up_relative(delay);
64 	mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay + 1);
65 
66 	return true;
67 }
68 
69 static struct i915_request *
70 heartbeat_create(struct intel_context *ce, gfp_t gfp)
71 {
72 	struct i915_request *rq;
73 
74 	intel_context_enter(ce);
75 	rq = __i915_request_create(ce, gfp);
76 	intel_context_exit(ce);
77 
78 	return rq;
79 }
80 
81 static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq)
82 {
83 	engine->wakeref_serial = READ_ONCE(engine->serial) + 1;
84 	i915_request_add_active_barriers(rq);
85 	if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine))
86 		engine->heartbeat.systole = i915_request_get(rq);
87 }
88 
89 static void heartbeat_commit(struct i915_request *rq,
90 			     const struct i915_sched_attr *attr)
91 {
92 	idle_pulse(rq->engine, rq);
93 
94 	__i915_request_commit(rq);
95 	__i915_request_queue(rq, attr);
96 }
97 
98 static void show_heartbeat(const struct i915_request *rq,
99 			   struct intel_engine_cs *engine)
100 {
101 	struct drm_printer p =
102 		drm_dbg_printer(&engine->i915->drm, DRM_UT_DRIVER, "heartbeat");
103 
104 	if (!rq) {
105 		intel_engine_dump(engine, &p,
106 				  "%s heartbeat not ticking\n",
107 				  engine->name);
108 	} else {
109 		intel_engine_dump(engine, &p,
110 				  "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n",
111 				  engine->name,
112 				  rq->fence.context,
113 				  rq->fence.seqno,
114 				  rq->sched.attr.priority);
115 	}
116 }
117 
118 static void
119 reset_engine(struct intel_engine_cs *engine, struct i915_request *rq)
120 {
121 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
122 		show_heartbeat(rq, engine);
123 
124 	if (intel_engine_uses_guc(engine))
125 		/*
126 		 * GuC itself is toast or GuC's hang detection
127 		 * is disabled. Either way, need to find the
128 		 * hang culprit manually.
129 		 */
130 		intel_guc_find_hung_context(engine);
131 
132 	intel_gt_handle_error(engine->gt, engine->mask,
133 			      I915_ERROR_CAPTURE,
134 			      "stopped heartbeat on %s",
135 			      engine->name);
136 }
137 
138 static void heartbeat(struct work_struct *wrk)
139 {
140 	struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
141 	struct intel_engine_cs *engine =
142 		container_of(wrk, typeof(*engine), heartbeat.work.work);
143 	struct intel_context *ce = engine->kernel_context;
144 	struct i915_request *rq;
145 	unsigned long serial;
146 
147 	/* Just in case everything has gone horribly wrong, give it a kick */
148 	intel_engine_flush_submission(engine);
149 
150 	rq = engine->heartbeat.systole;
151 	if (rq && i915_request_completed(rq)) {
152 		i915_request_put(rq);
153 		engine->heartbeat.systole = NULL;
154 	}
155 
156 	if (!intel_engine_pm_get_if_awake(engine))
157 		return;
158 
159 	if (intel_gt_is_wedged(engine->gt))
160 		goto out;
161 
162 	if (i915_sched_engine_disabled(engine->sched_engine)) {
163 		reset_engine(engine, engine->heartbeat.systole);
164 		goto out;
165 	}
166 
167 	if (engine->heartbeat.systole) {
168 		long delay = READ_ONCE(engine->props.heartbeat_interval_ms);
169 
170 		/* Safeguard against too-fast worker invocations */
171 		if (!time_after(jiffies,
172 				rq->emitted_jiffies + msecs_to_jiffies(delay)))
173 			goto out;
174 
175 		if (!i915_sw_fence_signaled(&rq->submit)) {
176 			/*
177 			 * Not yet submitted, system is stalled.
178 			 *
179 			 * This more often happens for ring submission,
180 			 * where all contexts are funnelled into a common
181 			 * ringbuffer. If one context is blocked on an
182 			 * external fence, not only is it not submitted,
183 			 * but all other contexts, including the kernel
184 			 * context are stuck waiting for the signal.
185 			 */
186 		} else if (engine->sched_engine->schedule &&
187 			   rq->sched.attr.priority < I915_PRIORITY_BARRIER) {
188 			/*
189 			 * Gradually raise the priority of the heartbeat to
190 			 * give high priority work [which presumably desires
191 			 * low latency and no jitter] the chance to naturally
192 			 * complete before being preempted.
193 			 */
194 			attr.priority = I915_PRIORITY_NORMAL;
195 			if (rq->sched.attr.priority >= attr.priority)
196 				attr.priority = I915_PRIORITY_HEARTBEAT;
197 			if (rq->sched.attr.priority >= attr.priority)
198 				attr.priority = I915_PRIORITY_BARRIER;
199 
200 			local_bh_disable();
201 			engine->sched_engine->schedule(rq, &attr);
202 			local_bh_enable();
203 		} else {
204 			reset_engine(engine, rq);
205 		}
206 
207 		rq->emitted_jiffies = jiffies;
208 		goto out;
209 	}
210 
211 	serial = READ_ONCE(engine->serial);
212 	if (engine->wakeref_serial == serial)
213 		goto out;
214 
215 	if (!mutex_trylock(&ce->timeline->mutex)) {
216 		/* Unable to lock the kernel timeline, is the engine stuck? */
217 		if (xchg(&engine->heartbeat.blocked, serial) == serial)
218 			intel_gt_handle_error(engine->gt, engine->mask,
219 					      I915_ERROR_CAPTURE,
220 					      "no heartbeat on %s",
221 					      engine->name);
222 		goto out;
223 	}
224 
225 	rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN);
226 	if (IS_ERR(rq))
227 		goto unlock;
228 
229 	heartbeat_commit(rq, &attr);
230 
231 unlock:
232 	mutex_unlock(&ce->timeline->mutex);
233 out:
234 	if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine))
235 		i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
236 	intel_engine_pm_put(engine);
237 }
238 
239 void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine)
240 {
241 	if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL)
242 		return;
243 
244 	next_heartbeat(engine);
245 }
246 
247 void intel_engine_park_heartbeat(struct intel_engine_cs *engine)
248 {
249 	if (cancel_delayed_work(&engine->heartbeat.work))
250 		i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
251 }
252 
253 void intel_gt_unpark_heartbeats(struct intel_gt *gt)
254 {
255 	struct intel_engine_cs *engine;
256 	enum intel_engine_id id;
257 
258 	for_each_engine(engine, gt, id)
259 		if (intel_engine_pm_is_awake(engine))
260 			intel_engine_unpark_heartbeat(engine);
261 }
262 
263 void intel_gt_park_heartbeats(struct intel_gt *gt)
264 {
265 	struct intel_engine_cs *engine;
266 	enum intel_engine_id id;
267 
268 	for_each_engine(engine, gt, id)
269 		intel_engine_park_heartbeat(engine);
270 }
271 
272 void intel_engine_init_heartbeat(struct intel_engine_cs *engine)
273 {
274 	INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat);
275 }
276 
277 static int __intel_engine_pulse(struct intel_engine_cs *engine)
278 {
279 	struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
280 	struct intel_context *ce = engine->kernel_context;
281 	struct i915_request *rq;
282 
283 	lockdep_assert_held(&ce->timeline->mutex);
284 	GEM_BUG_ON(!intel_engine_has_preemption(engine));
285 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
286 
287 	rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN);
288 	if (IS_ERR(rq))
289 		return PTR_ERR(rq);
290 
291 	__set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags);
292 
293 	heartbeat_commit(rq, &attr);
294 	GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
295 
296 	/* Ensure the forced pulse gets a full period to execute */
297 	next_heartbeat(engine);
298 
299 	return 0;
300 }
301 
302 static unsigned long set_heartbeat(struct intel_engine_cs *engine,
303 				   unsigned long delay)
304 {
305 	unsigned long old;
306 
307 	old = xchg(&engine->props.heartbeat_interval_ms, delay);
308 	if (delay)
309 		intel_engine_unpark_heartbeat(engine);
310 	else
311 		intel_engine_park_heartbeat(engine);
312 
313 	return old;
314 }
315 
316 int intel_engine_set_heartbeat(struct intel_engine_cs *engine,
317 			       unsigned long delay)
318 {
319 	struct intel_context *ce = engine->kernel_context;
320 	int err = 0;
321 
322 	if (!delay && !intel_engine_has_preempt_reset(engine))
323 		return -ENODEV;
324 
325 	/* FIXME: Remove together with equally marked hack in next_heartbeat. */
326 	if (delay != engine->defaults.heartbeat_interval_ms &&
327 	    delay < 2 * engine->props.preempt_timeout_ms) {
328 		if (intel_engine_uses_guc(engine))
329 			drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may downgrade individual engine resets to full GPU resets!\n",
330 				   engine->name);
331 		else
332 			drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may cause engine resets to target innocent contexts!\n",
333 				   engine->name);
334 	}
335 
336 	intel_engine_pm_get(engine);
337 
338 	err = mutex_lock_interruptible(&ce->timeline->mutex);
339 	if (err)
340 		goto out_rpm;
341 
342 	if (delay != engine->props.heartbeat_interval_ms) {
343 		unsigned long saved = set_heartbeat(engine, delay);
344 
345 		/* recheck current execution */
346 		if (intel_engine_has_preemption(engine)) {
347 			err = __intel_engine_pulse(engine);
348 			if (err)
349 				set_heartbeat(engine, saved);
350 		}
351 	}
352 
353 	mutex_unlock(&ce->timeline->mutex);
354 
355 out_rpm:
356 	intel_engine_pm_put(engine);
357 	return err;
358 }
359 
360 int intel_engine_pulse(struct intel_engine_cs *engine)
361 {
362 	struct intel_context *ce = engine->kernel_context;
363 	int err;
364 
365 	if (!intel_engine_has_preemption(engine))
366 		return -ENODEV;
367 
368 	if (!intel_engine_pm_get_if_awake(engine))
369 		return 0;
370 
371 	err = -EINTR;
372 	if (!mutex_lock_interruptible(&ce->timeline->mutex)) {
373 		err = __intel_engine_pulse(engine);
374 		mutex_unlock(&ce->timeline->mutex);
375 	}
376 
377 	intel_engine_flush_submission(engine);
378 	intel_engine_pm_put(engine);
379 	return err;
380 }
381 
382 int intel_engine_flush_barriers(struct intel_engine_cs *engine)
383 {
384 	struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
385 	struct intel_context *ce = engine->kernel_context;
386 	struct i915_request *rq;
387 	int err;
388 
389 	if (llist_empty(&engine->barrier_tasks))
390 		return 0;
391 
392 	if (!intel_engine_pm_get_if_awake(engine))
393 		return 0;
394 
395 	if (mutex_lock_interruptible(&ce->timeline->mutex)) {
396 		err = -EINTR;
397 		goto out_rpm;
398 	}
399 
400 	rq = heartbeat_create(ce, GFP_KERNEL);
401 	if (IS_ERR(rq)) {
402 		err = PTR_ERR(rq);
403 		goto out_unlock;
404 	}
405 
406 	heartbeat_commit(rq, &attr);
407 
408 	err = 0;
409 out_unlock:
410 	mutex_unlock(&ce->timeline->mutex);
411 out_rpm:
412 	intel_engine_pm_put(engine);
413 	return err;
414 }
415 
416 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
417 #include "selftest_engine_heartbeat.c"
418 #endif
419