xref: /linux/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c (revision 88a8e278ff0b6b461bf39d4ace17384e976a3f3f)
1 /*
2  * SPDX-License-Identifier: MIT
3  *
4  * Copyright © 2019 Intel Corporation
5  */
6 
7 #include "i915_request.h"
8 
9 #include "intel_context.h"
10 #include "intel_engine_heartbeat.h"
11 #include "intel_engine_pm.h"
12 #include "intel_engine.h"
13 #include "intel_gt.h"
14 #include "intel_reset.h"
15 
16 /*
17  * While the engine is active, we send a periodic pulse along the engine
18  * to check on its health and to flush any idle-barriers. If that request
19  * is stuck, and we fail to preempt it, we declare the engine hung and
20  * issue a reset -- in the hope that restores progress.
21  */
22 
23 static bool next_heartbeat(struct intel_engine_cs *engine)
24 {
25 	long delay;
26 
27 	delay = READ_ONCE(engine->props.heartbeat_interval_ms);
28 	if (!delay)
29 		return false;
30 
31 	delay = msecs_to_jiffies_timeout(delay);
32 	if (delay >= HZ)
33 		delay = round_jiffies_up_relative(delay);
34 	schedule_delayed_work(&engine->heartbeat.work, delay);
35 
36 	return true;
37 }
38 
39 static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq)
40 {
41 	engine->wakeref_serial = READ_ONCE(engine->serial) + 1;
42 	i915_request_add_active_barriers(rq);
43 }
44 
45 static void show_heartbeat(const struct i915_request *rq,
46 			   struct intel_engine_cs *engine)
47 {
48 	struct drm_printer p = drm_debug_printer("heartbeat");
49 
50 	intel_engine_dump(engine, &p,
51 			  "%s heartbeat {prio:%d} not ticking\n",
52 			  engine->name,
53 			  rq->sched.attr.priority);
54 }
55 
56 static void heartbeat(struct work_struct *wrk)
57 {
58 	struct i915_sched_attr attr = {
59 		.priority = I915_USER_PRIORITY(I915_PRIORITY_MIN),
60 	};
61 	struct intel_engine_cs *engine =
62 		container_of(wrk, typeof(*engine), heartbeat.work.work);
63 	struct intel_context *ce = engine->kernel_context;
64 	struct i915_request *rq;
65 
66 	rq = engine->heartbeat.systole;
67 	if (rq && i915_request_completed(rq)) {
68 		i915_request_put(rq);
69 		engine->heartbeat.systole = NULL;
70 	}
71 
72 	if (!intel_engine_pm_get_if_awake(engine))
73 		return;
74 
75 	if (intel_gt_is_wedged(engine->gt))
76 		goto out;
77 
78 	if (engine->heartbeat.systole) {
79 		if (engine->schedule &&
80 		    rq->sched.attr.priority < I915_PRIORITY_BARRIER) {
81 			/*
82 			 * Gradually raise the priority of the heartbeat to
83 			 * give high priority work [which presumably desires
84 			 * low latency and no jitter] the chance to naturally
85 			 * complete before being preempted.
86 			 */
87 			attr.priority = I915_PRIORITY_MASK;
88 			if (rq->sched.attr.priority >= attr.priority)
89 				attr.priority |= I915_USER_PRIORITY(I915_PRIORITY_HEARTBEAT);
90 			if (rq->sched.attr.priority >= attr.priority)
91 				attr.priority = I915_PRIORITY_BARRIER;
92 
93 			local_bh_disable();
94 			engine->schedule(rq, &attr);
95 			local_bh_enable();
96 		} else {
97 			if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
98 				show_heartbeat(rq, engine);
99 
100 			intel_gt_handle_error(engine->gt, engine->mask,
101 					      I915_ERROR_CAPTURE,
102 					      "stopped heartbeat on %s",
103 					      engine->name);
104 		}
105 		goto out;
106 	}
107 
108 	if (engine->wakeref_serial == engine->serial)
109 		goto out;
110 
111 	mutex_lock(&ce->timeline->mutex);
112 
113 	intel_context_enter(ce);
114 	rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN);
115 	intel_context_exit(ce);
116 	if (IS_ERR(rq))
117 		goto unlock;
118 
119 	idle_pulse(engine, rq);
120 	if (i915_modparams.enable_hangcheck)
121 		engine->heartbeat.systole = i915_request_get(rq);
122 
123 	__i915_request_commit(rq);
124 	__i915_request_queue(rq, &attr);
125 
126 unlock:
127 	mutex_unlock(&ce->timeline->mutex);
128 out:
129 	if (!next_heartbeat(engine))
130 		i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
131 	intel_engine_pm_put(engine);
132 }
133 
134 void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine)
135 {
136 	if (!IS_ACTIVE(CONFIG_DRM_I915_HEARTBEAT_INTERVAL))
137 		return;
138 
139 	next_heartbeat(engine);
140 }
141 
142 void intel_engine_park_heartbeat(struct intel_engine_cs *engine)
143 {
144 	if (cancel_delayed_work(&engine->heartbeat.work))
145 		i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
146 }
147 
148 void intel_engine_init_heartbeat(struct intel_engine_cs *engine)
149 {
150 	INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat);
151 }
152 
153 int intel_engine_set_heartbeat(struct intel_engine_cs *engine,
154 			       unsigned long delay)
155 {
156 	int err;
157 
158 	/* Send one last pulse before to cleanup persistent hogs */
159 	if (!delay && IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT)) {
160 		err = intel_engine_pulse(engine);
161 		if (err)
162 			return err;
163 	}
164 
165 	WRITE_ONCE(engine->props.heartbeat_interval_ms, delay);
166 
167 	if (intel_engine_pm_get_if_awake(engine)) {
168 		if (delay)
169 			intel_engine_unpark_heartbeat(engine);
170 		else
171 			intel_engine_park_heartbeat(engine);
172 		intel_engine_pm_put(engine);
173 	}
174 
175 	return 0;
176 }
177 
178 int intel_engine_pulse(struct intel_engine_cs *engine)
179 {
180 	struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
181 	struct intel_context *ce = engine->kernel_context;
182 	struct i915_request *rq;
183 	int err;
184 
185 	if (!intel_engine_has_preemption(engine))
186 		return -ENODEV;
187 
188 	if (!intel_engine_pm_get_if_awake(engine))
189 		return 0;
190 
191 	if (mutex_lock_interruptible(&ce->timeline->mutex)) {
192 		err = -EINTR;
193 		goto out_rpm;
194 	}
195 
196 	intel_context_enter(ce);
197 	rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN);
198 	intel_context_exit(ce);
199 	if (IS_ERR(rq)) {
200 		err = PTR_ERR(rq);
201 		goto out_unlock;
202 	}
203 
204 	__set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags);
205 	idle_pulse(engine, rq);
206 
207 	__i915_request_commit(rq);
208 	__i915_request_queue(rq, &attr);
209 	GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
210 	err = 0;
211 
212 out_unlock:
213 	mutex_unlock(&ce->timeline->mutex);
214 out_rpm:
215 	intel_engine_pm_put(engine);
216 	return err;
217 }
218 
219 int intel_engine_flush_barriers(struct intel_engine_cs *engine)
220 {
221 	struct i915_request *rq;
222 	int err = 0;
223 
224 	if (llist_empty(&engine->barrier_tasks))
225 		return 0;
226 
227 	if (!intel_engine_pm_get_if_awake(engine))
228 		return 0;
229 
230 	rq = i915_request_create(engine->kernel_context);
231 	if (IS_ERR(rq)) {
232 		err = PTR_ERR(rq);
233 		goto out_rpm;
234 	}
235 
236 	idle_pulse(engine, rq);
237 	i915_request_add(rq);
238 
239 out_rpm:
240 	intel_engine_pm_put(engine);
241 	return err;
242 }
243 
244 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
245 #include "selftest_engine_heartbeat.c"
246 #endif
247