1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2019 Intel Corporation 4 */ 5 6 #include "i915_drv.h" 7 #include "i915_request.h" 8 9 #include "intel_context.h" 10 #include "intel_engine_heartbeat.h" 11 #include "intel_engine_pm.h" 12 #include "intel_engine.h" 13 #include "intel_gt.h" 14 #include "intel_reset.h" 15 16 /* 17 * While the engine is active, we send a periodic pulse along the engine 18 * to check on its health and to flush any idle-barriers. If that request 19 * is stuck, and we fail to preempt it, we declare the engine hung and 20 * issue a reset -- in the hope that restores progress. 21 */ 22 23 static bool next_heartbeat(struct intel_engine_cs *engine) 24 { 25 struct i915_request *rq; 26 long delay; 27 28 delay = READ_ONCE(engine->props.heartbeat_interval_ms); 29 30 rq = engine->heartbeat.systole; 31 32 /* 33 * FIXME: The final period extension is disabled if the period has been 34 * modified from the default. This is to prevent issues with certain 35 * selftests which override the value and expect specific behaviour. 36 * Once the selftests have been updated to either cope with variable 37 * heartbeat periods (or to override the pre-emption timeout as well, 38 * or just to add a selftest specific override of the extension), the 39 * generic override can be removed. 40 */ 41 if (rq && rq->sched.attr.priority >= I915_PRIORITY_BARRIER && 42 delay == engine->defaults.heartbeat_interval_ms) { 43 long longer; 44 45 /* 46 * The final try is at the highest priority possible. Up until now 47 * a pre-emption might not even have been attempted. So make sure 48 * this last attempt allows enough time for a pre-emption to occur. 49 */ 50 longer = READ_ONCE(engine->props.preempt_timeout_ms) * 2; 51 longer = intel_clamp_heartbeat_interval_ms(engine, longer); 52 if (longer > delay) 53 delay = longer; 54 } 55 56 if (!delay) 57 return false; 58 59 delay = msecs_to_jiffies_timeout(delay); 60 if (delay >= HZ) 61 delay = round_jiffies_up_relative(delay); 62 mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay + 1); 63 64 return true; 65 } 66 67 static struct i915_request * 68 heartbeat_create(struct intel_context *ce, gfp_t gfp) 69 { 70 struct i915_request *rq; 71 72 intel_context_enter(ce); 73 rq = __i915_request_create(ce, gfp); 74 intel_context_exit(ce); 75 76 return rq; 77 } 78 79 static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq) 80 { 81 engine->wakeref_serial = READ_ONCE(engine->serial) + 1; 82 i915_request_add_active_barriers(rq); 83 if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine)) 84 engine->heartbeat.systole = i915_request_get(rq); 85 } 86 87 static void heartbeat_commit(struct i915_request *rq, 88 const struct i915_sched_attr *attr) 89 { 90 idle_pulse(rq->engine, rq); 91 92 __i915_request_commit(rq); 93 __i915_request_queue(rq, attr); 94 } 95 96 static void show_heartbeat(const struct i915_request *rq, 97 struct intel_engine_cs *engine) 98 { 99 struct drm_printer p = drm_debug_printer("heartbeat"); 100 101 if (!rq) { 102 intel_engine_dump(engine, &p, 103 "%s heartbeat not ticking\n", 104 engine->name); 105 } else { 106 intel_engine_dump(engine, &p, 107 "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n", 108 engine->name, 109 rq->fence.context, 110 rq->fence.seqno, 111 rq->sched.attr.priority); 112 } 113 } 114 115 static void 116 reset_engine(struct intel_engine_cs *engine, struct i915_request *rq) 117 { 118 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 119 show_heartbeat(rq, engine); 120 121 if (intel_engine_uses_guc(engine)) 122 /* 123 * GuC itself is toast or GuC's hang detection 124 * is disabled. Either way, need to find the 125 * hang culprit manually. 126 */ 127 intel_guc_find_hung_context(engine); 128 129 intel_gt_handle_error(engine->gt, engine->mask, 130 I915_ERROR_CAPTURE, 131 "stopped heartbeat on %s", 132 engine->name); 133 } 134 135 static void heartbeat(struct work_struct *wrk) 136 { 137 struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN }; 138 struct intel_engine_cs *engine = 139 container_of(wrk, typeof(*engine), heartbeat.work.work); 140 struct intel_context *ce = engine->kernel_context; 141 struct i915_request *rq; 142 unsigned long serial; 143 144 /* Just in case everything has gone horribly wrong, give it a kick */ 145 intel_engine_flush_submission(engine); 146 147 rq = engine->heartbeat.systole; 148 if (rq && i915_request_completed(rq)) { 149 i915_request_put(rq); 150 engine->heartbeat.systole = NULL; 151 } 152 153 if (!intel_engine_pm_get_if_awake(engine)) 154 return; 155 156 if (intel_gt_is_wedged(engine->gt)) 157 goto out; 158 159 if (i915_sched_engine_disabled(engine->sched_engine)) { 160 reset_engine(engine, engine->heartbeat.systole); 161 goto out; 162 } 163 164 if (engine->heartbeat.systole) { 165 long delay = READ_ONCE(engine->props.heartbeat_interval_ms); 166 167 /* Safeguard against too-fast worker invocations */ 168 if (!time_after(jiffies, 169 rq->emitted_jiffies + msecs_to_jiffies(delay))) 170 goto out; 171 172 if (!i915_sw_fence_signaled(&rq->submit)) { 173 /* 174 * Not yet submitted, system is stalled. 175 * 176 * This more often happens for ring submission, 177 * where all contexts are funnelled into a common 178 * ringbuffer. If one context is blocked on an 179 * external fence, not only is it not submitted, 180 * but all other contexts, including the kernel 181 * context are stuck waiting for the signal. 182 */ 183 } else if (engine->sched_engine->schedule && 184 rq->sched.attr.priority < I915_PRIORITY_BARRIER) { 185 /* 186 * Gradually raise the priority of the heartbeat to 187 * give high priority work [which presumably desires 188 * low latency and no jitter] the chance to naturally 189 * complete before being preempted. 190 */ 191 attr.priority = I915_PRIORITY_NORMAL; 192 if (rq->sched.attr.priority >= attr.priority) 193 attr.priority = I915_PRIORITY_HEARTBEAT; 194 if (rq->sched.attr.priority >= attr.priority) 195 attr.priority = I915_PRIORITY_BARRIER; 196 197 local_bh_disable(); 198 engine->sched_engine->schedule(rq, &attr); 199 local_bh_enable(); 200 } else { 201 reset_engine(engine, rq); 202 } 203 204 rq->emitted_jiffies = jiffies; 205 goto out; 206 } 207 208 serial = READ_ONCE(engine->serial); 209 if (engine->wakeref_serial == serial) 210 goto out; 211 212 if (!mutex_trylock(&ce->timeline->mutex)) { 213 /* Unable to lock the kernel timeline, is the engine stuck? */ 214 if (xchg(&engine->heartbeat.blocked, serial) == serial) 215 intel_gt_handle_error(engine->gt, engine->mask, 216 I915_ERROR_CAPTURE, 217 "no heartbeat on %s", 218 engine->name); 219 goto out; 220 } 221 222 rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); 223 if (IS_ERR(rq)) 224 goto unlock; 225 226 heartbeat_commit(rq, &attr); 227 228 unlock: 229 mutex_unlock(&ce->timeline->mutex); 230 out: 231 if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine)) 232 i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); 233 intel_engine_pm_put(engine); 234 } 235 236 void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine) 237 { 238 if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL) 239 return; 240 241 next_heartbeat(engine); 242 } 243 244 void intel_engine_park_heartbeat(struct intel_engine_cs *engine) 245 { 246 if (cancel_delayed_work(&engine->heartbeat.work)) 247 i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); 248 } 249 250 void intel_gt_unpark_heartbeats(struct intel_gt *gt) 251 { 252 struct intel_engine_cs *engine; 253 enum intel_engine_id id; 254 255 for_each_engine(engine, gt, id) 256 if (intel_engine_pm_is_awake(engine)) 257 intel_engine_unpark_heartbeat(engine); 258 } 259 260 void intel_gt_park_heartbeats(struct intel_gt *gt) 261 { 262 struct intel_engine_cs *engine; 263 enum intel_engine_id id; 264 265 for_each_engine(engine, gt, id) 266 intel_engine_park_heartbeat(engine); 267 } 268 269 void intel_engine_init_heartbeat(struct intel_engine_cs *engine) 270 { 271 INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat); 272 } 273 274 static int __intel_engine_pulse(struct intel_engine_cs *engine) 275 { 276 struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER }; 277 struct intel_context *ce = engine->kernel_context; 278 struct i915_request *rq; 279 280 lockdep_assert_held(&ce->timeline->mutex); 281 GEM_BUG_ON(!intel_engine_has_preemption(engine)); 282 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 283 284 rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); 285 if (IS_ERR(rq)) 286 return PTR_ERR(rq); 287 288 __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags); 289 290 heartbeat_commit(rq, &attr); 291 GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER); 292 293 return 0; 294 } 295 296 static unsigned long set_heartbeat(struct intel_engine_cs *engine, 297 unsigned long delay) 298 { 299 unsigned long old; 300 301 old = xchg(&engine->props.heartbeat_interval_ms, delay); 302 if (delay) 303 intel_engine_unpark_heartbeat(engine); 304 else 305 intel_engine_park_heartbeat(engine); 306 307 return old; 308 } 309 310 int intel_engine_set_heartbeat(struct intel_engine_cs *engine, 311 unsigned long delay) 312 { 313 struct intel_context *ce = engine->kernel_context; 314 int err = 0; 315 316 if (!delay && !intel_engine_has_preempt_reset(engine)) 317 return -ENODEV; 318 319 /* FIXME: Remove together with equally marked hack in next_heartbeat. */ 320 if (delay != engine->defaults.heartbeat_interval_ms && 321 delay < 2 * engine->props.preempt_timeout_ms) { 322 if (intel_engine_uses_guc(engine)) 323 drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may downgrade individual engine resets to full GPU resets!\n", 324 engine->name); 325 else 326 drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may cause engine resets to target innocent contexts!\n", 327 engine->name); 328 } 329 330 intel_engine_pm_get(engine); 331 332 err = mutex_lock_interruptible(&ce->timeline->mutex); 333 if (err) 334 goto out_rpm; 335 336 if (delay != engine->props.heartbeat_interval_ms) { 337 unsigned long saved = set_heartbeat(engine, delay); 338 339 /* recheck current execution */ 340 if (intel_engine_has_preemption(engine)) { 341 err = __intel_engine_pulse(engine); 342 if (err) 343 set_heartbeat(engine, saved); 344 } 345 } 346 347 mutex_unlock(&ce->timeline->mutex); 348 349 out_rpm: 350 intel_engine_pm_put(engine); 351 return err; 352 } 353 354 int intel_engine_pulse(struct intel_engine_cs *engine) 355 { 356 struct intel_context *ce = engine->kernel_context; 357 int err; 358 359 if (!intel_engine_has_preemption(engine)) 360 return -ENODEV; 361 362 if (!intel_engine_pm_get_if_awake(engine)) 363 return 0; 364 365 err = -EINTR; 366 if (!mutex_lock_interruptible(&ce->timeline->mutex)) { 367 err = __intel_engine_pulse(engine); 368 mutex_unlock(&ce->timeline->mutex); 369 } 370 371 intel_engine_flush_submission(engine); 372 intel_engine_pm_put(engine); 373 return err; 374 } 375 376 int intel_engine_flush_barriers(struct intel_engine_cs *engine) 377 { 378 struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN }; 379 struct intel_context *ce = engine->kernel_context; 380 struct i915_request *rq; 381 int err; 382 383 if (llist_empty(&engine->barrier_tasks)) 384 return 0; 385 386 if (!intel_engine_pm_get_if_awake(engine)) 387 return 0; 388 389 if (mutex_lock_interruptible(&ce->timeline->mutex)) { 390 err = -EINTR; 391 goto out_rpm; 392 } 393 394 rq = heartbeat_create(ce, GFP_KERNEL); 395 if (IS_ERR(rq)) { 396 err = PTR_ERR(rq); 397 goto out_unlock; 398 } 399 400 heartbeat_commit(rq, &attr); 401 402 err = 0; 403 out_unlock: 404 mutex_unlock(&ce->timeline->mutex); 405 out_rpm: 406 intel_engine_pm_put(engine); 407 return err; 408 } 409 410 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 411 #include "selftest_engine_heartbeat.c" 412 #endif 413