1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2019 Intel Corporation 4 */ 5 6 #include <drm/drm_print.h> 7 8 #include "i915_drv.h" 9 #include "i915_jiffies.h" 10 #include "i915_request.h" 11 12 #include "intel_context.h" 13 #include "intel_engine_heartbeat.h" 14 #include "intel_engine_pm.h" 15 #include "intel_engine.h" 16 #include "intel_gt.h" 17 #include "intel_reset.h" 18 19 /* 20 * While the engine is active, we send a periodic pulse along the engine 21 * to check on its health and to flush any idle-barriers. If that request 22 * is stuck, and we fail to preempt it, we declare the engine hung and 23 * issue a reset -- in the hope that restores progress. 24 */ 25 26 static bool next_heartbeat(struct intel_engine_cs *engine) 27 { 28 struct i915_request *rq; 29 long delay; 30 31 delay = READ_ONCE(engine->props.heartbeat_interval_ms); 32 33 rq = engine->heartbeat.systole; 34 35 /* 36 * FIXME: The final period extension is disabled if the period has been 37 * modified from the default. This is to prevent issues with certain 38 * selftests which override the value and expect specific behaviour. 39 * Once the selftests have been updated to either cope with variable 40 * heartbeat periods (or to override the pre-emption timeout as well, 41 * or just to add a selftest specific override of the extension), the 42 * generic override can be removed. 43 */ 44 if (rq && rq->sched.attr.priority >= I915_PRIORITY_BARRIER && 45 delay == engine->defaults.heartbeat_interval_ms) { 46 long longer; 47 48 /* 49 * The final try is at the highest priority possible. Up until now 50 * a pre-emption might not even have been attempted. So make sure 51 * this last attempt allows enough time for a pre-emption to occur. 52 */ 53 longer = READ_ONCE(engine->props.preempt_timeout_ms) * 2; 54 longer = intel_clamp_heartbeat_interval_ms(engine, longer); 55 if (longer > delay) 56 delay = longer; 57 } 58 59 if (!delay) 60 return false; 61 62 delay = msecs_to_jiffies_timeout(delay); 63 if (delay >= HZ) 64 delay = round_jiffies_up_relative(delay); 65 mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay + 1); 66 67 return true; 68 } 69 70 static struct i915_request * 71 heartbeat_create(struct intel_context *ce, gfp_t gfp) 72 { 73 struct i915_request *rq; 74 75 intel_context_enter(ce); 76 rq = __i915_request_create(ce, gfp); 77 intel_context_exit(ce); 78 79 return rq; 80 } 81 82 static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq) 83 { 84 engine->wakeref_serial = READ_ONCE(engine->serial) + 1; 85 i915_request_add_active_barriers(rq); 86 if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine)) 87 engine->heartbeat.systole = i915_request_get(rq); 88 } 89 90 static void heartbeat_commit(struct i915_request *rq, 91 const struct i915_sched_attr *attr) 92 { 93 idle_pulse(rq->engine, rq); 94 95 __i915_request_commit(rq); 96 __i915_request_queue(rq, attr); 97 } 98 99 static void show_heartbeat(const struct i915_request *rq, 100 struct intel_engine_cs *engine) 101 { 102 struct drm_printer p = 103 drm_dbg_printer(&engine->i915->drm, DRM_UT_DRIVER, "heartbeat"); 104 105 if (!rq) { 106 intel_engine_dump(engine, &p, 107 "%s heartbeat not ticking\n", 108 engine->name); 109 } else { 110 intel_engine_dump(engine, &p, 111 "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n", 112 engine->name, 113 rq->fence.context, 114 rq->fence.seqno, 115 rq->sched.attr.priority); 116 } 117 } 118 119 static void 120 reset_engine(struct intel_engine_cs *engine, struct i915_request *rq) 121 { 122 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 123 show_heartbeat(rq, engine); 124 125 if (intel_engine_uses_guc(engine)) 126 /* 127 * GuC itself is toast or GuC's hang detection 128 * is disabled. Either way, need to find the 129 * hang culprit manually. 130 */ 131 intel_guc_find_hung_context(engine); 132 133 intel_gt_handle_error(engine->gt, engine->mask, 134 I915_ERROR_CAPTURE, 135 "stopped heartbeat on %s", 136 engine->name); 137 } 138 139 static void heartbeat(struct work_struct *wrk) 140 { 141 struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN }; 142 struct intel_engine_cs *engine = 143 container_of(wrk, typeof(*engine), heartbeat.work.work); 144 struct intel_context *ce = engine->kernel_context; 145 struct i915_request *rq; 146 unsigned long serial; 147 148 /* Just in case everything has gone horribly wrong, give it a kick */ 149 intel_engine_flush_submission(engine); 150 151 rq = xchg(&engine->heartbeat.systole, NULL); 152 if (rq) { 153 if (i915_request_completed(rq)) 154 i915_request_put(rq); 155 else 156 engine->heartbeat.systole = rq; 157 } 158 159 if (!intel_engine_pm_get_if_awake(engine)) 160 return; 161 162 if (intel_gt_is_wedged(engine->gt)) 163 goto out; 164 165 if (i915_sched_engine_disabled(engine->sched_engine)) { 166 reset_engine(engine, engine->heartbeat.systole); 167 goto out; 168 } 169 170 if (engine->heartbeat.systole) { 171 long delay = READ_ONCE(engine->props.heartbeat_interval_ms); 172 173 /* Safeguard against too-fast worker invocations */ 174 if (!time_after(jiffies, 175 rq->emitted_jiffies + msecs_to_jiffies(delay))) 176 goto out; 177 178 if (!i915_sw_fence_signaled(&rq->submit)) { 179 /* 180 * Not yet submitted, system is stalled. 181 * 182 * This more often happens for ring submission, 183 * where all contexts are funnelled into a common 184 * ringbuffer. If one context is blocked on an 185 * external fence, not only is it not submitted, 186 * but all other contexts, including the kernel 187 * context are stuck waiting for the signal. 188 */ 189 } else if (engine->sched_engine->schedule && 190 rq->sched.attr.priority < I915_PRIORITY_BARRIER) { 191 /* 192 * Gradually raise the priority of the heartbeat to 193 * give high priority work [which presumably desires 194 * low latency and no jitter] the chance to naturally 195 * complete before being preempted. 196 */ 197 attr.priority = I915_PRIORITY_NORMAL; 198 if (rq->sched.attr.priority >= attr.priority) 199 attr.priority = I915_PRIORITY_HEARTBEAT; 200 if (rq->sched.attr.priority >= attr.priority) 201 attr.priority = I915_PRIORITY_BARRIER; 202 203 local_bh_disable(); 204 engine->sched_engine->schedule(rq, &attr); 205 local_bh_enable(); 206 } else { 207 reset_engine(engine, rq); 208 } 209 210 rq->emitted_jiffies = jiffies; 211 goto out; 212 } 213 214 serial = READ_ONCE(engine->serial); 215 if (engine->wakeref_serial == serial) 216 goto out; 217 218 if (!mutex_trylock(&ce->timeline->mutex)) { 219 /* Unable to lock the kernel timeline, is the engine stuck? */ 220 if (xchg(&engine->heartbeat.blocked, serial) == serial) 221 intel_gt_handle_error(engine->gt, engine->mask, 222 I915_ERROR_CAPTURE, 223 "no heartbeat on %s", 224 engine->name); 225 goto out; 226 } 227 228 rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); 229 if (IS_ERR(rq)) 230 goto unlock; 231 232 heartbeat_commit(rq, &attr); 233 234 unlock: 235 mutex_unlock(&ce->timeline->mutex); 236 out: 237 if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine)) { 238 rq = xchg(&engine->heartbeat.systole, NULL); 239 if (rq) 240 i915_request_put(rq); 241 } 242 intel_engine_pm_put(engine); 243 } 244 245 void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine) 246 { 247 if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL) 248 return; 249 250 next_heartbeat(engine); 251 } 252 253 void intel_engine_park_heartbeat(struct intel_engine_cs *engine) 254 { 255 if (cancel_delayed_work(&engine->heartbeat.work)) { 256 struct i915_request *rq; 257 258 rq = xchg(&engine->heartbeat.systole, NULL); 259 if (rq) 260 i915_request_put(rq); 261 } 262 } 263 264 void intel_gt_unpark_heartbeats(struct intel_gt *gt) 265 { 266 struct intel_engine_cs *engine; 267 enum intel_engine_id id; 268 269 for_each_engine(engine, gt, id) 270 if (intel_engine_pm_is_awake(engine)) 271 intel_engine_unpark_heartbeat(engine); 272 } 273 274 void intel_gt_park_heartbeats(struct intel_gt *gt) 275 { 276 struct intel_engine_cs *engine; 277 enum intel_engine_id id; 278 279 for_each_engine(engine, gt, id) 280 intel_engine_park_heartbeat(engine); 281 } 282 283 void intel_engine_init_heartbeat(struct intel_engine_cs *engine) 284 { 285 INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat); 286 } 287 288 static int __intel_engine_pulse(struct intel_engine_cs *engine) 289 { 290 struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER }; 291 struct intel_context *ce = engine->kernel_context; 292 struct i915_request *rq; 293 294 lockdep_assert_held(&ce->timeline->mutex); 295 GEM_BUG_ON(!intel_engine_has_preemption(engine)); 296 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 297 298 rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); 299 if (IS_ERR(rq)) 300 return PTR_ERR(rq); 301 302 __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags); 303 304 heartbeat_commit(rq, &attr); 305 GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER); 306 307 /* Ensure the forced pulse gets a full period to execute */ 308 next_heartbeat(engine); 309 310 return 0; 311 } 312 313 static unsigned long set_heartbeat(struct intel_engine_cs *engine, 314 unsigned long delay) 315 { 316 unsigned long old; 317 318 old = xchg(&engine->props.heartbeat_interval_ms, delay); 319 if (delay) 320 intel_engine_unpark_heartbeat(engine); 321 else 322 intel_engine_park_heartbeat(engine); 323 324 return old; 325 } 326 327 int intel_engine_set_heartbeat(struct intel_engine_cs *engine, 328 unsigned long delay) 329 { 330 struct intel_context *ce = engine->kernel_context; 331 int err = 0; 332 333 if (!delay && !intel_engine_has_preempt_reset(engine)) 334 return -ENODEV; 335 336 /* FIXME: Remove together with equally marked hack in next_heartbeat. */ 337 if (delay != engine->defaults.heartbeat_interval_ms && 338 delay < 2 * engine->props.preempt_timeout_ms) { 339 if (intel_engine_uses_guc(engine)) 340 drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may downgrade individual engine resets to full GPU resets!\n", 341 engine->name); 342 else 343 drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may cause engine resets to target innocent contexts!\n", 344 engine->name); 345 } 346 347 intel_engine_pm_get(engine); 348 349 err = mutex_lock_interruptible(&ce->timeline->mutex); 350 if (err) 351 goto out_rpm; 352 353 if (delay != engine->props.heartbeat_interval_ms) { 354 unsigned long saved = set_heartbeat(engine, delay); 355 356 /* recheck current execution */ 357 if (intel_engine_has_preemption(engine)) { 358 err = __intel_engine_pulse(engine); 359 if (err) 360 set_heartbeat(engine, saved); 361 } 362 } 363 364 mutex_unlock(&ce->timeline->mutex); 365 366 out_rpm: 367 intel_engine_pm_put(engine); 368 return err; 369 } 370 371 int intel_engine_pulse(struct intel_engine_cs *engine) 372 { 373 struct intel_context *ce = engine->kernel_context; 374 int err; 375 376 if (!intel_engine_has_preemption(engine)) 377 return -ENODEV; 378 379 if (!intel_engine_pm_get_if_awake(engine)) 380 return 0; 381 382 err = -EINTR; 383 if (!mutex_lock_interruptible(&ce->timeline->mutex)) { 384 err = __intel_engine_pulse(engine); 385 mutex_unlock(&ce->timeline->mutex); 386 } 387 388 intel_engine_flush_submission(engine); 389 intel_engine_pm_put(engine); 390 return err; 391 } 392 393 int intel_engine_flush_barriers(struct intel_engine_cs *engine) 394 { 395 struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN }; 396 struct intel_context *ce = engine->kernel_context; 397 struct i915_request *rq; 398 int err; 399 400 if (llist_empty(&engine->barrier_tasks)) 401 return 0; 402 403 if (!intel_engine_pm_get_if_awake(engine)) 404 return 0; 405 406 if (mutex_lock_interruptible(&ce->timeline->mutex)) { 407 err = -EINTR; 408 goto out_rpm; 409 } 410 411 rq = heartbeat_create(ce, GFP_KERNEL); 412 if (IS_ERR(rq)) { 413 err = PTR_ERR(rq); 414 goto out_unlock; 415 } 416 417 heartbeat_commit(rq, &attr); 418 419 err = 0; 420 out_unlock: 421 mutex_unlock(&ce->timeline->mutex); 422 out_rpm: 423 intel_engine_pm_put(engine); 424 return err; 425 } 426 427 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 428 #include "selftest_engine_heartbeat.c" 429 #endif 430