1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2019 Intel Corporation 4 */ 5 6 #include <drm/drm_print.h> 7 8 #include "i915_drv.h" 9 #include "i915_jiffies.h" 10 #include "i915_request.h" 11 12 #include "intel_context.h" 13 #include "intel_engine_heartbeat.h" 14 #include "intel_engine_pm.h" 15 #include "intel_engine.h" 16 #include "intel_gt.h" 17 #include "intel_reset.h" 18 19 /* 20 * While the engine is active, we send a periodic pulse along the engine 21 * to check on its health and to flush any idle-barriers. If that request 22 * is stuck, and we fail to preempt it, we declare the engine hung and 23 * issue a reset -- in the hope that restores progress. 24 */ 25 26 static bool next_heartbeat(struct intel_engine_cs *engine) 27 { 28 struct i915_request *rq; 29 long delay; 30 31 delay = READ_ONCE(engine->props.heartbeat_interval_ms); 32 33 rq = engine->heartbeat.systole; 34 35 /* 36 * FIXME: The final period extension is disabled if the period has been 37 * modified from the default. This is to prevent issues with certain 38 * selftests which override the value and expect specific behaviour. 39 * Once the selftests have been updated to either cope with variable 40 * heartbeat periods (or to override the pre-emption timeout as well, 41 * or just to add a selftest specific override of the extension), the 42 * generic override can be removed. 43 */ 44 if (rq && rq->sched.attr.priority >= I915_PRIORITY_BARRIER && 45 delay == engine->defaults.heartbeat_interval_ms) { 46 long longer; 47 48 /* 49 * The final try is at the highest priority possible. Up until now 50 * a pre-emption might not even have been attempted. So make sure 51 * this last attempt allows enough time for a pre-emption to occur. 52 */ 53 longer = READ_ONCE(engine->props.preempt_timeout_ms) * 2; 54 longer = intel_clamp_heartbeat_interval_ms(engine, longer); 55 if (longer > delay) 56 delay = longer; 57 } 58 59 if (!delay) 60 return false; 61 62 delay = msecs_to_jiffies_timeout(delay); 63 if (delay >= HZ) 64 delay = round_jiffies_up_relative(delay); 65 mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay + 1); 66 67 return true; 68 } 69 70 static struct i915_request * 71 heartbeat_create(struct intel_context *ce, gfp_t gfp) 72 { 73 struct i915_request *rq; 74 75 intel_context_enter(ce); 76 rq = __i915_request_create(ce, gfp); 77 intel_context_exit(ce); 78 79 return rq; 80 } 81 82 static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq) 83 { 84 engine->wakeref_serial = READ_ONCE(engine->serial) + 1; 85 i915_request_add_active_barriers(rq); 86 if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine)) 87 engine->heartbeat.systole = i915_request_get(rq); 88 } 89 90 static void heartbeat_commit(struct i915_request *rq, 91 const struct i915_sched_attr *attr) 92 { 93 idle_pulse(rq->engine, rq); 94 95 __i915_request_commit(rq); 96 __i915_request_queue(rq, attr); 97 } 98 99 static void show_heartbeat(const struct i915_request *rq, 100 struct intel_engine_cs *engine) 101 { 102 struct drm_printer p = 103 drm_dbg_printer(&engine->i915->drm, DRM_UT_DRIVER, "heartbeat"); 104 105 if (!rq) { 106 intel_engine_dump(engine, &p, 107 "%s heartbeat not ticking\n", 108 engine->name); 109 } else { 110 intel_engine_dump(engine, &p, 111 "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n", 112 engine->name, 113 rq->fence.context, 114 rq->fence.seqno, 115 rq->sched.attr.priority); 116 } 117 } 118 119 static void 120 reset_engine(struct intel_engine_cs *engine, struct i915_request *rq) 121 { 122 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 123 show_heartbeat(rq, engine); 124 125 if (intel_engine_uses_guc(engine)) 126 /* 127 * GuC itself is toast or GuC's hang detection 128 * is disabled. Either way, need to find the 129 * hang culprit manually. 130 */ 131 intel_guc_find_hung_context(engine); 132 133 intel_gt_handle_error(engine->gt, engine->mask, 134 I915_ERROR_CAPTURE, 135 "stopped heartbeat on %s", 136 engine->name); 137 } 138 139 static void heartbeat(struct work_struct *wrk) 140 { 141 struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN }; 142 struct intel_engine_cs *engine = 143 container_of(wrk, typeof(*engine), heartbeat.work.work); 144 struct intel_context *ce = engine->kernel_context; 145 struct i915_request *rq; 146 unsigned long serial; 147 148 /* Just in case everything has gone horribly wrong, give it a kick */ 149 intel_engine_flush_submission(engine); 150 151 rq = engine->heartbeat.systole; 152 if (rq && i915_request_completed(rq)) { 153 i915_request_put(rq); 154 engine->heartbeat.systole = NULL; 155 } 156 157 if (!intel_engine_pm_get_if_awake(engine)) 158 return; 159 160 if (intel_gt_is_wedged(engine->gt)) 161 goto out; 162 163 if (i915_sched_engine_disabled(engine->sched_engine)) { 164 reset_engine(engine, engine->heartbeat.systole); 165 goto out; 166 } 167 168 if (engine->heartbeat.systole) { 169 long delay = READ_ONCE(engine->props.heartbeat_interval_ms); 170 171 /* Safeguard against too-fast worker invocations */ 172 if (!time_after(jiffies, 173 rq->emitted_jiffies + msecs_to_jiffies(delay))) 174 goto out; 175 176 if (!i915_sw_fence_signaled(&rq->submit)) { 177 /* 178 * Not yet submitted, system is stalled. 179 * 180 * This more often happens for ring submission, 181 * where all contexts are funnelled into a common 182 * ringbuffer. If one context is blocked on an 183 * external fence, not only is it not submitted, 184 * but all other contexts, including the kernel 185 * context are stuck waiting for the signal. 186 */ 187 } else if (engine->sched_engine->schedule && 188 rq->sched.attr.priority < I915_PRIORITY_BARRIER) { 189 /* 190 * Gradually raise the priority of the heartbeat to 191 * give high priority work [which presumably desires 192 * low latency and no jitter] the chance to naturally 193 * complete before being preempted. 194 */ 195 attr.priority = I915_PRIORITY_NORMAL; 196 if (rq->sched.attr.priority >= attr.priority) 197 attr.priority = I915_PRIORITY_HEARTBEAT; 198 if (rq->sched.attr.priority >= attr.priority) 199 attr.priority = I915_PRIORITY_BARRIER; 200 201 local_bh_disable(); 202 engine->sched_engine->schedule(rq, &attr); 203 local_bh_enable(); 204 } else { 205 reset_engine(engine, rq); 206 } 207 208 rq->emitted_jiffies = jiffies; 209 goto out; 210 } 211 212 serial = READ_ONCE(engine->serial); 213 if (engine->wakeref_serial == serial) 214 goto out; 215 216 if (!mutex_trylock(&ce->timeline->mutex)) { 217 /* Unable to lock the kernel timeline, is the engine stuck? */ 218 if (xchg(&engine->heartbeat.blocked, serial) == serial) 219 intel_gt_handle_error(engine->gt, engine->mask, 220 I915_ERROR_CAPTURE, 221 "no heartbeat on %s", 222 engine->name); 223 goto out; 224 } 225 226 rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); 227 if (IS_ERR(rq)) 228 goto unlock; 229 230 heartbeat_commit(rq, &attr); 231 232 unlock: 233 mutex_unlock(&ce->timeline->mutex); 234 out: 235 if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine)) 236 i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); 237 intel_engine_pm_put(engine); 238 } 239 240 void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine) 241 { 242 if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL) 243 return; 244 245 next_heartbeat(engine); 246 } 247 248 void intel_engine_park_heartbeat(struct intel_engine_cs *engine) 249 { 250 if (cancel_delayed_work(&engine->heartbeat.work)) 251 i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); 252 } 253 254 void intel_gt_unpark_heartbeats(struct intel_gt *gt) 255 { 256 struct intel_engine_cs *engine; 257 enum intel_engine_id id; 258 259 for_each_engine(engine, gt, id) 260 if (intel_engine_pm_is_awake(engine)) 261 intel_engine_unpark_heartbeat(engine); 262 } 263 264 void intel_gt_park_heartbeats(struct intel_gt *gt) 265 { 266 struct intel_engine_cs *engine; 267 enum intel_engine_id id; 268 269 for_each_engine(engine, gt, id) 270 intel_engine_park_heartbeat(engine); 271 } 272 273 void intel_engine_init_heartbeat(struct intel_engine_cs *engine) 274 { 275 INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat); 276 } 277 278 static int __intel_engine_pulse(struct intel_engine_cs *engine) 279 { 280 struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER }; 281 struct intel_context *ce = engine->kernel_context; 282 struct i915_request *rq; 283 284 lockdep_assert_held(&ce->timeline->mutex); 285 GEM_BUG_ON(!intel_engine_has_preemption(engine)); 286 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 287 288 rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); 289 if (IS_ERR(rq)) 290 return PTR_ERR(rq); 291 292 __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags); 293 294 heartbeat_commit(rq, &attr); 295 GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER); 296 297 /* Ensure the forced pulse gets a full period to execute */ 298 next_heartbeat(engine); 299 300 return 0; 301 } 302 303 static unsigned long set_heartbeat(struct intel_engine_cs *engine, 304 unsigned long delay) 305 { 306 unsigned long old; 307 308 old = xchg(&engine->props.heartbeat_interval_ms, delay); 309 if (delay) 310 intel_engine_unpark_heartbeat(engine); 311 else 312 intel_engine_park_heartbeat(engine); 313 314 return old; 315 } 316 317 int intel_engine_set_heartbeat(struct intel_engine_cs *engine, 318 unsigned long delay) 319 { 320 struct intel_context *ce = engine->kernel_context; 321 int err = 0; 322 323 if (!delay && !intel_engine_has_preempt_reset(engine)) 324 return -ENODEV; 325 326 /* FIXME: Remove together with equally marked hack in next_heartbeat. */ 327 if (delay != engine->defaults.heartbeat_interval_ms && 328 delay < 2 * engine->props.preempt_timeout_ms) { 329 if (intel_engine_uses_guc(engine)) 330 drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may downgrade individual engine resets to full GPU resets!\n", 331 engine->name); 332 else 333 drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may cause engine resets to target innocent contexts!\n", 334 engine->name); 335 } 336 337 intel_engine_pm_get(engine); 338 339 err = mutex_lock_interruptible(&ce->timeline->mutex); 340 if (err) 341 goto out_rpm; 342 343 if (delay != engine->props.heartbeat_interval_ms) { 344 unsigned long saved = set_heartbeat(engine, delay); 345 346 /* recheck current execution */ 347 if (intel_engine_has_preemption(engine)) { 348 err = __intel_engine_pulse(engine); 349 if (err) 350 set_heartbeat(engine, saved); 351 } 352 } 353 354 mutex_unlock(&ce->timeline->mutex); 355 356 out_rpm: 357 intel_engine_pm_put(engine); 358 return err; 359 } 360 361 int intel_engine_pulse(struct intel_engine_cs *engine) 362 { 363 struct intel_context *ce = engine->kernel_context; 364 int err; 365 366 if (!intel_engine_has_preemption(engine)) 367 return -ENODEV; 368 369 if (!intel_engine_pm_get_if_awake(engine)) 370 return 0; 371 372 err = -EINTR; 373 if (!mutex_lock_interruptible(&ce->timeline->mutex)) { 374 err = __intel_engine_pulse(engine); 375 mutex_unlock(&ce->timeline->mutex); 376 } 377 378 intel_engine_flush_submission(engine); 379 intel_engine_pm_put(engine); 380 return err; 381 } 382 383 int intel_engine_flush_barriers(struct intel_engine_cs *engine) 384 { 385 struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN }; 386 struct intel_context *ce = engine->kernel_context; 387 struct i915_request *rq; 388 int err; 389 390 if (llist_empty(&engine->barrier_tasks)) 391 return 0; 392 393 if (!intel_engine_pm_get_if_awake(engine)) 394 return 0; 395 396 if (mutex_lock_interruptible(&ce->timeline->mutex)) { 397 err = -EINTR; 398 goto out_rpm; 399 } 400 401 rq = heartbeat_create(ce, GFP_KERNEL); 402 if (IS_ERR(rq)) { 403 err = PTR_ERR(rq); 404 goto out_unlock; 405 } 406 407 heartbeat_commit(rq, &attr); 408 409 err = 0; 410 out_unlock: 411 mutex_unlock(&ce->timeline->mutex); 412 out_rpm: 413 intel_engine_pm_put(engine); 414 return err; 415 } 416 417 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 418 #include "selftest_engine_heartbeat.c" 419 #endif 420