1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2019 Intel Corporation 4 */ 5 6 #include "i915_drv.h" 7 #include "i915_request.h" 8 9 #include "intel_context.h" 10 #include "intel_engine_heartbeat.h" 11 #include "intel_engine_pm.h" 12 #include "intel_engine.h" 13 #include "intel_gt.h" 14 #include "intel_reset.h" 15 16 /* 17 * While the engine is active, we send a periodic pulse along the engine 18 * to check on its health and to flush any idle-barriers. If that request 19 * is stuck, and we fail to preempt it, we declare the engine hung and 20 * issue a reset -- in the hope that restores progress. 21 */ 22 23 static bool next_heartbeat(struct intel_engine_cs *engine) 24 { 25 struct i915_request *rq; 26 long delay; 27 28 delay = READ_ONCE(engine->props.heartbeat_interval_ms); 29 30 rq = engine->heartbeat.systole; 31 32 /* 33 * FIXME: The final period extension is disabled if the period has been 34 * modified from the default. This is to prevent issues with certain 35 * selftests which override the value and expect specific behaviour. 36 * Once the selftests have been updated to either cope with variable 37 * heartbeat periods (or to override the pre-emption timeout as well, 38 * or just to add a selftest specific override of the extension), the 39 * generic override can be removed. 40 */ 41 if (rq && rq->sched.attr.priority >= I915_PRIORITY_BARRIER && 42 delay == engine->defaults.heartbeat_interval_ms) { 43 long longer; 44 45 /* 46 * The final try is at the highest priority possible. Up until now 47 * a pre-emption might not even have been attempted. So make sure 48 * this last attempt allows enough time for a pre-emption to occur. 49 */ 50 longer = READ_ONCE(engine->props.preempt_timeout_ms) * 2; 51 longer = intel_clamp_heartbeat_interval_ms(engine, longer); 52 if (longer > delay) 53 delay = longer; 54 } 55 56 if (!delay) 57 return false; 58 59 delay = msecs_to_jiffies_timeout(delay); 60 if (delay >= HZ) 61 delay = round_jiffies_up_relative(delay); 62 mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay + 1); 63 64 return true; 65 } 66 67 static struct i915_request * 68 heartbeat_create(struct intel_context *ce, gfp_t gfp) 69 { 70 struct i915_request *rq; 71 72 intel_context_enter(ce); 73 rq = __i915_request_create(ce, gfp); 74 intel_context_exit(ce); 75 76 return rq; 77 } 78 79 static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq) 80 { 81 engine->wakeref_serial = READ_ONCE(engine->serial) + 1; 82 i915_request_add_active_barriers(rq); 83 if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine)) 84 engine->heartbeat.systole = i915_request_get(rq); 85 } 86 87 static void heartbeat_commit(struct i915_request *rq, 88 const struct i915_sched_attr *attr) 89 { 90 idle_pulse(rq->engine, rq); 91 92 __i915_request_commit(rq); 93 __i915_request_queue(rq, attr); 94 } 95 96 static void show_heartbeat(const struct i915_request *rq, 97 struct intel_engine_cs *engine) 98 { 99 struct drm_printer p = 100 drm_dbg_printer(&engine->i915->drm, DRM_UT_DRIVER, "heartbeat"); 101 102 if (!rq) { 103 intel_engine_dump(engine, &p, 104 "%s heartbeat not ticking\n", 105 engine->name); 106 } else { 107 intel_engine_dump(engine, &p, 108 "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n", 109 engine->name, 110 rq->fence.context, 111 rq->fence.seqno, 112 rq->sched.attr.priority); 113 } 114 } 115 116 static void 117 reset_engine(struct intel_engine_cs *engine, struct i915_request *rq) 118 { 119 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 120 show_heartbeat(rq, engine); 121 122 if (intel_engine_uses_guc(engine)) 123 /* 124 * GuC itself is toast or GuC's hang detection 125 * is disabled. Either way, need to find the 126 * hang culprit manually. 127 */ 128 intel_guc_find_hung_context(engine); 129 130 intel_gt_handle_error(engine->gt, engine->mask, 131 I915_ERROR_CAPTURE, 132 "stopped heartbeat on %s", 133 engine->name); 134 } 135 136 static void heartbeat(struct work_struct *wrk) 137 { 138 struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN }; 139 struct intel_engine_cs *engine = 140 container_of(wrk, typeof(*engine), heartbeat.work.work); 141 struct intel_context *ce = engine->kernel_context; 142 struct i915_request *rq; 143 unsigned long serial; 144 145 /* Just in case everything has gone horribly wrong, give it a kick */ 146 intel_engine_flush_submission(engine); 147 148 rq = engine->heartbeat.systole; 149 if (rq && i915_request_completed(rq)) { 150 i915_request_put(rq); 151 engine->heartbeat.systole = NULL; 152 } 153 154 if (!intel_engine_pm_get_if_awake(engine)) 155 return; 156 157 if (intel_gt_is_wedged(engine->gt)) 158 goto out; 159 160 if (i915_sched_engine_disabled(engine->sched_engine)) { 161 reset_engine(engine, engine->heartbeat.systole); 162 goto out; 163 } 164 165 if (engine->heartbeat.systole) { 166 long delay = READ_ONCE(engine->props.heartbeat_interval_ms); 167 168 /* Safeguard against too-fast worker invocations */ 169 if (!time_after(jiffies, 170 rq->emitted_jiffies + msecs_to_jiffies(delay))) 171 goto out; 172 173 if (!i915_sw_fence_signaled(&rq->submit)) { 174 /* 175 * Not yet submitted, system is stalled. 176 * 177 * This more often happens for ring submission, 178 * where all contexts are funnelled into a common 179 * ringbuffer. If one context is blocked on an 180 * external fence, not only is it not submitted, 181 * but all other contexts, including the kernel 182 * context are stuck waiting for the signal. 183 */ 184 } else if (engine->sched_engine->schedule && 185 rq->sched.attr.priority < I915_PRIORITY_BARRIER) { 186 /* 187 * Gradually raise the priority of the heartbeat to 188 * give high priority work [which presumably desires 189 * low latency and no jitter] the chance to naturally 190 * complete before being preempted. 191 */ 192 attr.priority = I915_PRIORITY_NORMAL; 193 if (rq->sched.attr.priority >= attr.priority) 194 attr.priority = I915_PRIORITY_HEARTBEAT; 195 if (rq->sched.attr.priority >= attr.priority) 196 attr.priority = I915_PRIORITY_BARRIER; 197 198 local_bh_disable(); 199 engine->sched_engine->schedule(rq, &attr); 200 local_bh_enable(); 201 } else { 202 reset_engine(engine, rq); 203 } 204 205 rq->emitted_jiffies = jiffies; 206 goto out; 207 } 208 209 serial = READ_ONCE(engine->serial); 210 if (engine->wakeref_serial == serial) 211 goto out; 212 213 if (!mutex_trylock(&ce->timeline->mutex)) { 214 /* Unable to lock the kernel timeline, is the engine stuck? */ 215 if (xchg(&engine->heartbeat.blocked, serial) == serial) 216 intel_gt_handle_error(engine->gt, engine->mask, 217 I915_ERROR_CAPTURE, 218 "no heartbeat on %s", 219 engine->name); 220 goto out; 221 } 222 223 rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); 224 if (IS_ERR(rq)) 225 goto unlock; 226 227 heartbeat_commit(rq, &attr); 228 229 unlock: 230 mutex_unlock(&ce->timeline->mutex); 231 out: 232 if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine)) 233 i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); 234 intel_engine_pm_put(engine); 235 } 236 237 void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine) 238 { 239 if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL) 240 return; 241 242 next_heartbeat(engine); 243 } 244 245 void intel_engine_park_heartbeat(struct intel_engine_cs *engine) 246 { 247 if (cancel_delayed_work(&engine->heartbeat.work)) 248 i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); 249 } 250 251 void intel_gt_unpark_heartbeats(struct intel_gt *gt) 252 { 253 struct intel_engine_cs *engine; 254 enum intel_engine_id id; 255 256 for_each_engine(engine, gt, id) 257 if (intel_engine_pm_is_awake(engine)) 258 intel_engine_unpark_heartbeat(engine); 259 } 260 261 void intel_gt_park_heartbeats(struct intel_gt *gt) 262 { 263 struct intel_engine_cs *engine; 264 enum intel_engine_id id; 265 266 for_each_engine(engine, gt, id) 267 intel_engine_park_heartbeat(engine); 268 } 269 270 void intel_engine_init_heartbeat(struct intel_engine_cs *engine) 271 { 272 INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat); 273 } 274 275 static int __intel_engine_pulse(struct intel_engine_cs *engine) 276 { 277 struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER }; 278 struct intel_context *ce = engine->kernel_context; 279 struct i915_request *rq; 280 281 lockdep_assert_held(&ce->timeline->mutex); 282 GEM_BUG_ON(!intel_engine_has_preemption(engine)); 283 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 284 285 rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); 286 if (IS_ERR(rq)) 287 return PTR_ERR(rq); 288 289 __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags); 290 291 heartbeat_commit(rq, &attr); 292 GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER); 293 294 /* Ensure the forced pulse gets a full period to execute */ 295 next_heartbeat(engine); 296 297 return 0; 298 } 299 300 static unsigned long set_heartbeat(struct intel_engine_cs *engine, 301 unsigned long delay) 302 { 303 unsigned long old; 304 305 old = xchg(&engine->props.heartbeat_interval_ms, delay); 306 if (delay) 307 intel_engine_unpark_heartbeat(engine); 308 else 309 intel_engine_park_heartbeat(engine); 310 311 return old; 312 } 313 314 int intel_engine_set_heartbeat(struct intel_engine_cs *engine, 315 unsigned long delay) 316 { 317 struct intel_context *ce = engine->kernel_context; 318 int err = 0; 319 320 if (!delay && !intel_engine_has_preempt_reset(engine)) 321 return -ENODEV; 322 323 /* FIXME: Remove together with equally marked hack in next_heartbeat. */ 324 if (delay != engine->defaults.heartbeat_interval_ms && 325 delay < 2 * engine->props.preempt_timeout_ms) { 326 if (intel_engine_uses_guc(engine)) 327 drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may downgrade individual engine resets to full GPU resets!\n", 328 engine->name); 329 else 330 drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may cause engine resets to target innocent contexts!\n", 331 engine->name); 332 } 333 334 intel_engine_pm_get(engine); 335 336 err = mutex_lock_interruptible(&ce->timeline->mutex); 337 if (err) 338 goto out_rpm; 339 340 if (delay != engine->props.heartbeat_interval_ms) { 341 unsigned long saved = set_heartbeat(engine, delay); 342 343 /* recheck current execution */ 344 if (intel_engine_has_preemption(engine)) { 345 err = __intel_engine_pulse(engine); 346 if (err) 347 set_heartbeat(engine, saved); 348 } 349 } 350 351 mutex_unlock(&ce->timeline->mutex); 352 353 out_rpm: 354 intel_engine_pm_put(engine); 355 return err; 356 } 357 358 int intel_engine_pulse(struct intel_engine_cs *engine) 359 { 360 struct intel_context *ce = engine->kernel_context; 361 int err; 362 363 if (!intel_engine_has_preemption(engine)) 364 return -ENODEV; 365 366 if (!intel_engine_pm_get_if_awake(engine)) 367 return 0; 368 369 err = -EINTR; 370 if (!mutex_lock_interruptible(&ce->timeline->mutex)) { 371 err = __intel_engine_pulse(engine); 372 mutex_unlock(&ce->timeline->mutex); 373 } 374 375 intel_engine_flush_submission(engine); 376 intel_engine_pm_put(engine); 377 return err; 378 } 379 380 int intel_engine_flush_barriers(struct intel_engine_cs *engine) 381 { 382 struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN }; 383 struct intel_context *ce = engine->kernel_context; 384 struct i915_request *rq; 385 int err; 386 387 if (llist_empty(&engine->barrier_tasks)) 388 return 0; 389 390 if (!intel_engine_pm_get_if_awake(engine)) 391 return 0; 392 393 if (mutex_lock_interruptible(&ce->timeline->mutex)) { 394 err = -EINTR; 395 goto out_rpm; 396 } 397 398 rq = heartbeat_create(ce, GFP_KERNEL); 399 if (IS_ERR(rq)) { 400 err = PTR_ERR(rq); 401 goto out_unlock; 402 } 403 404 heartbeat_commit(rq, &attr); 405 406 err = 0; 407 out_unlock: 408 mutex_unlock(&ce->timeline->mutex); 409 out_rpm: 410 intel_engine_pm_put(engine); 411 return err; 412 } 413 414 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 415 #include "selftest_engine_heartbeat.c" 416 #endif 417