1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2019 Intel Corporation 4 */ 5 6 #include <drm/drm_print.h> 7 8 #include "i915_drv.h" 9 #include "i915_request.h" 10 11 #include "intel_context.h" 12 #include "intel_engine_heartbeat.h" 13 #include "intel_engine_pm.h" 14 #include "intel_engine.h" 15 #include "intel_gt.h" 16 #include "intel_reset.h" 17 18 /* 19 * While the engine is active, we send a periodic pulse along the engine 20 * to check on its health and to flush any idle-barriers. If that request 21 * is stuck, and we fail to preempt it, we declare the engine hung and 22 * issue a reset -- in the hope that restores progress. 23 */ 24 25 static bool next_heartbeat(struct intel_engine_cs *engine) 26 { 27 struct i915_request *rq; 28 long delay; 29 30 delay = READ_ONCE(engine->props.heartbeat_interval_ms); 31 32 rq = engine->heartbeat.systole; 33 34 /* 35 * FIXME: The final period extension is disabled if the period has been 36 * modified from the default. This is to prevent issues with certain 37 * selftests which override the value and expect specific behaviour. 38 * Once the selftests have been updated to either cope with variable 39 * heartbeat periods (or to override the pre-emption timeout as well, 40 * or just to add a selftest specific override of the extension), the 41 * generic override can be removed. 42 */ 43 if (rq && rq->sched.attr.priority >= I915_PRIORITY_BARRIER && 44 delay == engine->defaults.heartbeat_interval_ms) { 45 long longer; 46 47 /* 48 * The final try is at the highest priority possible. Up until now 49 * a pre-emption might not even have been attempted. So make sure 50 * this last attempt allows enough time for a pre-emption to occur. 51 */ 52 longer = READ_ONCE(engine->props.preempt_timeout_ms) * 2; 53 longer = intel_clamp_heartbeat_interval_ms(engine, longer); 54 if (longer > delay) 55 delay = longer; 56 } 57 58 if (!delay) 59 return false; 60 61 delay = msecs_to_jiffies_timeout(delay); 62 if (delay >= HZ) 63 delay = round_jiffies_up_relative(delay); 64 mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay + 1); 65 66 return true; 67 } 68 69 static struct i915_request * 70 heartbeat_create(struct intel_context *ce, gfp_t gfp) 71 { 72 struct i915_request *rq; 73 74 intel_context_enter(ce); 75 rq = __i915_request_create(ce, gfp); 76 intel_context_exit(ce); 77 78 return rq; 79 } 80 81 static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq) 82 { 83 engine->wakeref_serial = READ_ONCE(engine->serial) + 1; 84 i915_request_add_active_barriers(rq); 85 if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine)) 86 engine->heartbeat.systole = i915_request_get(rq); 87 } 88 89 static void heartbeat_commit(struct i915_request *rq, 90 const struct i915_sched_attr *attr) 91 { 92 idle_pulse(rq->engine, rq); 93 94 __i915_request_commit(rq); 95 __i915_request_queue(rq, attr); 96 } 97 98 static void show_heartbeat(const struct i915_request *rq, 99 struct intel_engine_cs *engine) 100 { 101 struct drm_printer p = 102 drm_dbg_printer(&engine->i915->drm, DRM_UT_DRIVER, "heartbeat"); 103 104 if (!rq) { 105 intel_engine_dump(engine, &p, 106 "%s heartbeat not ticking\n", 107 engine->name); 108 } else { 109 intel_engine_dump(engine, &p, 110 "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n", 111 engine->name, 112 rq->fence.context, 113 rq->fence.seqno, 114 rq->sched.attr.priority); 115 } 116 } 117 118 static void 119 reset_engine(struct intel_engine_cs *engine, struct i915_request *rq) 120 { 121 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 122 show_heartbeat(rq, engine); 123 124 if (intel_engine_uses_guc(engine)) 125 /* 126 * GuC itself is toast or GuC's hang detection 127 * is disabled. Either way, need to find the 128 * hang culprit manually. 129 */ 130 intel_guc_find_hung_context(engine); 131 132 intel_gt_handle_error(engine->gt, engine->mask, 133 I915_ERROR_CAPTURE, 134 "stopped heartbeat on %s", 135 engine->name); 136 } 137 138 static void heartbeat(struct work_struct *wrk) 139 { 140 struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN }; 141 struct intel_engine_cs *engine = 142 container_of(wrk, typeof(*engine), heartbeat.work.work); 143 struct intel_context *ce = engine->kernel_context; 144 struct i915_request *rq; 145 unsigned long serial; 146 147 /* Just in case everything has gone horribly wrong, give it a kick */ 148 intel_engine_flush_submission(engine); 149 150 rq = engine->heartbeat.systole; 151 if (rq && i915_request_completed(rq)) { 152 i915_request_put(rq); 153 engine->heartbeat.systole = NULL; 154 } 155 156 if (!intel_engine_pm_get_if_awake(engine)) 157 return; 158 159 if (intel_gt_is_wedged(engine->gt)) 160 goto out; 161 162 if (i915_sched_engine_disabled(engine->sched_engine)) { 163 reset_engine(engine, engine->heartbeat.systole); 164 goto out; 165 } 166 167 if (engine->heartbeat.systole) { 168 long delay = READ_ONCE(engine->props.heartbeat_interval_ms); 169 170 /* Safeguard against too-fast worker invocations */ 171 if (!time_after(jiffies, 172 rq->emitted_jiffies + msecs_to_jiffies(delay))) 173 goto out; 174 175 if (!i915_sw_fence_signaled(&rq->submit)) { 176 /* 177 * Not yet submitted, system is stalled. 178 * 179 * This more often happens for ring submission, 180 * where all contexts are funnelled into a common 181 * ringbuffer. If one context is blocked on an 182 * external fence, not only is it not submitted, 183 * but all other contexts, including the kernel 184 * context are stuck waiting for the signal. 185 */ 186 } else if (engine->sched_engine->schedule && 187 rq->sched.attr.priority < I915_PRIORITY_BARRIER) { 188 /* 189 * Gradually raise the priority of the heartbeat to 190 * give high priority work [which presumably desires 191 * low latency and no jitter] the chance to naturally 192 * complete before being preempted. 193 */ 194 attr.priority = I915_PRIORITY_NORMAL; 195 if (rq->sched.attr.priority >= attr.priority) 196 attr.priority = I915_PRIORITY_HEARTBEAT; 197 if (rq->sched.attr.priority >= attr.priority) 198 attr.priority = I915_PRIORITY_BARRIER; 199 200 local_bh_disable(); 201 engine->sched_engine->schedule(rq, &attr); 202 local_bh_enable(); 203 } else { 204 reset_engine(engine, rq); 205 } 206 207 rq->emitted_jiffies = jiffies; 208 goto out; 209 } 210 211 serial = READ_ONCE(engine->serial); 212 if (engine->wakeref_serial == serial) 213 goto out; 214 215 if (!mutex_trylock(&ce->timeline->mutex)) { 216 /* Unable to lock the kernel timeline, is the engine stuck? */ 217 if (xchg(&engine->heartbeat.blocked, serial) == serial) 218 intel_gt_handle_error(engine->gt, engine->mask, 219 I915_ERROR_CAPTURE, 220 "no heartbeat on %s", 221 engine->name); 222 goto out; 223 } 224 225 rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); 226 if (IS_ERR(rq)) 227 goto unlock; 228 229 heartbeat_commit(rq, &attr); 230 231 unlock: 232 mutex_unlock(&ce->timeline->mutex); 233 out: 234 if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine)) 235 i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); 236 intel_engine_pm_put(engine); 237 } 238 239 void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine) 240 { 241 if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL) 242 return; 243 244 next_heartbeat(engine); 245 } 246 247 void intel_engine_park_heartbeat(struct intel_engine_cs *engine) 248 { 249 if (cancel_delayed_work(&engine->heartbeat.work)) 250 i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); 251 } 252 253 void intel_gt_unpark_heartbeats(struct intel_gt *gt) 254 { 255 struct intel_engine_cs *engine; 256 enum intel_engine_id id; 257 258 for_each_engine(engine, gt, id) 259 if (intel_engine_pm_is_awake(engine)) 260 intel_engine_unpark_heartbeat(engine); 261 } 262 263 void intel_gt_park_heartbeats(struct intel_gt *gt) 264 { 265 struct intel_engine_cs *engine; 266 enum intel_engine_id id; 267 268 for_each_engine(engine, gt, id) 269 intel_engine_park_heartbeat(engine); 270 } 271 272 void intel_engine_init_heartbeat(struct intel_engine_cs *engine) 273 { 274 INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat); 275 } 276 277 static int __intel_engine_pulse(struct intel_engine_cs *engine) 278 { 279 struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER }; 280 struct intel_context *ce = engine->kernel_context; 281 struct i915_request *rq; 282 283 lockdep_assert_held(&ce->timeline->mutex); 284 GEM_BUG_ON(!intel_engine_has_preemption(engine)); 285 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 286 287 rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); 288 if (IS_ERR(rq)) 289 return PTR_ERR(rq); 290 291 __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags); 292 293 heartbeat_commit(rq, &attr); 294 GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER); 295 296 /* Ensure the forced pulse gets a full period to execute */ 297 next_heartbeat(engine); 298 299 return 0; 300 } 301 302 static unsigned long set_heartbeat(struct intel_engine_cs *engine, 303 unsigned long delay) 304 { 305 unsigned long old; 306 307 old = xchg(&engine->props.heartbeat_interval_ms, delay); 308 if (delay) 309 intel_engine_unpark_heartbeat(engine); 310 else 311 intel_engine_park_heartbeat(engine); 312 313 return old; 314 } 315 316 int intel_engine_set_heartbeat(struct intel_engine_cs *engine, 317 unsigned long delay) 318 { 319 struct intel_context *ce = engine->kernel_context; 320 int err = 0; 321 322 if (!delay && !intel_engine_has_preempt_reset(engine)) 323 return -ENODEV; 324 325 /* FIXME: Remove together with equally marked hack in next_heartbeat. */ 326 if (delay != engine->defaults.heartbeat_interval_ms && 327 delay < 2 * engine->props.preempt_timeout_ms) { 328 if (intel_engine_uses_guc(engine)) 329 drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may downgrade individual engine resets to full GPU resets!\n", 330 engine->name); 331 else 332 drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may cause engine resets to target innocent contexts!\n", 333 engine->name); 334 } 335 336 intel_engine_pm_get(engine); 337 338 err = mutex_lock_interruptible(&ce->timeline->mutex); 339 if (err) 340 goto out_rpm; 341 342 if (delay != engine->props.heartbeat_interval_ms) { 343 unsigned long saved = set_heartbeat(engine, delay); 344 345 /* recheck current execution */ 346 if (intel_engine_has_preemption(engine)) { 347 err = __intel_engine_pulse(engine); 348 if (err) 349 set_heartbeat(engine, saved); 350 } 351 } 352 353 mutex_unlock(&ce->timeline->mutex); 354 355 out_rpm: 356 intel_engine_pm_put(engine); 357 return err; 358 } 359 360 int intel_engine_pulse(struct intel_engine_cs *engine) 361 { 362 struct intel_context *ce = engine->kernel_context; 363 int err; 364 365 if (!intel_engine_has_preemption(engine)) 366 return -ENODEV; 367 368 if (!intel_engine_pm_get_if_awake(engine)) 369 return 0; 370 371 err = -EINTR; 372 if (!mutex_lock_interruptible(&ce->timeline->mutex)) { 373 err = __intel_engine_pulse(engine); 374 mutex_unlock(&ce->timeline->mutex); 375 } 376 377 intel_engine_flush_submission(engine); 378 intel_engine_pm_put(engine); 379 return err; 380 } 381 382 int intel_engine_flush_barriers(struct intel_engine_cs *engine) 383 { 384 struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN }; 385 struct intel_context *ce = engine->kernel_context; 386 struct i915_request *rq; 387 int err; 388 389 if (llist_empty(&engine->barrier_tasks)) 390 return 0; 391 392 if (!intel_engine_pm_get_if_awake(engine)) 393 return 0; 394 395 if (mutex_lock_interruptible(&ce->timeline->mutex)) { 396 err = -EINTR; 397 goto out_rpm; 398 } 399 400 rq = heartbeat_create(ce, GFP_KERNEL); 401 if (IS_ERR(rq)) { 402 err = PTR_ERR(rq); 403 goto out_unlock; 404 } 405 406 heartbeat_commit(rq, &attr); 407 408 err = 0; 409 out_unlock: 410 mutex_unlock(&ce->timeline->mutex); 411 out_rpm: 412 intel_engine_pm_put(engine); 413 return err; 414 } 415 416 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 417 #include "selftest_engine_heartbeat.c" 418 #endif 419