1 /* 2 * SPDX-License-Identifier: MIT 3 * 4 * Copyright © 2019 Intel Corporation 5 */ 6 7 #include "i915_request.h" 8 9 #include "intel_context.h" 10 #include "intel_engine_heartbeat.h" 11 #include "intel_engine_pm.h" 12 #include "intel_engine.h" 13 #include "intel_gt.h" 14 #include "intel_reset.h" 15 16 /* 17 * While the engine is active, we send a periodic pulse along the engine 18 * to check on its health and to flush any idle-barriers. If that request 19 * is stuck, and we fail to preempt it, we declare the engine hung and 20 * issue a reset -- in the hope that restores progress. 21 */ 22 23 static bool next_heartbeat(struct intel_engine_cs *engine) 24 { 25 long delay; 26 27 delay = READ_ONCE(engine->props.heartbeat_interval_ms); 28 if (!delay) 29 return false; 30 31 delay = msecs_to_jiffies_timeout(delay); 32 if (delay >= HZ) 33 delay = round_jiffies_up_relative(delay); 34 mod_delayed_work(system_wq, &engine->heartbeat.work, delay); 35 36 return true; 37 } 38 39 static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq) 40 { 41 engine->wakeref_serial = READ_ONCE(engine->serial) + 1; 42 i915_request_add_active_barriers(rq); 43 } 44 45 static void show_heartbeat(const struct i915_request *rq, 46 struct intel_engine_cs *engine) 47 { 48 struct drm_printer p = drm_debug_printer("heartbeat"); 49 50 intel_engine_dump(engine, &p, 51 "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n", 52 engine->name, 53 rq->fence.context, 54 rq->fence.seqno, 55 rq->sched.attr.priority); 56 } 57 58 static void heartbeat(struct work_struct *wrk) 59 { 60 struct i915_sched_attr attr = { 61 .priority = I915_USER_PRIORITY(I915_PRIORITY_MIN), 62 }; 63 struct intel_engine_cs *engine = 64 container_of(wrk, typeof(*engine), heartbeat.work.work); 65 struct intel_context *ce = engine->kernel_context; 66 struct i915_request *rq; 67 68 /* Just in case everything has gone horribly wrong, give it a kick */ 69 intel_engine_flush_submission(engine); 70 71 rq = engine->heartbeat.systole; 72 if (rq && i915_request_completed(rq)) { 73 i915_request_put(rq); 74 engine->heartbeat.systole = NULL; 75 } 76 77 if (!intel_engine_pm_get_if_awake(engine)) 78 return; 79 80 if (intel_gt_is_wedged(engine->gt)) 81 goto out; 82 83 if (engine->heartbeat.systole) { 84 if (!i915_sw_fence_signaled(&rq->submit)) { 85 /* 86 * Not yet submitted, system is stalled. 87 * 88 * This more often happens for ring submission, 89 * where all contexts are funnelled into a common 90 * ringbuffer. If one context is blocked on an 91 * external fence, not only is it not submitted, 92 * but all other contexts, including the kernel 93 * context are stuck waiting for the signal. 94 */ 95 } else if (engine->schedule && 96 rq->sched.attr.priority < I915_PRIORITY_BARRIER) { 97 /* 98 * Gradually raise the priority of the heartbeat to 99 * give high priority work [which presumably desires 100 * low latency and no jitter] the chance to naturally 101 * complete before being preempted. 102 */ 103 attr.priority = I915_PRIORITY_MASK; 104 if (rq->sched.attr.priority >= attr.priority) 105 attr.priority |= I915_USER_PRIORITY(I915_PRIORITY_HEARTBEAT); 106 if (rq->sched.attr.priority >= attr.priority) 107 attr.priority = I915_PRIORITY_BARRIER; 108 109 local_bh_disable(); 110 engine->schedule(rq, &attr); 111 local_bh_enable(); 112 } else { 113 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 114 show_heartbeat(rq, engine); 115 116 intel_gt_handle_error(engine->gt, engine->mask, 117 I915_ERROR_CAPTURE, 118 "stopped heartbeat on %s", 119 engine->name); 120 } 121 goto out; 122 } 123 124 if (engine->wakeref_serial == engine->serial) 125 goto out; 126 127 mutex_lock(&ce->timeline->mutex); 128 129 intel_context_enter(ce); 130 rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN); 131 intel_context_exit(ce); 132 if (IS_ERR(rq)) 133 goto unlock; 134 135 idle_pulse(engine, rq); 136 if (i915_modparams.enable_hangcheck) 137 engine->heartbeat.systole = i915_request_get(rq); 138 139 __i915_request_commit(rq); 140 __i915_request_queue(rq, &attr); 141 142 unlock: 143 mutex_unlock(&ce->timeline->mutex); 144 out: 145 if (!next_heartbeat(engine)) 146 i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); 147 intel_engine_pm_put(engine); 148 } 149 150 void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine) 151 { 152 if (!IS_ACTIVE(CONFIG_DRM_I915_HEARTBEAT_INTERVAL)) 153 return; 154 155 next_heartbeat(engine); 156 } 157 158 void intel_engine_park_heartbeat(struct intel_engine_cs *engine) 159 { 160 if (cancel_delayed_work(&engine->heartbeat.work)) 161 i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); 162 } 163 164 void intel_engine_init_heartbeat(struct intel_engine_cs *engine) 165 { 166 INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat); 167 } 168 169 int intel_engine_set_heartbeat(struct intel_engine_cs *engine, 170 unsigned long delay) 171 { 172 int err; 173 174 /* Send one last pulse before to cleanup persistent hogs */ 175 if (!delay && IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT)) { 176 err = intel_engine_pulse(engine); 177 if (err) 178 return err; 179 } 180 181 WRITE_ONCE(engine->props.heartbeat_interval_ms, delay); 182 183 if (intel_engine_pm_get_if_awake(engine)) { 184 if (delay) 185 intel_engine_unpark_heartbeat(engine); 186 else 187 intel_engine_park_heartbeat(engine); 188 intel_engine_pm_put(engine); 189 } 190 191 return 0; 192 } 193 194 int intel_engine_pulse(struct intel_engine_cs *engine) 195 { 196 struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER }; 197 struct intel_context *ce = engine->kernel_context; 198 struct i915_request *rq; 199 int err; 200 201 if (!intel_engine_has_preemption(engine)) 202 return -ENODEV; 203 204 if (!intel_engine_pm_get_if_awake(engine)) 205 return 0; 206 207 if (mutex_lock_interruptible(&ce->timeline->mutex)) { 208 err = -EINTR; 209 goto out_rpm; 210 } 211 212 intel_context_enter(ce); 213 rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN); 214 intel_context_exit(ce); 215 if (IS_ERR(rq)) { 216 err = PTR_ERR(rq); 217 goto out_unlock; 218 } 219 220 __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags); 221 idle_pulse(engine, rq); 222 223 __i915_request_commit(rq); 224 __i915_request_queue(rq, &attr); 225 GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER); 226 err = 0; 227 228 out_unlock: 229 mutex_unlock(&ce->timeline->mutex); 230 out_rpm: 231 intel_engine_pm_put(engine); 232 return err; 233 } 234 235 int intel_engine_flush_barriers(struct intel_engine_cs *engine) 236 { 237 struct i915_request *rq; 238 int err = 0; 239 240 if (llist_empty(&engine->barrier_tasks)) 241 return 0; 242 243 if (!intel_engine_pm_get_if_awake(engine)) 244 return 0; 245 246 rq = i915_request_create(engine->kernel_context); 247 if (IS_ERR(rq)) { 248 err = PTR_ERR(rq); 249 goto out_rpm; 250 } 251 252 idle_pulse(engine, rq); 253 i915_request_add(rq); 254 255 out_rpm: 256 intel_engine_pm_put(engine); 257 return err; 258 } 259 260 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 261 #include "selftest_engine_heartbeat.c" 262 #endif 263