1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Timer events oriented CPU idle governor
4 *
5 * Copyright (C) 2018 - 2021 Intel Corporation
6 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
7 */
8
9 /**
10 * DOC: teo-description
11 *
12 * The idea of this governor is based on the observation that on many systems
13 * timer events are two or more orders of magnitude more frequent than any
14 * other interrupts, so they are likely to be the most significant cause of CPU
15 * wakeups from idle states. Moreover, information about what happened in the
16 * (relatively recent) past can be used to estimate whether or not the deepest
17 * idle state with target residency within the (known) time till the closest
18 * timer event, referred to as the sleep length, is likely to be suitable for
19 * the upcoming CPU idle period and, if not, then which of the shallower idle
20 * states to choose instead of it.
21 *
22 * Of course, non-timer wakeup sources are more important in some use cases
23 * which can be covered by taking a few most recent idle time intervals of the
24 * CPU into account. However, even in that context it is not necessary to
25 * consider idle duration values greater than the sleep length, because the
26 * closest timer will ultimately wake up the CPU anyway unless it is woken up
27 * earlier.
28 *
29 * Thus this governor estimates whether or not the prospective idle duration of
30 * a CPU is likely to be significantly shorter than the sleep length and selects
31 * an idle state for it accordingly.
32 *
33 * The computations carried out by this governor are based on using bins whose
34 * boundaries are aligned with the target residency parameter values of the CPU
35 * idle states provided by the %CPUIdle driver in the ascending order. That is,
36 * the first bin spans from 0 up to, but not including, the target residency of
37 * the second idle state (idle state 1), the second bin spans from the target
38 * residency of idle state 1 up to, but not including, the target residency of
39 * idle state 2, the third bin spans from the target residency of idle state 2
40 * up to, but not including, the target residency of idle state 3 and so on.
41 * The last bin spans from the target residency of the deepest idle state
42 * supplied by the driver to infinity.
43 *
44 * Two metrics called "hits" and "intercepts" are associated with each bin.
45 * They are updated every time before selecting an idle state for the given CPU
46 * in accordance with what happened last time.
47 *
48 * The "hits" metric reflects the relative frequency of situations in which the
49 * sleep length and the idle duration measured after CPU wakeup fall into the
50 * same bin (that is, the CPU appears to wake up "on time" relative to the sleep
51 * length). In turn, the "intercepts" metric reflects the relative frequency of
52 * situations in which the measured idle duration is so much shorter than the
53 * sleep length that the bin it falls into corresponds to an idle state
54 * shallower than the one whose bin is fallen into by the sleep length (these
55 * situations are referred to as "intercepts" below).
56 *
57 * In order to select an idle state for a CPU, the governor takes the following
58 * steps (modulo the possible latency constraint that must be taken into account
59 * too):
60 *
61 * 1. Find the deepest CPU idle state whose target residency does not exceed
62 * the current sleep length (the candidate idle state) and compute 2 sums as
63 * follows:
64 *
65 * - The sum of the "hits" and "intercepts" metrics for the candidate state
66 * and all of the deeper idle states (it represents the cases in which the
67 * CPU was idle long enough to avoid being intercepted if the sleep length
68 * had been equal to the current one).
69 *
70 * - The sum of the "intercepts" metrics for all of the idle states shallower
71 * than the candidate one (it represents the cases in which the CPU was not
72 * idle long enough to avoid being intercepted if the sleep length had been
73 * equal to the current one).
74 *
75 * 2. If the second sum is greater than the first one the CPU is likely to wake
76 * up early, so look for an alternative idle state to select.
77 *
78 * - Traverse the idle states shallower than the candidate one in the
79 * descending order.
80 *
81 * - For each of them compute the sum of the "intercepts" metrics over all
82 * of the idle states between it and the candidate one (including the
83 * former and excluding the latter).
84 *
85 * - If each of these sums that needs to be taken into account (because the
86 * check related to it has indicated that the CPU is likely to wake up
87 * early) is greater than a half of the corresponding sum computed in step
88 * 1 (which means that the target residency of the state in question had
89 * not exceeded the idle duration in over a half of the relevant cases),
90 * select the given idle state instead of the candidate one.
91 *
92 * 3. By default, select the candidate state.
93 */
94
95 #include <linux/cpuidle.h>
96 #include <linux/jiffies.h>
97 #include <linux/kernel.h>
98 #include <linux/sched/clock.h>
99 #include <linux/tick.h>
100
101 #include "gov.h"
102
103 /*
104 * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
105 * is used for decreasing metrics on a regular basis.
106 */
107 #define PULSE 1024
108 #define DECAY_SHIFT 3
109
110 /**
111 * struct teo_bin - Metrics used by the TEO cpuidle governor.
112 * @intercepts: The "intercepts" metric.
113 * @hits: The "hits" metric.
114 */
115 struct teo_bin {
116 unsigned int intercepts;
117 unsigned int hits;
118 };
119
120 /**
121 * struct teo_cpu - CPU data used by the TEO cpuidle governor.
122 * @time_span_ns: Time between idle state selection and post-wakeup update.
123 * @sleep_length_ns: Time till the closest timer event (at the selection time).
124 * @state_bins: Idle state data bins for this CPU.
125 * @total: Grand total of the "intercepts" and "hits" metrics for all bins.
126 * @tick_hits: Number of "hits" after TICK_NSEC.
127 */
128 struct teo_cpu {
129 s64 time_span_ns;
130 s64 sleep_length_ns;
131 struct teo_bin state_bins[CPUIDLE_STATE_MAX];
132 unsigned int total;
133 unsigned int tick_hits;
134 };
135
136 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
137
138 /**
139 * teo_update - Update CPU metrics after wakeup.
140 * @drv: cpuidle driver containing state data.
141 * @dev: Target CPU.
142 */
teo_update(struct cpuidle_driver * drv,struct cpuidle_device * dev)143 static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
144 {
145 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
146 int i, idx_timer = 0, idx_duration = 0;
147 s64 target_residency_ns;
148 u64 measured_ns;
149
150 if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) {
151 /*
152 * One of the safety nets has triggered or the wakeup was close
153 * enough to the closest timer event expected at the idle state
154 * selection time to be discarded.
155 */
156 measured_ns = U64_MAX;
157 } else {
158 u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
159
160 /*
161 * The computations below are to determine whether or not the
162 * (saved) time till the next timer event and the measured idle
163 * duration fall into the same "bin", so use last_residency_ns
164 * for that instead of time_span_ns which includes the cpuidle
165 * overhead.
166 */
167 measured_ns = dev->last_residency_ns;
168 /*
169 * The delay between the wakeup and the first instruction
170 * executed by the CPU is not likely to be worst-case every
171 * time, so take 1/2 of the exit latency as a very rough
172 * approximation of the average of it.
173 */
174 if (measured_ns >= lat_ns)
175 measured_ns -= lat_ns / 2;
176 else
177 measured_ns /= 2;
178 }
179
180 cpu_data->total = 0;
181
182 /*
183 * Decay the "hits" and "intercepts" metrics for all of the bins and
184 * find the bins that the sleep length and the measured idle duration
185 * fall into.
186 */
187 for (i = 0; i < drv->state_count; i++) {
188 struct teo_bin *bin = &cpu_data->state_bins[i];
189
190 bin->hits -= bin->hits >> DECAY_SHIFT;
191 bin->intercepts -= bin->intercepts >> DECAY_SHIFT;
192
193 cpu_data->total += bin->hits + bin->intercepts;
194
195 target_residency_ns = drv->states[i].target_residency_ns;
196
197 if (target_residency_ns <= cpu_data->sleep_length_ns) {
198 idx_timer = i;
199 if (target_residency_ns <= measured_ns)
200 idx_duration = i;
201 }
202 }
203
204 /*
205 * If the deepest state's target residency is below the tick length,
206 * make a record of it to help teo_select() decide whether or not
207 * to stop the tick. This effectively adds an extra hits-only bin
208 * beyond the last state-related one.
209 */
210 if (target_residency_ns < TICK_NSEC) {
211 cpu_data->tick_hits -= cpu_data->tick_hits >> DECAY_SHIFT;
212
213 cpu_data->total += cpu_data->tick_hits;
214
215 if (TICK_NSEC <= cpu_data->sleep_length_ns) {
216 idx_timer = drv->state_count;
217 if (TICK_NSEC <= measured_ns) {
218 cpu_data->tick_hits += PULSE;
219 goto end;
220 }
221 }
222 }
223
224 /*
225 * If the measured idle duration falls into the same bin as the sleep
226 * length, this is a "hit", so update the "hits" metric for that bin.
227 * Otherwise, update the "intercepts" metric for the bin fallen into by
228 * the measured idle duration.
229 */
230 if (idx_timer == idx_duration)
231 cpu_data->state_bins[idx_timer].hits += PULSE;
232 else
233 cpu_data->state_bins[idx_duration].intercepts += PULSE;
234
235 end:
236 cpu_data->total += PULSE;
237 }
238
teo_state_ok(int i,struct cpuidle_driver * drv)239 static bool teo_state_ok(int i, struct cpuidle_driver *drv)
240 {
241 return !tick_nohz_tick_stopped() ||
242 drv->states[i].target_residency_ns >= TICK_NSEC;
243 }
244
245 /**
246 * teo_find_shallower_state - Find shallower idle state matching given duration.
247 * @drv: cpuidle driver containing state data.
248 * @dev: Target CPU.
249 * @state_idx: Index of the capping idle state.
250 * @duration_ns: Idle duration value to match.
251 * @no_poll: Don't consider polling states.
252 */
teo_find_shallower_state(struct cpuidle_driver * drv,struct cpuidle_device * dev,int state_idx,s64 duration_ns,bool no_poll)253 static int teo_find_shallower_state(struct cpuidle_driver *drv,
254 struct cpuidle_device *dev, int state_idx,
255 s64 duration_ns, bool no_poll)
256 {
257 int i;
258
259 for (i = state_idx - 1; i >= 0; i--) {
260 if (dev->states_usage[i].disable ||
261 (no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING))
262 continue;
263
264 state_idx = i;
265 if (drv->states[i].target_residency_ns <= duration_ns)
266 break;
267 }
268 return state_idx;
269 }
270
271 /**
272 * teo_select - Selects the next idle state to enter.
273 * @drv: cpuidle driver containing state data.
274 * @dev: Target CPU.
275 * @stop_tick: Indication on whether or not to stop the scheduler tick.
276 */
teo_select(struct cpuidle_driver * drv,struct cpuidle_device * dev,bool * stop_tick)277 static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
278 bool *stop_tick)
279 {
280 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
281 s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
282 ktime_t delta_tick = TICK_NSEC / 2;
283 unsigned int tick_intercept_sum = 0;
284 unsigned int idx_intercept_sum = 0;
285 unsigned int intercept_sum = 0;
286 unsigned int idx_hit_sum = 0;
287 unsigned int hit_sum = 0;
288 int constraint_idx = 0;
289 int idx0 = 0, idx = -1;
290 int prev_intercept_idx;
291 s64 duration_ns;
292 int i;
293
294 if (dev->last_state_idx >= 0) {
295 teo_update(drv, dev);
296 dev->last_state_idx = -1;
297 }
298
299 cpu_data->time_span_ns = local_clock();
300 /*
301 * Set the expected sleep length to infinity in case of an early
302 * return.
303 */
304 cpu_data->sleep_length_ns = KTIME_MAX;
305
306 /* Check if there is any choice in the first place. */
307 if (drv->state_count < 2) {
308 idx = 0;
309 goto out_tick;
310 }
311
312 if (!dev->states_usage[0].disable)
313 idx = 0;
314
315 /* Compute the sums of metrics for early wakeup pattern detection. */
316 for (i = 1; i < drv->state_count; i++) {
317 struct teo_bin *prev_bin = &cpu_data->state_bins[i-1];
318 struct cpuidle_state *s = &drv->states[i];
319
320 /*
321 * Update the sums of idle state mertics for all of the states
322 * shallower than the current one.
323 */
324 intercept_sum += prev_bin->intercepts;
325 hit_sum += prev_bin->hits;
326
327 if (dev->states_usage[i].disable)
328 continue;
329
330 if (idx < 0)
331 idx0 = i; /* first enabled state */
332
333 idx = i;
334
335 if (s->exit_latency_ns <= latency_req)
336 constraint_idx = i;
337
338 /* Save the sums for the current state. */
339 idx_intercept_sum = intercept_sum;
340 idx_hit_sum = hit_sum;
341 }
342
343 /* Avoid unnecessary overhead. */
344 if (idx < 0) {
345 idx = 0; /* No states enabled, must use 0. */
346 goto out_tick;
347 }
348
349 if (idx == idx0) {
350 /*
351 * Only one idle state is enabled, so use it, but do not
352 * allow the tick to be stopped it is shallow enough.
353 */
354 duration_ns = drv->states[idx].target_residency_ns;
355 goto end;
356 }
357
358 tick_intercept_sum = intercept_sum +
359 cpu_data->state_bins[drv->state_count-1].intercepts;
360
361 /*
362 * If the sum of the intercepts metric for all of the idle states
363 * shallower than the current candidate one (idx) is greater than the
364 * sum of the intercepts and hits metrics for the candidate state and
365 * all of the deeper states a shallower idle state is likely to be a
366 * better choice.
367 */
368 prev_intercept_idx = idx;
369 if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) {
370 int first_suitable_idx = idx;
371
372 /*
373 * Look for the deepest idle state whose target residency had
374 * not exceeded the idle duration in over a half of the relevant
375 * cases in the past.
376 *
377 * Take the possible duration limitation present if the tick
378 * has been stopped already into account.
379 */
380 intercept_sum = 0;
381
382 for (i = idx - 1; i >= 0; i--) {
383 struct teo_bin *bin = &cpu_data->state_bins[i];
384
385 intercept_sum += bin->intercepts;
386
387 if (2 * intercept_sum > idx_intercept_sum) {
388 /*
389 * Use the current state unless it is too
390 * shallow or disabled, in which case take the
391 * first enabled state that is deep enough.
392 */
393 if (teo_state_ok(i, drv) &&
394 !dev->states_usage[i].disable)
395 idx = i;
396 else
397 idx = first_suitable_idx;
398
399 break;
400 }
401
402 if (dev->states_usage[i].disable)
403 continue;
404
405 if (!teo_state_ok(i, drv)) {
406 /*
407 * The current state is too shallow, but if an
408 * alternative candidate state has been found,
409 * it may still turn out to be a better choice.
410 */
411 if (first_suitable_idx != idx)
412 continue;
413
414 break;
415 }
416
417 first_suitable_idx = i;
418 }
419 }
420 if (!idx && prev_intercept_idx) {
421 /*
422 * We have to query the sleep length here otherwise we don't
423 * know after wakeup if our guess was correct.
424 */
425 duration_ns = tick_nohz_get_sleep_length(&delta_tick);
426 cpu_data->sleep_length_ns = duration_ns;
427 goto out_tick;
428 }
429
430 /*
431 * If there is a latency constraint, it may be necessary to select an
432 * idle state shallower than the current candidate one.
433 */
434 if (idx > constraint_idx)
435 idx = constraint_idx;
436
437 /*
438 * Skip the timers check if state 0 is the current candidate one,
439 * because an immediate non-timer wakeup is expected in that case.
440 */
441 if (!idx)
442 goto out_tick;
443
444 /*
445 * If state 0 is a polling one, check if the target residency of
446 * the current candidate state is low enough and skip the timers
447 * check in that case too.
448 */
449 if ((drv->states[0].flags & CPUIDLE_FLAG_POLLING) &&
450 drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS)
451 goto out_tick;
452
453 duration_ns = tick_nohz_get_sleep_length(&delta_tick);
454 cpu_data->sleep_length_ns = duration_ns;
455
456 /*
457 * If the closest expected timer is before the target residency of the
458 * candidate state, a shallower one needs to be found.
459 */
460 if (drv->states[idx].target_residency_ns > duration_ns) {
461 i = teo_find_shallower_state(drv, dev, idx, duration_ns, false);
462 if (teo_state_ok(i, drv))
463 idx = i;
464 }
465
466 /*
467 * If the selected state's target residency is below the tick length
468 * and intercepts occurring before the tick length are the majority of
469 * total wakeup events, do not stop the tick.
470 */
471 if (drv->states[idx].target_residency_ns < TICK_NSEC &&
472 tick_intercept_sum > cpu_data->total / 2 + cpu_data->total / 8)
473 duration_ns = TICK_NSEC / 2;
474
475 end:
476 /*
477 * Allow the tick to be stopped unless the selected state is a polling
478 * one or the expected idle duration is shorter than the tick period
479 * length.
480 */
481 if ((!(drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
482 duration_ns >= TICK_NSEC) || tick_nohz_tick_stopped())
483 return idx;
484
485 /*
486 * The tick is not going to be stopped, so if the target residency of
487 * the state to be returned is not within the time till the closest
488 * timer including the tick, try to correct that.
489 */
490 if (idx > idx0 &&
491 drv->states[idx].target_residency_ns > delta_tick)
492 idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false);
493
494 out_tick:
495 *stop_tick = false;
496 return idx;
497 }
498
499 /**
500 * teo_reflect - Note that governor data for the CPU need to be updated.
501 * @dev: Target CPU.
502 * @state: Entered state.
503 */
teo_reflect(struct cpuidle_device * dev,int state)504 static void teo_reflect(struct cpuidle_device *dev, int state)
505 {
506 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
507
508 dev->last_state_idx = state;
509 /*
510 * If the wakeup was not "natural", but triggered by one of the safety
511 * nets, assume that the CPU might have been idle for the entire sleep
512 * length time.
513 */
514 if (dev->poll_time_limit ||
515 (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) {
516 dev->poll_time_limit = false;
517 cpu_data->time_span_ns = cpu_data->sleep_length_ns;
518 } else {
519 cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns;
520 }
521 }
522
523 /**
524 * teo_enable_device - Initialize the governor's data for the target CPU.
525 * @drv: cpuidle driver (not used).
526 * @dev: Target CPU.
527 */
teo_enable_device(struct cpuidle_driver * drv,struct cpuidle_device * dev)528 static int teo_enable_device(struct cpuidle_driver *drv,
529 struct cpuidle_device *dev)
530 {
531 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
532
533 memset(cpu_data, 0, sizeof(*cpu_data));
534
535 return 0;
536 }
537
538 static struct cpuidle_governor teo_governor = {
539 .name = "teo",
540 .rating = 19,
541 .enable = teo_enable_device,
542 .select = teo_select,
543 .reflect = teo_reflect,
544 };
545
teo_governor_init(void)546 static int __init teo_governor_init(void)
547 {
548 return cpuidle_register_governor(&teo_governor);
549 }
550
551 postcore_initcall(teo_governor_init);
552