1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Timer events oriented CPU idle governor
4 *
5 * Copyright (C) 2018 - 2021 Intel Corporation
6 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
7 */
8
9 /**
10 * DOC: teo-description
11 *
12 * The idea of this governor is based on the observation that on many systems
13 * timer interrupts are two or more orders of magnitude more frequent than any
14 * other interrupt types, so they are likely to dominate CPU wakeup patterns.
15 * Moreover, in principle, the time when the next timer event is going to occur
16 * can be determined at the idle state selection time, although doing that may
17 * be costly, so it can be regarded as the most reliable source of information
18 * for idle state selection.
19 *
20 * Of course, non-timer wakeup sources are more important in some use cases,
21 * but even then it is generally unnecessary to consider idle duration values
22 * greater than the time till the next timer event, referred as the sleep
23 * length in what follows, because the closest timer will ultimately wake up the
24 * CPU anyway unless it is woken up earlier.
25 *
26 * However, since obtaining the sleep length may be costly, the governor first
27 * checks if it can select a shallow idle state using wakeup pattern information
28 * from recent times, in which case it can do without knowing the sleep length
29 * at all. For this purpose, it counts CPU wakeup events and looks for an idle
30 * state whose target residency has not exceeded the idle duration (measured
31 * after wakeup) in the majority of relevant recent cases. If the target
32 * residency of that state is small enough, it may be used right away and the
33 * sleep length need not be determined.
34 *
35 * The computations carried out by this governor are based on using bins whose
36 * boundaries are aligned with the target residency parameter values of the CPU
37 * idle states provided by the %CPUIdle driver in the ascending order. That is,
38 * the first bin spans from 0 up to, but not including, the target residency of
39 * the second idle state (idle state 1), the second bin spans from the target
40 * residency of idle state 1 up to, but not including, the target residency of
41 * idle state 2, the third bin spans from the target residency of idle state 2
42 * up to, but not including, the target residency of idle state 3 and so on.
43 * The last bin spans from the target residency of the deepest idle state
44 * supplied by the driver to infinity.
45 *
46 * Two metrics called "hits" and "intercepts" are associated with each bin.
47 * They are updated every time before selecting an idle state for the given CPU
48 * in accordance with what happened last time.
49 *
50 * The "hits" metric reflects the relative frequency of situations in which the
51 * sleep length and the idle duration measured after CPU wakeup are close enough
52 * (that is, the CPU appears to wake up "on time" relative to the sleep length).
53 * In turn, the "intercepts" metric reflects the relative frequency of non-timer
54 * wakeup events for which the measured idle duration is significantly different
55 * from the sleep length (these events are also referred to as "intercepts"
56 * below).
57 *
58 * The governor also counts "intercepts" with the measured idle duration below
59 * the tick period length and uses this information when deciding whether or not
60 * to stop the scheduler tick.
61 *
62 * In order to select an idle state for a CPU, the governor takes the following
63 * steps (modulo the possible latency constraint that must be taken into account
64 * too):
65 *
66 * 1. Find the deepest enabled CPU idle state (the candidate idle state) and
67 * compute 2 sums as follows:
68 *
69 * - The sum of the "hits" metric for all of the idle states shallower than
70 * the candidate one (it represents the cases in which the CPU was likely
71 * woken up by a timer).
72 *
73 * - The sum of the "intercepts" metric for all of the idle states shallower
74 * than the candidate one (it represents the cases in which the CPU was
75 * likely woken up by a non-timer wakeup source).
76 *
77 * Also find the idle state with the maximum intercepts metric (if there are
78 * multiple states with the maximum intercepts metric, choose the one with
79 * the highest index).
80 *
81 * 2. If the second sum computed in step 1 is greater than a half of the sum of
82 * both metrics for the candidate state bin and all subsequent bins (if any),
83 * a shallower idle state is likely to be more suitable, so look for it.
84 *
85 * - Traverse the enabled idle states shallower than the candidate one in the
86 * descending order, starting at the state with the maximum intercepts
87 * metric found in step 1.
88 *
89 * - For each of them compute the sum of the "intercepts" metrics over all
90 * of the idle states between it and the candidate one (including the
91 * former and excluding the latter).
92 *
93 * - If this sum is greater than a half of the second sum computed in step 1,
94 * use the given idle state as the new candidate one.
95 *
96 * 3. If the current candidate state is state 0 or its target residency is short
97 * enough, return it and prevent the scheduler tick from being stopped.
98 *
99 * 4. Obtain the sleep length value and check if it is below the target
100 * residency of the current candidate state, in which case a new shallower
101 * candidate state needs to be found, so look for it.
102 */
103
104 #include <linux/cpuidle.h>
105 #include <linux/jiffies.h>
106 #include <linux/kernel.h>
107 #include <linux/sched/clock.h>
108 #include <linux/tick.h>
109
110 #include "gov.h"
111
112 /*
113 * Idle state exit latency threshold used for deciding whether or not to check
114 * the time till the closest expected timer event.
115 */
116 #define LATENCY_THRESHOLD_NS (RESIDENCY_THRESHOLD_NS / 2)
117
118 /*
119 * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
120 * is used for decreasing metrics on a regular basis.
121 */
122 #define PULSE 1024
123 #define DECAY_SHIFT 3
124
125 /**
126 * struct teo_bin - Metrics used by the TEO cpuidle governor.
127 * @intercepts: The "intercepts" metric.
128 * @hits: The "hits" metric.
129 */
130 struct teo_bin {
131 unsigned int intercepts;
132 unsigned int hits;
133 };
134
135 /**
136 * struct teo_cpu - CPU data used by the TEO cpuidle governor.
137 * @sleep_length_ns: Time till the closest timer event (at the selection time).
138 * @state_bins: Idle state data bins for this CPU.
139 * @total: Grand total of the "intercepts" and "hits" metrics for all bins.
140 * @total_tick: Wakeups by the scheduler tick.
141 * @tick_intercepts: "Intercepts" before TICK_NSEC.
142 * @short_idles: Wakeups after short idle periods.
143 * @tick_wakeup: Set if the last wakeup was by the scheduler tick.
144 */
145 struct teo_cpu {
146 s64 sleep_length_ns;
147 struct teo_bin state_bins[CPUIDLE_STATE_MAX];
148 unsigned int total;
149 unsigned int total_tick;
150 unsigned int tick_intercepts;
151 unsigned int short_idles;
152 bool tick_wakeup;
153 };
154
155 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
156
teo_decay(unsigned int * metric)157 static void teo_decay(unsigned int *metric)
158 {
159 unsigned int delta = *metric >> DECAY_SHIFT;
160
161 if (delta)
162 *metric -= delta;
163 else
164 *metric = 0;
165 }
166
167 /**
168 * teo_update - Update CPU metrics after wakeup.
169 * @drv: cpuidle driver containing state data.
170 * @dev: Target CPU.
171 */
teo_update(struct cpuidle_driver * drv,struct cpuidle_device * dev)172 static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
173 {
174 s64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
175 struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus);
176 int i, idx_timer = 0, idx_duration = 0;
177 s64 target_residency_ns, measured_ns;
178 unsigned int total = 0;
179
180 teo_decay(&cpu_data->short_idles);
181
182 if (dev->poll_time_limit) {
183 dev->poll_time_limit = false;
184 /*
185 * Polling state timeout has triggered, so assume that this
186 * might have been a long sleep.
187 */
188 measured_ns = S64_MAX;
189 } else {
190 measured_ns = dev->last_residency_ns;
191 /*
192 * The delay between the wakeup and the first instruction
193 * executed by the CPU is not likely to be worst-case every
194 * time, so take 1/2 of the exit latency as a very rough
195 * approximation of the average of it.
196 */
197 if (measured_ns >= lat_ns) {
198 measured_ns -= lat_ns / 2;
199 if (measured_ns < RESIDENCY_THRESHOLD_NS)
200 cpu_data->short_idles += PULSE;
201 } else {
202 measured_ns /= 2;
203 cpu_data->short_idles += PULSE;
204 }
205 }
206
207 /*
208 * Decay the "hits" and "intercepts" metrics for all of the bins and
209 * find the bins that the sleep length and the measured idle duration
210 * fall into.
211 */
212 for (i = 0; i < drv->state_count; i++) {
213 struct teo_bin *bin = &cpu_data->state_bins[i];
214
215 teo_decay(&bin->hits);
216 total += bin->hits;
217 teo_decay(&bin->intercepts);
218 total += bin->intercepts;
219
220 target_residency_ns = drv->states[i].target_residency_ns;
221
222 if (target_residency_ns <= cpu_data->sleep_length_ns) {
223 idx_timer = i;
224 if (target_residency_ns <= measured_ns)
225 idx_duration = i;
226 }
227 }
228
229 cpu_data->total = total + PULSE;
230
231 teo_decay(&cpu_data->tick_intercepts);
232
233 teo_decay(&cpu_data->total_tick);
234 if (cpu_data->tick_wakeup) {
235 cpu_data->total_tick += PULSE;
236 /*
237 * If tick wakeups dominate the wakeup pattern, count this one
238 * as a hit on the deepest available idle state to increase the
239 * likelihood of stopping the tick.
240 */
241 if (3 * cpu_data->total_tick > 2 * cpu_data->total) {
242 cpu_data->state_bins[drv->state_count-1].hits += PULSE;
243 return;
244 }
245 /*
246 * If intercepts within the tick period range are not frequent
247 * enough, count this wakeup as a hit, since it is likely that
248 * the tick has woken up the CPU because an expected intercept
249 * was not there. Otherwise, one of the intercepts may have
250 * been incidentally preceded by the tick wakeup.
251 */
252 if (3 * cpu_data->tick_intercepts < 2 * total) {
253 cpu_data->state_bins[idx_timer].hits += PULSE;
254 return;
255 }
256 }
257
258 /*
259 * If the measured idle duration (adjusted for the entered state exit
260 * latency) falls into the same bin as the sleep length and the latter
261 * is less than the "raw" measured idle duration (so the wakeup appears
262 * to have occurred after the anticipated timer event), this is a "hit",
263 * so update the "hits" metric for that bin.
264 *
265 * Otherwise, update the "intercepts" metric for the bin fallen into by
266 * the measured idle duration.
267 */
268 if (idx_timer == idx_duration &&
269 cpu_data->sleep_length_ns - measured_ns < lat_ns / 2) {
270 cpu_data->state_bins[idx_timer].hits += PULSE;
271 } else {
272 cpu_data->state_bins[idx_duration].intercepts += PULSE;
273 if (measured_ns <= TICK_NSEC)
274 cpu_data->tick_intercepts += PULSE;
275 }
276 }
277
278 /**
279 * teo_find_shallower_state - Find shallower idle state matching given duration.
280 * @drv: cpuidle driver containing state data.
281 * @dev: Target CPU.
282 * @state_idx: Index of the capping idle state.
283 * @duration_ns: Idle duration value to match.
284 */
teo_find_shallower_state(struct cpuidle_driver * drv,struct cpuidle_device * dev,int state_idx,s64 duration_ns)285 static int teo_find_shallower_state(struct cpuidle_driver *drv,
286 struct cpuidle_device *dev, int state_idx,
287 s64 duration_ns)
288 {
289 int i;
290
291 for (i = state_idx - 1; i >= 0; i--) {
292 if (dev->states_usage[i].disable)
293 continue;
294
295 state_idx = i;
296 if (drv->states[i].target_residency_ns <= duration_ns)
297 break;
298 }
299 return state_idx;
300 }
301
302 /**
303 * teo_select - Selects the next idle state to enter.
304 * @drv: cpuidle driver containing state data.
305 * @dev: Target CPU.
306 * @stop_tick: Indication on whether or not to stop the scheduler tick.
307 */
teo_select(struct cpuidle_driver * drv,struct cpuidle_device * dev,bool * stop_tick)308 static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
309 bool *stop_tick)
310 {
311 struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus);
312 s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
313 ktime_t delta_tick = TICK_NSEC / 2;
314 unsigned int idx_intercept_sum = 0;
315 unsigned int intercept_sum = 0;
316 unsigned int intercept_max = 0;
317 unsigned int idx_hit_sum = 0;
318 unsigned int hit_sum = 0;
319 int intercept_max_idx = -1;
320 int constraint_idx = 0;
321 int idx0 = 0, idx = -1;
322 s64 duration_ns;
323 int i;
324
325 if (dev->last_state_idx >= 0) {
326 teo_update(drv, dev);
327 dev->last_state_idx = -1;
328 }
329
330 /*
331 * Set the sleep length to infinity in case the invocation of
332 * tick_nohz_get_sleep_length() below is skipped, in which case it won't
333 * be known whether or not the subsequent wakeup is caused by a timer.
334 * It is generally fine to count the wakeup as an intercept then, except
335 * for the cases when the CPU is mostly woken up by timers and there may
336 * be opportunities to ask for a deeper idle state when no imminent
337 * timers are scheduled which may be missed.
338 */
339 cpu_data->sleep_length_ns = KTIME_MAX;
340
341 if (!dev->states_usage[0].disable)
342 idx = 0;
343
344 /*
345 * Compute the sums of metrics for early wakeup pattern detection and
346 * look for the state bin with the maximum intercepts metric below the
347 * deepest enabled one (if there are multiple states with the maximum
348 * intercepts metric, choose the one with the highest index).
349 */
350 for (i = 1; i < drv->state_count; i++) {
351 struct teo_bin *prev_bin = &cpu_data->state_bins[i-1];
352 unsigned int prev_intercepts = prev_bin->intercepts;
353 struct cpuidle_state *s = &drv->states[i];
354
355 /*
356 * Update the sums of idle state metrics for all of the states
357 * shallower than the current one.
358 */
359 hit_sum += prev_bin->hits;
360 intercept_sum += prev_intercepts;
361 /*
362 * Check if this is the bin with the maximum number of
363 * intercepts so far and in that case update the index of
364 * the state with the maximum intercepts metric.
365 */
366 if (prev_intercepts >= intercept_max) {
367 intercept_max = prev_intercepts;
368 intercept_max_idx = i - 1;
369 }
370
371 if (dev->states_usage[i].disable)
372 continue;
373
374 if (idx < 0)
375 idx0 = i; /* first enabled state */
376
377 idx = i;
378
379 if (s->exit_latency_ns <= latency_req)
380 constraint_idx = i;
381
382 /* Save the sums for the current state. */
383 idx_intercept_sum = intercept_sum;
384 idx_hit_sum = hit_sum;
385 }
386
387 /* Avoid unnecessary overhead. */
388 if (idx < 0) {
389 idx = 0; /* No states enabled, must use 0. */
390 goto out_tick;
391 }
392
393 if (idx == idx0) {
394 /*
395 * Only one idle state is enabled, so use it, but do not
396 * allow the tick to be stopped it is shallow enough.
397 */
398 duration_ns = drv->states[idx].target_residency_ns;
399 goto end;
400 }
401
402 /*
403 * If the sum of the intercepts metric for all of the idle states
404 * shallower than the current candidate one (idx) is greater than the
405 * sum of the intercepts and hits metrics for the candidate state and
406 * all of the deeper states, a shallower idle state is likely to be a
407 * better choice.
408 */
409 if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) {
410 int min_idx = idx0;
411
412 if (tick_nohz_tick_stopped()) {
413 /*
414 * Look for the shallowest idle state below the current
415 * candidate one whose target residency is at least
416 * equal to the tick period length.
417 */
418 while (min_idx < idx &&
419 drv->states[min_idx].target_residency_ns < TICK_NSEC)
420 min_idx++;
421
422 /*
423 * Avoid selecting a state with a lower index, but with
424 * the same target residency as the current candidate
425 * one.
426 */
427 if (drv->states[min_idx].target_residency_ns ==
428 drv->states[idx].target_residency_ns)
429 goto constraint;
430 }
431
432 /*
433 * If the minimum state index is greater than or equal to the
434 * index of the state with the maximum intercepts metric and
435 * the corresponding state is enabled, there is no need to look
436 * at the deeper states.
437 */
438 if (min_idx >= intercept_max_idx &&
439 !dev->states_usage[min_idx].disable) {
440 idx = min_idx;
441 goto constraint;
442 }
443
444 /*
445 * Look for the deepest enabled idle state, at most as deep as
446 * the one with the maximum intercepts metric, whose target
447 * residency had not been greater than the idle duration in over
448 * a half of the relevant cases in the past.
449 *
450 * Take the possible duration limitation present if the tick
451 * has been stopped already into account.
452 */
453 for (i = idx - 1, intercept_sum = 0; i >= min_idx; i--) {
454 intercept_sum += cpu_data->state_bins[i].intercepts;
455
456 if (dev->states_usage[i].disable)
457 continue;
458
459 idx = i;
460 if (2 * intercept_sum > idx_intercept_sum &&
461 i <= intercept_max_idx)
462 break;
463 }
464 }
465
466 constraint:
467 /*
468 * If there is a latency constraint, it may be necessary to select an
469 * idle state shallower than the current candidate one.
470 */
471 if (idx > constraint_idx)
472 idx = constraint_idx;
473
474 /*
475 * If either the candidate state is state 0 or its target residency is
476 * low enough, there is basically nothing more to do, but if the sleep
477 * length is not updated, the subsequent wakeup will be counted as an
478 * "intercept" which may be problematic in the cases when timer wakeups
479 * are dominant. Namely, it may effectively prevent deeper idle states
480 * from being selected at one point even if no imminent timers are
481 * scheduled.
482 *
483 * However, frequent timers in the RESIDENCY_THRESHOLD_NS range on one
484 * CPU are unlikely (user space has a default 50 us slack value for
485 * hrtimers and there are relatively few timers with a lower deadline
486 * value in the kernel), and even if they did happen, the potential
487 * benefit from using a deep idle state in that case would be
488 * questionable anyway for latency reasons. Thus if the measured idle
489 * duration falls into that range in the majority of cases, assume
490 * non-timer wakeups to be dominant and skip updating the sleep length
491 * to reduce latency.
492 *
493 * Also, if the latency constraint is sufficiently low, it will force
494 * shallow idle states regardless of the wakeup type, so the sleep
495 * length need not be known in that case.
496 */
497 if ((!idx || drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) &&
498 (2 * cpu_data->short_idles >= cpu_data->total ||
499 latency_req < LATENCY_THRESHOLD_NS))
500 goto out_tick;
501
502 duration_ns = tick_nohz_get_sleep_length(&delta_tick);
503 cpu_data->sleep_length_ns = duration_ns;
504
505 if (!idx)
506 goto out_tick;
507
508 /*
509 * If the closest expected timer is before the target residency of the
510 * candidate state, a shallower one needs to be found.
511 */
512 if (drv->states[idx].target_residency_ns > duration_ns)
513 idx = teo_find_shallower_state(drv, dev, idx, duration_ns);
514
515 /*
516 * If the selected state's target residency is below the tick length
517 * and intercepts occurring before the tick length are the majority of
518 * total wakeup events, do not stop the tick.
519 */
520 if (drv->states[idx].target_residency_ns < TICK_NSEC &&
521 3 * cpu_data->tick_intercepts >= 2 * cpu_data->total)
522 duration_ns = TICK_NSEC / 2;
523
524 end:
525 /*
526 * Allow the tick to be stopped unless the selected state is a polling
527 * one or the expected idle duration is shorter than the tick period
528 * length.
529 */
530 if ((!(drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
531 duration_ns >= TICK_NSEC) || tick_nohz_tick_stopped())
532 return idx;
533
534 /*
535 * The tick is not going to be stopped, so if the target residency of
536 * the state to be returned is not within the time till the closest
537 * timer including the tick, try to correct that.
538 */
539 if (idx > idx0 &&
540 drv->states[idx].target_residency_ns > delta_tick)
541 idx = teo_find_shallower_state(drv, dev, idx, delta_tick);
542
543 out_tick:
544 *stop_tick = false;
545 return idx;
546 }
547
548 /**
549 * teo_reflect - Note that governor data for the CPU need to be updated.
550 * @dev: Target CPU.
551 * @state: Entered state.
552 */
teo_reflect(struct cpuidle_device * dev,int state)553 static void teo_reflect(struct cpuidle_device *dev, int state)
554 {
555 struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus);
556
557 cpu_data->tick_wakeup = tick_nohz_idle_got_tick();
558
559 dev->last_state_idx = state;
560 }
561
562 /**
563 * teo_enable_device - Initialize the governor's data for the target CPU.
564 * @drv: cpuidle driver (not used).
565 * @dev: Target CPU.
566 */
teo_enable_device(struct cpuidle_driver * drv,struct cpuidle_device * dev)567 static int teo_enable_device(struct cpuidle_driver *drv,
568 struct cpuidle_device *dev)
569 {
570 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
571
572 memset(cpu_data, 0, sizeof(*cpu_data));
573
574 return 0;
575 }
576
577 static struct cpuidle_governor teo_governor = {
578 .name = "teo",
579 .rating = 19,
580 .enable = teo_enable_device,
581 .select = teo_select,
582 .reflect = teo_reflect,
583 };
584
teo_governor_init(void)585 static int __init teo_governor_init(void)
586 {
587 return cpuidle_register_governor(&teo_governor);
588 }
589
590 postcore_initcall(teo_governor_init);
591