xref: /linux/drivers/cpuidle/governors/teo.c (revision a1ff5a7d78a036d6c2178ee5acd6ba4946243800)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Timer events oriented CPU idle governor
4  *
5  * Copyright (C) 2018 - 2021 Intel Corporation
6  * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
7  */
8 
9 /**
10  * DOC: teo-description
11  *
12  * The idea of this governor is based on the observation that on many systems
13  * timer events are two or more orders of magnitude more frequent than any
14  * other interrupts, so they are likely to be the most significant cause of CPU
15  * wakeups from idle states.  Moreover, information about what happened in the
16  * (relatively recent) past can be used to estimate whether or not the deepest
17  * idle state with target residency within the (known) time till the closest
18  * timer event, referred to as the sleep length, is likely to be suitable for
19  * the upcoming CPU idle period and, if not, then which of the shallower idle
20  * states to choose instead of it.
21  *
22  * Of course, non-timer wakeup sources are more important in some use cases
23  * which can be covered by taking a few most recent idle time intervals of the
24  * CPU into account.  However, even in that context it is not necessary to
25  * consider idle duration values greater than the sleep length, because the
26  * closest timer will ultimately wake up the CPU anyway unless it is woken up
27  * earlier.
28  *
29  * Thus this governor estimates whether or not the prospective idle duration of
30  * a CPU is likely to be significantly shorter than the sleep length and selects
31  * an idle state for it accordingly.
32  *
33  * The computations carried out by this governor are based on using bins whose
34  * boundaries are aligned with the target residency parameter values of the CPU
35  * idle states provided by the %CPUIdle driver in the ascending order.  That is,
36  * the first bin spans from 0 up to, but not including, the target residency of
37  * the second idle state (idle state 1), the second bin spans from the target
38  * residency of idle state 1 up to, but not including, the target residency of
39  * idle state 2, the third bin spans from the target residency of idle state 2
40  * up to, but not including, the target residency of idle state 3 and so on.
41  * The last bin spans from the target residency of the deepest idle state
42  * supplied by the driver to infinity.
43  *
44  * Two metrics called "hits" and "intercepts" are associated with each bin.
45  * They are updated every time before selecting an idle state for the given CPU
46  * in accordance with what happened last time.
47  *
48  * The "hits" metric reflects the relative frequency of situations in which the
49  * sleep length and the idle duration measured after CPU wakeup fall into the
50  * same bin (that is, the CPU appears to wake up "on time" relative to the sleep
51  * length).  In turn, the "intercepts" metric reflects the relative frequency of
52  * situations in which the measured idle duration is so much shorter than the
53  * sleep length that the bin it falls into corresponds to an idle state
54  * shallower than the one whose bin is fallen into by the sleep length (these
55  * situations are referred to as "intercepts" below).
56  *
57  * In order to select an idle state for a CPU, the governor takes the following
58  * steps (modulo the possible latency constraint that must be taken into account
59  * too):
60  *
61  * 1. Find the deepest CPU idle state whose target residency does not exceed
62  *    the current sleep length (the candidate idle state) and compute 2 sums as
63  *    follows:
64  *
65  *    - The sum of the "hits" and "intercepts" metrics for the candidate state
66  *      and all of the deeper idle states (it represents the cases in which the
67  *      CPU was idle long enough to avoid being intercepted if the sleep length
68  *      had been equal to the current one).
69  *
70  *    - The sum of the "intercepts" metrics for all of the idle states shallower
71  *      than the candidate one (it represents the cases in which the CPU was not
72  *      idle long enough to avoid being intercepted if the sleep length had been
73  *      equal to the current one).
74  *
75  * 2. If the second sum is greater than the first one the CPU is likely to wake
76  *    up early, so look for an alternative idle state to select.
77  *
78  *    - Traverse the idle states shallower than the candidate one in the
79  *      descending order.
80  *
81  *    - For each of them compute the sum of the "intercepts" metrics over all
82  *      of the idle states between it and the candidate one (including the
83  *      former and excluding the latter).
84  *
85  *    - If each of these sums that needs to be taken into account (because the
86  *      check related to it has indicated that the CPU is likely to wake up
87  *      early) is greater than a half of the corresponding sum computed in step
88  *      1 (which means that the target residency of the state in question had
89  *      not exceeded the idle duration in over a half of the relevant cases),
90  *      select the given idle state instead of the candidate one.
91  *
92  * 3. By default, select the candidate state.
93  */
94 
95 #include <linux/cpuidle.h>
96 #include <linux/jiffies.h>
97 #include <linux/kernel.h>
98 #include <linux/sched/clock.h>
99 #include <linux/tick.h>
100 
101 #include "gov.h"
102 
103 /*
104  * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
105  * is used for decreasing metrics on a regular basis.
106  */
107 #define PULSE		1024
108 #define DECAY_SHIFT	3
109 
110 /**
111  * struct teo_bin - Metrics used by the TEO cpuidle governor.
112  * @intercepts: The "intercepts" metric.
113  * @hits: The "hits" metric.
114  */
115 struct teo_bin {
116 	unsigned int intercepts;
117 	unsigned int hits;
118 };
119 
120 /**
121  * struct teo_cpu - CPU data used by the TEO cpuidle governor.
122  * @time_span_ns: Time between idle state selection and post-wakeup update.
123  * @sleep_length_ns: Time till the closest timer event (at the selection time).
124  * @state_bins: Idle state data bins for this CPU.
125  * @total: Grand total of the "intercepts" and "hits" metrics for all bins.
126  * @tick_hits: Number of "hits" after TICK_NSEC.
127  */
128 struct teo_cpu {
129 	s64 time_span_ns;
130 	s64 sleep_length_ns;
131 	struct teo_bin state_bins[CPUIDLE_STATE_MAX];
132 	unsigned int total;
133 	unsigned int tick_hits;
134 };
135 
136 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
137 
138 /**
139  * teo_update - Update CPU metrics after wakeup.
140  * @drv: cpuidle driver containing state data.
141  * @dev: Target CPU.
142  */
teo_update(struct cpuidle_driver * drv,struct cpuidle_device * dev)143 static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
144 {
145 	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
146 	int i, idx_timer = 0, idx_duration = 0;
147 	s64 target_residency_ns;
148 	u64 measured_ns;
149 
150 	if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) {
151 		/*
152 		 * One of the safety nets has triggered or the wakeup was close
153 		 * enough to the closest timer event expected at the idle state
154 		 * selection time to be discarded.
155 		 */
156 		measured_ns = U64_MAX;
157 	} else {
158 		u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
159 
160 		/*
161 		 * The computations below are to determine whether or not the
162 		 * (saved) time till the next timer event and the measured idle
163 		 * duration fall into the same "bin", so use last_residency_ns
164 		 * for that instead of time_span_ns which includes the cpuidle
165 		 * overhead.
166 		 */
167 		measured_ns = dev->last_residency_ns;
168 		/*
169 		 * The delay between the wakeup and the first instruction
170 		 * executed by the CPU is not likely to be worst-case every
171 		 * time, so take 1/2 of the exit latency as a very rough
172 		 * approximation of the average of it.
173 		 */
174 		if (measured_ns >= lat_ns)
175 			measured_ns -= lat_ns / 2;
176 		else
177 			measured_ns /= 2;
178 	}
179 
180 	cpu_data->total = 0;
181 
182 	/*
183 	 * Decay the "hits" and "intercepts" metrics for all of the bins and
184 	 * find the bins that the sleep length and the measured idle duration
185 	 * fall into.
186 	 */
187 	for (i = 0; i < drv->state_count; i++) {
188 		struct teo_bin *bin = &cpu_data->state_bins[i];
189 
190 		bin->hits -= bin->hits >> DECAY_SHIFT;
191 		bin->intercepts -= bin->intercepts >> DECAY_SHIFT;
192 
193 		cpu_data->total += bin->hits + bin->intercepts;
194 
195 		target_residency_ns = drv->states[i].target_residency_ns;
196 
197 		if (target_residency_ns <= cpu_data->sleep_length_ns) {
198 			idx_timer = i;
199 			if (target_residency_ns <= measured_ns)
200 				idx_duration = i;
201 		}
202 	}
203 
204 	/*
205 	 * If the deepest state's target residency is below the tick length,
206 	 * make a record of it to help teo_select() decide whether or not
207 	 * to stop the tick.  This effectively adds an extra hits-only bin
208 	 * beyond the last state-related one.
209 	 */
210 	if (target_residency_ns < TICK_NSEC) {
211 		cpu_data->tick_hits -= cpu_data->tick_hits >> DECAY_SHIFT;
212 
213 		cpu_data->total += cpu_data->tick_hits;
214 
215 		if (TICK_NSEC <= cpu_data->sleep_length_ns) {
216 			idx_timer = drv->state_count;
217 			if (TICK_NSEC <= measured_ns) {
218 				cpu_data->tick_hits += PULSE;
219 				goto end;
220 			}
221 		}
222 	}
223 
224 	/*
225 	 * If the measured idle duration falls into the same bin as the sleep
226 	 * length, this is a "hit", so update the "hits" metric for that bin.
227 	 * Otherwise, update the "intercepts" metric for the bin fallen into by
228 	 * the measured idle duration.
229 	 */
230 	if (idx_timer == idx_duration)
231 		cpu_data->state_bins[idx_timer].hits += PULSE;
232 	else
233 		cpu_data->state_bins[idx_duration].intercepts += PULSE;
234 
235 end:
236 	cpu_data->total += PULSE;
237 }
238 
teo_state_ok(int i,struct cpuidle_driver * drv)239 static bool teo_state_ok(int i, struct cpuidle_driver *drv)
240 {
241 	return !tick_nohz_tick_stopped() ||
242 		drv->states[i].target_residency_ns >= TICK_NSEC;
243 }
244 
245 /**
246  * teo_find_shallower_state - Find shallower idle state matching given duration.
247  * @drv: cpuidle driver containing state data.
248  * @dev: Target CPU.
249  * @state_idx: Index of the capping idle state.
250  * @duration_ns: Idle duration value to match.
251  * @no_poll: Don't consider polling states.
252  */
teo_find_shallower_state(struct cpuidle_driver * drv,struct cpuidle_device * dev,int state_idx,s64 duration_ns,bool no_poll)253 static int teo_find_shallower_state(struct cpuidle_driver *drv,
254 				    struct cpuidle_device *dev, int state_idx,
255 				    s64 duration_ns, bool no_poll)
256 {
257 	int i;
258 
259 	for (i = state_idx - 1; i >= 0; i--) {
260 		if (dev->states_usage[i].disable ||
261 				(no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING))
262 			continue;
263 
264 		state_idx = i;
265 		if (drv->states[i].target_residency_ns <= duration_ns)
266 			break;
267 	}
268 	return state_idx;
269 }
270 
271 /**
272  * teo_select - Selects the next idle state to enter.
273  * @drv: cpuidle driver containing state data.
274  * @dev: Target CPU.
275  * @stop_tick: Indication on whether or not to stop the scheduler tick.
276  */
teo_select(struct cpuidle_driver * drv,struct cpuidle_device * dev,bool * stop_tick)277 static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
278 		      bool *stop_tick)
279 {
280 	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
281 	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
282 	ktime_t delta_tick = TICK_NSEC / 2;
283 	unsigned int tick_intercept_sum = 0;
284 	unsigned int idx_intercept_sum = 0;
285 	unsigned int intercept_sum = 0;
286 	unsigned int idx_hit_sum = 0;
287 	unsigned int hit_sum = 0;
288 	int constraint_idx = 0;
289 	int idx0 = 0, idx = -1;
290 	int prev_intercept_idx;
291 	s64 duration_ns;
292 	int i;
293 
294 	if (dev->last_state_idx >= 0) {
295 		teo_update(drv, dev);
296 		dev->last_state_idx = -1;
297 	}
298 
299 	cpu_data->time_span_ns = local_clock();
300 	/*
301 	 * Set the expected sleep length to infinity in case of an early
302 	 * return.
303 	 */
304 	cpu_data->sleep_length_ns = KTIME_MAX;
305 
306 	/* Check if there is any choice in the first place. */
307 	if (drv->state_count < 2) {
308 		idx = 0;
309 		goto out_tick;
310 	}
311 
312 	if (!dev->states_usage[0].disable)
313 		idx = 0;
314 
315 	/* Compute the sums of metrics for early wakeup pattern detection. */
316 	for (i = 1; i < drv->state_count; i++) {
317 		struct teo_bin *prev_bin = &cpu_data->state_bins[i-1];
318 		struct cpuidle_state *s = &drv->states[i];
319 
320 		/*
321 		 * Update the sums of idle state mertics for all of the states
322 		 * shallower than the current one.
323 		 */
324 		intercept_sum += prev_bin->intercepts;
325 		hit_sum += prev_bin->hits;
326 
327 		if (dev->states_usage[i].disable)
328 			continue;
329 
330 		if (idx < 0)
331 			idx0 = i; /* first enabled state */
332 
333 		idx = i;
334 
335 		if (s->exit_latency_ns <= latency_req)
336 			constraint_idx = i;
337 
338 		/* Save the sums for the current state. */
339 		idx_intercept_sum = intercept_sum;
340 		idx_hit_sum = hit_sum;
341 	}
342 
343 	/* Avoid unnecessary overhead. */
344 	if (idx < 0) {
345 		idx = 0; /* No states enabled, must use 0. */
346 		goto out_tick;
347 	}
348 
349 	if (idx == idx0) {
350 		/*
351 		 * Only one idle state is enabled, so use it, but do not
352 		 * allow the tick to be stopped it is shallow enough.
353 		 */
354 		duration_ns = drv->states[idx].target_residency_ns;
355 		goto end;
356 	}
357 
358 	tick_intercept_sum = intercept_sum +
359 			cpu_data->state_bins[drv->state_count-1].intercepts;
360 
361 	/*
362 	 * If the sum of the intercepts metric for all of the idle states
363 	 * shallower than the current candidate one (idx) is greater than the
364 	 * sum of the intercepts and hits metrics for the candidate state and
365 	 * all of the deeper states a shallower idle state is likely to be a
366 	 * better choice.
367 	 */
368 	prev_intercept_idx = idx;
369 	if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) {
370 		int first_suitable_idx = idx;
371 
372 		/*
373 		 * Look for the deepest idle state whose target residency had
374 		 * not exceeded the idle duration in over a half of the relevant
375 		 * cases in the past.
376 		 *
377 		 * Take the possible duration limitation present if the tick
378 		 * has been stopped already into account.
379 		 */
380 		intercept_sum = 0;
381 
382 		for (i = idx - 1; i >= 0; i--) {
383 			struct teo_bin *bin = &cpu_data->state_bins[i];
384 
385 			intercept_sum += bin->intercepts;
386 
387 			if (2 * intercept_sum > idx_intercept_sum) {
388 				/*
389 				 * Use the current state unless it is too
390 				 * shallow or disabled, in which case take the
391 				 * first enabled state that is deep enough.
392 				 */
393 				if (teo_state_ok(i, drv) &&
394 				    !dev->states_usage[i].disable)
395 					idx = i;
396 				else
397 					idx = first_suitable_idx;
398 
399 				break;
400 			}
401 
402 			if (dev->states_usage[i].disable)
403 				continue;
404 
405 			if (!teo_state_ok(i, drv)) {
406 				/*
407 				 * The current state is too shallow, but if an
408 				 * alternative candidate state has been found,
409 				 * it may still turn out to be a better choice.
410 				 */
411 				if (first_suitable_idx != idx)
412 					continue;
413 
414 				break;
415 			}
416 
417 			first_suitable_idx = i;
418 		}
419 	}
420 	if (!idx && prev_intercept_idx) {
421 		/*
422 		 * We have to query the sleep length here otherwise we don't
423 		 * know after wakeup if our guess was correct.
424 		 */
425 		duration_ns = tick_nohz_get_sleep_length(&delta_tick);
426 		cpu_data->sleep_length_ns = duration_ns;
427 		goto out_tick;
428 	}
429 
430 	/*
431 	 * If there is a latency constraint, it may be necessary to select an
432 	 * idle state shallower than the current candidate one.
433 	 */
434 	if (idx > constraint_idx)
435 		idx = constraint_idx;
436 
437 	/*
438 	 * Skip the timers check if state 0 is the current candidate one,
439 	 * because an immediate non-timer wakeup is expected in that case.
440 	 */
441 	if (!idx)
442 		goto out_tick;
443 
444 	/*
445 	 * If state 0 is a polling one, check if the target residency of
446 	 * the current candidate state is low enough and skip the timers
447 	 * check in that case too.
448 	 */
449 	if ((drv->states[0].flags & CPUIDLE_FLAG_POLLING) &&
450 	    drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS)
451 		goto out_tick;
452 
453 	duration_ns = tick_nohz_get_sleep_length(&delta_tick);
454 	cpu_data->sleep_length_ns = duration_ns;
455 
456 	/*
457 	 * If the closest expected timer is before the target residency of the
458 	 * candidate state, a shallower one needs to be found.
459 	 */
460 	if (drv->states[idx].target_residency_ns > duration_ns) {
461 		i = teo_find_shallower_state(drv, dev, idx, duration_ns, false);
462 		if (teo_state_ok(i, drv))
463 			idx = i;
464 	}
465 
466 	/*
467 	 * If the selected state's target residency is below the tick length
468 	 * and intercepts occurring before the tick length are the majority of
469 	 * total wakeup events, do not stop the tick.
470 	 */
471 	if (drv->states[idx].target_residency_ns < TICK_NSEC &&
472 	    tick_intercept_sum > cpu_data->total / 2 + cpu_data->total / 8)
473 		duration_ns = TICK_NSEC / 2;
474 
475 end:
476 	/*
477 	 * Allow the tick to be stopped unless the selected state is a polling
478 	 * one or the expected idle duration is shorter than the tick period
479 	 * length.
480 	 */
481 	if ((!(drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
482 	    duration_ns >= TICK_NSEC) || tick_nohz_tick_stopped())
483 		return idx;
484 
485 	/*
486 	 * The tick is not going to be stopped, so if the target residency of
487 	 * the state to be returned is not within the time till the closest
488 	 * timer including the tick, try to correct that.
489 	 */
490 	if (idx > idx0 &&
491 	    drv->states[idx].target_residency_ns > delta_tick)
492 		idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false);
493 
494 out_tick:
495 	*stop_tick = false;
496 	return idx;
497 }
498 
499 /**
500  * teo_reflect - Note that governor data for the CPU need to be updated.
501  * @dev: Target CPU.
502  * @state: Entered state.
503  */
teo_reflect(struct cpuidle_device * dev,int state)504 static void teo_reflect(struct cpuidle_device *dev, int state)
505 {
506 	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
507 
508 	dev->last_state_idx = state;
509 	/*
510 	 * If the wakeup was not "natural", but triggered by one of the safety
511 	 * nets, assume that the CPU might have been idle for the entire sleep
512 	 * length time.
513 	 */
514 	if (dev->poll_time_limit ||
515 	    (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) {
516 		dev->poll_time_limit = false;
517 		cpu_data->time_span_ns = cpu_data->sleep_length_ns;
518 	} else {
519 		cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns;
520 	}
521 }
522 
523 /**
524  * teo_enable_device - Initialize the governor's data for the target CPU.
525  * @drv: cpuidle driver (not used).
526  * @dev: Target CPU.
527  */
teo_enable_device(struct cpuidle_driver * drv,struct cpuidle_device * dev)528 static int teo_enable_device(struct cpuidle_driver *drv,
529 			     struct cpuidle_device *dev)
530 {
531 	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
532 
533 	memset(cpu_data, 0, sizeof(*cpu_data));
534 
535 	return 0;
536 }
537 
538 static struct cpuidle_governor teo_governor = {
539 	.name =		"teo",
540 	.rating =	19,
541 	.enable =	teo_enable_device,
542 	.select =	teo_select,
543 	.reflect =	teo_reflect,
544 };
545 
teo_governor_init(void)546 static int __init teo_governor_init(void)
547 {
548 	return cpuidle_register_governor(&teo_governor);
549 }
550 
551 postcore_initcall(teo_governor_init);
552