xref: /linux/drivers/cpuidle/governors/teo.c (revision 6f7e6393d1ce636bb7ec77a7fe7b77458fddf701)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Timer events oriented CPU idle governor
4  *
5  * Copyright (C) 2018 - 2021 Intel Corporation
6  * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
7  */
8 
9 /**
10  * DOC: teo-description
11  *
12  * The idea of this governor is based on the observation that on many systems
13  * timer interrupts are two or more orders of magnitude more frequent than any
14  * other interrupt types, so they are likely to dominate CPU wakeup patterns.
15  * Moreover, in principle, the time when the next timer event is going to occur
16  * can be determined at the idle state selection time, although doing that may
17  * be costly, so it can be regarded as the most reliable source of information
18  * for idle state selection.
19  *
20  * Of course, non-timer wakeup sources are more important in some use cases,
21  * but even then it is generally unnecessary to consider idle duration values
22  * greater than the time till the next timer event, referred as the sleep
23  * length in what follows, because the closest timer will ultimately wake up the
24  * CPU anyway unless it is woken up earlier.
25  *
26  * However, since obtaining the sleep length may be costly, the governor first
27  * checks if it can select a shallow idle state using wakeup pattern information
28  * from recent times, in which case it can do without knowing the sleep length
29  * at all.  For this purpose, it counts CPU wakeup events and looks for an idle
30  * state whose target residency has not exceeded the idle duration (measured
31  * after wakeup) in the majority of relevant recent cases.  If the target
32  * residency of that state is small enough, it may be used right away and the
33  * sleep length need not be determined.
34  *
35  * The computations carried out by this governor are based on using bins whose
36  * boundaries are aligned with the target residency parameter values of the CPU
37  * idle states provided by the %CPUIdle driver in the ascending order.  That is,
38  * the first bin spans from 0 up to, but not including, the target residency of
39  * the second idle state (idle state 1), the second bin spans from the target
40  * residency of idle state 1 up to, but not including, the target residency of
41  * idle state 2, the third bin spans from the target residency of idle state 2
42  * up to, but not including, the target residency of idle state 3 and so on.
43  * The last bin spans from the target residency of the deepest idle state
44  * supplied by the driver to infinity.
45  *
46  * Two metrics called "hits" and "intercepts" are associated with each bin.
47  * They are updated every time before selecting an idle state for the given CPU
48  * in accordance with what happened last time.
49  *
50  * The "hits" metric reflects the relative frequency of situations in which the
51  * sleep length and the idle duration measured after CPU wakeup are close enough
52  * (that is, the CPU appears to wake up "on time" relative to the sleep length).
53  * In turn, the "intercepts" metric reflects the relative frequency of non-timer
54  * wakeup events for which the measured idle duration is significantly different
55  * from the sleep length (these events are also referred to as "intercepts"
56  * below).
57  *
58  * The governor also counts "intercepts" with the measured idle duration below
59  * the tick period length and uses this information when deciding whether or not
60  * to stop the scheduler tick.
61  *
62  * In order to select an idle state for a CPU, the governor takes the following
63  * steps (modulo the possible latency constraint that must be taken into account
64  * too):
65  *
66  * 1. Find the deepest enabled CPU idle state (the candidate idle state) and
67  *    compute 2 sums as follows:
68  *
69  *    - The sum of the "hits" metric for all of the idle states shallower than
70  *      the candidate one (it represents the cases in which the CPU was likely
71  *      woken up by a timer).
72  *
73  *    - The sum of the "intercepts" metric for all of the idle states shallower
74  *      than the candidate one (it represents the cases in which the CPU was
75  *      likely woken up by a non-timer wakeup source).
76  *
77  *    Also find the idle state with the maximum intercepts metric (if there are
78  *    multiple states with the maximum intercepts metric, choose the one with
79  *    the highest index).
80  *
81  * 2. If the second sum computed in step 1 is greater than a half of the sum of
82  *    both metrics for the candidate state bin and all subsequent bins (if any),
83  *    a shallower idle state is likely to be more suitable, so look for it.
84  *
85  *    - Traverse the enabled idle states shallower than the candidate one in the
86  *      descending order, starting at the state with the maximum intercepts
87  *      metric found in step 1.
88  *
89  *    - For each of them compute the sum of the "intercepts" metrics over all
90  *      of the idle states between it and the candidate one (including the
91  *      former and excluding the latter).
92  *
93  *    - If this sum is greater than a half of the second sum computed in step 1,
94  *      use the given idle state as the new candidate one.
95  *
96  * 3. If the current candidate state is state 0 or its target residency is short
97  *    enough, return it and prevent the scheduler tick from being stopped.
98  *
99  * 4. Obtain the sleep length value and check if it is below the target
100  *    residency of the current candidate state, in which case a new shallower
101  *    candidate state needs to be found, so look for it.
102  */
103 
104 #include <linux/cpuidle.h>
105 #include <linux/jiffies.h>
106 #include <linux/kernel.h>
107 #include <linux/sched/clock.h>
108 #include <linux/tick.h>
109 
110 #include "gov.h"
111 
112 /*
113  * Idle state exit latency threshold used for deciding whether or not to check
114  * the time till the closest expected timer event.
115  */
116 #define LATENCY_THRESHOLD_NS	(RESIDENCY_THRESHOLD_NS / 2)
117 
118 /*
119  * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
120  * is used for decreasing metrics on a regular basis.
121  */
122 #define PULSE		1024
123 #define DECAY_SHIFT	3
124 
125 /**
126  * struct teo_bin - Metrics used by the TEO cpuidle governor.
127  * @intercepts: The "intercepts" metric.
128  * @hits: The "hits" metric.
129  */
130 struct teo_bin {
131 	unsigned int intercepts;
132 	unsigned int hits;
133 };
134 
135 /**
136  * struct teo_cpu - CPU data used by the TEO cpuidle governor.
137  * @sleep_length_ns: Time till the closest timer event (at the selection time).
138  * @state_bins: Idle state data bins for this CPU.
139  * @total: Grand total of the "intercepts" and "hits" metrics for all bins.
140  * @total_tick: Wakeups by the scheduler tick.
141  * @tick_intercepts: "Intercepts" before TICK_NSEC.
142  * @short_idles: Wakeups after short idle periods.
143  * @tick_wakeup: Set if the last wakeup was by the scheduler tick.
144  */
145 struct teo_cpu {
146 	s64 sleep_length_ns;
147 	struct teo_bin state_bins[CPUIDLE_STATE_MAX];
148 	unsigned int total;
149 	unsigned int total_tick;
150 	unsigned int tick_intercepts;
151 	unsigned int short_idles;
152 	bool tick_wakeup;
153 };
154 
155 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
156 
157 static void teo_decay(unsigned int *metric)
158 {
159 	unsigned int delta = *metric >> DECAY_SHIFT;
160 
161 	if (delta)
162 		*metric -= delta;
163 	else
164 		*metric = 0;
165 }
166 
167 /**
168  * teo_update - Update CPU metrics after wakeup.
169  * @drv: cpuidle driver containing state data.
170  * @dev: Target CPU.
171  */
172 static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
173 {
174 	s64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
175 	struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus);
176 	int i, idx_timer = 0, idx_duration = 0;
177 	s64 target_residency_ns, measured_ns;
178 	unsigned int total = 0;
179 
180 	teo_decay(&cpu_data->short_idles);
181 
182 	if (dev->poll_time_limit) {
183 		dev->poll_time_limit = false;
184 		/*
185 		 * Polling state timeout has triggered, so assume that this
186 		 * might have been a long sleep.
187 		 */
188 		measured_ns = S64_MAX;
189 	} else {
190 		measured_ns = dev->last_residency_ns;
191 		/*
192 		 * The delay between the wakeup and the first instruction
193 		 * executed by the CPU is not likely to be worst-case every
194 		 * time, so take 1/2 of the exit latency as a very rough
195 		 * approximation of the average of it.
196 		 */
197 		if (measured_ns >= lat_ns) {
198 			measured_ns -= lat_ns / 2;
199 			if (measured_ns < RESIDENCY_THRESHOLD_NS)
200 				cpu_data->short_idles += PULSE;
201 		} else {
202 			measured_ns /= 2;
203 			cpu_data->short_idles += PULSE;
204 		}
205 	}
206 
207 	/*
208 	 * Decay the "hits" and "intercepts" metrics for all of the bins and
209 	 * find the bins that the sleep length and the measured idle duration
210 	 * fall into.
211 	 */
212 	for (i = 0; i < drv->state_count; i++) {
213 		struct teo_bin *bin = &cpu_data->state_bins[i];
214 
215 		teo_decay(&bin->hits);
216 		total += bin->hits;
217 		teo_decay(&bin->intercepts);
218 		total += bin->intercepts;
219 
220 		target_residency_ns = drv->states[i].target_residency_ns;
221 
222 		if (target_residency_ns <= cpu_data->sleep_length_ns) {
223 			idx_timer = i;
224 			if (target_residency_ns <= measured_ns)
225 				idx_duration = i;
226 		}
227 	}
228 
229 	cpu_data->total = total + PULSE;
230 
231 	teo_decay(&cpu_data->tick_intercepts);
232 
233 	teo_decay(&cpu_data->total_tick);
234 	if (cpu_data->tick_wakeup) {
235 		cpu_data->total_tick += PULSE;
236 		/*
237 		 * If tick wakeups dominate the wakeup pattern, count this one
238 		 * as a hit on the deepest available idle state to increase the
239 		 * likelihood of stopping the tick.
240 		 */
241 		if (3 * cpu_data->total_tick > 2 * cpu_data->total) {
242 			cpu_data->state_bins[drv->state_count-1].hits += PULSE;
243 			return;
244 		}
245 		/*
246 		 * If intercepts within the tick period range are not frequent
247 		 * enough, count this wakeup as a hit, since it is likely that
248 		 * the tick has woken up the CPU because an expected intercept
249 		 * was not there.  Otherwise, one of the intercepts may have
250 		 * been incidentally preceded by the tick wakeup.
251 		 */
252 		if (3 * cpu_data->tick_intercepts < 2 * total) {
253 			cpu_data->state_bins[idx_timer].hits += PULSE;
254 			return;
255 		}
256 	}
257 
258 	/*
259 	 * If the measured idle duration (adjusted for the entered state exit
260 	 * latency) falls into the same bin as the sleep length and the latter
261 	 * is less than the "raw" measured idle duration (so the wakeup appears
262 	 * to have occurred after the anticipated timer event), this is a "hit",
263 	 * so update the "hits" metric for that bin.
264 	 *
265 	 * Otherwise, update the "intercepts" metric for the bin fallen into by
266 	 * the measured idle duration.
267 	 */
268 	if (idx_timer == idx_duration &&
269 	    cpu_data->sleep_length_ns - measured_ns < lat_ns / 2) {
270 		cpu_data->state_bins[idx_timer].hits += PULSE;
271 	} else {
272 		cpu_data->state_bins[idx_duration].intercepts += PULSE;
273 		if (measured_ns <= TICK_NSEC)
274 			cpu_data->tick_intercepts += PULSE;
275 	}
276 }
277 
278 /**
279  * teo_find_shallower_state - Find shallower idle state matching given duration.
280  * @drv: cpuidle driver containing state data.
281  * @dev: Target CPU.
282  * @state_idx: Index of the capping idle state.
283  * @duration_ns: Idle duration value to match.
284  */
285 static int teo_find_shallower_state(struct cpuidle_driver *drv,
286 				    struct cpuidle_device *dev, int state_idx,
287 				    s64 duration_ns)
288 {
289 	int i;
290 
291 	for (i = state_idx - 1; i >= 0; i--) {
292 		if (dev->states_usage[i].disable)
293 			continue;
294 
295 		state_idx = i;
296 		if (drv->states[i].target_residency_ns <= duration_ns)
297 			break;
298 	}
299 	return state_idx;
300 }
301 
302 /**
303  * teo_select - Selects the next idle state to enter.
304  * @drv: cpuidle driver containing state data.
305  * @dev: Target CPU.
306  * @stop_tick: Indication on whether or not to stop the scheduler tick.
307  */
308 static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
309 		      bool *stop_tick)
310 {
311 	struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus);
312 	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
313 	ktime_t delta_tick = TICK_NSEC / 2;
314 	unsigned int idx_intercept_sum = 0;
315 	unsigned int intercept_sum = 0;
316 	unsigned int intercept_max = 0;
317 	unsigned int idx_hit_sum = 0;
318 	unsigned int hit_sum = 0;
319 	int intercept_max_idx = -1;
320 	int constraint_idx = 0;
321 	int idx0 = 0, idx = -1;
322 	s64 duration_ns;
323 	int i;
324 
325 	if (dev->last_state_idx >= 0) {
326 		teo_update(drv, dev);
327 		dev->last_state_idx = -1;
328 	}
329 
330 	/*
331 	 * Set the sleep length to infinity in case the invocation of
332 	 * tick_nohz_get_sleep_length() below is skipped, in which case it won't
333 	 * be known whether or not the subsequent wakeup is caused by a timer.
334 	 * It is generally fine to count the wakeup as an intercept then, except
335 	 * for the cases when the CPU is mostly woken up by timers and there may
336 	 * be opportunities to ask for a deeper idle state when no imminent
337 	 * timers are scheduled which may be missed.
338 	 */
339 	cpu_data->sleep_length_ns = KTIME_MAX;
340 
341 	/* Check if there is any choice in the first place. */
342 	if (drv->state_count < 2) {
343 		idx = 0;
344 		goto out_tick;
345 	}
346 
347 	if (!dev->states_usage[0].disable)
348 		idx = 0;
349 
350 	/*
351 	 * Compute the sums of metrics for early wakeup pattern detection and
352 	 * look for the state bin with the maximum intercepts metric below the
353 	 * deepest enabled one (if there are multiple states with the maximum
354 	 * intercepts metric, choose the one with the highest index).
355 	 */
356 	for (i = 1; i < drv->state_count; i++) {
357 		struct teo_bin *prev_bin = &cpu_data->state_bins[i-1];
358 		unsigned int prev_intercepts = prev_bin->intercepts;
359 		struct cpuidle_state *s = &drv->states[i];
360 
361 		/*
362 		 * Update the sums of idle state metrics for all of the states
363 		 * shallower than the current one.
364 		 */
365 		hit_sum += prev_bin->hits;
366 		intercept_sum += prev_intercepts;
367 		/*
368 		 * Check if this is the bin with the maximum number of
369 		 * intercepts so far and in that case update the index of
370 		 * the state with the maximum intercepts metric.
371 		 */
372 		if (prev_intercepts >= intercept_max) {
373 			intercept_max = prev_intercepts;
374 			intercept_max_idx = i - 1;
375 		}
376 
377 		if (dev->states_usage[i].disable)
378 			continue;
379 
380 		if (idx < 0)
381 			idx0 = i; /* first enabled state */
382 
383 		idx = i;
384 
385 		if (s->exit_latency_ns <= latency_req)
386 			constraint_idx = i;
387 
388 		/* Save the sums for the current state. */
389 		idx_intercept_sum = intercept_sum;
390 		idx_hit_sum = hit_sum;
391 	}
392 
393 	/* Avoid unnecessary overhead. */
394 	if (idx < 0) {
395 		idx = 0; /* No states enabled, must use 0. */
396 		goto out_tick;
397 	}
398 
399 	if (idx == idx0) {
400 		/*
401 		 * Only one idle state is enabled, so use it, but do not
402 		 * allow the tick to be stopped it is shallow enough.
403 		 */
404 		duration_ns = drv->states[idx].target_residency_ns;
405 		goto end;
406 	}
407 
408 	/*
409 	 * If the sum of the intercepts metric for all of the idle states
410 	 * shallower than the current candidate one (idx) is greater than the
411 	 * sum of the intercepts and hits metrics for the candidate state and
412 	 * all of the deeper states, a shallower idle state is likely to be a
413 	 * better choice.
414 	 */
415 	if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) {
416 		int min_idx = idx0;
417 
418 		if (tick_nohz_tick_stopped()) {
419 			/*
420 			 * Look for the shallowest idle state below the current
421 			 * candidate one whose target residency is at least
422 			 * equal to the tick period length.
423 			 */
424 			while (min_idx < idx &&
425 			       drv->states[min_idx].target_residency_ns < TICK_NSEC)
426 				min_idx++;
427 
428 			/*
429 			 * Avoid selecting a state with a lower index, but with
430 			 * the same target residency as the current candidate
431 			 * one.
432 			 */
433 			if (drv->states[min_idx].target_residency_ns ==
434 					drv->states[idx].target_residency_ns)
435 				goto constraint;
436 		}
437 
438 		/*
439 		 * If the minimum state index is greater than or equal to the
440 		 * index of the state with the maximum intercepts metric and
441 		 * the corresponding state is enabled, there is no need to look
442 		 * at the deeper states.
443 		 */
444 		if (min_idx >= intercept_max_idx &&
445 		    !dev->states_usage[min_idx].disable) {
446 			idx = min_idx;
447 			goto constraint;
448 		}
449 
450 		/*
451 		 * Look for the deepest enabled idle state, at most as deep as
452 		 * the one with the maximum intercepts metric, whose target
453 		 * residency had not been greater than the idle duration in over
454 		 * a half of the relevant cases in the past.
455 		 *
456 		 * Take the possible duration limitation present if the tick
457 		 * has been stopped already into account.
458 		 */
459 		for (i = idx - 1, intercept_sum = 0; i >= min_idx; i--) {
460 			intercept_sum += cpu_data->state_bins[i].intercepts;
461 
462 			if (dev->states_usage[i].disable)
463 				continue;
464 
465 			idx = i;
466 			if (2 * intercept_sum > idx_intercept_sum &&
467 			    i <= intercept_max_idx)
468 				break;
469 		}
470 	}
471 
472 constraint:
473 	/*
474 	 * If there is a latency constraint, it may be necessary to select an
475 	 * idle state shallower than the current candidate one.
476 	 */
477 	if (idx > constraint_idx)
478 		idx = constraint_idx;
479 
480 	/*
481 	 * If either the candidate state is state 0 or its target residency is
482 	 * low enough, there is basically nothing more to do, but if the sleep
483 	 * length is not updated, the subsequent wakeup will be counted as an
484 	 * "intercept" which may be problematic in the cases when timer wakeups
485 	 * are dominant.  Namely, it may effectively prevent deeper idle states
486 	 * from being selected at one point even if no imminent timers are
487 	 * scheduled.
488 	 *
489 	 * However, frequent timers in the RESIDENCY_THRESHOLD_NS range on one
490 	 * CPU are unlikely (user space has a default 50 us slack value for
491 	 * hrtimers and there are relatively few timers with a lower deadline
492 	 * value in the kernel), and even if they did happen, the potential
493 	 * benefit from using a deep idle state in that case would be
494 	 * questionable anyway for latency reasons.  Thus if the measured idle
495 	 * duration falls into that range in the majority of cases, assume
496 	 * non-timer wakeups to be dominant and skip updating the sleep length
497 	 * to reduce latency.
498 	 *
499 	 * Also, if the latency constraint is sufficiently low, it will force
500 	 * shallow idle states regardless of the wakeup type, so the sleep
501 	 * length need not be known in that case.
502 	 */
503 	if ((!idx || drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) &&
504 	    (2 * cpu_data->short_idles >= cpu_data->total ||
505 	     latency_req < LATENCY_THRESHOLD_NS))
506 		goto out_tick;
507 
508 	duration_ns = tick_nohz_get_sleep_length(&delta_tick);
509 	cpu_data->sleep_length_ns = duration_ns;
510 
511 	if (!idx)
512 		goto out_tick;
513 
514 	/*
515 	 * If the closest expected timer is before the target residency of the
516 	 * candidate state, a shallower one needs to be found.
517 	 */
518 	if (drv->states[idx].target_residency_ns > duration_ns)
519 		idx = teo_find_shallower_state(drv, dev, idx, duration_ns);
520 
521 	/*
522 	 * If the selected state's target residency is below the tick length
523 	 * and intercepts occurring before the tick length are the majority of
524 	 * total wakeup events, do not stop the tick.
525 	 */
526 	if (drv->states[idx].target_residency_ns < TICK_NSEC &&
527 	    3 * cpu_data->tick_intercepts >= 2 * cpu_data->total)
528 		duration_ns = TICK_NSEC / 2;
529 
530 end:
531 	/*
532 	 * Allow the tick to be stopped unless the selected state is a polling
533 	 * one or the expected idle duration is shorter than the tick period
534 	 * length.
535 	 */
536 	if ((!(drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
537 	    duration_ns >= TICK_NSEC) || tick_nohz_tick_stopped())
538 		return idx;
539 
540 	/*
541 	 * The tick is not going to be stopped, so if the target residency of
542 	 * the state to be returned is not within the time till the closest
543 	 * timer including the tick, try to correct that.
544 	 */
545 	if (idx > idx0 &&
546 	    drv->states[idx].target_residency_ns > delta_tick)
547 		idx = teo_find_shallower_state(drv, dev, idx, delta_tick);
548 
549 out_tick:
550 	*stop_tick = false;
551 	return idx;
552 }
553 
554 /**
555  * teo_reflect - Note that governor data for the CPU need to be updated.
556  * @dev: Target CPU.
557  * @state: Entered state.
558  */
559 static void teo_reflect(struct cpuidle_device *dev, int state)
560 {
561 	struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus);
562 
563 	cpu_data->tick_wakeup = tick_nohz_idle_got_tick();
564 
565 	dev->last_state_idx = state;
566 }
567 
568 /**
569  * teo_enable_device - Initialize the governor's data for the target CPU.
570  * @drv: cpuidle driver (not used).
571  * @dev: Target CPU.
572  */
573 static int teo_enable_device(struct cpuidle_driver *drv,
574 			     struct cpuidle_device *dev)
575 {
576 	struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
577 
578 	memset(cpu_data, 0, sizeof(*cpu_data));
579 
580 	return 0;
581 }
582 
583 static struct cpuidle_governor teo_governor = {
584 	.name =		"teo",
585 	.rating =	19,
586 	.enable =	teo_enable_device,
587 	.select =	teo_select,
588 	.reflect =	teo_reflect,
589 };
590 
591 static int __init teo_governor_init(void)
592 {
593 	return cpuidle_register_governor(&teo_governor);
594 }
595 
596 postcore_initcall(teo_governor_init);
597