xref: /linux/drivers/cpuidle/governors/menu.c (revision 79d2e1919a2728ef49d938eb20ebd5903c14dfb0)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * menu.c - the menu idle governor
4  *
5  * Copyright (C) 2006-2007 Adam Belay <abelay@novell.com>
6  * Copyright (C) 2009 Intel Corporation
7  * Author:
8  *        Arjan van de Ven <arjan@linux.intel.com>
9  */
10 
11 #include <linux/kernel.h>
12 #include <linux/cpuidle.h>
13 #include <linux/time.h>
14 #include <linux/ktime.h>
15 #include <linux/hrtimer.h>
16 #include <linux/tick.h>
17 #include <linux/sched/stat.h>
18 #include <linux/math64.h>
19 
20 #include "gov.h"
21 
22 #define BUCKETS 6
23 #define INTERVAL_SHIFT 3
24 #define INTERVALS (1UL << INTERVAL_SHIFT)
25 #define RESOLUTION 1024
26 #define DECAY 8
27 #define MAX_INTERESTING (50000 * NSEC_PER_USEC)
28 
29 /*
30  * Concepts and ideas behind the menu governor
31  *
32  * For the menu governor, there are 2 decision factors for picking a C
33  * state:
34  * 1) Energy break even point
35  * 2) Latency tolerance (from pmqos infrastructure)
36  * These two factors are treated independently.
37  *
38  * Energy break even point
39  * -----------------------
40  * C state entry and exit have an energy cost, and a certain amount of time in
41  * the  C state is required to actually break even on this cost. CPUIDLE
42  * provides us this duration in the "target_residency" field. So all that we
43  * need is a good prediction of how long we'll be idle. Like the traditional
44  * menu governor, we start with the actual known "next timer event" time.
45  *
46  * Since there are other source of wakeups (interrupts for example) than
47  * the next timer event, this estimation is rather optimistic. To get a
48  * more realistic estimate, a correction factor is applied to the estimate,
49  * that is based on historic behavior. For example, if in the past the actual
50  * duration always was 50% of the next timer tick, the correction factor will
51  * be 0.5.
52  *
53  * menu uses a running average for this correction factor, however it uses a
54  * set of factors, not just a single factor. This stems from the realization
55  * that the ratio is dependent on the order of magnitude of the expected
56  * duration; if we expect 500 milliseconds of idle time the likelihood of
57  * getting an interrupt very early is much higher than if we expect 50 micro
58  * seconds of idle time. A second independent factor that has big impact on
59  * the actual factor is if there is (disk) IO outstanding or not.
60  * (as a special twist, we consider every sleep longer than 50 milliseconds
61  * as perfect; there are no power gains for sleeping longer than this)
62  *
63  * For these two reasons we keep an array of 12 independent factors, that gets
64  * indexed based on the magnitude of the expected duration as well as the
65  * "is IO outstanding" property.
66  *
67  * Repeatable-interval-detector
68  * ----------------------------
69  * There are some cases where "next timer" is a completely unusable predictor:
70  * Those cases where the interval is fixed, for example due to hardware
71  * interrupt mitigation, but also due to fixed transfer rate devices such as
72  * mice.
73  * For this, we use a different predictor: We track the duration of the last 8
74  * intervals and if the stand deviation of these 8 intervals is below a
75  * threshold value, we use the average of these intervals as prediction.
76  *
77  */
78 
79 struct menu_device {
80 	int             needs_update;
81 	int             tick_wakeup;
82 
83 	u64		next_timer_ns;
84 	unsigned int	bucket;
85 	unsigned int	correction_factor[BUCKETS];
86 	unsigned int	intervals[INTERVALS];
87 	int		interval_ptr;
88 };
89 
90 static inline int which_bucket(u64 duration_ns)
91 {
92 	int bucket = 0;
93 
94 	if (duration_ns < 10ULL * NSEC_PER_USEC)
95 		return bucket;
96 	if (duration_ns < 100ULL * NSEC_PER_USEC)
97 		return bucket + 1;
98 	if (duration_ns < 1000ULL * NSEC_PER_USEC)
99 		return bucket + 2;
100 	if (duration_ns < 10000ULL * NSEC_PER_USEC)
101 		return bucket + 3;
102 	if (duration_ns < 100000ULL * NSEC_PER_USEC)
103 		return bucket + 4;
104 	return bucket + 5;
105 }
106 
107 static DEFINE_PER_CPU(struct menu_device, menu_devices);
108 
109 static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
110 
111 /*
112  * Try detecting repeating patterns by keeping track of the last 8
113  * intervals, and checking if the standard deviation of that set
114  * of points is below a threshold. If it is... then use the
115  * average of these 8 points as the estimated value.
116  */
117 static unsigned int get_typical_interval(struct menu_device *data)
118 {
119 	int i, divisor;
120 	unsigned int min, max, thresh, avg;
121 	uint64_t sum, variance;
122 
123 	thresh = INT_MAX; /* Discard outliers above this value */
124 
125 again:
126 
127 	/* First calculate the average of past intervals */
128 	min = UINT_MAX;
129 	max = 0;
130 	sum = 0;
131 	divisor = 0;
132 	for (i = 0; i < INTERVALS; i++) {
133 		unsigned int value = data->intervals[i];
134 		if (value <= thresh) {
135 			sum += value;
136 			divisor++;
137 			if (value > max)
138 				max = value;
139 
140 			if (value < min)
141 				min = value;
142 		}
143 	}
144 
145 	if (!max)
146 		return UINT_MAX;
147 
148 	if (divisor == INTERVALS)
149 		avg = sum >> INTERVAL_SHIFT;
150 	else
151 		avg = div_u64(sum, divisor);
152 
153 	/* Then try to determine variance */
154 	variance = 0;
155 	for (i = 0; i < INTERVALS; i++) {
156 		unsigned int value = data->intervals[i];
157 		if (value <= thresh) {
158 			int64_t diff = (int64_t)value - avg;
159 			variance += diff * diff;
160 		}
161 	}
162 	if (divisor == INTERVALS)
163 		variance >>= INTERVAL_SHIFT;
164 	else
165 		do_div(variance, divisor);
166 
167 	/*
168 	 * The typical interval is obtained when standard deviation is
169 	 * small (stddev <= 20 us, variance <= 400 us^2) or standard
170 	 * deviation is small compared to the average interval (avg >
171 	 * 6*stddev, avg^2 > 36*variance). The average is smaller than
172 	 * UINT_MAX aka U32_MAX, so computing its square does not
173 	 * overflow a u64. We simply reject this candidate average if
174 	 * the standard deviation is greater than 715 s (which is
175 	 * rather unlikely).
176 	 *
177 	 * Use this result only if there is no timer to wake us up sooner.
178 	 */
179 	if (likely(variance <= U64_MAX/36)) {
180 		if ((((u64)avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3))
181 							|| variance <= 400) {
182 			return avg;
183 		}
184 	}
185 
186 	/*
187 	 * If we have outliers to the upside in our distribution, discard
188 	 * those by setting the threshold to exclude these outliers, then
189 	 * calculate the average and standard deviation again. Once we get
190 	 * down to the bottom 3/4 of our samples, stop excluding samples.
191 	 *
192 	 * This can deal with workloads that have long pauses interspersed
193 	 * with sporadic activity with a bunch of short pauses.
194 	 */
195 	if ((divisor * 4) <= INTERVALS * 3)
196 		return UINT_MAX;
197 
198 	thresh = max - 1;
199 	goto again;
200 }
201 
202 /**
203  * menu_select - selects the next idle state to enter
204  * @drv: cpuidle driver containing state data
205  * @dev: the CPU
206  * @stop_tick: indication on whether or not to stop the tick
207  */
208 static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
209 		       bool *stop_tick)
210 {
211 	struct menu_device *data = this_cpu_ptr(&menu_devices);
212 	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
213 	u64 predicted_ns;
214 	ktime_t delta, delta_tick;
215 	int i, idx;
216 
217 	if (data->needs_update) {
218 		menu_update(drv, dev);
219 		data->needs_update = 0;
220 	}
221 
222 	/* Find the shortest expected idle interval. */
223 	predicted_ns = get_typical_interval(data) * NSEC_PER_USEC;
224 	if (predicted_ns > RESIDENCY_THRESHOLD_NS) {
225 		unsigned int timer_us;
226 
227 		/* Determine the time till the closest timer. */
228 		delta = tick_nohz_get_sleep_length(&delta_tick);
229 		if (unlikely(delta < 0)) {
230 			delta = 0;
231 			delta_tick = 0;
232 		}
233 
234 		data->next_timer_ns = delta;
235 		data->bucket = which_bucket(data->next_timer_ns);
236 
237 		/* Round up the result for half microseconds. */
238 		timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 +
239 					data->next_timer_ns *
240 						data->correction_factor[data->bucket],
241 				   RESOLUTION * DECAY * NSEC_PER_USEC);
242 		/* Use the lowest expected idle interval to pick the idle state. */
243 		predicted_ns = min((u64)timer_us * NSEC_PER_USEC, predicted_ns);
244 	} else {
245 		/*
246 		 * Because the next timer event is not going to be determined
247 		 * in this case, assume that without the tick the closest timer
248 		 * will be in distant future and that the closest tick will occur
249 		 * after 1/2 of the tick period.
250 		 */
251 		data->next_timer_ns = KTIME_MAX;
252 		delta_tick = TICK_NSEC / 2;
253 		data->bucket = which_bucket(KTIME_MAX);
254 	}
255 
256 	if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
257 	    ((data->next_timer_ns < drv->states[1].target_residency_ns ||
258 	      latency_req < drv->states[1].exit_latency_ns) &&
259 	     !dev->states_usage[0].disable)) {
260 		/*
261 		 * In this case state[0] will be used no matter what, so return
262 		 * it right away and keep the tick running if state[0] is a
263 		 * polling one.
264 		 */
265 		*stop_tick = !(drv->states[0].flags & CPUIDLE_FLAG_POLLING);
266 		return 0;
267 	}
268 
269 	if (tick_nohz_tick_stopped()) {
270 		/*
271 		 * If the tick is already stopped, the cost of possible short
272 		 * idle duration misprediction is much higher, because the CPU
273 		 * may be stuck in a shallow idle state for a long time as a
274 		 * result of it.  In that case say we might mispredict and use
275 		 * the known time till the closest timer event for the idle
276 		 * state selection.
277 		 */
278 		if (predicted_ns < TICK_NSEC)
279 			predicted_ns = data->next_timer_ns;
280 	} else if (latency_req > predicted_ns) {
281 		latency_req = predicted_ns;
282 	}
283 
284 	/*
285 	 * Find the idle state with the lowest power while satisfying
286 	 * our constraints.
287 	 */
288 	idx = -1;
289 	for (i = 0; i < drv->state_count; i++) {
290 		struct cpuidle_state *s = &drv->states[i];
291 
292 		if (dev->states_usage[i].disable)
293 			continue;
294 
295 		if (idx == -1)
296 			idx = i; /* first enabled state */
297 
298 		if (s->target_residency_ns > predicted_ns) {
299 			/*
300 			 * Use a physical idle state, not busy polling, unless
301 			 * a timer is going to trigger soon enough.
302 			 */
303 			if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
304 			    s->exit_latency_ns <= latency_req &&
305 			    s->target_residency_ns <= data->next_timer_ns) {
306 				predicted_ns = s->target_residency_ns;
307 				idx = i;
308 				break;
309 			}
310 			if (predicted_ns < TICK_NSEC)
311 				break;
312 
313 			if (!tick_nohz_tick_stopped()) {
314 				/*
315 				 * If the state selected so far is shallow,
316 				 * waking up early won't hurt, so retain the
317 				 * tick in that case and let the governor run
318 				 * again in the next iteration of the loop.
319 				 */
320 				predicted_ns = drv->states[idx].target_residency_ns;
321 				break;
322 			}
323 
324 			/*
325 			 * If the state selected so far is shallow and this
326 			 * state's target residency matches the time till the
327 			 * closest timer event, select this one to avoid getting
328 			 * stuck in the shallow one for too long.
329 			 */
330 			if (drv->states[idx].target_residency_ns < TICK_NSEC &&
331 			    s->target_residency_ns <= delta_tick)
332 				idx = i;
333 
334 			return idx;
335 		}
336 		if (s->exit_latency_ns > latency_req)
337 			break;
338 
339 		idx = i;
340 	}
341 
342 	if (idx == -1)
343 		idx = 0; /* No states enabled. Must use 0. */
344 
345 	/*
346 	 * Don't stop the tick if the selected state is a polling one or if the
347 	 * expected idle duration is shorter than the tick period length.
348 	 */
349 	if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
350 	     predicted_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) {
351 		*stop_tick = false;
352 
353 		if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick) {
354 			/*
355 			 * The tick is not going to be stopped and the target
356 			 * residency of the state to be returned is not within
357 			 * the time until the next timer event including the
358 			 * tick, so try to correct that.
359 			 */
360 			for (i = idx - 1; i >= 0; i--) {
361 				if (dev->states_usage[i].disable)
362 					continue;
363 
364 				idx = i;
365 				if (drv->states[i].target_residency_ns <= delta_tick)
366 					break;
367 			}
368 		}
369 	}
370 
371 	return idx;
372 }
373 
374 /**
375  * menu_reflect - records that data structures need update
376  * @dev: the CPU
377  * @index: the index of actual entered state
378  *
379  * NOTE: it's important to be fast here because this operation will add to
380  *       the overall exit latency.
381  */
382 static void menu_reflect(struct cpuidle_device *dev, int index)
383 {
384 	struct menu_device *data = this_cpu_ptr(&menu_devices);
385 
386 	dev->last_state_idx = index;
387 	data->needs_update = 1;
388 	data->tick_wakeup = tick_nohz_idle_got_tick();
389 }
390 
391 /**
392  * menu_update - attempts to guess what happened after entry
393  * @drv: cpuidle driver containing state data
394  * @dev: the CPU
395  */
396 static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
397 {
398 	struct menu_device *data = this_cpu_ptr(&menu_devices);
399 	int last_idx = dev->last_state_idx;
400 	struct cpuidle_state *target = &drv->states[last_idx];
401 	u64 measured_ns;
402 	unsigned int new_factor;
403 
404 	/*
405 	 * Try to figure out how much time passed between entry to low
406 	 * power state and occurrence of the wakeup event.
407 	 *
408 	 * If the entered idle state didn't support residency measurements,
409 	 * we use them anyway if they are short, and if long,
410 	 * truncate to the whole expected time.
411 	 *
412 	 * Any measured amount of time will include the exit latency.
413 	 * Since we are interested in when the wakeup begun, not when it
414 	 * was completed, we must subtract the exit latency. However, if
415 	 * the measured amount of time is less than the exit latency,
416 	 * assume the state was never reached and the exit latency is 0.
417 	 */
418 
419 	if (data->tick_wakeup && data->next_timer_ns > TICK_NSEC) {
420 		/*
421 		 * The nohz code said that there wouldn't be any events within
422 		 * the tick boundary (if the tick was stopped), but the idle
423 		 * duration predictor had a differing opinion.  Since the CPU
424 		 * was woken up by a tick (that wasn't stopped after all), the
425 		 * predictor was not quite right, so assume that the CPU could
426 		 * have been idle long (but not forever) to help the idle
427 		 * duration predictor do a better job next time.
428 		 */
429 		measured_ns = 9 * MAX_INTERESTING / 10;
430 	} else if ((drv->states[last_idx].flags & CPUIDLE_FLAG_POLLING) &&
431 		   dev->poll_time_limit) {
432 		/*
433 		 * The CPU exited the "polling" state due to a time limit, so
434 		 * the idle duration prediction leading to the selection of that
435 		 * state was inaccurate.  If a better prediction had been made,
436 		 * the CPU might have been woken up from idle by the next timer.
437 		 * Assume that to be the case.
438 		 */
439 		measured_ns = data->next_timer_ns;
440 	} else {
441 		/* measured value */
442 		measured_ns = dev->last_residency_ns;
443 
444 		/* Deduct exit latency */
445 		if (measured_ns > 2 * target->exit_latency_ns)
446 			measured_ns -= target->exit_latency_ns;
447 		else
448 			measured_ns /= 2;
449 	}
450 
451 	/* Make sure our coefficients do not exceed unity */
452 	if (measured_ns > data->next_timer_ns)
453 		measured_ns = data->next_timer_ns;
454 
455 	/* Update our correction ratio */
456 	new_factor = data->correction_factor[data->bucket];
457 	new_factor -= new_factor / DECAY;
458 
459 	if (data->next_timer_ns > 0 && measured_ns < MAX_INTERESTING)
460 		new_factor += div64_u64(RESOLUTION * measured_ns,
461 					data->next_timer_ns);
462 	else
463 		/*
464 		 * we were idle so long that we count it as a perfect
465 		 * prediction
466 		 */
467 		new_factor += RESOLUTION;
468 
469 	/*
470 	 * We don't want 0 as factor; we always want at least
471 	 * a tiny bit of estimated time. Fortunately, due to rounding,
472 	 * new_factor will stay nonzero regardless of measured_us values
473 	 * and the compiler can eliminate this test as long as DECAY > 1.
474 	 */
475 	if (DECAY == 1 && unlikely(new_factor == 0))
476 		new_factor = 1;
477 
478 	data->correction_factor[data->bucket] = new_factor;
479 
480 	/* update the repeating-pattern data */
481 	data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns);
482 	if (data->interval_ptr >= INTERVALS)
483 		data->interval_ptr = 0;
484 }
485 
486 /**
487  * menu_enable_device - scans a CPU's states and does setup
488  * @drv: cpuidle driver
489  * @dev: the CPU
490  */
491 static int menu_enable_device(struct cpuidle_driver *drv,
492 				struct cpuidle_device *dev)
493 {
494 	struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
495 	int i;
496 
497 	memset(data, 0, sizeof(struct menu_device));
498 
499 	/*
500 	 * if the correction factor is 0 (eg first time init or cpu hotplug
501 	 * etc), we actually want to start out with a unity factor.
502 	 */
503 	for(i = 0; i < BUCKETS; i++)
504 		data->correction_factor[i] = RESOLUTION * DECAY;
505 
506 	return 0;
507 }
508 
509 static struct cpuidle_governor menu_governor = {
510 	.name =		"menu",
511 	.rating =	20,
512 	.enable =	menu_enable_device,
513 	.select =	menu_select,
514 	.reflect =	menu_reflect,
515 };
516 
517 /**
518  * init_menu - initializes the governor
519  */
520 static int __init init_menu(void)
521 {
522 	return cpuidle_register_governor(&menu_governor);
523 }
524 
525 postcore_initcall(init_menu);
526