1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * menu.c - the menu idle governor 4 * 5 * Copyright (C) 2006-2007 Adam Belay <abelay@novell.com> 6 * Copyright (C) 2009 Intel Corporation 7 * Author: 8 * Arjan van de Ven <arjan@linux.intel.com> 9 */ 10 11 #include <linux/kernel.h> 12 #include <linux/cpuidle.h> 13 #include <linux/time.h> 14 #include <linux/ktime.h> 15 #include <linux/hrtimer.h> 16 #include <linux/tick.h> 17 #include <linux/sched/stat.h> 18 #include <linux/math64.h> 19 20 #include "gov.h" 21 22 #define BUCKETS 6 23 #define INTERVAL_SHIFT 3 24 #define INTERVALS (1UL << INTERVAL_SHIFT) 25 #define RESOLUTION 1024 26 #define DECAY 8 27 #define MAX_INTERESTING (50000 * NSEC_PER_USEC) 28 29 /* 30 * Concepts and ideas behind the menu governor 31 * 32 * For the menu governor, there are 2 decision factors for picking a C 33 * state: 34 * 1) Energy break even point 35 * 2) Latency tolerance (from pmqos infrastructure) 36 * These two factors are treated independently. 37 * 38 * Energy break even point 39 * ----------------------- 40 * C state entry and exit have an energy cost, and a certain amount of time in 41 * the C state is required to actually break even on this cost. CPUIDLE 42 * provides us this duration in the "target_residency" field. So all that we 43 * need is a good prediction of how long we'll be idle. Like the traditional 44 * menu governor, we start with the actual known "next timer event" time. 45 * 46 * Since there are other source of wakeups (interrupts for example) than 47 * the next timer event, this estimation is rather optimistic. To get a 48 * more realistic estimate, a correction factor is applied to the estimate, 49 * that is based on historic behavior. For example, if in the past the actual 50 * duration always was 50% of the next timer tick, the correction factor will 51 * be 0.5. 52 * 53 * menu uses a running average for this correction factor, however it uses a 54 * set of factors, not just a single factor. This stems from the realization 55 * that the ratio is dependent on the order of magnitude of the expected 56 * duration; if we expect 500 milliseconds of idle time the likelihood of 57 * getting an interrupt very early is much higher than if we expect 50 micro 58 * seconds of idle time. A second independent factor that has big impact on 59 * the actual factor is if there is (disk) IO outstanding or not. 60 * (as a special twist, we consider every sleep longer than 50 milliseconds 61 * as perfect; there are no power gains for sleeping longer than this) 62 * 63 * For these two reasons we keep an array of 12 independent factors, that gets 64 * indexed based on the magnitude of the expected duration as well as the 65 * "is IO outstanding" property. 66 * 67 * Repeatable-interval-detector 68 * ---------------------------- 69 * There are some cases where "next timer" is a completely unusable predictor: 70 * Those cases where the interval is fixed, for example due to hardware 71 * interrupt mitigation, but also due to fixed transfer rate devices such as 72 * mice. 73 * For this, we use a different predictor: We track the duration of the last 8 74 * intervals and if the stand deviation of these 8 intervals is below a 75 * threshold value, we use the average of these intervals as prediction. 76 * 77 */ 78 79 struct menu_device { 80 int needs_update; 81 int tick_wakeup; 82 83 u64 next_timer_ns; 84 unsigned int bucket; 85 unsigned int correction_factor[BUCKETS]; 86 unsigned int intervals[INTERVALS]; 87 int interval_ptr; 88 }; 89 90 static inline int which_bucket(u64 duration_ns) 91 { 92 int bucket = 0; 93 94 if (duration_ns < 10ULL * NSEC_PER_USEC) 95 return bucket; 96 if (duration_ns < 100ULL * NSEC_PER_USEC) 97 return bucket + 1; 98 if (duration_ns < 1000ULL * NSEC_PER_USEC) 99 return bucket + 2; 100 if (duration_ns < 10000ULL * NSEC_PER_USEC) 101 return bucket + 3; 102 if (duration_ns < 100000ULL * NSEC_PER_USEC) 103 return bucket + 4; 104 return bucket + 5; 105 } 106 107 static DEFINE_PER_CPU(struct menu_device, menu_devices); 108 109 static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); 110 111 /* 112 * Try detecting repeating patterns by keeping track of the last 8 113 * intervals, and checking if the standard deviation of that set 114 * of points is below a threshold. If it is... then use the 115 * average of these 8 points as the estimated value. 116 */ 117 static unsigned int get_typical_interval(struct menu_device *data) 118 { 119 int i, divisor; 120 unsigned int min, max, thresh, avg; 121 uint64_t sum, variance; 122 123 thresh = INT_MAX; /* Discard outliers above this value */ 124 125 again: 126 127 /* First calculate the average of past intervals */ 128 min = UINT_MAX; 129 max = 0; 130 sum = 0; 131 divisor = 0; 132 for (i = 0; i < INTERVALS; i++) { 133 unsigned int value = data->intervals[i]; 134 if (value <= thresh) { 135 sum += value; 136 divisor++; 137 if (value > max) 138 max = value; 139 140 if (value < min) 141 min = value; 142 } 143 } 144 145 if (!max) 146 return UINT_MAX; 147 148 if (divisor == INTERVALS) 149 avg = sum >> INTERVAL_SHIFT; 150 else 151 avg = div_u64(sum, divisor); 152 153 /* Then try to determine variance */ 154 variance = 0; 155 for (i = 0; i < INTERVALS; i++) { 156 unsigned int value = data->intervals[i]; 157 if (value <= thresh) { 158 int64_t diff = (int64_t)value - avg; 159 variance += diff * diff; 160 } 161 } 162 if (divisor == INTERVALS) 163 variance >>= INTERVAL_SHIFT; 164 else 165 do_div(variance, divisor); 166 167 /* 168 * The typical interval is obtained when standard deviation is 169 * small (stddev <= 20 us, variance <= 400 us^2) or standard 170 * deviation is small compared to the average interval (avg > 171 * 6*stddev, avg^2 > 36*variance). The average is smaller than 172 * UINT_MAX aka U32_MAX, so computing its square does not 173 * overflow a u64. We simply reject this candidate average if 174 * the standard deviation is greater than 715 s (which is 175 * rather unlikely). 176 * 177 * Use this result only if there is no timer to wake us up sooner. 178 */ 179 if (likely(variance <= U64_MAX/36)) { 180 if ((((u64)avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3)) 181 || variance <= 400) { 182 return avg; 183 } 184 } 185 186 /* 187 * If we have outliers to the upside in our distribution, discard 188 * those by setting the threshold to exclude these outliers, then 189 * calculate the average and standard deviation again. Once we get 190 * down to the bottom 3/4 of our samples, stop excluding samples. 191 * 192 * This can deal with workloads that have long pauses interspersed 193 * with sporadic activity with a bunch of short pauses. 194 */ 195 if ((divisor * 4) <= INTERVALS * 3) 196 return UINT_MAX; 197 198 thresh = max - 1; 199 goto again; 200 } 201 202 /** 203 * menu_select - selects the next idle state to enter 204 * @drv: cpuidle driver containing state data 205 * @dev: the CPU 206 * @stop_tick: indication on whether or not to stop the tick 207 */ 208 static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, 209 bool *stop_tick) 210 { 211 struct menu_device *data = this_cpu_ptr(&menu_devices); 212 s64 latency_req = cpuidle_governor_latency_req(dev->cpu); 213 u64 predicted_ns; 214 ktime_t delta, delta_tick; 215 int i, idx; 216 217 if (data->needs_update) { 218 menu_update(drv, dev); 219 data->needs_update = 0; 220 } 221 222 /* Find the shortest expected idle interval. */ 223 predicted_ns = get_typical_interval(data) * NSEC_PER_USEC; 224 if (predicted_ns > RESIDENCY_THRESHOLD_NS) { 225 unsigned int timer_us; 226 227 /* Determine the time till the closest timer. */ 228 delta = tick_nohz_get_sleep_length(&delta_tick); 229 if (unlikely(delta < 0)) { 230 delta = 0; 231 delta_tick = 0; 232 } 233 234 data->next_timer_ns = delta; 235 data->bucket = which_bucket(data->next_timer_ns); 236 237 /* Round up the result for half microseconds. */ 238 timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 + 239 data->next_timer_ns * 240 data->correction_factor[data->bucket], 241 RESOLUTION * DECAY * NSEC_PER_USEC); 242 /* Use the lowest expected idle interval to pick the idle state. */ 243 predicted_ns = min((u64)timer_us * NSEC_PER_USEC, predicted_ns); 244 } else { 245 /* 246 * Because the next timer event is not going to be determined 247 * in this case, assume that without the tick the closest timer 248 * will be in distant future and that the closest tick will occur 249 * after 1/2 of the tick period. 250 */ 251 data->next_timer_ns = KTIME_MAX; 252 delta_tick = TICK_NSEC / 2; 253 data->bucket = which_bucket(KTIME_MAX); 254 } 255 256 if (unlikely(drv->state_count <= 1 || latency_req == 0) || 257 ((data->next_timer_ns < drv->states[1].target_residency_ns || 258 latency_req < drv->states[1].exit_latency_ns) && 259 !dev->states_usage[0].disable)) { 260 /* 261 * In this case state[0] will be used no matter what, so return 262 * it right away and keep the tick running if state[0] is a 263 * polling one. 264 */ 265 *stop_tick = !(drv->states[0].flags & CPUIDLE_FLAG_POLLING); 266 return 0; 267 } 268 269 if (tick_nohz_tick_stopped()) { 270 /* 271 * If the tick is already stopped, the cost of possible short 272 * idle duration misprediction is much higher, because the CPU 273 * may be stuck in a shallow idle state for a long time as a 274 * result of it. In that case say we might mispredict and use 275 * the known time till the closest timer event for the idle 276 * state selection. 277 */ 278 if (predicted_ns < TICK_NSEC) 279 predicted_ns = data->next_timer_ns; 280 } else if (latency_req > predicted_ns) { 281 latency_req = predicted_ns; 282 } 283 284 /* 285 * Find the idle state with the lowest power while satisfying 286 * our constraints. 287 */ 288 idx = -1; 289 for (i = 0; i < drv->state_count; i++) { 290 struct cpuidle_state *s = &drv->states[i]; 291 292 if (dev->states_usage[i].disable) 293 continue; 294 295 if (idx == -1) 296 idx = i; /* first enabled state */ 297 298 if (s->target_residency_ns > predicted_ns) { 299 /* 300 * Use a physical idle state, not busy polling, unless 301 * a timer is going to trigger soon enough. 302 */ 303 if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) && 304 s->exit_latency_ns <= latency_req && 305 s->target_residency_ns <= data->next_timer_ns) { 306 predicted_ns = s->target_residency_ns; 307 idx = i; 308 break; 309 } 310 if (predicted_ns < TICK_NSEC) 311 break; 312 313 if (!tick_nohz_tick_stopped()) { 314 /* 315 * If the state selected so far is shallow, 316 * waking up early won't hurt, so retain the 317 * tick in that case and let the governor run 318 * again in the next iteration of the loop. 319 */ 320 predicted_ns = drv->states[idx].target_residency_ns; 321 break; 322 } 323 324 /* 325 * If the state selected so far is shallow and this 326 * state's target residency matches the time till the 327 * closest timer event, select this one to avoid getting 328 * stuck in the shallow one for too long. 329 */ 330 if (drv->states[idx].target_residency_ns < TICK_NSEC && 331 s->target_residency_ns <= delta_tick) 332 idx = i; 333 334 return idx; 335 } 336 if (s->exit_latency_ns > latency_req) 337 break; 338 339 idx = i; 340 } 341 342 if (idx == -1) 343 idx = 0; /* No states enabled. Must use 0. */ 344 345 /* 346 * Don't stop the tick if the selected state is a polling one or if the 347 * expected idle duration is shorter than the tick period length. 348 */ 349 if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || 350 predicted_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) { 351 *stop_tick = false; 352 353 if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick) { 354 /* 355 * The tick is not going to be stopped and the target 356 * residency of the state to be returned is not within 357 * the time until the next timer event including the 358 * tick, so try to correct that. 359 */ 360 for (i = idx - 1; i >= 0; i--) { 361 if (dev->states_usage[i].disable) 362 continue; 363 364 idx = i; 365 if (drv->states[i].target_residency_ns <= delta_tick) 366 break; 367 } 368 } 369 } 370 371 return idx; 372 } 373 374 /** 375 * menu_reflect - records that data structures need update 376 * @dev: the CPU 377 * @index: the index of actual entered state 378 * 379 * NOTE: it's important to be fast here because this operation will add to 380 * the overall exit latency. 381 */ 382 static void menu_reflect(struct cpuidle_device *dev, int index) 383 { 384 struct menu_device *data = this_cpu_ptr(&menu_devices); 385 386 dev->last_state_idx = index; 387 data->needs_update = 1; 388 data->tick_wakeup = tick_nohz_idle_got_tick(); 389 } 390 391 /** 392 * menu_update - attempts to guess what happened after entry 393 * @drv: cpuidle driver containing state data 394 * @dev: the CPU 395 */ 396 static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) 397 { 398 struct menu_device *data = this_cpu_ptr(&menu_devices); 399 int last_idx = dev->last_state_idx; 400 struct cpuidle_state *target = &drv->states[last_idx]; 401 u64 measured_ns; 402 unsigned int new_factor; 403 404 /* 405 * Try to figure out how much time passed between entry to low 406 * power state and occurrence of the wakeup event. 407 * 408 * If the entered idle state didn't support residency measurements, 409 * we use them anyway if they are short, and if long, 410 * truncate to the whole expected time. 411 * 412 * Any measured amount of time will include the exit latency. 413 * Since we are interested in when the wakeup begun, not when it 414 * was completed, we must subtract the exit latency. However, if 415 * the measured amount of time is less than the exit latency, 416 * assume the state was never reached and the exit latency is 0. 417 */ 418 419 if (data->tick_wakeup && data->next_timer_ns > TICK_NSEC) { 420 /* 421 * The nohz code said that there wouldn't be any events within 422 * the tick boundary (if the tick was stopped), but the idle 423 * duration predictor had a differing opinion. Since the CPU 424 * was woken up by a tick (that wasn't stopped after all), the 425 * predictor was not quite right, so assume that the CPU could 426 * have been idle long (but not forever) to help the idle 427 * duration predictor do a better job next time. 428 */ 429 measured_ns = 9 * MAX_INTERESTING / 10; 430 } else if ((drv->states[last_idx].flags & CPUIDLE_FLAG_POLLING) && 431 dev->poll_time_limit) { 432 /* 433 * The CPU exited the "polling" state due to a time limit, so 434 * the idle duration prediction leading to the selection of that 435 * state was inaccurate. If a better prediction had been made, 436 * the CPU might have been woken up from idle by the next timer. 437 * Assume that to be the case. 438 */ 439 measured_ns = data->next_timer_ns; 440 } else { 441 /* measured value */ 442 measured_ns = dev->last_residency_ns; 443 444 /* Deduct exit latency */ 445 if (measured_ns > 2 * target->exit_latency_ns) 446 measured_ns -= target->exit_latency_ns; 447 else 448 measured_ns /= 2; 449 } 450 451 /* Make sure our coefficients do not exceed unity */ 452 if (measured_ns > data->next_timer_ns) 453 measured_ns = data->next_timer_ns; 454 455 /* Update our correction ratio */ 456 new_factor = data->correction_factor[data->bucket]; 457 new_factor -= new_factor / DECAY; 458 459 if (data->next_timer_ns > 0 && measured_ns < MAX_INTERESTING) 460 new_factor += div64_u64(RESOLUTION * measured_ns, 461 data->next_timer_ns); 462 else 463 /* 464 * we were idle so long that we count it as a perfect 465 * prediction 466 */ 467 new_factor += RESOLUTION; 468 469 /* 470 * We don't want 0 as factor; we always want at least 471 * a tiny bit of estimated time. Fortunately, due to rounding, 472 * new_factor will stay nonzero regardless of measured_us values 473 * and the compiler can eliminate this test as long as DECAY > 1. 474 */ 475 if (DECAY == 1 && unlikely(new_factor == 0)) 476 new_factor = 1; 477 478 data->correction_factor[data->bucket] = new_factor; 479 480 /* update the repeating-pattern data */ 481 data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns); 482 if (data->interval_ptr >= INTERVALS) 483 data->interval_ptr = 0; 484 } 485 486 /** 487 * menu_enable_device - scans a CPU's states and does setup 488 * @drv: cpuidle driver 489 * @dev: the CPU 490 */ 491 static int menu_enable_device(struct cpuidle_driver *drv, 492 struct cpuidle_device *dev) 493 { 494 struct menu_device *data = &per_cpu(menu_devices, dev->cpu); 495 int i; 496 497 memset(data, 0, sizeof(struct menu_device)); 498 499 /* 500 * if the correction factor is 0 (eg first time init or cpu hotplug 501 * etc), we actually want to start out with a unity factor. 502 */ 503 for(i = 0; i < BUCKETS; i++) 504 data->correction_factor[i] = RESOLUTION * DECAY; 505 506 return 0; 507 } 508 509 static struct cpuidle_governor menu_governor = { 510 .name = "menu", 511 .rating = 20, 512 .enable = menu_enable_device, 513 .select = menu_select, 514 .reflect = menu_reflect, 515 }; 516 517 /** 518 * init_menu - initializes the governor 519 */ 520 static int __init init_menu(void) 521 { 522 return cpuidle_register_governor(&menu_governor); 523 } 524 525 postcore_initcall(init_menu); 526