1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Timer events oriented CPU idle governor 4 * 5 * Copyright (C) 2018 - 2021 Intel Corporation 6 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> 7 */ 8 9 /** 10 * DOC: teo-description 11 * 12 * The idea of this governor is based on the observation that on many systems 13 * timer events are two or more orders of magnitude more frequent than any 14 * other interrupts, so they are likely to be the most significant cause of CPU 15 * wakeups from idle states. Moreover, information about what happened in the 16 * (relatively recent) past can be used to estimate whether or not the deepest 17 * idle state with target residency within the (known) time till the closest 18 * timer event, referred to as the sleep length, is likely to be suitable for 19 * the upcoming CPU idle period and, if not, then which of the shallower idle 20 * states to choose instead of it. 21 * 22 * Of course, non-timer wakeup sources are more important in some use cases 23 * which can be covered by taking a few most recent idle time intervals of the 24 * CPU into account. However, even in that context it is not necessary to 25 * consider idle duration values greater than the sleep length, because the 26 * closest timer will ultimately wake up the CPU anyway unless it is woken up 27 * earlier. 28 * 29 * Thus this governor estimates whether or not the prospective idle duration of 30 * a CPU is likely to be significantly shorter than the sleep length and selects 31 * an idle state for it accordingly. 32 * 33 * The computations carried out by this governor are based on using bins whose 34 * boundaries are aligned with the target residency parameter values of the CPU 35 * idle states provided by the %CPUIdle driver in the ascending order. That is, 36 * the first bin spans from 0 up to, but not including, the target residency of 37 * the second idle state (idle state 1), the second bin spans from the target 38 * residency of idle state 1 up to, but not including, the target residency of 39 * idle state 2, the third bin spans from the target residency of idle state 2 40 * up to, but not including, the target residency of idle state 3 and so on. 41 * The last bin spans from the target residency of the deepest idle state 42 * supplied by the driver to infinity. 43 * 44 * Two metrics called "hits" and "intercepts" are associated with each bin. 45 * They are updated every time before selecting an idle state for the given CPU 46 * in accordance with what happened last time. 47 * 48 * The "hits" metric reflects the relative frequency of situations in which the 49 * sleep length and the idle duration measured after CPU wakeup fall into the 50 * same bin (that is, the CPU appears to wake up "on time" relative to the sleep 51 * length). In turn, the "intercepts" metric reflects the relative frequency of 52 * situations in which the measured idle duration is so much shorter than the 53 * sleep length that the bin it falls into corresponds to an idle state 54 * shallower than the one whose bin is fallen into by the sleep length (these 55 * situations are referred to as "intercepts" below). 56 * 57 * In order to select an idle state for a CPU, the governor takes the following 58 * steps (modulo the possible latency constraint that must be taken into account 59 * too): 60 * 61 * 1. Find the deepest CPU idle state whose target residency does not exceed 62 * the current sleep length (the candidate idle state) and compute 2 sums as 63 * follows: 64 * 65 * - The sum of the "hits" and "intercepts" metrics for the candidate state 66 * and all of the deeper idle states (it represents the cases in which the 67 * CPU was idle long enough to avoid being intercepted if the sleep length 68 * had been equal to the current one). 69 * 70 * - The sum of the "intercepts" metrics for all of the idle states shallower 71 * than the candidate one (it represents the cases in which the CPU was not 72 * idle long enough to avoid being intercepted if the sleep length had been 73 * equal to the current one). 74 * 75 * 2. If the second sum is greater than the first one the CPU is likely to wake 76 * up early, so look for an alternative idle state to select. 77 * 78 * - Traverse the idle states shallower than the candidate one in the 79 * descending order. 80 * 81 * - For each of them compute the sum of the "intercepts" metrics over all 82 * of the idle states between it and the candidate one (including the 83 * former and excluding the latter). 84 * 85 * - If each of these sums that needs to be taken into account (because the 86 * check related to it has indicated that the CPU is likely to wake up 87 * early) is greater than a half of the corresponding sum computed in step 88 * 1 (which means that the target residency of the state in question had 89 * not exceeded the idle duration in over a half of the relevant cases), 90 * select the given idle state instead of the candidate one. 91 * 92 * 3. By default, select the candidate state. 93 */ 94 95 #include <linux/cpuidle.h> 96 #include <linux/jiffies.h> 97 #include <linux/kernel.h> 98 #include <linux/sched/clock.h> 99 #include <linux/tick.h> 100 101 #include "gov.h" 102 103 /* 104 * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value 105 * is used for decreasing metrics on a regular basis. 106 */ 107 #define PULSE 1024 108 #define DECAY_SHIFT 3 109 110 /** 111 * struct teo_bin - Metrics used by the TEO cpuidle governor. 112 * @intercepts: The "intercepts" metric. 113 * @hits: The "hits" metric. 114 */ 115 struct teo_bin { 116 unsigned int intercepts; 117 unsigned int hits; 118 }; 119 120 /** 121 * struct teo_cpu - CPU data used by the TEO cpuidle governor. 122 * @time_span_ns: Time between idle state selection and post-wakeup update. 123 * @sleep_length_ns: Time till the closest timer event (at the selection time). 124 * @state_bins: Idle state data bins for this CPU. 125 * @total: Grand total of the "intercepts" and "hits" metrics for all bins. 126 * @tick_hits: Number of "hits" after TICK_NSEC. 127 */ 128 struct teo_cpu { 129 s64 time_span_ns; 130 s64 sleep_length_ns; 131 struct teo_bin state_bins[CPUIDLE_STATE_MAX]; 132 unsigned int total; 133 unsigned int tick_hits; 134 }; 135 136 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); 137 138 /** 139 * teo_update - Update CPU metrics after wakeup. 140 * @drv: cpuidle driver containing state data. 141 * @dev: Target CPU. 142 */ 143 static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) 144 { 145 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 146 int i, idx_timer = 0, idx_duration = 0; 147 s64 target_residency_ns; 148 u64 measured_ns; 149 150 if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { 151 /* 152 * One of the safety nets has triggered or the wakeup was close 153 * enough to the closest timer event expected at the idle state 154 * selection time to be discarded. 155 */ 156 measured_ns = U64_MAX; 157 } else { 158 u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; 159 160 /* 161 * The computations below are to determine whether or not the 162 * (saved) time till the next timer event and the measured idle 163 * duration fall into the same "bin", so use last_residency_ns 164 * for that instead of time_span_ns which includes the cpuidle 165 * overhead. 166 */ 167 measured_ns = dev->last_residency_ns; 168 /* 169 * The delay between the wakeup and the first instruction 170 * executed by the CPU is not likely to be worst-case every 171 * time, so take 1/2 of the exit latency as a very rough 172 * approximation of the average of it. 173 */ 174 if (measured_ns >= lat_ns) 175 measured_ns -= lat_ns / 2; 176 else 177 measured_ns /= 2; 178 } 179 180 cpu_data->total = 0; 181 182 /* 183 * Decay the "hits" and "intercepts" metrics for all of the bins and 184 * find the bins that the sleep length and the measured idle duration 185 * fall into. 186 */ 187 for (i = 0; i < drv->state_count; i++) { 188 struct teo_bin *bin = &cpu_data->state_bins[i]; 189 190 bin->hits -= bin->hits >> DECAY_SHIFT; 191 bin->intercepts -= bin->intercepts >> DECAY_SHIFT; 192 193 cpu_data->total += bin->hits + bin->intercepts; 194 195 target_residency_ns = drv->states[i].target_residency_ns; 196 197 if (target_residency_ns <= cpu_data->sleep_length_ns) { 198 idx_timer = i; 199 if (target_residency_ns <= measured_ns) 200 idx_duration = i; 201 } 202 } 203 204 /* 205 * If the deepest state's target residency is below the tick length, 206 * make a record of it to help teo_select() decide whether or not 207 * to stop the tick. This effectively adds an extra hits-only bin 208 * beyond the last state-related one. 209 */ 210 if (target_residency_ns < TICK_NSEC) { 211 cpu_data->tick_hits -= cpu_data->tick_hits >> DECAY_SHIFT; 212 213 cpu_data->total += cpu_data->tick_hits; 214 215 if (TICK_NSEC <= cpu_data->sleep_length_ns) { 216 idx_timer = drv->state_count; 217 if (TICK_NSEC <= measured_ns) { 218 cpu_data->tick_hits += PULSE; 219 goto end; 220 } 221 } 222 } 223 224 /* 225 * If the measured idle duration falls into the same bin as the sleep 226 * length, this is a "hit", so update the "hits" metric for that bin. 227 * Otherwise, update the "intercepts" metric for the bin fallen into by 228 * the measured idle duration. 229 */ 230 if (idx_timer == idx_duration) 231 cpu_data->state_bins[idx_timer].hits += PULSE; 232 else 233 cpu_data->state_bins[idx_duration].intercepts += PULSE; 234 235 end: 236 cpu_data->total += PULSE; 237 } 238 239 static bool teo_state_ok(int i, struct cpuidle_driver *drv) 240 { 241 return !tick_nohz_tick_stopped() || 242 drv->states[i].target_residency_ns >= TICK_NSEC; 243 } 244 245 /** 246 * teo_find_shallower_state - Find shallower idle state matching given duration. 247 * @drv: cpuidle driver containing state data. 248 * @dev: Target CPU. 249 * @state_idx: Index of the capping idle state. 250 * @duration_ns: Idle duration value to match. 251 * @no_poll: Don't consider polling states. 252 */ 253 static int teo_find_shallower_state(struct cpuidle_driver *drv, 254 struct cpuidle_device *dev, int state_idx, 255 s64 duration_ns, bool no_poll) 256 { 257 int i; 258 259 for (i = state_idx - 1; i >= 0; i--) { 260 if (dev->states_usage[i].disable || 261 (no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING)) 262 continue; 263 264 state_idx = i; 265 if (drv->states[i].target_residency_ns <= duration_ns) 266 break; 267 } 268 return state_idx; 269 } 270 271 /** 272 * teo_select - Selects the next idle state to enter. 273 * @drv: cpuidle driver containing state data. 274 * @dev: Target CPU. 275 * @stop_tick: Indication on whether or not to stop the scheduler tick. 276 */ 277 static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, 278 bool *stop_tick) 279 { 280 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 281 s64 latency_req = cpuidle_governor_latency_req(dev->cpu); 282 ktime_t delta_tick = TICK_NSEC / 2; 283 unsigned int tick_intercept_sum = 0; 284 unsigned int idx_intercept_sum = 0; 285 unsigned int intercept_sum = 0; 286 unsigned int idx_hit_sum = 0; 287 unsigned int hit_sum = 0; 288 int constraint_idx = 0; 289 int idx0 = 0, idx = -1; 290 int prev_intercept_idx; 291 s64 duration_ns; 292 int i; 293 294 if (dev->last_state_idx >= 0) { 295 teo_update(drv, dev); 296 dev->last_state_idx = -1; 297 } 298 299 cpu_data->time_span_ns = local_clock(); 300 /* 301 * Set the expected sleep length to infinity in case of an early 302 * return. 303 */ 304 cpu_data->sleep_length_ns = KTIME_MAX; 305 306 /* Check if there is any choice in the first place. */ 307 if (drv->state_count < 2) { 308 idx = 0; 309 goto out_tick; 310 } 311 312 if (!dev->states_usage[0].disable) 313 idx = 0; 314 315 /* Compute the sums of metrics for early wakeup pattern detection. */ 316 for (i = 1; i < drv->state_count; i++) { 317 struct teo_bin *prev_bin = &cpu_data->state_bins[i-1]; 318 struct cpuidle_state *s = &drv->states[i]; 319 320 /* 321 * Update the sums of idle state mertics for all of the states 322 * shallower than the current one. 323 */ 324 intercept_sum += prev_bin->intercepts; 325 hit_sum += prev_bin->hits; 326 327 if (dev->states_usage[i].disable) 328 continue; 329 330 if (idx < 0) 331 idx0 = i; /* first enabled state */ 332 333 idx = i; 334 335 if (s->exit_latency_ns <= latency_req) 336 constraint_idx = i; 337 338 /* Save the sums for the current state. */ 339 idx_intercept_sum = intercept_sum; 340 idx_hit_sum = hit_sum; 341 } 342 343 /* Avoid unnecessary overhead. */ 344 if (idx < 0) { 345 idx = 0; /* No states enabled, must use 0. */ 346 goto out_tick; 347 } 348 349 if (idx == idx0) { 350 /* 351 * Only one idle state is enabled, so use it, but do not 352 * allow the tick to be stopped it is shallow enough. 353 */ 354 duration_ns = drv->states[idx].target_residency_ns; 355 goto end; 356 } 357 358 tick_intercept_sum = intercept_sum + 359 cpu_data->state_bins[drv->state_count-1].intercepts; 360 361 /* 362 * If the sum of the intercepts metric for all of the idle states 363 * shallower than the current candidate one (idx) is greater than the 364 * sum of the intercepts and hits metrics for the candidate state and 365 * all of the deeper states a shallower idle state is likely to be a 366 * better choice. 367 */ 368 prev_intercept_idx = idx; 369 if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) { 370 int first_suitable_idx = idx; 371 372 /* 373 * Look for the deepest idle state whose target residency had 374 * not exceeded the idle duration in over a half of the relevant 375 * cases in the past. 376 * 377 * Take the possible duration limitation present if the tick 378 * has been stopped already into account. 379 */ 380 intercept_sum = 0; 381 382 for (i = idx - 1; i >= 0; i--) { 383 struct teo_bin *bin = &cpu_data->state_bins[i]; 384 385 intercept_sum += bin->intercepts; 386 387 if (2 * intercept_sum > idx_intercept_sum) { 388 /* 389 * Use the current state unless it is too 390 * shallow or disabled, in which case take the 391 * first enabled state that is deep enough. 392 */ 393 if (teo_state_ok(i, drv) && 394 !dev->states_usage[i].disable) 395 idx = i; 396 else 397 idx = first_suitable_idx; 398 399 break; 400 } 401 402 if (dev->states_usage[i].disable) 403 continue; 404 405 if (!teo_state_ok(i, drv)) { 406 /* 407 * The current state is too shallow, but if an 408 * alternative candidate state has been found, 409 * it may still turn out to be a better choice. 410 */ 411 if (first_suitable_idx != idx) 412 continue; 413 414 break; 415 } 416 417 first_suitable_idx = i; 418 } 419 } 420 if (!idx && prev_intercept_idx) { 421 /* 422 * We have to query the sleep length here otherwise we don't 423 * know after wakeup if our guess was correct. 424 */ 425 duration_ns = tick_nohz_get_sleep_length(&delta_tick); 426 cpu_data->sleep_length_ns = duration_ns; 427 goto out_tick; 428 } 429 430 /* 431 * If there is a latency constraint, it may be necessary to select an 432 * idle state shallower than the current candidate one. 433 */ 434 if (idx > constraint_idx) 435 idx = constraint_idx; 436 437 /* 438 * Skip the timers check if state 0 is the current candidate one, 439 * because an immediate non-timer wakeup is expected in that case. 440 */ 441 if (!idx) 442 goto out_tick; 443 444 /* 445 * If state 0 is a polling one, check if the target residency of 446 * the current candidate state is low enough and skip the timers 447 * check in that case too. 448 */ 449 if ((drv->states[0].flags & CPUIDLE_FLAG_POLLING) && 450 drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) 451 goto out_tick; 452 453 duration_ns = tick_nohz_get_sleep_length(&delta_tick); 454 cpu_data->sleep_length_ns = duration_ns; 455 456 /* 457 * If the closest expected timer is before the target residency of the 458 * candidate state, a shallower one needs to be found. 459 */ 460 if (drv->states[idx].target_residency_ns > duration_ns) { 461 i = teo_find_shallower_state(drv, dev, idx, duration_ns, false); 462 if (teo_state_ok(i, drv)) 463 idx = i; 464 } 465 466 /* 467 * If the selected state's target residency is below the tick length 468 * and intercepts occurring before the tick length are the majority of 469 * total wakeup events, do not stop the tick. 470 */ 471 if (drv->states[idx].target_residency_ns < TICK_NSEC && 472 tick_intercept_sum > cpu_data->total / 2 + cpu_data->total / 8) 473 duration_ns = TICK_NSEC / 2; 474 475 end: 476 /* 477 * Allow the tick to be stopped unless the selected state is a polling 478 * one or the expected idle duration is shorter than the tick period 479 * length. 480 */ 481 if ((!(drv->states[idx].flags & CPUIDLE_FLAG_POLLING) && 482 duration_ns >= TICK_NSEC) || tick_nohz_tick_stopped()) 483 return idx; 484 485 /* 486 * The tick is not going to be stopped, so if the target residency of 487 * the state to be returned is not within the time till the closest 488 * timer including the tick, try to correct that. 489 */ 490 if (idx > idx0 && 491 drv->states[idx].target_residency_ns > delta_tick) 492 idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false); 493 494 out_tick: 495 *stop_tick = false; 496 return idx; 497 } 498 499 /** 500 * teo_reflect - Note that governor data for the CPU need to be updated. 501 * @dev: Target CPU. 502 * @state: Entered state. 503 */ 504 static void teo_reflect(struct cpuidle_device *dev, int state) 505 { 506 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 507 508 dev->last_state_idx = state; 509 /* 510 * If the wakeup was not "natural", but triggered by one of the safety 511 * nets, assume that the CPU might have been idle for the entire sleep 512 * length time. 513 */ 514 if (dev->poll_time_limit || 515 (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) { 516 dev->poll_time_limit = false; 517 cpu_data->time_span_ns = cpu_data->sleep_length_ns; 518 } else { 519 cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns; 520 } 521 } 522 523 /** 524 * teo_enable_device - Initialize the governor's data for the target CPU. 525 * @drv: cpuidle driver (not used). 526 * @dev: Target CPU. 527 */ 528 static int teo_enable_device(struct cpuidle_driver *drv, 529 struct cpuidle_device *dev) 530 { 531 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 532 533 memset(cpu_data, 0, sizeof(*cpu_data)); 534 535 return 0; 536 } 537 538 static struct cpuidle_governor teo_governor = { 539 .name = "teo", 540 .rating = 19, 541 .enable = teo_enable_device, 542 .select = teo_select, 543 .reflect = teo_reflect, 544 }; 545 546 static int __init teo_governor_init(void) 547 { 548 return cpuidle_register_governor(&teo_governor); 549 } 550 551 postcore_initcall(teo_governor_init); 552