1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Timer events oriented CPU idle governor 4 * 5 * Copyright (C) 2018 - 2021 Intel Corporation 6 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> 7 */ 8 9 /** 10 * DOC: teo-description 11 * 12 * The idea of this governor is based on the observation that on many systems 13 * timer interrupts are two or more orders of magnitude more frequent than any 14 * other interrupt types, so they are likely to dominate CPU wakeup patterns. 15 * Moreover, in principle, the time when the next timer event is going to occur 16 * can be determined at the idle state selection time, although doing that may 17 * be costly, so it can be regarded as the most reliable source of information 18 * for idle state selection. 19 * 20 * Of course, non-timer wakeup sources are more important in some use cases, 21 * but even then it is generally unnecessary to consider idle duration values 22 * greater than the time time till the next timer event, referred as the sleep 23 * length in what follows, because the closest timer will ultimately wake up the 24 * CPU anyway unless it is woken up earlier. 25 * 26 * However, since obtaining the sleep length may be costly, the governor first 27 * checks if it can select a shallow idle state using wakeup pattern information 28 * from recent times, in which case it can do without knowing the sleep length 29 * at all. For this purpose, it counts CPU wakeup events and looks for an idle 30 * state whose target residency has not exceeded the idle duration (measured 31 * after wakeup) in the majority of relevant recent cases. If the target 32 * residency of that state is small enough, it may be used right away and the 33 * sleep length need not be determined. 34 * 35 * The computations carried out by this governor are based on using bins whose 36 * boundaries are aligned with the target residency parameter values of the CPU 37 * idle states provided by the %CPUIdle driver in the ascending order. That is, 38 * the first bin spans from 0 up to, but not including, the target residency of 39 * the second idle state (idle state 1), the second bin spans from the target 40 * residency of idle state 1 up to, but not including, the target residency of 41 * idle state 2, the third bin spans from the target residency of idle state 2 42 * up to, but not including, the target residency of idle state 3 and so on. 43 * The last bin spans from the target residency of the deepest idle state 44 * supplied by the driver to the scheduler tick period length or to infinity if 45 * the tick period length is less than the target residency of that state. In 46 * the latter case, the governor also counts events with the measured idle 47 * duration between the tick period length and the target residency of the 48 * deepest idle state. 49 * 50 * Two metrics called "hits" and "intercepts" are associated with each bin. 51 * They are updated every time before selecting an idle state for the given CPU 52 * in accordance with what happened last time. 53 * 54 * The "hits" metric reflects the relative frequency of situations in which the 55 * sleep length and the idle duration measured after CPU wakeup fall into the 56 * same bin (that is, the CPU appears to wake up "on time" relative to the sleep 57 * length). In turn, the "intercepts" metric reflects the relative frequency of 58 * non-timer wakeup events for which the measured idle duration falls into a bin 59 * that corresponds to an idle state shallower than the one whose bin is fallen 60 * into by the sleep length (these events are also referred to as "intercepts" 61 * below). 62 * 63 * In order to select an idle state for a CPU, the governor takes the following 64 * steps (modulo the possible latency constraint that must be taken into account 65 * too): 66 * 67 * 1. Find the deepest enabled CPU idle state (the candidate idle state) and 68 * compute 2 sums as follows: 69 * 70 * - The sum of the "hits" metric for all of the idle states shallower than 71 * the candidate one (it represents the cases in which the CPU was likely 72 * woken up by a timer). 73 * 74 * - The sum of the "intercepts" metric for all of the idle states shallower 75 * than the candidate one (it represents the cases in which the CPU was 76 * likely woken up by a non-timer wakeup source). 77 * 78 * 2. If the second sum computed in step 1 is greater than a half of the sum of 79 * both metrics for the candidate state bin and all subsequent bins(if any), 80 * a shallower idle state is likely to be more suitable, so look for it. 81 * 82 * - Traverse the enabled idle states shallower than the candidate one in the 83 * descending order. 84 * 85 * - For each of them compute the sum of the "intercepts" metrics over all 86 * of the idle states between it and the candidate one (including the 87 * former and excluding the latter). 88 * 89 * - If this sum is greater than a half of the second sum computed in step 1, 90 * use the given idle state as the new candidate one. 91 * 92 * 3. If the current candidate state is state 0 or its target residency is short 93 * enough, return it and prevent the scheduler tick from being stopped. 94 * 95 * 4. Obtain the sleep length value and check if it is below the target 96 * residency of the current candidate state, in which case a new shallower 97 * candidate state needs to be found, so look for it. 98 */ 99 100 #include <linux/cpuidle.h> 101 #include <linux/jiffies.h> 102 #include <linux/kernel.h> 103 #include <linux/sched/clock.h> 104 #include <linux/tick.h> 105 106 #include "gov.h" 107 108 /* 109 * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value 110 * is used for decreasing metrics on a regular basis. 111 */ 112 #define PULSE 1024 113 #define DECAY_SHIFT 3 114 115 /** 116 * struct teo_bin - Metrics used by the TEO cpuidle governor. 117 * @intercepts: The "intercepts" metric. 118 * @hits: The "hits" metric. 119 */ 120 struct teo_bin { 121 unsigned int intercepts; 122 unsigned int hits; 123 }; 124 125 /** 126 * struct teo_cpu - CPU data used by the TEO cpuidle governor. 127 * @time_span_ns: Time between idle state selection and post-wakeup update. 128 * @sleep_length_ns: Time till the closest timer event (at the selection time). 129 * @state_bins: Idle state data bins for this CPU. 130 * @total: Grand total of the "intercepts" and "hits" metrics for all bins. 131 * @tick_hits: Number of "hits" after TICK_NSEC. 132 */ 133 struct teo_cpu { 134 s64 time_span_ns; 135 s64 sleep_length_ns; 136 struct teo_bin state_bins[CPUIDLE_STATE_MAX]; 137 unsigned int total; 138 unsigned int tick_hits; 139 }; 140 141 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); 142 143 /** 144 * teo_update - Update CPU metrics after wakeup. 145 * @drv: cpuidle driver containing state data. 146 * @dev: Target CPU. 147 */ 148 static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) 149 { 150 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 151 int i, idx_timer = 0, idx_duration = 0; 152 s64 target_residency_ns; 153 u64 measured_ns; 154 155 if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { 156 /* 157 * One of the safety nets has triggered or the wakeup was close 158 * enough to the closest timer event expected at the idle state 159 * selection time to be discarded. 160 */ 161 measured_ns = U64_MAX; 162 } else { 163 u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; 164 165 /* 166 * The computations below are to determine whether or not the 167 * (saved) time till the next timer event and the measured idle 168 * duration fall into the same "bin", so use last_residency_ns 169 * for that instead of time_span_ns which includes the cpuidle 170 * overhead. 171 */ 172 measured_ns = dev->last_residency_ns; 173 /* 174 * The delay between the wakeup and the first instruction 175 * executed by the CPU is not likely to be worst-case every 176 * time, so take 1/2 of the exit latency as a very rough 177 * approximation of the average of it. 178 */ 179 if (measured_ns >= lat_ns) 180 measured_ns -= lat_ns / 2; 181 else 182 measured_ns /= 2; 183 } 184 185 cpu_data->total = 0; 186 187 /* 188 * Decay the "hits" and "intercepts" metrics for all of the bins and 189 * find the bins that the sleep length and the measured idle duration 190 * fall into. 191 */ 192 for (i = 0; i < drv->state_count; i++) { 193 struct teo_bin *bin = &cpu_data->state_bins[i]; 194 195 bin->hits -= bin->hits >> DECAY_SHIFT; 196 bin->intercepts -= bin->intercepts >> DECAY_SHIFT; 197 198 cpu_data->total += bin->hits + bin->intercepts; 199 200 target_residency_ns = drv->states[i].target_residency_ns; 201 202 if (target_residency_ns <= cpu_data->sleep_length_ns) { 203 idx_timer = i; 204 if (target_residency_ns <= measured_ns) 205 idx_duration = i; 206 } 207 } 208 209 /* 210 * If the deepest state's target residency is below the tick length, 211 * make a record of it to help teo_select() decide whether or not 212 * to stop the tick. This effectively adds an extra hits-only bin 213 * beyond the last state-related one. 214 */ 215 if (target_residency_ns < TICK_NSEC) { 216 cpu_data->tick_hits -= cpu_data->tick_hits >> DECAY_SHIFT; 217 218 cpu_data->total += cpu_data->tick_hits; 219 220 if (TICK_NSEC <= cpu_data->sleep_length_ns) { 221 idx_timer = drv->state_count; 222 if (TICK_NSEC <= measured_ns) { 223 cpu_data->tick_hits += PULSE; 224 goto end; 225 } 226 } 227 } 228 229 /* 230 * If the measured idle duration falls into the same bin as the sleep 231 * length, this is a "hit", so update the "hits" metric for that bin. 232 * Otherwise, update the "intercepts" metric for the bin fallen into by 233 * the measured idle duration. 234 */ 235 if (idx_timer == idx_duration) 236 cpu_data->state_bins[idx_timer].hits += PULSE; 237 else 238 cpu_data->state_bins[idx_duration].intercepts += PULSE; 239 240 end: 241 cpu_data->total += PULSE; 242 } 243 244 static bool teo_state_ok(int i, struct cpuidle_driver *drv) 245 { 246 return !tick_nohz_tick_stopped() || 247 drv->states[i].target_residency_ns >= TICK_NSEC; 248 } 249 250 /** 251 * teo_find_shallower_state - Find shallower idle state matching given duration. 252 * @drv: cpuidle driver containing state data. 253 * @dev: Target CPU. 254 * @state_idx: Index of the capping idle state. 255 * @duration_ns: Idle duration value to match. 256 * @no_poll: Don't consider polling states. 257 */ 258 static int teo_find_shallower_state(struct cpuidle_driver *drv, 259 struct cpuidle_device *dev, int state_idx, 260 s64 duration_ns, bool no_poll) 261 { 262 int i; 263 264 for (i = state_idx - 1; i >= 0; i--) { 265 if (dev->states_usage[i].disable || 266 (no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING)) 267 continue; 268 269 state_idx = i; 270 if (drv->states[i].target_residency_ns <= duration_ns) 271 break; 272 } 273 return state_idx; 274 } 275 276 /** 277 * teo_select - Selects the next idle state to enter. 278 * @drv: cpuidle driver containing state data. 279 * @dev: Target CPU. 280 * @stop_tick: Indication on whether or not to stop the scheduler tick. 281 */ 282 static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, 283 bool *stop_tick) 284 { 285 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 286 s64 latency_req = cpuidle_governor_latency_req(dev->cpu); 287 ktime_t delta_tick = TICK_NSEC / 2; 288 unsigned int tick_intercept_sum = 0; 289 unsigned int idx_intercept_sum = 0; 290 unsigned int intercept_sum = 0; 291 unsigned int idx_hit_sum = 0; 292 unsigned int hit_sum = 0; 293 int constraint_idx = 0; 294 int idx0 = 0, idx = -1; 295 int prev_intercept_idx; 296 s64 duration_ns; 297 int i; 298 299 if (dev->last_state_idx >= 0) { 300 teo_update(drv, dev); 301 dev->last_state_idx = -1; 302 } 303 304 cpu_data->time_span_ns = local_clock(); 305 /* 306 * Set the expected sleep length to infinity in case of an early 307 * return. 308 */ 309 cpu_data->sleep_length_ns = KTIME_MAX; 310 311 /* Check if there is any choice in the first place. */ 312 if (drv->state_count < 2) { 313 idx = 0; 314 goto out_tick; 315 } 316 317 if (!dev->states_usage[0].disable) 318 idx = 0; 319 320 /* Compute the sums of metrics for early wakeup pattern detection. */ 321 for (i = 1; i < drv->state_count; i++) { 322 struct teo_bin *prev_bin = &cpu_data->state_bins[i-1]; 323 struct cpuidle_state *s = &drv->states[i]; 324 325 /* 326 * Update the sums of idle state mertics for all of the states 327 * shallower than the current one. 328 */ 329 intercept_sum += prev_bin->intercepts; 330 hit_sum += prev_bin->hits; 331 332 if (dev->states_usage[i].disable) 333 continue; 334 335 if (idx < 0) 336 idx0 = i; /* first enabled state */ 337 338 idx = i; 339 340 if (s->exit_latency_ns <= latency_req) 341 constraint_idx = i; 342 343 /* Save the sums for the current state. */ 344 idx_intercept_sum = intercept_sum; 345 idx_hit_sum = hit_sum; 346 } 347 348 /* Avoid unnecessary overhead. */ 349 if (idx < 0) { 350 idx = 0; /* No states enabled, must use 0. */ 351 goto out_tick; 352 } 353 354 if (idx == idx0) { 355 /* 356 * Only one idle state is enabled, so use it, but do not 357 * allow the tick to be stopped it is shallow enough. 358 */ 359 duration_ns = drv->states[idx].target_residency_ns; 360 goto end; 361 } 362 363 tick_intercept_sum = intercept_sum + 364 cpu_data->state_bins[drv->state_count-1].intercepts; 365 366 /* 367 * If the sum of the intercepts metric for all of the idle states 368 * shallower than the current candidate one (idx) is greater than the 369 * sum of the intercepts and hits metrics for the candidate state and 370 * all of the deeper states a shallower idle state is likely to be a 371 * better choice. 372 */ 373 prev_intercept_idx = idx; 374 if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) { 375 int first_suitable_idx = idx; 376 377 /* 378 * Look for the deepest idle state whose target residency had 379 * not exceeded the idle duration in over a half of the relevant 380 * cases in the past. 381 * 382 * Take the possible duration limitation present if the tick 383 * has been stopped already into account. 384 */ 385 intercept_sum = 0; 386 387 for (i = idx - 1; i >= 0; i--) { 388 struct teo_bin *bin = &cpu_data->state_bins[i]; 389 390 intercept_sum += bin->intercepts; 391 392 if (2 * intercept_sum > idx_intercept_sum) { 393 /* 394 * Use the current state unless it is too 395 * shallow or disabled, in which case take the 396 * first enabled state that is deep enough. 397 */ 398 if (teo_state_ok(i, drv) && 399 !dev->states_usage[i].disable) 400 idx = i; 401 else 402 idx = first_suitable_idx; 403 404 break; 405 } 406 407 if (dev->states_usage[i].disable) 408 continue; 409 410 if (!teo_state_ok(i, drv)) { 411 /* 412 * The current state is too shallow, but if an 413 * alternative candidate state has been found, 414 * it may still turn out to be a better choice. 415 */ 416 if (first_suitable_idx != idx) 417 continue; 418 419 break; 420 } 421 422 first_suitable_idx = i; 423 } 424 } 425 if (!idx && prev_intercept_idx) { 426 /* 427 * We have to query the sleep length here otherwise we don't 428 * know after wakeup if our guess was correct. 429 */ 430 duration_ns = tick_nohz_get_sleep_length(&delta_tick); 431 cpu_data->sleep_length_ns = duration_ns; 432 goto out_tick; 433 } 434 435 /* 436 * If there is a latency constraint, it may be necessary to select an 437 * idle state shallower than the current candidate one. 438 */ 439 if (idx > constraint_idx) 440 idx = constraint_idx; 441 442 /* 443 * Skip the timers check if state 0 is the current candidate one, 444 * because an immediate non-timer wakeup is expected in that case. 445 */ 446 if (!idx) 447 goto out_tick; 448 449 /* 450 * If state 0 is a polling one, check if the target residency of 451 * the current candidate state is low enough and skip the timers 452 * check in that case too. 453 */ 454 if ((drv->states[0].flags & CPUIDLE_FLAG_POLLING) && 455 drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) 456 goto out_tick; 457 458 duration_ns = tick_nohz_get_sleep_length(&delta_tick); 459 cpu_data->sleep_length_ns = duration_ns; 460 461 /* 462 * If the closest expected timer is before the target residency of the 463 * candidate state, a shallower one needs to be found. 464 */ 465 if (drv->states[idx].target_residency_ns > duration_ns) { 466 i = teo_find_shallower_state(drv, dev, idx, duration_ns, false); 467 if (teo_state_ok(i, drv)) 468 idx = i; 469 } 470 471 /* 472 * If the selected state's target residency is below the tick length 473 * and intercepts occurring before the tick length are the majority of 474 * total wakeup events, do not stop the tick. 475 */ 476 if (drv->states[idx].target_residency_ns < TICK_NSEC && 477 tick_intercept_sum > cpu_data->total / 2 + cpu_data->total / 8) 478 duration_ns = TICK_NSEC / 2; 479 480 end: 481 /* 482 * Allow the tick to be stopped unless the selected state is a polling 483 * one or the expected idle duration is shorter than the tick period 484 * length. 485 */ 486 if ((!(drv->states[idx].flags & CPUIDLE_FLAG_POLLING) && 487 duration_ns >= TICK_NSEC) || tick_nohz_tick_stopped()) 488 return idx; 489 490 /* 491 * The tick is not going to be stopped, so if the target residency of 492 * the state to be returned is not within the time till the closest 493 * timer including the tick, try to correct that. 494 */ 495 if (idx > idx0 && 496 drv->states[idx].target_residency_ns > delta_tick) 497 idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false); 498 499 out_tick: 500 *stop_tick = false; 501 return idx; 502 } 503 504 /** 505 * teo_reflect - Note that governor data for the CPU need to be updated. 506 * @dev: Target CPU. 507 * @state: Entered state. 508 */ 509 static void teo_reflect(struct cpuidle_device *dev, int state) 510 { 511 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 512 513 dev->last_state_idx = state; 514 /* 515 * If the wakeup was not "natural", but triggered by one of the safety 516 * nets, assume that the CPU might have been idle for the entire sleep 517 * length time. 518 */ 519 if (dev->poll_time_limit || 520 (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) { 521 dev->poll_time_limit = false; 522 cpu_data->time_span_ns = cpu_data->sleep_length_ns; 523 } else { 524 cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns; 525 } 526 } 527 528 /** 529 * teo_enable_device - Initialize the governor's data for the target CPU. 530 * @drv: cpuidle driver (not used). 531 * @dev: Target CPU. 532 */ 533 static int teo_enable_device(struct cpuidle_driver *drv, 534 struct cpuidle_device *dev) 535 { 536 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); 537 538 memset(cpu_data, 0, sizeof(*cpu_data)); 539 540 return 0; 541 } 542 543 static struct cpuidle_governor teo_governor = { 544 .name = "teo", 545 .rating = 19, 546 .enable = teo_enable_device, 547 .select = teo_select, 548 .reflect = teo_reflect, 549 }; 550 551 static int __init teo_governor_init(void) 552 { 553 return cpuidle_register_governor(&teo_governor); 554 } 555 556 postcore_initcall(teo_governor_init); 557