1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Infrastructure for migratable timers 4 * 5 * Copyright(C) 2022 linutronix GmbH 6 */ 7 #include <linux/cpuhotplug.h> 8 #include <linux/slab.h> 9 #include <linux/smp.h> 10 #include <linux/spinlock.h> 11 #include <linux/timerqueue.h> 12 #include <trace/events/ipi.h> 13 #include <linux/sched/isolation.h> 14 15 #include "timer_migration.h" 16 #include "tick-internal.h" 17 18 #define CREATE_TRACE_POINTS 19 #include <trace/events/timer_migration.h> 20 21 /* 22 * The timer migration mechanism is built on a hierarchy of groups. The 23 * lowest level group contains CPUs, the next level groups of CPU groups 24 * and so forth. The CPU groups are kept per node so for the normal case 25 * lock contention won't happen across nodes. Depending on the number of 26 * CPUs per node even the next level might be kept as groups of CPU groups 27 * per node and only the levels above cross the node topology. 28 * 29 * Example topology for a two node system with 24 CPUs each. 30 * 31 * LVL 2 [GRP2:0] 32 * GRP1:0 = GRP1:M 33 * 34 * LVL 1 [GRP1:0] [GRP1:1] 35 * GRP0:0 - GRP0:2 GRP0:3 - GRP0:5 36 * 37 * LVL 0 [GRP0:0] [GRP0:1] [GRP0:2] [GRP0:3] [GRP0:4] [GRP0:5] 38 * CPUS 0-7 8-15 16-23 24-31 32-39 40-47 39 * 40 * The groups hold a timer queue of events sorted by expiry time. These 41 * queues are updated when CPUs go in idle. When they come out of idle 42 * ignore flag of events is set. 43 * 44 * Each group has a designated migrator CPU/group as long as a CPU/group is 45 * active in the group. This designated role is necessary to avoid that all 46 * active CPUs in a group try to migrate expired timers from other CPUs, 47 * which would result in massive lock bouncing. 48 * 49 * When a CPU is awake, it checks in it's own timer tick the group 50 * hierarchy up to the point where it is assigned the migrator role or if 51 * no CPU is active, it also checks the groups where no migrator is set 52 * (TMIGR_NONE). 53 * 54 * If it finds expired timers in one of the group queues it pulls them over 55 * from the idle CPU and runs the timer function. After that it updates the 56 * group and the parent groups if required. 57 * 58 * CPUs which go idle arm their CPU local timer hardware for the next local 59 * (pinned) timer event. If the next migratable timer expires after the 60 * next local timer or the CPU has no migratable timer pending then the 61 * CPU does not queue an event in the LVL0 group. If the next migratable 62 * timer expires before the next local timer then the CPU queues that timer 63 * in the LVL0 group. In both cases the CPU marks itself idle in the LVL0 64 * group. 65 * 66 * When CPU comes out of idle and when a group has at least a single active 67 * child, the ignore flag of the tmigr_event is set. This indicates, that 68 * the event is ignored even if it is still enqueued in the parent groups 69 * timer queue. It will be removed when touching the timer queue the next 70 * time. This spares locking in active path as the lock protects (after 71 * setup) only event information. For more information about locking, 72 * please read the section "Locking rules". 73 * 74 * If the CPU is the migrator of the group then it delegates that role to 75 * the next active CPU in the group or sets migrator to TMIGR_NONE when 76 * there is no active CPU in the group. This delegation needs to be 77 * propagated up the hierarchy so hand over from other leaves can happen at 78 * all hierarchy levels w/o doing a search. 79 * 80 * When the last CPU in the system goes idle, then it drops all migrator 81 * duties up to the top level of the hierarchy (LVL2 in the example). It 82 * then has to make sure, that it arms it's own local hardware timer for 83 * the earliest event in the system. 84 * 85 * 86 * Lifetime rules: 87 * --------------- 88 * 89 * The groups are built up at init time or when CPUs come online. They are 90 * not destroyed when a group becomes empty due to offlining. The group 91 * just won't participate in the hierarchy management anymore. Destroying 92 * groups would result in interesting race conditions which would just make 93 * the whole mechanism slow and complex. 94 * 95 * 96 * Locking rules: 97 * -------------- 98 * 99 * For setting up new groups and handling events it's required to lock both 100 * child and parent group. The lock ordering is always bottom up. This also 101 * includes the per CPU locks in struct tmigr_cpu. For updating the migrator and 102 * active CPU/group information atomic_try_cmpxchg() is used instead and only 103 * the per CPU tmigr_cpu->lock is held. 104 * 105 * During the setup of groups, hier->level_list is required. It is protected by 106 * @tmigr_mutex. 107 * 108 * When @timer_base->lock as well as tmigr related locks are required, the lock 109 * ordering is: first @timer_base->lock, afterwards tmigr related locks. 110 * 111 * 112 * Protection of the tmigr group state information: 113 * ------------------------------------------------ 114 * 115 * The state information with the list of active children and migrator needs to 116 * be protected by a sequence counter. It prevents a race when updates in child 117 * groups are propagated in changed order. The state update is performed 118 * lockless and group wise. The following scenario describes what happens 119 * without updating the sequence counter: 120 * 121 * Therefore, let's take three groups and four CPUs (CPU2 and CPU3 as well 122 * as GRP0:1 will not change during the scenario): 123 * 124 * LVL 1 [GRP1:0] 125 * migrator = GRP0:1 126 * active = GRP0:0, GRP0:1 127 * / \ 128 * LVL 0 [GRP0:0] [GRP0:1] 129 * migrator = CPU0 migrator = CPU2 130 * active = CPU0 active = CPU2 131 * / \ / \ 132 * CPUs 0 1 2 3 133 * active idle active idle 134 * 135 * 136 * 1. CPU0 goes idle. As the update is performed group wise, in the first step 137 * only GRP0:0 is updated. The update of GRP1:0 is pending as CPU0 has to 138 * walk the hierarchy. 139 * 140 * LVL 1 [GRP1:0] 141 * migrator = GRP0:1 142 * active = GRP0:0, GRP0:1 143 * / \ 144 * LVL 0 [GRP0:0] [GRP0:1] 145 * --> migrator = TMIGR_NONE migrator = CPU2 146 * --> active = active = CPU2 147 * / \ / \ 148 * CPUs 0 1 2 3 149 * --> idle idle active idle 150 * 151 * 2. While CPU0 goes idle and continues to update the state, CPU1 comes out of 152 * idle. CPU1 updates GRP0:0. The update for GRP1:0 is pending as CPU1 also 153 * has to walk the hierarchy. Both CPUs (CPU0 and CPU1) now walk the 154 * hierarchy to perform the needed update from their point of view. The 155 * currently visible state looks the following: 156 * 157 * LVL 1 [GRP1:0] 158 * migrator = GRP0:1 159 * active = GRP0:0, GRP0:1 160 * / \ 161 * LVL 0 [GRP0:0] [GRP0:1] 162 * --> migrator = CPU1 migrator = CPU2 163 * --> active = CPU1 active = CPU2 164 * / \ / \ 165 * CPUs 0 1 2 3 166 * idle --> active active idle 167 * 168 * 3. Here is the race condition: CPU1 managed to propagate its changes (from 169 * step 2) through the hierarchy to GRP1:0 before CPU0 (step 1) did. The 170 * active members of GRP1:0 remain unchanged after the update since it is 171 * still valid from CPU1 current point of view: 172 * 173 * LVL 1 [GRP1:0] 174 * --> migrator = GRP0:1 175 * --> active = GRP0:0, GRP0:1 176 * / \ 177 * LVL 0 [GRP0:0] [GRP0:1] 178 * migrator = CPU1 migrator = CPU2 179 * active = CPU1 active = CPU2 180 * / \ / \ 181 * CPUs 0 1 2 3 182 * idle active active idle 183 * 184 * 4. Now CPU0 finally propagates its changes (from step 1) to GRP1:0. 185 * 186 * LVL 1 [GRP1:0] 187 * --> migrator = GRP0:1 188 * --> active = GRP0:1 189 * / \ 190 * LVL 0 [GRP0:0] [GRP0:1] 191 * migrator = CPU1 migrator = CPU2 192 * active = CPU1 active = CPU2 193 * / \ / \ 194 * CPUs 0 1 2 3 195 * idle active active idle 196 * 197 * 198 * The race of CPU0 vs. CPU1 led to an inconsistent state in GRP1:0. CPU1 is 199 * active and is correctly listed as active in GRP0:0. However GRP1:0 does not 200 * have GRP0:0 listed as active, which is wrong. The sequence counter has been 201 * added to avoid inconsistent states during updates. The state is updated 202 * atomically only if all members, including the sequence counter, match the 203 * expected value (compare-and-exchange). 204 * 205 * Looking back at the previous example with the addition of the sequence 206 * counter: The update as performed by CPU0 in step 4 will fail. CPU1 changed 207 * the sequence number during the update in step 3 so the expected old value (as 208 * seen by CPU0 before starting the walk) does not match. 209 * 210 * Prevent race between new event and last CPU going inactive 211 * ---------------------------------------------------------- 212 * 213 * When the last CPU is going idle and there is a concurrent update of a new 214 * first global timer of an idle CPU, the group and child states have to be read 215 * while holding the lock in tmigr_update_events(). The following scenario shows 216 * what happens, when this is not done. 217 * 218 * 1. Only CPU2 is active: 219 * 220 * LVL 1 [GRP1:0] 221 * migrator = GRP0:1 222 * active = GRP0:1 223 * next_expiry = KTIME_MAX 224 * / \ 225 * LVL 0 [GRP0:0] [GRP0:1] 226 * migrator = TMIGR_NONE migrator = CPU2 227 * active = active = CPU2 228 * next_expiry = KTIME_MAX next_expiry = KTIME_MAX 229 * / \ / \ 230 * CPUs 0 1 2 3 231 * idle idle active idle 232 * 233 * 2. Now CPU 2 goes idle (and has no global timer, that has to be handled) and 234 * propagates that to GRP0:1: 235 * 236 * LVL 1 [GRP1:0] 237 * migrator = GRP0:1 238 * active = GRP0:1 239 * next_expiry = KTIME_MAX 240 * / \ 241 * LVL 0 [GRP0:0] [GRP0:1] 242 * migrator = TMIGR_NONE --> migrator = TMIGR_NONE 243 * active = --> active = 244 * next_expiry = KTIME_MAX next_expiry = KTIME_MAX 245 * / \ / \ 246 * CPUs 0 1 2 3 247 * idle idle --> idle idle 248 * 249 * 3. Now the idle state is propagated up to GRP1:0. As this is now the last 250 * child going idle in top level group, the expiry of the next group event 251 * has to be handed back to make sure no event is lost. As there is no event 252 * enqueued, KTIME_MAX is handed back to CPU2. 253 * 254 * LVL 1 [GRP1:0] 255 * --> migrator = TMIGR_NONE 256 * --> active = 257 * next_expiry = KTIME_MAX 258 * / \ 259 * LVL 0 [GRP0:0] [GRP0:1] 260 * migrator = TMIGR_NONE migrator = TMIGR_NONE 261 * active = active = 262 * next_expiry = KTIME_MAX next_expiry = KTIME_MAX 263 * / \ / \ 264 * CPUs 0 1 2 3 265 * idle idle --> idle idle 266 * 267 * 4. CPU 0 has a new timer queued from idle and it expires at TIMER0. CPU0 268 * propagates that to GRP0:0: 269 * 270 * LVL 1 [GRP1:0] 271 * migrator = TMIGR_NONE 272 * active = 273 * next_expiry = KTIME_MAX 274 * / \ 275 * LVL 0 [GRP0:0] [GRP0:1] 276 * migrator = TMIGR_NONE migrator = TMIGR_NONE 277 * active = active = 278 * --> next_expiry = TIMER0 next_expiry = KTIME_MAX 279 * / \ / \ 280 * CPUs 0 1 2 3 281 * idle idle idle idle 282 * 283 * 5. GRP0:0 is not active, so the new timer has to be propagated to 284 * GRP1:0. Therefore the GRP1:0 state has to be read. When the stalled value 285 * (from step 2) is read, the timer is enqueued into GRP1:0, but nothing is 286 * handed back to CPU0, as it seems that there is still an active child in 287 * top level group. 288 * 289 * LVL 1 [GRP1:0] 290 * migrator = TMIGR_NONE 291 * active = 292 * --> next_expiry = TIMER0 293 * / \ 294 * LVL 0 [GRP0:0] [GRP0:1] 295 * migrator = TMIGR_NONE migrator = TMIGR_NONE 296 * active = active = 297 * next_expiry = TIMER0 next_expiry = KTIME_MAX 298 * / \ / \ 299 * CPUs 0 1 2 3 300 * idle idle idle idle 301 * 302 * This is prevented by reading the state when holding the lock (when a new 303 * timer has to be propagated from idle path):: 304 * 305 * CPU2 (tmigr_inactive_up()) CPU0 (tmigr_new_timer_up()) 306 * -------------------------- --------------------------- 307 * // step 3: 308 * cmpxchg(&GRP1:0->state); 309 * tmigr_update_events() { 310 * spin_lock(&GRP1:0->lock); 311 * // ... update events ... 312 * // hand back first expiry when GRP1:0 is idle 313 * spin_unlock(&GRP1:0->lock); 314 * // ^^^ release state modification 315 * } 316 * tmigr_update_events() { 317 * spin_lock(&GRP1:0->lock) 318 * // ^^^ acquire state modification 319 * group_state = atomic_read(&GRP1:0->state) 320 * // .... update events ... 321 * // hand back first expiry when GRP1:0 is idle 322 * spin_unlock(&GRP1:0->lock) <3> 323 * // ^^^ makes state visible for other 324 * // callers of tmigr_new_timer_up() 325 * } 326 * 327 * When CPU0 grabs the lock directly after cmpxchg, the first timer is reported 328 * back to CPU0 and also later on to CPU2. So no timer is missed. A concurrent 329 * update of the group state from active path is no problem, as the upcoming CPU 330 * will take care of the group events. 331 * 332 * Required event and timerqueue update after a remote expiry: 333 * ----------------------------------------------------------- 334 * 335 * After expiring timers of a remote CPU, a walk through the hierarchy and 336 * update of events and timerqueues is required. It is obviously needed if there 337 * is a 'new' global timer but also if there is no new global timer but the 338 * remote CPU is still idle. 339 * 340 * 1. CPU0 and CPU1 are idle and have both a global timer expiring at the same 341 * time. So both have an event enqueued in the timerqueue of GRP0:0. CPU3 is 342 * also idle and has no global timer pending. CPU2 is the only active CPU and 343 * thus also the migrator: 344 * 345 * LVL 1 [GRP1:0] 346 * migrator = GRP0:1 347 * active = GRP0:1 348 * --> timerqueue = evt-GRP0:0 349 * / \ 350 * LVL 0 [GRP0:0] [GRP0:1] 351 * migrator = TMIGR_NONE migrator = CPU2 352 * active = active = CPU2 353 * groupevt.ignore = false groupevt.ignore = true 354 * groupevt.cpu = CPU0 groupevt.cpu = 355 * timerqueue = evt-CPU0, timerqueue = 356 * evt-CPU1 357 * / \ / \ 358 * CPUs 0 1 2 3 359 * idle idle active idle 360 * 361 * 2. CPU2 starts to expire remote timers. It starts with LVL0 group 362 * GRP0:1. There is no event queued in the timerqueue, so CPU2 continues with 363 * the parent of GRP0:1: GRP1:0. In GRP1:0 it dequeues the first event. It 364 * looks at tmigr_event::cpu struct member and expires the pending timer(s) 365 * of CPU0. 366 * 367 * LVL 1 [GRP1:0] 368 * migrator = GRP0:1 369 * active = GRP0:1 370 * --> timerqueue = 371 * / \ 372 * LVL 0 [GRP0:0] [GRP0:1] 373 * migrator = TMIGR_NONE migrator = CPU2 374 * active = active = CPU2 375 * groupevt.ignore = false groupevt.ignore = true 376 * --> groupevt.cpu = CPU0 groupevt.cpu = 377 * timerqueue = evt-CPU0, timerqueue = 378 * evt-CPU1 379 * / \ / \ 380 * CPUs 0 1 2 3 381 * idle idle active idle 382 * 383 * 3. Some work has to be done after expiring the timers of CPU0. If we stop 384 * here, then CPU1's pending global timer(s) will not expire in time and the 385 * timerqueue of GRP0:0 has still an event for CPU0 enqueued which has just 386 * been processed. So it is required to walk the hierarchy from CPU0's point 387 * of view and update it accordingly. CPU0's event will be removed from the 388 * timerqueue because it has no pending timer. If CPU0 would have a timer 389 * pending then it has to expire after CPU1's first timer because all timers 390 * from this period were just expired. Either way CPU1's event will be first 391 * in GRP0:0's timerqueue and therefore set in the CPU field of the group 392 * event which is then enqueued in GRP1:0's timerqueue as GRP0:0 is still not 393 * active: 394 * 395 * LVL 1 [GRP1:0] 396 * migrator = GRP0:1 397 * active = GRP0:1 398 * --> timerqueue = evt-GRP0:0 399 * / \ 400 * LVL 0 [GRP0:0] [GRP0:1] 401 * migrator = TMIGR_NONE migrator = CPU2 402 * active = active = CPU2 403 * groupevt.ignore = false groupevt.ignore = true 404 * --> groupevt.cpu = CPU1 groupevt.cpu = 405 * --> timerqueue = evt-CPU1 timerqueue = 406 * / \ / \ 407 * CPUs 0 1 2 3 408 * idle idle active idle 409 * 410 * Now CPU2 (migrator) will continue step 2 at GRP1:0 and will expire the 411 * timer(s) of CPU1. 412 * 413 * The hierarchy walk in step 3 can be skipped if the migrator notices that a 414 * CPU of GRP0:0 is active again. The CPU will mark GRP0:0 active and take care 415 * of the group as migrator and any needed updates within the hierarchy. 416 */ 417 418 static DEFINE_MUTEX(tmigr_mutex); 419 420 static LIST_HEAD(tmigr_hierarchy_list); 421 422 static unsigned int tmigr_hierarchy_levels __read_mostly; 423 static unsigned int tmigr_crossnode_level __read_mostly; 424 425 static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu); 426 427 /* 428 * CPUs available for timer migration. 429 * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock. 430 * Additionally tmigr_available_mutex serializes set/clear operations with each other. 431 */ 432 static cpumask_var_t tmigr_available_cpumask; 433 static DEFINE_MUTEX(tmigr_available_mutex); 434 435 /* Enabled during late initcall */ 436 static DEFINE_STATIC_KEY_FALSE(tmigr_exclude_isolated); 437 438 #define TMIGR_NONE 0xFF 439 #define BIT_CNT 8 440 441 static inline bool tmigr_is_not_available(struct tmigr_cpu *tmc) 442 { 443 return !(tmc->tmgroup && tmc->available); 444 } 445 446 /* 447 * Returns true if @cpu should be excluded from the hierarchy as isolated. 448 * Domain isolated CPUs don't participate in timer migration, nohz_full CPUs 449 * are still part of the hierarchy but become idle (from a tick and timer 450 * migration perspective) when they stop their tick. This lets the timekeeping 451 * CPU handle their global timers. Marking also isolated CPUs as idle would be 452 * too costly, hence they are completely excluded from the hierarchy. 453 * This check is necessary, for instance, to prevent offline isolated CPUs from 454 * being incorrectly marked as available once getting back online. 455 * 456 * This function returns false during early boot and the isolation logic is 457 * enabled only after isolated CPUs are marked as unavailable at late boot. 458 * The tick CPU can be isolated at boot, however we cannot mark it as 459 * unavailable to avoid having no global migrator for the nohz_full CPUs. This 460 * should be ensured by the callers of this function: implicitly from hotplug 461 * callbacks and explicitly in tmigr_init_isolation() and 462 * tmigr_isolated_exclude_cpumask(). 463 */ 464 static inline bool tmigr_is_isolated(int cpu) 465 { 466 if (!static_branch_unlikely(&tmigr_exclude_isolated)) 467 return false; 468 return (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN) && 469 housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)); 470 } 471 472 /* 473 * Returns true, when @childmask corresponds to the group migrator or when the 474 * group is not active - so no migrator is set. 475 */ 476 static bool tmigr_check_migrator(struct tmigr_group *group, u8 childmask) 477 { 478 union tmigr_state s; 479 480 s.state = atomic_read(&group->migr_state); 481 482 if ((s.migrator == childmask) || (s.migrator == TMIGR_NONE)) 483 return true; 484 485 return false; 486 } 487 488 static bool tmigr_check_migrator_and_lonely(struct tmigr_group *group, u8 childmask) 489 { 490 bool lonely, migrator = false; 491 unsigned long active; 492 union tmigr_state s; 493 494 s.state = atomic_read(&group->migr_state); 495 496 if ((s.migrator == childmask) || (s.migrator == TMIGR_NONE)) 497 migrator = true; 498 499 active = s.active; 500 lonely = bitmap_weight(&active, BIT_CNT) <= 1; 501 502 return (migrator && lonely); 503 } 504 505 static bool tmigr_check_lonely(struct tmigr_group *group) 506 { 507 unsigned long active; 508 union tmigr_state s; 509 510 s.state = atomic_read(&group->migr_state); 511 512 active = s.active; 513 514 return bitmap_weight(&active, BIT_CNT) <= 1; 515 } 516 517 /** 518 * struct tmigr_walk - data required for walking the hierarchy 519 * @nextexp: Next CPU event expiry information which is handed into 520 * the timer migration code by the timer code 521 * (get_next_timer_interrupt()) 522 * @firstexp: Contains the first event expiry information when 523 * hierarchy is completely idle. When CPU itself was the 524 * last going idle, information makes sure, that CPU will 525 * be back in time. When using this value in the remote 526 * expiry case, firstexp is stored in the per CPU tmigr_cpu 527 * struct of CPU which expires remote timers. It is updated 528 * in top level group only. Be aware, there could occur a 529 * new top level of the hierarchy between the 'top level 530 * call' in tmigr_update_events() and the check for the 531 * parent group in walk_groups(). Then @firstexp might 532 * contain a value != KTIME_MAX even if it was not the 533 * final top level. This is not a problem, as the worst 534 * outcome is a CPU which might wake up a little early. 535 * @evt: Pointer to tmigr_event which needs to be queued (of idle 536 * child group) 537 * @childmask: groupmask of child group 538 * @remote: Is set, when the new timer path is executed in 539 * tmigr_handle_remote_cpu() 540 * @basej: timer base in jiffies 541 * @now: timer base monotonic 542 * @check: is set if there is the need to handle remote timers; 543 * required in tmigr_requires_handle_remote() only 544 */ 545 struct tmigr_walk { 546 u64 nextexp; 547 u64 firstexp; 548 struct tmigr_event *evt; 549 u8 childmask; 550 bool remote; 551 unsigned long basej; 552 u64 now; 553 bool check; 554 }; 555 556 typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *); 557 558 static void __walk_groups_from(up_f up, struct tmigr_walk *data, 559 struct tmigr_group *child, struct tmigr_group *group) 560 { 561 do { 562 WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels); 563 564 if (up(group, child, data)) 565 break; 566 567 child = group; 568 /* 569 * Pairs with the store release on group connection 570 * to make sure group initialization is visible. 571 */ 572 group = READ_ONCE(group->parent); 573 data->childmask = child->groupmask; 574 WARN_ON_ONCE(!data->childmask); 575 } while (group); 576 } 577 578 static void __walk_groups(up_f up, struct tmigr_walk *data, 579 struct tmigr_cpu *tmc) 580 { 581 __walk_groups_from(up, data, NULL, tmc->tmgroup); 582 } 583 584 static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc) 585 { 586 lockdep_assert_held(&tmc->lock); 587 588 __walk_groups(up, data, tmc); 589 } 590 591 /* 592 * Returns the next event of the timerqueue @group->events 593 * 594 * Removes timers with ignore flag and update next_expiry of the group. Values 595 * of the group event are updated in tmigr_update_events() only. 596 */ 597 static struct tmigr_event *tmigr_next_groupevt(struct tmigr_group *group) 598 { 599 struct timerqueue_node *node = NULL; 600 struct tmigr_event *evt = NULL; 601 602 lockdep_assert_held(&group->lock); 603 604 WRITE_ONCE(group->next_expiry, KTIME_MAX); 605 606 while ((node = timerqueue_getnext(&group->events))) { 607 evt = container_of(node, struct tmigr_event, nextevt); 608 609 if (!READ_ONCE(evt->ignore)) { 610 WRITE_ONCE(group->next_expiry, evt->nextevt.expires); 611 return evt; 612 } 613 614 /* 615 * Remove next timers with ignore flag, because the group lock 616 * is held anyway 617 */ 618 if (!timerqueue_del(&group->events, node)) 619 break; 620 } 621 622 return NULL; 623 } 624 625 /* 626 * Return the next event (with the expiry equal or before @now) 627 * 628 * Event, which is returned, is also removed from the queue. 629 */ 630 static struct tmigr_event *tmigr_next_expired_groupevt(struct tmigr_group *group, 631 u64 now) 632 { 633 struct tmigr_event *evt = tmigr_next_groupevt(group); 634 635 if (!evt || now < evt->nextevt.expires) 636 return NULL; 637 638 /* 639 * The event is ready to expire. Remove it and update next group event. 640 */ 641 timerqueue_del(&group->events, &evt->nextevt); 642 tmigr_next_groupevt(group); 643 644 return evt; 645 } 646 647 static u64 tmigr_next_groupevt_expires(struct tmigr_group *group) 648 { 649 struct tmigr_event *evt; 650 651 evt = tmigr_next_groupevt(group); 652 653 if (!evt) 654 return KTIME_MAX; 655 else 656 return evt->nextevt.expires; 657 } 658 659 static bool tmigr_active_up(struct tmigr_group *group, 660 struct tmigr_group *child, 661 struct tmigr_walk *data) 662 { 663 union tmigr_state curstate, newstate; 664 bool walk_done; 665 u8 childmask; 666 667 childmask = data->childmask; 668 /* 669 * No memory barrier is required here in contrast to 670 * tmigr_inactive_up(), as the group state change does not depend on the 671 * child state. 672 */ 673 curstate.state = atomic_read(&group->migr_state); 674 675 do { 676 newstate = curstate; 677 walk_done = true; 678 679 if (newstate.migrator == TMIGR_NONE) { 680 newstate.migrator = childmask; 681 682 /* Changes need to be propagated */ 683 walk_done = false; 684 } 685 686 newstate.active |= childmask; 687 newstate.seq++; 688 689 } while (!atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)); 690 691 trace_tmigr_group_set_cpu_active(group, newstate, childmask); 692 693 /* 694 * The group is active (again). The group event might be still queued 695 * into the parent group's timerqueue but can now be handled by the 696 * migrator of this group. Therefore the ignore flag for the group event 697 * is updated to reflect this. 698 * 699 * The update of the ignore flag in the active path is done lockless. In 700 * worst case the migrator of the parent group observes the change too 701 * late and expires remotely all events belonging to this group. The 702 * lock is held while updating the ignore flag in idle path. So this 703 * state change will not be lost. 704 */ 705 WRITE_ONCE(group->groupevt.ignore, true); 706 707 return walk_done; 708 } 709 710 static void __tmigr_cpu_activate(struct tmigr_cpu *tmc) 711 { 712 struct tmigr_walk data; 713 714 data.childmask = tmc->groupmask; 715 716 trace_tmigr_cpu_active(tmc); 717 718 tmc->cpuevt.ignore = true; 719 WRITE_ONCE(tmc->wakeup, KTIME_MAX); 720 721 walk_groups(&tmigr_active_up, &data, tmc); 722 } 723 724 /** 725 * tmigr_cpu_activate() - set this CPU active in timer migration hierarchy 726 * 727 * Call site timer_clear_idle() is called with interrupts disabled. 728 */ 729 void tmigr_cpu_activate(void) 730 { 731 struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 732 733 if (tmigr_is_not_available(tmc)) 734 return; 735 736 if (WARN_ON_ONCE(!tmc->idle)) 737 return; 738 739 raw_spin_lock(&tmc->lock); 740 tmc->idle = false; 741 __tmigr_cpu_activate(tmc); 742 raw_spin_unlock(&tmc->lock); 743 } 744 745 /* 746 * Returns true, if there is nothing to be propagated to the next level 747 * 748 * @data->firstexp is set to expiry of first global event of the (top level of 749 * the) hierarchy, but only when hierarchy is completely idle. 750 * 751 * The child and group states need to be read under the lock, to prevent a race 752 * against a concurrent tmigr_inactive_up() run when the last CPU goes idle. See 753 * also section "Prevent race between new event and last CPU going inactive" in 754 * the documentation at the top. 755 * 756 * This is the only place where the group event expiry value is set. 757 */ 758 static 759 bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, 760 struct tmigr_walk *data) 761 { 762 struct tmigr_event *evt, *first_childevt; 763 union tmigr_state childstate, groupstate; 764 bool remote = data->remote; 765 bool walk_done = false; 766 bool ignore; 767 u64 nextexp; 768 769 if (child) { 770 raw_spin_lock(&child->lock); 771 raw_spin_lock_nested(&group->lock, SINGLE_DEPTH_NESTING); 772 773 childstate.state = atomic_read(&child->migr_state); 774 groupstate.state = atomic_read(&group->migr_state); 775 776 if (childstate.active) { 777 walk_done = true; 778 goto unlock; 779 } 780 781 first_childevt = tmigr_next_groupevt(child); 782 nextexp = child->next_expiry; 783 evt = &child->groupevt; 784 785 /* 786 * This can race with concurrent idle exit (activate). 787 * If the current writer wins, a useless remote expiration may 788 * be scheduled. If the activate wins, the event is properly 789 * ignored. 790 */ 791 ignore = (nextexp == KTIME_MAX) ? true : false; 792 WRITE_ONCE(evt->ignore, ignore); 793 } else { 794 nextexp = data->nextexp; 795 796 first_childevt = evt = data->evt; 797 ignore = evt->ignore; 798 799 /* 800 * Walking the hierarchy is required in any case when a 801 * remote expiry was done before. This ensures to not lose 802 * already queued events in non active groups (see section 803 * "Required event and timerqueue update after a remote 804 * expiry" in the documentation at the top). 805 * 806 * The two call sites which are executed without a remote expiry 807 * before, are not prevented from propagating changes through 808 * the hierarchy by the return: 809 * - When entering this path by tmigr_new_timer(), @evt->ignore 810 * is never set. 811 * - tmigr_inactive_up() takes care of the propagation by 812 * itself and ignores the return value. But an immediate 813 * return is possible if there is a parent, sparing group 814 * locking at this level, because the upper walking call to 815 * the parent will take care about removing this event from 816 * within the group and update next_expiry accordingly. 817 * 818 * However if there is no parent, ie: the hierarchy has only a 819 * single level so @group is the top level group, make sure the 820 * first event information of the group is updated properly and 821 * also handled properly, so skip this fast return path. 822 */ 823 if (ignore && !remote && group->parent) 824 return true; 825 826 raw_spin_lock(&group->lock); 827 828 childstate.state = 0; 829 groupstate.state = atomic_read(&group->migr_state); 830 } 831 832 /* 833 * If the child event is already queued in the group, remove it from the 834 * queue when the expiry time changed only or when it could be ignored. 835 */ 836 if (timerqueue_node_queued(&evt->nextevt)) { 837 if ((evt->nextevt.expires == nextexp) && !ignore) { 838 /* Make sure not to miss a new CPU event with the same expiry */ 839 evt->cpu = first_childevt->cpu; 840 goto check_toplvl; 841 } 842 843 if (!timerqueue_del(&group->events, &evt->nextevt)) 844 WRITE_ONCE(group->next_expiry, KTIME_MAX); 845 } 846 847 if (ignore) { 848 /* 849 * When the next child event could be ignored (nextexp is 850 * KTIME_MAX) and there was no remote timer handling before or 851 * the group is already active, there is no need to walk the 852 * hierarchy even if there is a parent group. 853 * 854 * The other way round: even if the event could be ignored, but 855 * if a remote timer handling was executed before and the group 856 * is not active, walking the hierarchy is required to not miss 857 * an enqueued timer in the non active group. The enqueued timer 858 * of the group needs to be propagated to a higher level to 859 * ensure it is handled. 860 */ 861 if (!remote || groupstate.active) 862 walk_done = true; 863 } else { 864 evt->nextevt.expires = nextexp; 865 evt->cpu = first_childevt->cpu; 866 867 if (timerqueue_add(&group->events, &evt->nextevt)) 868 WRITE_ONCE(group->next_expiry, nextexp); 869 } 870 871 check_toplvl: 872 if (!group->parent && (groupstate.migrator == TMIGR_NONE)) { 873 walk_done = true; 874 875 /* 876 * Nothing to do when update was done during remote timer 877 * handling. First timer in top level group which needs to be 878 * handled when top level group is not active, is calculated 879 * directly in tmigr_handle_remote_up(). 880 */ 881 if (remote) 882 goto unlock; 883 884 /* 885 * The top level group is idle and it has to be ensured the 886 * global timers are handled in time. (This could be optimized 887 * by keeping track of the last global scheduled event and only 888 * arming it on the CPU if the new event is earlier. Not sure if 889 * its worth the complexity.) 890 */ 891 data->firstexp = tmigr_next_groupevt_expires(group); 892 } 893 894 trace_tmigr_update_events(child, group, childstate, groupstate, 895 nextexp); 896 897 unlock: 898 raw_spin_unlock(&group->lock); 899 900 if (child) 901 raw_spin_unlock(&child->lock); 902 903 return walk_done; 904 } 905 906 static bool tmigr_new_timer_up(struct tmigr_group *group, 907 struct tmigr_group *child, 908 struct tmigr_walk *data) 909 { 910 return tmigr_update_events(group, child, data); 911 } 912 913 /* 914 * Returns the expiry of the next timer that needs to be handled. KTIME_MAX is 915 * returned, if an active CPU will handle all the timer migration hierarchy 916 * timers. 917 */ 918 static u64 tmigr_new_timer(struct tmigr_cpu *tmc, u64 nextexp) 919 { 920 struct tmigr_walk data = { .nextexp = nextexp, 921 .firstexp = KTIME_MAX, 922 .evt = &tmc->cpuevt }; 923 924 lockdep_assert_held(&tmc->lock); 925 926 if (tmc->remote) 927 return KTIME_MAX; 928 929 trace_tmigr_cpu_new_timer(tmc); 930 931 tmc->cpuevt.ignore = false; 932 data.remote = false; 933 934 walk_groups(&tmigr_new_timer_up, &data, tmc); 935 936 /* If there is a new first global event, make sure it is handled */ 937 return data.firstexp; 938 } 939 940 static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now, 941 unsigned long jif) 942 { 943 struct timer_events tevt; 944 struct tmigr_walk data; 945 struct tmigr_cpu *tmc; 946 947 tmc = per_cpu_ptr(&tmigr_cpu, cpu); 948 949 raw_spin_lock_irq(&tmc->lock); 950 951 /* 952 * If the remote CPU is offline then the timers have been migrated to 953 * another CPU. 954 * 955 * If tmigr_cpu::remote is set, at the moment another CPU already 956 * expires the timers of the remote CPU. 957 * 958 * If tmigr_event::ignore is set, then the CPU returns from idle and 959 * takes care of its timers. 960 * 961 * If the next event expires in the future, then the event has been 962 * updated and there are no timers to expire right now. The CPU which 963 * updated the event takes care when hierarchy is completely 964 * idle. Otherwise the migrator does it as the event is enqueued. 965 */ 966 if (!tmc->available || tmc->remote || tmc->cpuevt.ignore || 967 now < tmc->cpuevt.nextevt.expires) { 968 raw_spin_unlock_irq(&tmc->lock); 969 return; 970 } 971 972 trace_tmigr_handle_remote_cpu(tmc); 973 974 tmc->remote = true; 975 WRITE_ONCE(tmc->wakeup, KTIME_MAX); 976 977 /* Drop the lock to allow the remote CPU to exit idle */ 978 raw_spin_unlock_irq(&tmc->lock); 979 980 /* 981 * This can't exclude the local CPU because jiffies might have advanced 982 * after the timer softirq invoked run_timer_base(BASE_GLOBAL) and the 983 * point where the jiffies snapshot @jif was taken in tmigr_handle_remote(). 984 */ 985 timer_expire_remote(cpu); 986 987 /* 988 * Lock ordering needs to be preserved - timer_base locks before tmigr 989 * related locks (see section "Locking rules" in the documentation at 990 * the top). During fetching the next timer interrupt, also tmc->lock 991 * needs to be held. Otherwise there is a possible race window against 992 * the CPU itself when it comes out of idle, updates the first timer in 993 * the hierarchy and goes back to idle. 994 * 995 * timer base locks are dropped as fast as possible: After checking 996 * whether the remote CPU went offline in the meantime and after 997 * fetching the next remote timer interrupt. Dropping the locks as fast 998 * as possible keeps the locking region small and prevents holding 999 * several (unnecessary) locks during walking the hierarchy for updating 1000 * the timerqueue and group events. 1001 */ 1002 local_irq_disable(); 1003 timer_lock_remote_bases(cpu); 1004 raw_spin_lock(&tmc->lock); 1005 1006 /* 1007 * When the CPU went offline in the meantime, no hierarchy walk has to 1008 * be done for updating the queued events, because the walk was 1009 * already done during marking the CPU offline in the hierarchy. 1010 * 1011 * When the CPU is no longer idle, the CPU takes care of the timers and 1012 * also of the timers in the hierarchy. 1013 * 1014 * (See also section "Required event and timerqueue update after a 1015 * remote expiry" in the documentation at the top) 1016 */ 1017 if (!tmc->available || !tmc->idle) { 1018 timer_unlock_remote_bases(cpu); 1019 goto unlock; 1020 } 1021 1022 /* next event of CPU */ 1023 fetch_next_timer_interrupt_remote(jif, now, &tevt, cpu); 1024 timer_unlock_remote_bases(cpu); 1025 1026 data.nextexp = tevt.global; 1027 data.firstexp = KTIME_MAX; 1028 data.evt = &tmc->cpuevt; 1029 data.remote = true; 1030 1031 /* 1032 * The update is done even when there is no 'new' global timer pending 1033 * on the remote CPU (see section "Required event and timerqueue update 1034 * after a remote expiry" in the documentation at the top) 1035 */ 1036 walk_groups(&tmigr_new_timer_up, &data, tmc); 1037 1038 unlock: 1039 tmc->remote = false; 1040 raw_spin_unlock_irq(&tmc->lock); 1041 } 1042 1043 static bool tmigr_handle_remote_up(struct tmigr_group *group, 1044 struct tmigr_group *child, 1045 struct tmigr_walk *data) 1046 { 1047 struct tmigr_event *evt; 1048 unsigned long jif; 1049 u8 childmask; 1050 u64 now; 1051 1052 jif = data->basej; 1053 now = data->now; 1054 1055 childmask = data->childmask; 1056 1057 trace_tmigr_handle_remote(group); 1058 again: 1059 /* 1060 * Handle the group only if @childmask is the migrator or if the 1061 * group has no migrator. Otherwise the group is active and is 1062 * handled by its own migrator. 1063 */ 1064 if (!tmigr_check_migrator(group, childmask)) 1065 return true; 1066 1067 raw_spin_lock_irq(&group->lock); 1068 1069 evt = tmigr_next_expired_groupevt(group, now); 1070 1071 if (evt) { 1072 unsigned int remote_cpu = evt->cpu; 1073 1074 raw_spin_unlock_irq(&group->lock); 1075 1076 tmigr_handle_remote_cpu(remote_cpu, now, jif); 1077 1078 /* check if there is another event, that needs to be handled */ 1079 goto again; 1080 } 1081 1082 /* 1083 * Keep track of the expiry of the first event that needs to be handled 1084 * (group->next_expiry was updated by tmigr_next_expired_groupevt(), 1085 * next was set by tmigr_handle_remote_cpu()). 1086 */ 1087 data->firstexp = group->next_expiry; 1088 1089 raw_spin_unlock_irq(&group->lock); 1090 1091 return false; 1092 } 1093 1094 /** 1095 * tmigr_handle_remote() - Handle global timers of remote idle CPUs 1096 * 1097 * Called from the timer soft interrupt with interrupts enabled. 1098 */ 1099 void tmigr_handle_remote(void) 1100 { 1101 struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1102 struct tmigr_walk data; 1103 1104 if (tmigr_is_not_available(tmc)) 1105 return; 1106 1107 data.childmask = tmc->groupmask; 1108 data.firstexp = KTIME_MAX; 1109 1110 /* 1111 * NOTE: This is a doubled check because the migrator test will be done 1112 * in tmigr_handle_remote_up() anyway. Keep this check to speed up the 1113 * return when nothing has to be done. 1114 */ 1115 if (!tmigr_check_migrator(tmc->tmgroup, tmc->groupmask)) { 1116 /* 1117 * If this CPU was an idle migrator, make sure to clear its wakeup 1118 * value so it won't chase timers that have already expired elsewhere. 1119 * This avoids endless requeue from tmigr_new_timer(). 1120 */ 1121 if (READ_ONCE(tmc->wakeup) == KTIME_MAX) 1122 return; 1123 } 1124 1125 data.now = get_jiffies_update(&data.basej); 1126 1127 /* 1128 * Update @tmc->wakeup only at the end and do not reset @tmc->wakeup to 1129 * KTIME_MAX. Even if tmc->lock is not held during the whole remote 1130 * handling, tmc->wakeup is fine to be stale as it is called in 1131 * interrupt context and tick_nohz_next_event() is executed in interrupt 1132 * exit path only after processing the last pending interrupt. 1133 */ 1134 1135 __walk_groups(&tmigr_handle_remote_up, &data, tmc); 1136 1137 raw_spin_lock_irq(&tmc->lock); 1138 WRITE_ONCE(tmc->wakeup, data.firstexp); 1139 raw_spin_unlock_irq(&tmc->lock); 1140 } 1141 1142 static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, 1143 struct tmigr_group *child, 1144 struct tmigr_walk *data) 1145 { 1146 u8 childmask; 1147 1148 childmask = data->childmask; 1149 1150 /* 1151 * Handle the group only if the child is the migrator or if the group 1152 * has no migrator. Otherwise the group is active and is handled by its 1153 * own migrator. 1154 */ 1155 if (!tmigr_check_migrator(group, childmask)) 1156 return true; 1157 /* 1158 * The lock is required on 32bit architectures to read the variable 1159 * consistently with a concurrent writer. On 64bit the lock is not 1160 * required because the read operation is not split and so it is always 1161 * consistent. 1162 */ 1163 if (IS_ENABLED(CONFIG_64BIT)) { 1164 data->firstexp = READ_ONCE(group->next_expiry); 1165 if (data->now >= data->firstexp) { 1166 data->check = true; 1167 return true; 1168 } 1169 } else { 1170 raw_spin_lock(&group->lock); 1171 data->firstexp = group->next_expiry; 1172 if (data->now >= group->next_expiry) { 1173 data->check = true; 1174 raw_spin_unlock(&group->lock); 1175 return true; 1176 } 1177 raw_spin_unlock(&group->lock); 1178 } 1179 1180 return false; 1181 } 1182 1183 /** 1184 * tmigr_requires_handle_remote() - Check the need of remote timer handling 1185 * 1186 * Must be called with interrupts disabled. 1187 */ 1188 bool tmigr_requires_handle_remote(void) 1189 { 1190 struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1191 struct tmigr_walk data; 1192 unsigned long jif; 1193 bool ret = false; 1194 1195 if (tmigr_is_not_available(tmc)) 1196 return ret; 1197 1198 data.now = get_jiffies_update(&jif); 1199 data.childmask = tmc->groupmask; 1200 data.firstexp = KTIME_MAX; 1201 data.check = false; 1202 1203 /* 1204 * If the CPU is active, walk the hierarchy to check whether a remote 1205 * expiry is required. 1206 * 1207 * Check is done lockless as interrupts are disabled and @tmc->idle is 1208 * set only by the local CPU. 1209 */ 1210 if (!tmc->idle) { 1211 __walk_groups(&tmigr_requires_handle_remote_up, &data, tmc); 1212 1213 return data.check; 1214 } 1215 1216 /* 1217 * When the CPU is idle, compare @tmc->wakeup with @data.now. The lock 1218 * is required on 32bit architectures to read the variable consistently 1219 * with a concurrent writer. On 64bit the lock is not required because 1220 * the read operation is not split and so it is always consistent. 1221 */ 1222 if (IS_ENABLED(CONFIG_64BIT)) { 1223 if (data.now >= READ_ONCE(tmc->wakeup)) 1224 return true; 1225 } else { 1226 raw_spin_lock(&tmc->lock); 1227 if (data.now >= tmc->wakeup) 1228 ret = true; 1229 raw_spin_unlock(&tmc->lock); 1230 } 1231 1232 return ret; 1233 } 1234 1235 /** 1236 * tmigr_cpu_new_timer() - enqueue next global timer into hierarchy (idle tmc) 1237 * @nextexp: Next expiry of global timer (or KTIME_MAX if not) 1238 * 1239 * The CPU is already deactivated in the timer migration 1240 * hierarchy. tick_nohz_get_sleep_length() calls tick_nohz_next_event() 1241 * and thereby the timer idle path is executed once more. @tmc->wakeup 1242 * holds the first timer, when the timer migration hierarchy is 1243 * completely idle. 1244 * 1245 * Returns the first timer that needs to be handled by this CPU or KTIME_MAX if 1246 * nothing needs to be done. 1247 */ 1248 u64 tmigr_cpu_new_timer(u64 nextexp) 1249 { 1250 struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1251 u64 ret; 1252 1253 if (tmigr_is_not_available(tmc)) 1254 return nextexp; 1255 1256 raw_spin_lock(&tmc->lock); 1257 1258 ret = READ_ONCE(tmc->wakeup); 1259 if (nextexp != KTIME_MAX) { 1260 if (nextexp != tmc->cpuevt.nextevt.expires || 1261 tmc->cpuevt.ignore) { 1262 ret = tmigr_new_timer(tmc, nextexp); 1263 /* 1264 * Make sure the reevaluation of timers in idle path 1265 * will not miss an event. 1266 */ 1267 WRITE_ONCE(tmc->wakeup, ret); 1268 } 1269 } 1270 trace_tmigr_cpu_new_timer_idle(tmc, nextexp); 1271 raw_spin_unlock(&tmc->lock); 1272 return ret; 1273 } 1274 1275 static bool tmigr_inactive_up(struct tmigr_group *group, 1276 struct tmigr_group *child, 1277 struct tmigr_walk *data) 1278 { 1279 union tmigr_state curstate, newstate, childstate; 1280 bool walk_done; 1281 u8 childmask; 1282 1283 childmask = data->childmask; 1284 childstate.state = 0; 1285 1286 /* 1287 * The memory barrier is paired with the cmpxchg() in tmigr_active_up() 1288 * to make sure the updates of child and group states are ordered. The 1289 * ordering is mandatory, as the group state change depends on the child 1290 * state. 1291 */ 1292 curstate.state = atomic_read_acquire(&group->migr_state); 1293 1294 for (;;) { 1295 if (child) 1296 childstate.state = atomic_read(&child->migr_state); 1297 1298 newstate = curstate; 1299 walk_done = true; 1300 1301 /* Reset active bit when the child is no longer active */ 1302 if (!childstate.active) 1303 newstate.active &= ~childmask; 1304 1305 if (newstate.migrator == childmask) { 1306 /* 1307 * Find a new migrator for the group, because the child 1308 * group is idle! 1309 */ 1310 if (!childstate.active) { 1311 unsigned long new_migr_bit, active = newstate.active; 1312 1313 new_migr_bit = find_first_bit(&active, BIT_CNT); 1314 1315 if (new_migr_bit != BIT_CNT) { 1316 newstate.migrator = BIT(new_migr_bit); 1317 } else { 1318 newstate.migrator = TMIGR_NONE; 1319 1320 /* Changes need to be propagated */ 1321 walk_done = false; 1322 } 1323 } 1324 } 1325 1326 newstate.seq++; 1327 1328 WARN_ON_ONCE((newstate.migrator != TMIGR_NONE) && !(newstate.active)); 1329 1330 if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)) { 1331 trace_tmigr_group_set_cpu_inactive(group, newstate, childmask); 1332 break; 1333 } 1334 1335 /* 1336 * The memory barrier is paired with the cmpxchg() in 1337 * tmigr_active_up() to make sure the updates of child and group 1338 * states are ordered. It is required only when the above 1339 * try_cmpxchg() fails. 1340 */ 1341 smp_mb__after_atomic(); 1342 } 1343 1344 data->remote = false; 1345 1346 /* Event Handling */ 1347 tmigr_update_events(group, child, data); 1348 1349 return walk_done; 1350 } 1351 1352 static u64 __tmigr_cpu_deactivate(struct tmigr_cpu *tmc, u64 nextexp) 1353 { 1354 struct tmigr_walk data = { .nextexp = nextexp, 1355 .firstexp = KTIME_MAX, 1356 .evt = &tmc->cpuevt, 1357 .childmask = tmc->groupmask }; 1358 1359 /* 1360 * If nextexp is KTIME_MAX, the CPU event will be ignored because the 1361 * local timer expires before the global timer, no global timer is set 1362 * or CPU goes offline. 1363 */ 1364 if (nextexp != KTIME_MAX) 1365 tmc->cpuevt.ignore = false; 1366 1367 walk_groups(&tmigr_inactive_up, &data, tmc); 1368 return data.firstexp; 1369 } 1370 1371 /** 1372 * tmigr_cpu_deactivate() - Put current CPU into inactive state 1373 * @nextexp: The next global timer expiry of the current CPU 1374 * 1375 * Must be called with interrupts disabled. 1376 * 1377 * Return: the next event expiry of the current CPU or the next event expiry 1378 * from the hierarchy if this CPU is the top level migrator or the hierarchy is 1379 * completely idle. 1380 */ 1381 u64 tmigr_cpu_deactivate(u64 nextexp) 1382 { 1383 struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1384 u64 ret; 1385 1386 if (tmigr_is_not_available(tmc)) 1387 return nextexp; 1388 1389 raw_spin_lock(&tmc->lock); 1390 1391 ret = __tmigr_cpu_deactivate(tmc, nextexp); 1392 1393 tmc->idle = true; 1394 1395 /* 1396 * Make sure the reevaluation of timers in idle path will not miss an 1397 * event. 1398 */ 1399 WRITE_ONCE(tmc->wakeup, ret); 1400 1401 trace_tmigr_cpu_idle(tmc, nextexp); 1402 raw_spin_unlock(&tmc->lock); 1403 return ret; 1404 } 1405 1406 /** 1407 * tmigr_quick_check() - Quick forecast of next tmigr event when CPU wants to 1408 * go idle 1409 * @nextevt: The next global timer expiry of the current CPU 1410 * 1411 * Return: 1412 * * KTIME_MAX - when it is probable that nothing has to be done (not 1413 * the only one in the level 0 group; and if it is the 1414 * only one in level 0 group, but there are more than a 1415 * single group active on the way to top level) 1416 * * nextevt - when CPU is offline and has to handle timer on its own 1417 * or when on the way to top in every group only a single 1418 * child is active but @nextevt is before the lowest 1419 * next_expiry encountered while walking up to top level. 1420 * * next_expiry - value of lowest expiry encountered while walking groups 1421 * if only a single child is active on each and @nextevt 1422 * is after this lowest expiry. 1423 */ 1424 u64 tmigr_quick_check(u64 nextevt) 1425 { 1426 struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1427 struct tmigr_group *group = tmc->tmgroup; 1428 1429 if (tmigr_is_not_available(tmc)) 1430 return nextevt; 1431 1432 if (WARN_ON_ONCE(tmc->idle)) 1433 return nextevt; 1434 1435 if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->groupmask)) 1436 return KTIME_MAX; 1437 1438 do { 1439 if (!tmigr_check_lonely(group)) 1440 return KTIME_MAX; 1441 1442 /* 1443 * Since current CPU is active, events may not be sorted 1444 * from bottom to the top because the CPU's event is ignored 1445 * up to the top and its sibling's events not propagated upwards. 1446 * Thus keep track of the lowest observed expiry. 1447 */ 1448 nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry)); 1449 group = group->parent; 1450 } while (group); 1451 1452 return nextevt; 1453 } 1454 1455 /* 1456 * tmigr_trigger_active() - trigger a CPU to become active again 1457 * 1458 * This function is executed on a CPU which is part of cpu_online_mask, when the 1459 * last active CPU in the hierarchy is offlining. With this, it is ensured that 1460 * the other CPU is active and takes over the migrator duty. 1461 */ 1462 static long tmigr_trigger_active(void *unused) 1463 { 1464 struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1465 1466 WARN_ON_ONCE(!tmc->available || tmc->idle); 1467 1468 return 0; 1469 } 1470 1471 static unsigned int tmigr_get_capacity(int cpu) 1472 { 1473 /* 1474 * nohz_full CPUs need to make sure there is always an available (online) 1475 * and never idle migrator to handle all their global timers. That duty 1476 * is served by the timekeeper which then never stops its tick. But the 1477 * timekeeper must then belong to the same hierarchy as all the nohz_full 1478 * CPUs. Simply turn off capacity awareness when nohz_full is running. 1479 */ 1480 if (tick_nohz_full_enabled() || !IS_ENABLED(CONFIG_BROKEN)) 1481 return SCHED_CAPACITY_SCALE; 1482 else 1483 return arch_scale_cpu_capacity(cpu); 1484 } 1485 1486 static struct tmigr_hierarchy *__tmigr_get_hierarchy(int cpu) 1487 { 1488 unsigned int capacity = tmigr_get_capacity(cpu); 1489 struct tmigr_hierarchy *iter; 1490 1491 list_for_each_entry(iter, &tmigr_hierarchy_list, node) { 1492 if (iter->capacity == capacity) 1493 return iter; 1494 } 1495 1496 return NULL; 1497 } 1498 1499 static int tmigr_clear_cpu_available(unsigned int cpu) 1500 { 1501 struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1502 int migrator; 1503 u64 firstexp; 1504 1505 guard(mutex)(&tmigr_available_mutex); 1506 1507 cpumask_clear_cpu(cpu, tmigr_available_cpumask); 1508 scoped_guard(raw_spinlock_irq, &tmc->lock) { 1509 if (!tmc->available) 1510 return 0; 1511 tmc->available = false; 1512 WRITE_ONCE(tmc->wakeup, KTIME_MAX); 1513 1514 /* 1515 * CPU has to handle the local events on his own, when on the way to 1516 * offline; Therefore nextevt value is set to KTIME_MAX 1517 */ 1518 firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); 1519 trace_tmigr_cpu_unavailable(tmc); 1520 } 1521 1522 if (firstexp != KTIME_MAX) { 1523 struct tmigr_hierarchy *hier = __tmigr_get_hierarchy(cpu); 1524 1525 if (WARN_ON_ONCE(!hier)) 1526 return -EINVAL; 1527 1528 migrator = cpumask_any_and(tmigr_available_cpumask, hier->cpumask); 1529 if (migrator < nr_cpu_ids) { 1530 work_on_cpu(migrator, tmigr_trigger_active, NULL); 1531 } else { 1532 /* 1533 * If deactivation returned an expiration, it belongs to an available 1534 * nohz CPU in the hierarchy. 1535 */ 1536 WARN_ONCE(1, "Expected available CPU in the hierarchy\n"); 1537 } 1538 } 1539 1540 return 0; 1541 } 1542 1543 static int __tmigr_set_cpu_available(unsigned int cpu) 1544 { 1545 struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); 1546 1547 /* Check whether CPU data was successfully initialized */ 1548 if (WARN_ON_ONCE(!tmc->tmgroup)) 1549 return -EINVAL; 1550 1551 guard(mutex)(&tmigr_available_mutex); 1552 1553 cpumask_set_cpu(cpu, tmigr_available_cpumask); 1554 scoped_guard(raw_spinlock_irq, &tmc->lock) { 1555 if (tmc->available) 1556 return 0; 1557 trace_tmigr_cpu_available(tmc); 1558 tmc->idle = timer_base_is_idle(); 1559 if (!tmc->idle) 1560 __tmigr_cpu_activate(tmc); 1561 tmc->available = true; 1562 } 1563 return 0; 1564 } 1565 1566 static int tmigr_set_cpu_available(unsigned int cpu) 1567 { 1568 if (tmigr_is_isolated(cpu)) 1569 return 0; 1570 1571 return __tmigr_set_cpu_available(cpu); 1572 } 1573 1574 static void tmigr_cpu_isolate(struct work_struct *ignored) 1575 { 1576 tmigr_clear_cpu_available(smp_processor_id()); 1577 } 1578 1579 static void tmigr_cpu_unisolate(struct work_struct *ignored) 1580 { 1581 /* 1582 * Don't call tmigr_is_isolated() ->housekeeping_cpu() directly because 1583 * the cpuset mutex is correctly held by the workqueue caller but lockdep 1584 * doesn't know that. 1585 */ 1586 __tmigr_set_cpu_available(smp_processor_id()); 1587 } 1588 1589 /** 1590 * tmigr_isolated_exclude_cpumask - Exclude given CPUs from hierarchy 1591 * @exclude_cpumask: the cpumask to be excluded from timer migration hierarchy 1592 * 1593 * This function can be called from cpuset code to provide the new set of 1594 * isolated CPUs that should be excluded from the hierarchy. 1595 * Online CPUs not present in exclude_cpumask but already excluded are brought 1596 * back to the hierarchy. 1597 * Functions to isolate/unisolate need to be called locally and can sleep. 1598 */ 1599 int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask) 1600 { 1601 struct work_struct __percpu *works __free(free_percpu) = 1602 alloc_percpu(struct work_struct); 1603 cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL; 1604 int cpu; 1605 1606 if (!works) 1607 return -ENOMEM; 1608 if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) 1609 return -ENOMEM; 1610 1611 /* 1612 * First set previously isolated CPUs as available (unisolate). 1613 * This cpumask contains only CPUs that switched to available now. 1614 */ 1615 guard(cpus_read_lock)(); 1616 cpumask_andnot(cpumask, cpu_online_mask, exclude_cpumask); 1617 cpumask_andnot(cpumask, cpumask, tmigr_available_cpumask); 1618 1619 for_each_cpu(cpu, cpumask) { 1620 struct work_struct *work = per_cpu_ptr(works, cpu); 1621 1622 INIT_WORK(work, tmigr_cpu_unisolate); 1623 schedule_work_on(cpu, work); 1624 } 1625 for_each_cpu(cpu, cpumask) 1626 flush_work(per_cpu_ptr(works, cpu)); 1627 1628 /* 1629 * Then clear previously available CPUs (isolate). 1630 * This cpumask contains only CPUs that switched to not available now. 1631 * There cannot be overlap with the newly available ones. 1632 */ 1633 cpumask_and(cpumask, exclude_cpumask, tmigr_available_cpumask); 1634 cpumask_and(cpumask, cpumask, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE)); 1635 /* 1636 * Handle this here and not in the cpuset code because exclude_cpumask 1637 * might include also the tick CPU if included in isolcpus. 1638 */ 1639 for_each_cpu(cpu, cpumask) { 1640 if (!tick_nohz_cpu_hotpluggable(cpu)) { 1641 cpumask_clear_cpu(cpu, cpumask); 1642 break; 1643 } 1644 } 1645 1646 for_each_cpu(cpu, cpumask) { 1647 struct work_struct *work = per_cpu_ptr(works, cpu); 1648 1649 INIT_WORK(work, tmigr_cpu_isolate); 1650 schedule_work_on(cpu, work); 1651 } 1652 for_each_cpu(cpu, cpumask) 1653 flush_work(per_cpu_ptr(works, cpu)); 1654 1655 return 0; 1656 } 1657 1658 static int __init tmigr_init_isolation(void) 1659 { 1660 cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL; 1661 1662 static_branch_enable(&tmigr_exclude_isolated); 1663 1664 if (!housekeeping_enabled(HK_TYPE_DOMAIN)) 1665 return 0; 1666 if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) 1667 return -ENOMEM; 1668 1669 cpumask_andnot(cpumask, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); 1670 1671 /* Protect against RCU torture hotplug testing */ 1672 return tmigr_isolated_exclude_cpumask(cpumask); 1673 } 1674 late_initcall(tmigr_init_isolation); 1675 1676 static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, 1677 int node) 1678 { 1679 union tmigr_state s; 1680 1681 raw_spin_lock_init(&group->lock); 1682 1683 group->level = lvl; 1684 group->numa_node = lvl < tmigr_crossnode_level ? node : NUMA_NO_NODE; 1685 1686 group->num_children = 0; 1687 1688 s.migrator = TMIGR_NONE; 1689 s.active = 0; 1690 s.seq = 0; 1691 atomic_set(&group->migr_state, s.state); 1692 1693 timerqueue_init_head(&group->events); 1694 timerqueue_init(&group->groupevt.nextevt); 1695 group->groupevt.nextevt.expires = KTIME_MAX; 1696 WRITE_ONCE(group->next_expiry, KTIME_MAX); 1697 group->groupevt.ignore = true; 1698 } 1699 1700 static struct tmigr_group *tmigr_get_group(struct tmigr_hierarchy *hier, int node, unsigned int lvl) 1701 { 1702 struct tmigr_group *tmp, *group = NULL; 1703 1704 lockdep_assert_held(&tmigr_mutex); 1705 1706 /* Try to attach to an existing group first */ 1707 list_for_each_entry(tmp, &hier->level_list[lvl], list) { 1708 /* 1709 * If @lvl is below the cross NUMA node level, check whether 1710 * this group belongs to the same NUMA node. 1711 */ 1712 if (lvl < tmigr_crossnode_level && tmp->numa_node != node) 1713 continue; 1714 1715 /* Capacity left? */ 1716 if (tmp->num_children >= TMIGR_CHILDREN_PER_GROUP) 1717 continue; 1718 1719 /* 1720 * TODO: A possible further improvement: Make sure that all CPU 1721 * siblings end up in the same group of the lowest level of the 1722 * hierarchy. Rely on the topology sibling mask would be a 1723 * reasonable solution. 1724 */ 1725 1726 group = tmp; 1727 break; 1728 } 1729 1730 if (group) 1731 return group; 1732 1733 /* Allocate and set up a new group */ 1734 group = kzalloc_node(sizeof(*group), GFP_KERNEL, node); 1735 if (!group) 1736 return ERR_PTR(-ENOMEM); 1737 1738 tmigr_init_group(group, lvl, node); 1739 1740 /* Setup successful. Add it to the hierarchy */ 1741 list_add(&group->list, &hier->level_list[lvl]); 1742 trace_tmigr_group_set(group); 1743 return group; 1744 } 1745 1746 static bool tmigr_init_root(struct tmigr_hierarchy *hier, struct tmigr_group *group, bool activate) 1747 { 1748 if (!group->parent && group != hier->root) { 1749 /* 1750 * This is the new top-level, prepare its groupmask in advance 1751 * to avoid accidents where yet another new top-level is 1752 * created in the future and made visible before this groupmask. 1753 */ 1754 group->groupmask = BIT(0); 1755 WARN_ON_ONCE(activate); 1756 1757 return true; 1758 } 1759 1760 return false; 1761 1762 } 1763 1764 static void tmigr_connect_child_parent(struct tmigr_hierarchy *hier, struct tmigr_group *child, 1765 struct tmigr_group *parent, bool activate) 1766 { 1767 if (tmigr_init_root(hier, parent, activate)) { 1768 /* 1769 * The previous top level had prepared its groupmask already, 1770 * simply account it in advance as the first child. If some groups 1771 * have been created between the old and new root due to node 1772 * mismatch, the new root's child will be intialized accordingly. 1773 */ 1774 parent->num_children = 1; 1775 } 1776 1777 /* Connecting old root to new root ? */ 1778 if (!parent->parent && activate) { 1779 /* 1780 * @child is the old top, or in case of node mismatch, some 1781 * intermediate group between the old top and the new one in 1782 * @parent. In this case the @child must be pre-accounted above 1783 * as the first child. Its new inactive sibling corresponding 1784 * to the CPU going up has been accounted as the second child. 1785 */ 1786 WARN_ON_ONCE(parent->num_children != 2); 1787 child->groupmask = BIT(0); 1788 } else { 1789 /* Common case adding @child for the CPU going up to @parent. */ 1790 child->groupmask = BIT(parent->num_children++); 1791 } 1792 1793 /* 1794 * Make sure parent initialization is visible before publishing it to a 1795 * racing CPU entering/exiting idle. This RELEASE barrier enforces an 1796 * address dependency that pairs with the READ_ONCE() in __walk_groups(). 1797 */ 1798 smp_store_release(&child->parent, parent); 1799 1800 trace_tmigr_connect_child_parent(hier, child); 1801 } 1802 1803 static int tmigr_setup_groups(struct tmigr_hierarchy *hier, unsigned int cpu, 1804 unsigned int node, struct tmigr_group *start, bool activate) 1805 { 1806 struct tmigr_group *root = hier->root, *group, *child, **stack; 1807 int i, top = 0, err = 0, start_lvl = 0; 1808 bool root_mismatch = false; 1809 1810 stack = kzalloc_objs(*stack, tmigr_hierarchy_levels); 1811 if (!stack) 1812 return -ENOMEM; 1813 1814 if (start) { 1815 stack[start->level] = start; 1816 start_lvl = start->level + 1; 1817 } 1818 1819 if (root) 1820 root_mismatch = root->numa_node != node; 1821 1822 for (i = start_lvl; i < tmigr_hierarchy_levels; i++) { 1823 group = tmigr_get_group(hier, node, i); 1824 if (IS_ERR(group)) { 1825 err = PTR_ERR(group); 1826 i--; 1827 break; 1828 } 1829 1830 top = i; 1831 stack[i] = group; 1832 1833 /* 1834 * When booting only less CPUs of a system than CPUs are 1835 * available, not all calculated hierarchy levels are required, 1836 * unless a node mismatch is detected. 1837 * 1838 * The loop is aborted as soon as the highest level, which might 1839 * be different from tmigr_hierarchy_levels, contains only a 1840 * single group, unless the nodes mismatch below tmigr_crossnode_level 1841 */ 1842 if (group->parent) 1843 break; 1844 if ((!root_mismatch || i >= tmigr_crossnode_level) && 1845 list_is_singular(&hier->level_list[i])) 1846 break; 1847 } 1848 1849 /* Assert single root without parent */ 1850 if (WARN_ON_ONCE(i >= tmigr_hierarchy_levels)) 1851 return -EINVAL; 1852 1853 for (; i >= start_lvl; i--) { 1854 group = stack[i]; 1855 1856 if (err < 0) { 1857 list_del(&group->list); 1858 kfree(group); 1859 continue; 1860 } 1861 1862 WARN_ON_ONCE(i != group->level); 1863 1864 /* 1865 * Update tmc -> group / child -> group connection 1866 */ 1867 if (i == 0) { 1868 struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); 1869 1870 tmc->tmgroup = group; 1871 tmc->groupmask = BIT(group->num_children++); 1872 1873 tmigr_init_root(hier, group, activate); 1874 1875 trace_tmigr_connect_cpu_parent(hier, tmc); 1876 1877 /* There are no children that need to be connected */ 1878 continue; 1879 } else { 1880 child = stack[i - 1]; 1881 tmigr_connect_child_parent(hier, child, group, activate); 1882 } 1883 } 1884 1885 if (err < 0) 1886 goto out; 1887 1888 if (activate) { 1889 struct tmigr_walk data; 1890 union tmigr_state state; 1891 1892 /* 1893 * To prevent inconsistent states, active children need to be active in 1894 * the new parent as well. Inactive children are already marked inactive 1895 * in the parent group: 1896 * 1897 * * When new groups were created by tmigr_setup_groups() starting from 1898 * the lowest level, then they are not active. They will be set active 1899 * when the new online CPU comes active. 1900 * 1901 * * But if new groups above the current top level are required, it is 1902 * mandatory to propagate the active state of the already existing 1903 * child to the new parents. So tmigr_active_up() activates the 1904 * new parents while walking up from the old root to the new. 1905 * 1906 * * It is ensured that @start is active, (or on the way to be activated 1907 * by another CPU that woke up before the current one) as this setup path 1908 * is executed in hotplug prepare callback. This is executed by an already 1909 * connected and !idle CPU in the hierarchy. 1910 * 1911 * * The below RmW atomic operation ensures that: 1912 * 1913 * 1) If the old root has been completely activated, the latest state is 1914 * acquired (the below implicit acquire pairs with the implicit release 1915 * from cmpxchg() in tmigr_active_up()). 1916 * 1917 * 2) If the old root is still on the way to be activated, the lagging behind 1918 * CPU performing the activation will acquire the links up to the new root. 1919 * (The below implicit release pairs with the implicit acquire from cmpxchg() 1920 * in tmigr_active_up()). 1921 * 1922 * 3) Every subsequent CPU below the old root will acquire the new links while 1923 * walking through the old root (The below implicit release pairs with the 1924 * implicit acquire from cmpxchg() in either tmigr_active_up()) or 1925 * tmigr_inactive_up(). 1926 */ 1927 state.state = atomic_fetch_or(0, &start->migr_state); 1928 WARN_ON_ONCE(!start->parent); 1929 /* 1930 * If the state of the old root is inactive, another CPU is on its way to activate 1931 * it and propagate to the new root. 1932 */ 1933 if (state.active) { 1934 data.childmask = start->groupmask; 1935 __walk_groups_from(tmigr_active_up, &data, start, start->parent); 1936 } 1937 } else if (start) { 1938 union tmigr_state state; 1939 1940 /* Remote activation assumes the whole target's hierarchy is inactive */ 1941 state.state = atomic_read(&start->migr_state); 1942 WARN_ON_ONCE(state.active); 1943 } 1944 1945 /* Root update */ 1946 if (list_is_singular(&hier->level_list[top])) { 1947 group = list_first_entry(&hier->level_list[top], typeof(*group), list); 1948 WARN_ON_ONCE(group->parent); 1949 if (root) { 1950 /* Old root should be the same or below */ 1951 WARN_ON_ONCE(root->level > top); 1952 } 1953 hier->root = group; 1954 } 1955 out: 1956 kfree(stack); 1957 1958 return err; 1959 } 1960 1961 static struct tmigr_hierarchy *tmigr_get_hierarchy(int cpu) 1962 { 1963 struct tmigr_hierarchy *hier; 1964 1965 hier = __tmigr_get_hierarchy(cpu); 1966 1967 if (hier) 1968 return hier; 1969 1970 hier = kzalloc_flex(*hier, level_list, tmigr_hierarchy_levels); 1971 if (!hier) 1972 return ERR_PTR(-ENOMEM); 1973 1974 hier->cpumask = kzalloc(cpumask_size(), GFP_KERNEL); 1975 if (!hier->cpumask) { 1976 kfree(hier); 1977 return ERR_PTR(-ENOMEM); 1978 } 1979 1980 for (int i = 0; i < tmigr_hierarchy_levels; i++) 1981 INIT_LIST_HEAD(&hier->level_list[i]); 1982 1983 hier->capacity = tmigr_get_capacity(cpu); 1984 list_add_tail(&hier->node, &tmigr_hierarchy_list); 1985 1986 return hier; 1987 } 1988 1989 static int tmigr_connect_old_root(struct tmigr_hierarchy *hier, int cpu, 1990 struct tmigr_group *old_root, bool activate) 1991 { 1992 /* 1993 * The target CPU must never do the prepare work, except 1994 * on early boot when the boot CPU is the target. Otherwise 1995 * it may spuriously activate the old top level group inside 1996 * the new one (nevertheless whether old top level group is 1997 * active or not) and/or release an uninitialized childmask. 1998 */ 1999 WARN_ON_ONCE(cpu == smp_processor_id()); 2000 if (activate) { 2001 /* 2002 * The current CPU is expected to be online in the hierarchy, 2003 * otherwise the old root may not be active as expected. 2004 */ 2005 WARN_ON_ONCE(!__this_cpu_read(tmigr_cpu.available)); 2006 } 2007 2008 return tmigr_setup_groups(hier, -1, old_root->numa_node, old_root, activate); 2009 } 2010 2011 static long connect_old_root_work(void *arg) 2012 { 2013 struct tmigr_group *old_root = arg; 2014 struct tmigr_hierarchy *hier; 2015 int cpu = smp_processor_id(); 2016 2017 hier = __tmigr_get_hierarchy(cpu); 2018 if (WARN_ON_ONCE(!hier)) 2019 return -EINVAL; 2020 2021 return tmigr_connect_old_root(hier, cpu, old_root, true); 2022 } 2023 2024 static int tmigr_add_cpu(unsigned int cpu) 2025 { 2026 struct tmigr_hierarchy *hier; 2027 struct tmigr_group *old_root; 2028 int node = cpu_to_node(cpu); 2029 int ret; 2030 2031 guard(mutex)(&tmigr_mutex); 2032 2033 hier = tmigr_get_hierarchy(cpu); 2034 if (IS_ERR(hier)) 2035 return PTR_ERR(hier); 2036 2037 old_root = hier->root; 2038 2039 ret = tmigr_setup_groups(hier, cpu, node, NULL, false); 2040 2041 if (ret < 0) 2042 return ret; 2043 2044 /* Root has changed? Connect the old one to the new */ 2045 if (old_root && old_root != hier->root) { 2046 guard(migrate)(); 2047 2048 if (cpumask_test_cpu(smp_processor_id(), hier->cpumask)) { 2049 /* 2050 * If the target belong to the same hierarchy, the old root is expected 2051 * to be active. Link and propagate to the new root. 2052 */ 2053 ret = tmigr_connect_old_root(hier, cpu, old_root, true); 2054 } else { 2055 int target = cpumask_first_and(hier->cpumask, tmigr_available_cpumask); 2056 2057 if (target < nr_cpu_ids) { 2058 /* 2059 * If the target doesn't belong to the same hierarchy as the current 2060 * CPU, activate from a relevant one to make sure the old root is 2061 * active. 2062 */ 2063 ret = work_on_cpu(target, connect_old_root_work, old_root); 2064 } else { 2065 /* 2066 * No other available CPUs in the remote hierarchy. Link the 2067 * old root remotely but don't propagate activation since the 2068 * old root is not expected to be active. 2069 */ 2070 ret = tmigr_connect_old_root(hier, cpu, old_root, false); 2071 } 2072 } 2073 } 2074 2075 if (ret >= 0) 2076 cpumask_set_cpu(cpu, hier->cpumask); 2077 2078 return ret; 2079 } 2080 2081 static int tmigr_cpu_prepare(unsigned int cpu) 2082 { 2083 struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); 2084 int ret = 0; 2085 2086 /* Not first online attempt? */ 2087 if (tmc->tmgroup) 2088 return ret; 2089 2090 raw_spin_lock_init(&tmc->lock); 2091 timerqueue_init(&tmc->cpuevt.nextevt); 2092 tmc->cpuevt.nextevt.expires = KTIME_MAX; 2093 tmc->cpuevt.ignore = true; 2094 tmc->cpuevt.cpu = cpu; 2095 tmc->remote = false; 2096 WRITE_ONCE(tmc->wakeup, KTIME_MAX); 2097 2098 ret = tmigr_add_cpu(cpu); 2099 if (ret < 0) 2100 return ret; 2101 2102 if (tmc->groupmask == 0) 2103 return -EINVAL; 2104 2105 return ret; 2106 } 2107 2108 static int __init tmigr_init(void) 2109 { 2110 unsigned int cpulvl, nodelvl, cpus_per_node; 2111 unsigned int nnodes = num_possible_nodes(); 2112 unsigned int ncpus = num_possible_cpus(); 2113 int ret = -ENOMEM; 2114 2115 BUILD_BUG_ON_NOT_POWER_OF_2(TMIGR_CHILDREN_PER_GROUP); 2116 2117 /* Nothing to do if running on UP */ 2118 if (ncpus == 1) 2119 return 0; 2120 2121 if (!zalloc_cpumask_var(&tmigr_available_cpumask, GFP_KERNEL)) { 2122 ret = -ENOMEM; 2123 goto err; 2124 } 2125 2126 /* 2127 * Calculate the required hierarchy levels. Unfortunately there is no 2128 * reliable information available, unless all possible CPUs have been 2129 * brought up and all NUMA nodes are populated. 2130 * 2131 * Estimate the number of levels with the number of possible nodes and 2132 * the number of possible CPUs. Assume CPUs are spread evenly across 2133 * nodes. We cannot rely on cpumask_of_node() because it only works for 2134 * online CPUs. 2135 */ 2136 cpus_per_node = DIV_ROUND_UP(ncpus, nnodes); 2137 2138 /* Calc the hierarchy levels required to hold the CPUs of a node */ 2139 cpulvl = DIV_ROUND_UP(order_base_2(cpus_per_node), 2140 ilog2(TMIGR_CHILDREN_PER_GROUP)); 2141 2142 /* Calculate the extra levels to connect all nodes */ 2143 nodelvl = DIV_ROUND_UP(order_base_2(nnodes), 2144 ilog2(TMIGR_CHILDREN_PER_GROUP)); 2145 2146 tmigr_hierarchy_levels = cpulvl + nodelvl; 2147 2148 /* 2149 * If a NUMA node spawns more than one CPU level group then the next 2150 * level(s) of the hierarchy contains groups which handle all CPU groups 2151 * of the same NUMA node. The level above goes across NUMA nodes. Store 2152 * this information for the setup code to decide in which level node 2153 * matching is no longer required. 2154 */ 2155 tmigr_crossnode_level = cpulvl; 2156 2157 pr_info("Timer migration: %d hierarchy levels; %d children per group;" 2158 " %d crossnode level\n", 2159 tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP, 2160 tmigr_crossnode_level); 2161 2162 ret = cpuhp_setup_state(CPUHP_TMIGR_PREPARE, "tmigr:prepare", 2163 tmigr_cpu_prepare, NULL); 2164 if (ret) 2165 goto err; 2166 2167 ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online", 2168 tmigr_set_cpu_available, tmigr_clear_cpu_available); 2169 if (ret) 2170 goto err; 2171 2172 return 0; 2173 2174 err: 2175 pr_err("Timer migration setup failed\n"); 2176 return ret; 2177 } 2178 early_initcall(tmigr_init); 2179