Lines Matching +full:top +full:- +full:level

1 // SPDX-License-Identifier: GPL-2.0-only
16 #include "tick-internal.h"
23 * lowest level group contains CPUs, the next level groups of CPU groups
26 * CPUs per node even the next level might be kept as groups of CPU groups
35 * GRP0:0 - GRP0:2 GRP0:3 - GRP0:5
38 * CPUS 0-7 8-15 16-23 24-31 32-39 40-47
81 * duties up to the top level of the hierarchy (LVL2 in the example). It
87 * ---------------
97 * --------------
103 * the per CPU tmigr_cpu->lock is held.
108 * When @timer_base->lock as well as tmigr related locks are required, the lock
109 * ordering is: first @timer_base->lock, afterwards tmigr related locks.
113 * ------------------------------------------------
145 * --> migrator = TMIGR_NONE migrator = CPU2
146 * --> active = active = CPU2
149 * --> idle idle active idle
162 * --> migrator = CPU1 migrator = CPU2
163 * --> active = CPU1 active = CPU2
166 * idle --> active active idle
174 * --> migrator = GRP0:1
175 * --> active = GRP0:0, GRP0:1
187 * --> migrator = GRP0:1
188 * --> active = GRP0:1
203 * expected value (compare-and-exchange).
211 * ----------------------------------------------------------
242 * migrator = TMIGR_NONE --> migrator = TMIGR_NONE
243 * active = --> active =
247 * idle idle --> idle idle
250 * child going idle in top level group, the expiry of the next group event
255 * --> migrator = TMIGR_NONE
256 * --> active =
265 * idle idle --> idle idle
278 * --> next_expiry = TIMER0 next_expiry = KTIME_MAX
287 * top level group.
292 * --> next_expiry = TIMER0
306 * -------------------------- ---------------------------
308 * cmpxchg(&GRP1:0->state);
310 * spin_lock(&GRP1:0->lock);
313 * spin_unlock(&GRP1:0->lock);
317 * spin_lock(&GRP1:0->lock)
319 * group_state = atomic_read(&GRP1:0->state)
322 * spin_unlock(&GRP1:0->lock) <3>
333 * -----------------------------------------------------------
348 * --> timerqueue = evt-GRP0:0
355 * timerqueue = evt-CPU0, timerqueue =
356 * evt-CPU1
370 * --> timerqueue =
376 * --> groupevt.cpu = CPU0 groupevt.cpu =
377 * timerqueue = evt-CPU0, timerqueue =
378 * evt-CPU1
398 * --> timerqueue = evt-GRP0:0
404 * --> groupevt.cpu = CPU1 groupevt.cpu =
405 * --> timerqueue = evt-CPU1 timerqueue =
444 return !(tmc->tmgroup && tmc->available);
476 * group is not active - so no migrator is set.
482 s.state = atomic_read(&group->migr_state);
496 s.state = atomic_read(&group->migr_state);
512 s.state = atomic_read(&group->migr_state);
520 * struct tmigr_walk - data required for walking the hierarchy
530 * in top level group only. Be aware, there could occur a
531 * new top level of the hierarchy between the 'top level
535 * final top level. This is not a problem, as the worst
564 WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels);
574 group = READ_ONCE(group->parent);
575 data->childmask = child->groupmask;
576 WARN_ON_ONCE(!data->childmask);
583 __walk_groups_from(up, data, NULL, tmc->tmgroup);
588 lockdep_assert_held(&tmc->lock);
594 * Returns the next event of the timerqueue @group->events
604 lockdep_assert_held(&group->lock);
606 WRITE_ONCE(group->next_expiry, KTIME_MAX);
608 while ((node = timerqueue_getnext(&group->events))) {
611 if (!READ_ONCE(evt->ignore)) {
612 WRITE_ONCE(group->next_expiry, evt->nextevt.expires);
620 if (!timerqueue_del(&group->events, node))
637 if (!evt || now < evt->nextevt.expires)
643 timerqueue_del(&group->events, &evt->nextevt);
658 return evt->nextevt.expires;
669 childmask = data->childmask;
675 curstate.state = atomic_read(&group->migr_state);
691 } while (!atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state));
707 WRITE_ONCE(group->groupevt.ignore, true);
716 data.childmask = tmc->groupmask;
720 tmc->cpuevt.ignore = true;
721 WRITE_ONCE(tmc->wakeup, KTIME_MAX);
727 * tmigr_cpu_activate() - set this CPU active in timer migration hierarchy
738 if (WARN_ON_ONCE(!tmc->idle))
741 raw_spin_lock(&tmc->lock);
742 tmc->idle = false;
744 raw_spin_unlock(&tmc->lock);
748 * Returns true, if there is nothing to be propagated to the next level
750 * @data->firstexp is set to expiry of first global event of the (top level of
756 * the documentation at the top.
766 bool remote = data->remote;
772 raw_spin_lock(&child->lock);
773 raw_spin_lock_nested(&group->lock, SINGLE_DEPTH_NESTING);
775 childstate.state = atomic_read(&child->migr_state);
776 groupstate.state = atomic_read(&group->migr_state);
784 nextexp = child->next_expiry;
785 evt = &child->groupevt;
794 WRITE_ONCE(evt->ignore, ignore);
796 nextexp = data->nextexp;
798 first_childevt = evt = data->evt;
799 ignore = evt->ignore;
806 * expiry" in the documentation at the top).
811 * - When entering this path by tmigr_new_timer(), @evt->ignore
813 * - tmigr_inactive_up() takes care of the propagation by
816 * locking at this level, because the upper walking call to
821 * single level so @group is the top level group, make sure the
825 if (ignore && !remote && group->parent)
828 raw_spin_lock(&group->lock);
831 groupstate.state = atomic_read(&group->migr_state);
838 if (timerqueue_node_queued(&evt->nextevt)) {
839 if ((evt->nextevt.expires == nextexp) && !ignore) {
841 evt->cpu = first_childevt->cpu;
845 if (!timerqueue_del(&group->events, &evt->nextevt))
846 WRITE_ONCE(group->next_expiry, KTIME_MAX);
860 * of the group needs to be propagated to a higher level to
866 evt->nextevt.expires = nextexp;
867 evt->cpu = first_childevt->cpu;
869 if (timerqueue_add(&group->events, &evt->nextevt))
870 WRITE_ONCE(group->next_expiry, nextexp);
874 if (!group->parent && (groupstate.migrator == TMIGR_NONE)) {
879 * handling. First timer in top level group which needs to be
880 * handled when top level group is not active, is calculated
887 * The top level group is idle and it has to be ensured the
893 data->firstexp = tmigr_next_groupevt_expires(group);
900 raw_spin_unlock(&group->lock);
903 raw_spin_unlock(&child->lock);
924 .evt = &tmc->cpuevt };
926 lockdep_assert_held(&tmc->lock);
928 if (tmc->remote)
933 tmc->cpuevt.ignore = false;
951 raw_spin_lock_irq(&tmc->lock);
968 if (!tmc->available || tmc->remote || tmc->cpuevt.ignore ||
969 now < tmc->cpuevt.nextevt.expires) {
970 raw_spin_unlock_irq(&tmc->lock);
976 tmc->remote = true;
977 WRITE_ONCE(tmc->wakeup, KTIME_MAX);
980 raw_spin_unlock_irq(&tmc->lock);
986 * Lock ordering needs to be preserved - timer_base locks before tmigr
988 * the top). During fetching the next timer interrupt, also tmc->lock
1002 raw_spin_lock(&tmc->lock);
1013 * remote expiry" in the documentation at the top)
1015 if (!tmc->available || !tmc->idle) {
1026 data.evt = &tmc->cpuevt;
1032 * after a remote expiry" in the documentation at the top)
1037 tmc->remote = false;
1038 raw_spin_unlock_irq(&tmc->lock);
1050 jif = data->basej;
1051 now = data->now;
1053 childmask = data->childmask;
1065 raw_spin_lock_irq(&group->lock);
1070 unsigned int remote_cpu = evt->cpu;
1072 raw_spin_unlock_irq(&group->lock);
1082 * (group->next_expiry was updated by tmigr_next_expired_groupevt(),
1085 data->firstexp = group->next_expiry;
1087 raw_spin_unlock_irq(&group->lock);
1093 * tmigr_handle_remote() - Handle global timers of remote idle CPUs
1105 data.childmask = tmc->groupmask;
1113 if (!tmigr_check_migrator(tmc->tmgroup, tmc->groupmask)) {
1119 if (READ_ONCE(tmc->wakeup) == KTIME_MAX)
1126 * Update @tmc->wakeup only at the end and do not reset @tmc->wakeup to
1127 * KTIME_MAX. Even if tmc->lock is not held during the whole remote
1128 * handling, tmc->wakeup is fine to be stale as it is called in
1135 raw_spin_lock_irq(&tmc->lock);
1136 WRITE_ONCE(tmc->wakeup, data.firstexp);
1137 raw_spin_unlock_irq(&tmc->lock);
1146 childmask = data->childmask;
1162 data->firstexp = READ_ONCE(group->next_expiry);
1163 if (data->now >= data->firstexp) {
1164 data->check = true;
1168 raw_spin_lock(&group->lock);
1169 data->firstexp = group->next_expiry;
1170 if (data->now >= group->next_expiry) {
1171 data->check = true;
1172 raw_spin_unlock(&group->lock);
1175 raw_spin_unlock(&group->lock);
1182 * tmigr_requires_handle_remote() - Check the need of remote timer handling
1197 data.childmask = tmc->groupmask;
1205 * Check is done lockless as interrupts are disabled and @tmc->idle is
1208 if (!tmc->idle) {
1215 * When the CPU is idle, compare @tmc->wakeup with @data.now. The lock
1221 if (data.now >= READ_ONCE(tmc->wakeup))
1224 raw_spin_lock(&tmc->lock);
1225 if (data.now >= tmc->wakeup)
1227 raw_spin_unlock(&tmc->lock);
1234 * tmigr_cpu_new_timer() - enqueue next global timer into hierarchy (idle tmc)
1239 * and thereby the timer idle path is executed once more. @tmc->wakeup
1254 raw_spin_lock(&tmc->lock);
1256 ret = READ_ONCE(tmc->wakeup);
1258 if (nextexp != tmc->cpuevt.nextevt.expires ||
1259 tmc->cpuevt.ignore) {
1265 WRITE_ONCE(tmc->wakeup, ret);
1269 raw_spin_unlock(&tmc->lock);
1281 childmask = data->childmask;
1290 curstate.state = atomic_read_acquire(&group->migr_state);
1294 childstate.state = atomic_read(&child->migr_state);
1328 if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)) {
1342 data->remote = false;
1354 .evt = &tmc->cpuevt,
1355 .childmask = tmc->groupmask };
1363 tmc->cpuevt.ignore = false;
1370 * tmigr_cpu_deactivate() - Put current CPU into inactive state
1376 * from the hierarchy if this CPU is the top level migrator or the hierarchy is
1387 raw_spin_lock(&tmc->lock);
1391 tmc->idle = true;
1397 WRITE_ONCE(tmc->wakeup, ret);
1400 raw_spin_unlock(&tmc->lock);
1405 * tmigr_quick_check() - Quick forecast of next tmigr event when CPU wants to
1410 * * KTIME_MAX - when it is probable that nothing has to be done (not
1411 * the only one in the level 0 group; and if it is the
1412 * only one in level 0 group, but there are more than a
1413 * single group active on the way to top level)
1414 * * nextevt - when CPU is offline and has to handle timer on its own
1415 * or when on the way to top in every group only a single
1417 * next_expiry encountered while walking up to top level.
1418 * * next_expiry - value of lowest expiry encountered while walking groups
1425 struct tmigr_group *group = tmc->tmgroup;
1430 if (WARN_ON_ONCE(tmc->idle))
1433 if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->groupmask))
1442 * from bottom to the top because the CPU's event is ignored
1443 * up to the top and its sibling's events not propagated upwards.
1446 nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry));
1447 group = group->parent;
1454 * tmigr_trigger_active() - trigger a CPU to become active again
1464 WARN_ON_ONCE(!tmc->available || tmc->idle);
1478 scoped_guard(raw_spinlock_irq, &tmc->lock) {
1479 if (!tmc->available)
1481 tmc->available = false;
1482 WRITE_ONCE(tmc->wakeup, KTIME_MAX);
1505 if (WARN_ON_ONCE(!tmc->tmgroup))
1506 return -EINVAL;
1514 scoped_guard(raw_spinlock_irq, &tmc->lock) {
1515 if (tmc->available)
1518 tmc->idle = timer_base_is_idle();
1519 if (!tmc->idle)
1521 tmc->available = true;
1537 * tmigr_isolated_exclude_cpumask - Exclude given CPUs from hierarchy
1556 return -ENOMEM;
1558 return -ENOMEM;
1615 return -ENOMEM;
1630 raw_spin_lock_init(&group->lock);
1632 group->level = lvl;
1633 group->numa_node = lvl < tmigr_crossnode_level ? node : NUMA_NO_NODE;
1635 group->num_children = 0;
1640 atomic_set(&group->migr_state, s.state);
1642 timerqueue_init_head(&group->events);
1643 timerqueue_init(&group->groupevt.nextevt);
1644 group->groupevt.nextevt.expires = KTIME_MAX;
1645 WRITE_ONCE(group->next_expiry, KTIME_MAX);
1646 group->groupevt.ignore = true;
1658 * If @lvl is below the cross NUMA node level, check whether
1661 if (lvl < tmigr_crossnode_level && tmp->numa_node != node)
1665 if (tmp->num_children >= TMIGR_CHILDREN_PER_GROUP)
1670 * siblings end up in the same group of the lowest level of the
1685 return ERR_PTR(-ENOMEM);
1690 list_add(&group->list, &tmigr_level_list[lvl]);
1697 if (!group->parent && group != tmigr_root) {
1699 * This is the new top-level, prepare its groupmask in advance
1700 * to avoid accidents where yet another new top-level is
1703 group->groupmask = BIT(0);
1719 * The previous top level had prepared its groupmask already,
1724 parent->num_children = 1;
1728 if (!parent->parent && activate) {
1730 * @child is the old top, or in case of node mismatch, some
1731 * intermediate group between the old top and the new one in
1732 * @parent. In this case the @child must be pre-accounted above
1736 WARN_ON_ONCE(parent->num_children != 2);
1737 child->groupmask = BIT(0);
1740 child->groupmask = BIT(parent->num_children++);
1748 smp_store_release(&child->parent, parent);
1757 int i, top = 0, err = 0, start_lvl = 0;
1762 return -ENOMEM;
1765 stack[start->level] = start;
1766 start_lvl = start->level + 1;
1770 root_mismatch = tmigr_root->numa_node != node;
1776 i--;
1780 top = i;
1788 * The loop is aborted as soon as the highest level, which might
1792 if (group->parent)
1801 return -EINVAL;
1803 for (; i >= start_lvl; i--) {
1807 list_del(&group->list);
1812 WARN_ON_ONCE(i != group->level);
1815 * Update tmc -> group / child -> group connection
1820 tmc->tmgroup = group;
1821 tmc->groupmask = BIT(group->num_children++);
1830 child = stack[i - 1];
1848 * the lowest level, then they are not active. They will be set active
1851 * * But if new groups above the current top level are required, it is
1859 * the CPU executing the setup will be responsible up to current top
1860 * level group. And the next time it goes inactive, it will release
1864 state.state = atomic_read(&start->migr_state);
1866 WARN_ON_ONCE(!start->parent);
1867 data.childmask = start->groupmask;
1868 __walk_groups_from(tmigr_active_up, &data, start, start->parent);
1872 if (list_is_singular(&tmigr_level_list[top])) {
1873 group = list_first_entry(&tmigr_level_list[top],
1875 WARN_ON_ONCE(group->parent);
1878 WARN_ON_ONCE(tmigr_root->level > top);
1903 * it may spuriously activate the old top level group inside
1904 * the new one (nevertheless whether old top level group is
1912 WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available);
1913 ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true);
1925 if (tmc->tmgroup)
1928 raw_spin_lock_init(&tmc->lock);
1929 timerqueue_init(&tmc->cpuevt.nextevt);
1930 tmc->cpuevt.nextevt.expires = KTIME_MAX;
1931 tmc->cpuevt.ignore = true;
1932 tmc->cpuevt.cpu = cpu;
1933 tmc->remote = false;
1934 WRITE_ONCE(tmc->wakeup, KTIME_MAX);
1940 if (tmc->groupmask == 0)
1941 return -EINVAL;
1951 int ret = -ENOMEM;
1960 ret = -ENOMEM;
1987 * If a NUMA node spawns more than one CPU level group then the next
1988 * level(s) of the hierarchy contains groups which handle all CPU groups
1989 * of the same NUMA node. The level above goes across NUMA nodes. Store
1990 * this information for the setup code to decide in which level node
2003 " %d crossnode level\n",