timer_migration.c - OpenGrok cross reference for /linux/kernel/time/timer

Lines Matching +full:top +full:- +full:level
1 // SPDX-License-Identifier: GPL-2.0-only
16 #include "tick-internal.h"
23  * lowest level group contains CPUs, the next level groups of CPU groups
26  * CPUs per node even the next level might be kept as groups of CPU groups
35  *               GRP0:0 - GRP0:2               GRP0:3 - GRP0:5
38  * CPUS     0-7       8-15      16-23     24-31     32-39     40-47
81  * duties up to the top level of the hierarchy (LVL2 in the example). It
87  * ---------------
97  * --------------
103  * the per CPU tmigr_cpu->lock is held.
108  * When @timer_base->lock as well as tmigr related locks are required, the lock
109  * ordering is: first @timer_base->lock, afterwards tmigr related locks.
113  * ------------------------------------------------
145  *       --> migrator = TMIGR_NONE     migrator = CPU2
146  *       --> active   =                active   = CPU2
149  *         --> idle        idle           active      idle
162  *       --> migrator = CPU1           migrator = CPU2
163  *       --> active   = CPU1           active   = CPU2
166  *             idle    --> active         active      idle
174  *                 --> migrator = GRP0:1
175  *                 --> active   = GRP0:0, GRP0:1
187  *                 --> migrator = GRP0:1
188  *                 --> active   = GRP0:1
203  * expected value (compare-and-exchange).
211  * ----------------------------------------------------------
242  *           migrator = TMIGR_NONE --> migrator = TMIGR_NONE
243  *           active   =            --> active   =
247  *             idle        idle       --> idle        idle
250  *    child going idle in top level group, the expiry of the next group event
255  *                 --> migrator = TMIGR_NONE
256  *                 --> active   =
265  *             idle        idle       --> idle        idle
278  *       --> next_expiry = TIMER0      next_expiry  = KTIME_MAX
287  *    top level group.
292  *                 --> next_expiry = TIMER0
306  *   --------------------------          ---------------------------
308  *   cmpxchg(&GRP1:0->state);
310  *       spin_lock(&GRP1:0->lock);
313  *       spin_unlock(&GRP1:0->lock);
317  *                                           spin_lock(&GRP1:0->lock)
319  *                                           group_state = atomic_read(&GRP1:0->state)
322  *                                           spin_unlock(&GRP1:0->lock) <3>
333  * -----------------------------------------------------------
348  *                 --> timerqueue = evt-GRP0:0
355  *           timerqueue = evt-CPU0,    timerqueue =
356  *                        evt-CPU1
370  *                 --> timerqueue =
376  *       --> groupevt.cpu = CPU0       groupevt.cpu =
377  *           timerqueue = evt-CPU0,    timerqueue =
378  *                        evt-CPU1
398  *                 --> timerqueue = evt-GRP0:0
404  *       --> groupevt.cpu = CPU1       groupevt.cpu =
405  *       --> timerqueue = evt-CPU1     timerqueue =
444 	return !(tmc->tmgroup && tmc->available);
476  * group is not active - so no migrator is set.
482 	s.state = atomic_read(&group->migr_state);
496 	s.state = atomic_read(&group->migr_state);
512 	s.state = atomic_read(&group->migr_state);
520  * struct tmigr_walk - data required for walking the hierarchy
530  *			in top level group only. Be aware, there could occur a
531  *			new top level of the hierarchy between the 'top level
535  *			final top level. This is not a problem, as the worst
564 		WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels);
574 		group = READ_ONCE(group->parent);
575 		data->childmask = child->groupmask;
576 		WARN_ON_ONCE(!data->childmask);
583 	__walk_groups_from(up, data, NULL, tmc->tmgroup);
588 	lockdep_assert_held(&tmc->lock);
594  * Returns the next event of the timerqueue @group->events
604 	lockdep_assert_held(&group->lock);
606 	WRITE_ONCE(group->next_expiry, KTIME_MAX);
608 	while ((node = timerqueue_getnext(&group->events))) {
611 		if (!READ_ONCE(evt->ignore)) {
612 			WRITE_ONCE(group->next_expiry, evt->nextevt.expires);
620 		if (!timerqueue_del(&group->events, node))
637 	if (!evt || now < evt->nextevt.expires)
643 	timerqueue_del(&group->events, &evt->nextevt);
658 		return evt->nextevt.expires;
669 	childmask = data->childmask;
675 	curstate.state = atomic_read(&group->migr_state);
691 	} while (!atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state));
707 	WRITE_ONCE(group->groupevt.ignore, true);
716 	data.childmask = tmc->groupmask;
720 	tmc->cpuevt.ignore = true;
721 	WRITE_ONCE(tmc->wakeup, KTIME_MAX);
727  * tmigr_cpu_activate() - set this CPU active in timer migration hierarchy
738 	if (WARN_ON_ONCE(!tmc->idle))
741 	raw_spin_lock(&tmc->lock);
742 	tmc->idle = false;
744 	raw_spin_unlock(&tmc->lock);
748  * Returns true, if there is nothing to be propagated to the next level
750  * @data->firstexp is set to expiry of first global event of the (top level of
756  * the documentation at the top.
766 	bool remote = data->remote;
772 		raw_spin_lock(&child->lock);
773 		raw_spin_lock_nested(&group->lock, SINGLE_DEPTH_NESTING);
775 		childstate.state = atomic_read(&child->migr_state);
776 		groupstate.state = atomic_read(&group->migr_state);
784 		nextexp = child->next_expiry;
785 		evt = &child->groupevt;
794 		WRITE_ONCE(evt->ignore, ignore);
796 		nextexp = data->nextexp;
798 		first_childevt = evt = data->evt;
799 		ignore = evt->ignore;
806 		 * expiry" in the documentation at the top).
811 		 *  - When entering this path by tmigr_new_timer(), @evt->ignore
813 		 *  - tmigr_inactive_up() takes care of the propagation by
816 		 *    locking at this level, because the upper walking call to
821 		 * single level so @group is the top level group, make sure the
825 		if (ignore && !remote && group->parent)
828 		raw_spin_lock(&group->lock);
831 		groupstate.state = atomic_read(&group->migr_state);
838 	if (timerqueue_node_queued(&evt->nextevt)) {
839 		if ((evt->nextevt.expires == nextexp) && !ignore) {
841 			evt->cpu = first_childevt->cpu;
845 		if (!timerqueue_del(&group->events, &evt->nextevt))
846 			WRITE_ONCE(group->next_expiry, KTIME_MAX);
860 		 * of the group needs to be propagated to a higher level to
866 		evt->nextevt.expires = nextexp;
867 		evt->cpu = first_childevt->cpu;
869 		if (timerqueue_add(&group->events, &evt->nextevt))
870 			WRITE_ONCE(group->next_expiry, nextexp);
874 	if (!group->parent && (groupstate.migrator == TMIGR_NONE)) {
879 		 * handling. First timer in top level group which needs to be
880 		 * handled when top level group is not active, is calculated
887 		 * The top level group is idle and it has to be ensured the
893 		data->firstexp = tmigr_next_groupevt_expires(group);
900 	raw_spin_unlock(&group->lock);
903 		raw_spin_unlock(&child->lock);
924 				   .evt = &tmc->cpuevt };
926 	lockdep_assert_held(&tmc->lock);
928 	if (tmc->remote)
933 	tmc->cpuevt.ignore = false;
951 	raw_spin_lock_irq(&tmc->lock);
968 	if (!tmc->available || tmc->remote || tmc->cpuevt.ignore ||
969 	    now < tmc->cpuevt.nextevt.expires) {
970 		raw_spin_unlock_irq(&tmc->lock);
976 	tmc->remote = true;
977 	WRITE_ONCE(tmc->wakeup, KTIME_MAX);
980 	raw_spin_unlock_irq(&tmc->lock);
986 	 * Lock ordering needs to be preserved - timer_base locks before tmigr
988 	 * the top). During fetching the next timer interrupt, also tmc->lock
1002 	raw_spin_lock(&tmc->lock);
1013 	 * remote expiry" in the documentation at the top)
1015 	if (!tmc->available || !tmc->idle) {
1026 	data.evt = &tmc->cpuevt;
1032 	 * after a remote expiry" in the documentation at the top)
1037 	tmc->remote = false;
1038 	raw_spin_unlock_irq(&tmc->lock);
1050 	jif = data->basej;
1051 	now = data->now;
1053 	childmask = data->childmask;
1065 	raw_spin_lock_irq(&group->lock);
1070 		unsigned int remote_cpu = evt->cpu;
1072 		raw_spin_unlock_irq(&group->lock);
1082 	 * (group->next_expiry was updated by tmigr_next_expired_groupevt(),
1085 	data->firstexp = group->next_expiry;
1087 	raw_spin_unlock_irq(&group->lock);
1093  * tmigr_handle_remote() - Handle global timers of remote idle CPUs
1105 	data.childmask = tmc->groupmask;
1113 	if (!tmigr_check_migrator(tmc->tmgroup, tmc->groupmask)) {
1119 		if (READ_ONCE(tmc->wakeup) == KTIME_MAX)
1126 	 * Update @tmc->wakeup only at the end and do not reset @tmc->wakeup to
1127 	 * KTIME_MAX. Even if tmc->lock is not held during the whole remote
1128 	 * handling, tmc->wakeup is fine to be stale as it is called in
1135 	raw_spin_lock_irq(&tmc->lock);
1136 	WRITE_ONCE(tmc->wakeup, data.firstexp);
1137 	raw_spin_unlock_irq(&tmc->lock);
1146 	childmask = data->childmask;
1162 		data->firstexp = READ_ONCE(group->next_expiry);
1163 		if (data->now >= data->firstexp) {
1164 			data->check = true;
1168 		raw_spin_lock(&group->lock);
1169 		data->firstexp = group->next_expiry;
1170 		if (data->now >= group->next_expiry) {
1171 			data->check = true;
1172 			raw_spin_unlock(&group->lock);
1175 		raw_spin_unlock(&group->lock);
1182  * tmigr_requires_handle_remote() - Check the need of remote timer handling
1197 	data.childmask = tmc->groupmask;
1205 	 * Check is done lockless as interrupts are disabled and @tmc->idle is
1208 	if (!tmc->idle) {
1215 	 * When the CPU is idle, compare @tmc->wakeup with @data.now. The lock
1221 		if (data.now >= READ_ONCE(tmc->wakeup))
1224 		raw_spin_lock(&tmc->lock);
1225 		if (data.now >= tmc->wakeup)
1227 		raw_spin_unlock(&tmc->lock);
1234  * tmigr_cpu_new_timer() - enqueue next global timer into hierarchy (idle tmc)
1239  * and thereby the timer idle path is executed once more. @tmc->wakeup
1254 	raw_spin_lock(&tmc->lock);
1256 	ret = READ_ONCE(tmc->wakeup);
1258 		if (nextexp != tmc->cpuevt.nextevt.expires ||
1259 		    tmc->cpuevt.ignore) {
1265 			WRITE_ONCE(tmc->wakeup, ret);
1269 	raw_spin_unlock(&tmc->lock);
1281 	childmask = data->childmask;
1290 	curstate.state = atomic_read_acquire(&group->migr_state);
1294 			childstate.state = atomic_read(&child->migr_state);
1328 		if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)) {
1342 	data->remote = false;
1354 				   .evt = &tmc->cpuevt,
1355 				   .childmask = tmc->groupmask };
1363 		tmc->cpuevt.ignore = false;
1370  * tmigr_cpu_deactivate() - Put current CPU into inactive state
1376  * from the hierarchy if this CPU is the top level migrator or the hierarchy is
1387 	raw_spin_lock(&tmc->lock);
1391 	tmc->idle = true;
1397 	WRITE_ONCE(tmc->wakeup, ret);
1400 	raw_spin_unlock(&tmc->lock);
1405  * tmigr_quick_check() - Quick forecast of next tmigr event when CPU wants to
1410  * * KTIME_MAX		- when it is probable that nothing has to be done (not
1411  *			  the only one in the level 0 group; and if it is the
1412  *			  only one in level 0 group, but there are more than a
1413  *			  single group active on the way to top level)
1414  * * nextevt		- when CPU is offline and has to handle timer on its own
1415  *			  or when on the way to top in every group only a single
1417  *			  next_expiry encountered while walking up to top level.
1418  * * next_expiry	- value of lowest expiry encountered while walking groups
1425 	struct tmigr_group *group = tmc->tmgroup;
1430 	if (WARN_ON_ONCE(tmc->idle))
1433 	if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->groupmask))
1442 		 * from bottom to the top because the CPU's event is ignored
1443 		 * up to the top and its sibling's events not propagated upwards.
1446 		nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry));
1447 		group = group->parent;
1454  * tmigr_trigger_active() - trigger a CPU to become active again
1464 	WARN_ON_ONCE(!tmc->available || tmc->idle);
1478 	scoped_guard(raw_spinlock_irq, &tmc->lock) {
1479 		if (!tmc->available)
1481 		tmc->available = false;
1482 		WRITE_ONCE(tmc->wakeup, KTIME_MAX);
1505 	if (WARN_ON_ONCE(!tmc->tmgroup))
1506 		return -EINVAL;
1514 	scoped_guard(raw_spinlock_irq, &tmc->lock) {
1515 		if (tmc->available)
1518 		tmc->idle = timer_base_is_idle();
1519 		if (!tmc->idle)
1521 		tmc->available = true;
1537  * tmigr_isolated_exclude_cpumask - Exclude given CPUs from hierarchy
1556 		return -ENOMEM;
1558 		return -ENOMEM;
1615 		return -ENOMEM;
1630 	raw_spin_lock_init(&group->lock);
1632 	group->level = lvl;
1633 	group->numa_node = lvl < tmigr_crossnode_level ? node : NUMA_NO_NODE;
1635 	group->num_children = 0;
1640 	atomic_set(&group->migr_state, s.state);
1642 	timerqueue_init_head(&group->events);
1643 	timerqueue_init(&group->groupevt.nextevt);
1644 	group->groupevt.nextevt.expires = KTIME_MAX;
1645 	WRITE_ONCE(group->next_expiry, KTIME_MAX);
1646 	group->groupevt.ignore = true;
1658 		 * If @lvl is below the cross NUMA node level, check whether
1661 		if (lvl < tmigr_crossnode_level && tmp->numa_node != node)
1665 		if (tmp->num_children >= TMIGR_CHILDREN_PER_GROUP)
1670 		 * siblings end up in the same group of the lowest level of the
1685 		return ERR_PTR(-ENOMEM);
1690 	list_add(&group->list, &tmigr_level_list[lvl]);
1697 	if (!group->parent && group != tmigr_root) {
1699 		 * This is the new top-level, prepare its groupmask in advance
1700 		 * to avoid accidents where yet another new top-level is
1703 		group->groupmask = BIT(0);
1719 		 * The previous top level had prepared its groupmask already,
1724 		parent->num_children = 1;
1728 	if (!parent->parent && activate) {
1730 		 * @child is the old top, or in case of node mismatch, some
1731 		 * intermediate group between the old top and the new one in
1732 		 * @parent. In this case the @child must be pre-accounted above
1736 		WARN_ON_ONCE(parent->num_children != 2);
1737 		child->groupmask = BIT(0);
1740 		child->groupmask = BIT(parent->num_children++);
1748 	smp_store_release(&child->parent, parent);
1757 	int i, top = 0, err = 0, start_lvl = 0;
1762 		return -ENOMEM;
1765 		stack[start->level] = start;
1766 		start_lvl = start->level + 1;
1770 		root_mismatch = tmigr_root->numa_node != node;
1776 			i--;
1780 		top = i;
1788 		 * The loop is aborted as soon as the highest level, which might
1792 		if (group->parent)
1801 		return -EINVAL;
1803 	for (; i >= start_lvl; i--) {
1807 			list_del(&group->list);
1812 		WARN_ON_ONCE(i != group->level);
1815 		 * Update tmc -> group / child -> group connection
1820 			tmc->tmgroup = group;
1821 			tmc->groupmask = BIT(group->num_children++);
1830 			child = stack[i - 1];
1848 		 *   the lowest level, then they are not active. They will be set active
1851 		 * * But if new groups above the current top level are required, it is
1859 		 *   the CPU executing the setup will be responsible up to current top
1860 		 *   level group. And the next time it goes inactive, it will release
1864 		state.state = atomic_read(&start->migr_state);
1866 		WARN_ON_ONCE(!start->parent);
1867 		data.childmask = start->groupmask;
1868 		__walk_groups_from(tmigr_active_up, &data, start, start->parent);
1872 	if (list_is_singular(&tmigr_level_list[top])) {
1873 		group = list_first_entry(&tmigr_level_list[top],
1875 		WARN_ON_ONCE(group->parent);
1878 			WARN_ON_ONCE(tmigr_root->level > top);
1903 		 * it may spuriously activate the old top level group inside
1904 		 * the new one (nevertheless whether old top level group is
1912 		WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available);
1913 		ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true);
1925 	if (tmc->tmgroup)
1928 	raw_spin_lock_init(&tmc->lock);
1929 	timerqueue_init(&tmc->cpuevt.nextevt);
1930 	tmc->cpuevt.nextevt.expires = KTIME_MAX;
1931 	tmc->cpuevt.ignore = true;
1932 	tmc->cpuevt.cpu = cpu;
1933 	tmc->remote = false;
1934 	WRITE_ONCE(tmc->wakeup, KTIME_MAX);
1940 	if (tmc->groupmask == 0)
1941 		return -EINVAL;
1951 	int ret = -ENOMEM;
1960 		ret = -ENOMEM;
1987 	 * If a NUMA node spawns more than one CPU level group then the next
1988 	 * level(s) of the hierarchy contains groups which handle all CPU groups
1989 	 * of the same NUMA node. The level above goes across NUMA nodes. Store
1990 	 * this information for the setup code to decide in which level node
2003 		" %d crossnode level\n",