Lines Matching +full:domain +full:- +full:idle +full:- +full:state

1 // SPDX-License-Identifier: GPL-2.0
3 * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
5 * Built-in idle CPU tracking policy.
14 /* Enable/disable built-in idle CPU selection policy */
17 /* Enable/disable per-node idle cpumasks */
27 * cpumasks to track idle CPUs within each NUMA node.
30 * from is used to track all the idle CPUs in the system.
38 * Global host-wide idle cpumasks (used when SCX_OPS_BUILTIN_IDLE_PER_NODE
44 * Per-node idle cpumasks.
49 * Local per-CPU cpumasks (used to generate temporary idle cpumasks).
56 * Return the idle masks associated to a target @node.
58 * NUMA_NO_NODE identifies the global idle cpumask.
67 * per-node idle cpumasks are disabled.
80 struct cpumask *idle_cpus = idle_cpumask(node)->cpu; in scx_idle_test_and_clear_cpu()
85 * cluster is not wholly idle either way. This also prevents in scx_idle_test_and_clear_cpu()
90 struct cpumask *idle_smts = idle_cpumask(node)->smt; in scx_idle_test_and_clear_cpu()
95 * @cpu is never cleared from the idle SMT mask. Ensure that in scx_idle_test_and_clear_cpu()
113 * Pick an idle CPU in a specific NUMA node.
121 cpu = cpumask_any_and_distribute(idle_cpumask(node)->smt, cpus_allowed); in pick_idle_cpu_in_node()
126 return -EBUSY; in pick_idle_cpu_in_node()
129 cpu = cpumask_any_and_distribute(idle_cpumask(node)->cpu, cpus_allowed); in pick_idle_cpu_in_node()
131 return -EBUSY; in pick_idle_cpu_in_node()
142 * Tracks nodes that have not yet been visited when searching for an idle
148 * Search for an idle CPU across all nodes, excluding @node.
153 s32 cpu = -EBUSY; in pick_idle_cpu_from_online_nodes()
172 * SCX_OPS_BUILTIN_IDLE_PER_NODE and it's requesting an idle CPU in pick_idle_cpu_from_online_nodes()
177 * in a per-node array, instead of actually traversing them every in pick_idle_cpu_from_online_nodes()
193 return -EBUSY; in pick_idle_cpu_from_online_nodes()
198 * Find an idle CPU in the system, starting from @node.
219 return -EBUSY; in scx_pick_idle_cpu()
228 * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC
229 * domain is not defined).
239 return sd->span_weight; in llc_weight()
243 * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC
244 * domain is not defined).
258 * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the
259 * NUMA domain is not defined).
269 sg = sd->groups; in numa_weight()
273 return sg->group_weight; in numa_weight()
277 * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA
278 * domain is not defined).
288 sg = sd->groups; in numa_span()
314 * - LLC 0: cpu0..cpu7 in llc_numa_mismatch()
315 * - LLC 1: cpu8..cpu15 [offline] in llc_numa_mismatch()
318 * - LLC 0: cpu16..cpu23 in llc_numa_mismatch()
319 * - LLC 1: cpu24..cpu31 in llc_numa_mismatch()
334 * Initialize topology-aware scheduling.
337 * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
341 * CPU belongs to a single LLC domain, and that each LLC domain is entirely
351 * Enable LLC domain optimization only when there are multiple LLC in scx_idle_update_selcpu_topology()
353 * single LLC domain, the idle CPU selection logic can choose any in scx_idle_update_selcpu_topology()
356 * Note that it is sufficient to check the LLC domain of the first in scx_idle_update_selcpu_topology()
357 * online CPU to determine whether a single LLC domain includes all in scx_idle_update_selcpu_topology()
374 * If all CPUs belong to the same NUMA node and the same LLC domain, in scx_idle_update_selcpu_topology()
376 * for an idle CPU in the same domain twice is redundant. in scx_idle_update_selcpu_topology()
379 * optimization, as we would naturally select idle CPUs within in scx_idle_update_selcpu_topology()
380 * specific NUMA nodes querying the corresponding per-node cpumask. in scx_idle_update_selcpu_topology()
382 if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { in scx_idle_update_selcpu_topology()
393 pr_debug("sched_ext: LLC idle selection %s\n", in scx_idle_update_selcpu_topology()
395 pr_debug("sched_ext: NUMA idle selection %s\n", in scx_idle_update_selcpu_topology()
413 return p->nr_cpus_allowed >= num_possible_cpus(); in task_affinity_all()
417 * Built-in CPU idle selection policy:
419 * 1. Prioritize full-idle cores:
420 * - always prioritize CPUs from fully idle cores (both logical CPUs are
421 * idle) to avoid interference caused by SMT.
424 * - prefer the last used CPU to take advantage of cached data (L1, L2) and
427 * 3. Pick a CPU within the same LLC (Last-Level Cache):
428 * - if the above conditions aren't met, pick a CPU that shares the same
429 * LLC, if the LLC domain is a subset of @cpus_allowed, to maintain
433 * - choose a CPU from the same NUMA node, if the node cpumask is a
436 * 5. Pick any idle CPU within the @cpus_allowed domain.
446 * Return the picked CPU if idle, or a negative value otherwise.
455 const struct cpumask *allowed = cpus_allowed ?: p->cpus_ptr; in scx_select_cpu_dfl()
471 if (allowed != p->cpus_ptr) { in scx_select_cpu_dfl()
476 } else if (cpumask_and(local_cpus, cpus_allowed, p->cpus_ptr)) { in scx_select_cpu_dfl()
479 cpu = -EBUSY; in scx_select_cpu_dfl()
500 if (allowed == p->cpus_ptr && task_affinity_all(p)) in scx_select_cpu_dfl()
510 if (allowed == p->cpus_ptr && task_affinity_all(p)) in scx_select_cpu_dfl()
523 * If the waker's CPU is cache affine and prev_cpu is idle, in scx_select_cpu_dfl()
541 * Checking only for the presence of idle CPUs is also in scx_select_cpu_dfl()
543 * piled up on it even if there is an idle core elsewhere on in scx_select_cpu_dfl()
547 if (!(current->flags & PF_EXITING) && in scx_select_cpu_dfl()
548 cpu_rq(cpu)->scx.local_dsq.nr == 0 && in scx_select_cpu_dfl()
550 !cpumask_empty(idle_cpumask(waker_node)->cpu)) { in scx_select_cpu_dfl()
557 * If CPU has SMT, any wholly idle CPU is likely a better pick than in scx_select_cpu_dfl()
558 * partially idle @prev_cpu. in scx_select_cpu_dfl()
562 * Keep using @prev_cpu if it's part of a fully idle core. in scx_select_cpu_dfl()
565 cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) && in scx_select_cpu_dfl()
572 * Search for any fully idle core in the same LLC domain. in scx_select_cpu_dfl()
581 * Search for any fully idle core in the same NUMA node. in scx_select_cpu_dfl()
590 * Search for any full-idle core usable by the task. in scx_select_cpu_dfl()
592 * If the node-aware idle CPU selection policy is enabled in scx_select_cpu_dfl()
602 * Give up if we're strictly looking for a full-idle SMT in scx_select_cpu_dfl()
606 cpu = -EBUSY; in scx_select_cpu_dfl()
612 * Use @prev_cpu if it's idle. in scx_select_cpu_dfl()
620 * Search for any idle CPU in the same LLC domain. in scx_select_cpu_dfl()
629 * Search for any idle CPU in the same NUMA node. in scx_select_cpu_dfl()
638 * Search for any idle CPU usable by the task. in scx_select_cpu_dfl()
640 * If the node-aware idle CPU selection policy is enabled in scx_select_cpu_dfl()
656 * Initialize global and per-node idle cpumasks.
662 /* Allocate global idle cpumasks */ in scx_idle_init_masks()
666 /* Allocate per-node idle cpumasks */ in scx_idle_init_masks()
676 BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->cpu, GFP_KERNEL, i)); in scx_idle_init_masks()
677 BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->smt, GFP_KERNEL, i)); in scx_idle_init_masks()
680 /* Allocate local per-cpu idle cpumasks */ in scx_idle_init_masks()
691 static void update_builtin_idle(int cpu, bool idle) in update_builtin_idle() argument
694 struct cpumask *idle_cpus = idle_cpumask(node)->cpu; in update_builtin_idle()
696 assign_cpu(cpu, idle_cpus, idle); in update_builtin_idle()
701 struct cpumask *idle_smts = idle_cpumask(node)->smt; in update_builtin_idle()
703 if (idle) { in update_builtin_idle()
706 * only for optimization and self-correcting. in update_builtin_idle()
719 * Update the idle state of a CPU to @idle.
722 * scheduler of an actual idle state transition (idle to busy or vice
723 * versa). If @do_notify is false, only the idle state in the idle masks is
726 * This distinction is necessary, because an idle CPU can be "reserved" and
729 * to idle without a true state transition. Refreshing the idle masks
730 * without invoking ops.update_idle() ensures accurate idle state tracking
731 * while avoiding unnecessary updates and maintaining balanced state
734 void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) in __scx_update_idle() argument
742 * Update the idle masks: in __scx_update_idle()
743 * - for real idle transitions (do_notify == true) in __scx_update_idle()
744 * - for idle-to-idle transitions (indicated by the previous task in __scx_update_idle()
745 * being the idle thread, managed by pick_task_idle()) in __scx_update_idle()
747 * Skip updating idle masks if the previous task is not the idle in __scx_update_idle()
749 * transitioning from a task to the idle thread (calling this in __scx_update_idle()
752 * In this way we can avoid updating the idle masks twice, in __scx_update_idle()
756 if (do_notify || is_idle_task(rq->curr)) in __scx_update_idle()
757 update_builtin_idle(cpu, idle); in __scx_update_idle()
761 * the idle thread and vice versa. in __scx_update_idle()
763 * Idle transitions are indicated by do_notify being set to true, in __scx_update_idle()
766 * This must come after builtin idle update so that BPF schedulers can in __scx_update_idle()
767 * create interlocking between ops.update_idle() and ops.enqueue() - in __scx_update_idle()
768 * either enqueue() sees the idle bit or update_idle() sees the task in __scx_update_idle()
772 SCX_CALL_OP(sch, SCX_KF_REST, update_idle, rq, cpu_of(rq), idle); in __scx_update_idle()
780 * Consider all online cpus idle. Should converge to the actual state in reset_idle_masks()
783 if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { in reset_idle_masks()
784 cpumask_copy(idle_cpumask(NUMA_NO_NODE)->cpu, cpu_online_mask); in reset_idle_masks()
785 cpumask_copy(idle_cpumask(NUMA_NO_NODE)->smt, cpu_online_mask); in reset_idle_masks()
792 cpumask_and(idle_cpumask(node)->cpu, cpu_online_mask, node_mask); in reset_idle_masks()
793 cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask); in reset_idle_masks()
799 if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) in scx_idle_enable()
804 if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) in scx_idle_enable()
825 scx_error(sch, "per-node idle tracking is disabled"); in validate_node()
826 return -EOPNOTSUPP; in validate_node()
831 return -ENOENT; in validate_node()
836 return -EINVAL; in validate_node()
842 return -EINVAL; in validate_node()
855 scx_error(sch, "built-in idle tracking is disabled"); in check_builtin_idle_enabled()
860 * Determine whether @p is a migration-disabled task in the context of BPF
863 * We can't simply check whether @p->migration_disabled is set in a
868 * disable and re-enable migration. For this reason, the current task
869 * inside a sched_ext callback is always a migration-disabled task.
871 * Therefore, when @p->migration_disabled == 1, check whether @p is the
875 * Returns true if @p is migration-disabled, false otherwise.
879 if (p->migration_disabled == 1) in is_bpf_migration_disabled()
882 return p->migration_disabled; in is_bpf_migration_disabled()
894 return -EINVAL; in select_cpu_from_kfunc()
897 return -EBUSY; in select_cpu_from_kfunc()
901 * so that we can safely access p->cpus_ptr and p->nr_cpus_allowed. in select_cpu_from_kfunc()
910 return -EPERM; in select_cpu_from_kfunc()
915 * Validate locking correctness to access p->cpus_ptr and in select_cpu_from_kfunc()
916 * p->nr_cpus_allowed: if we're holding an rq lock, we're safe; in select_cpu_from_kfunc()
917 * otherwise, assert that p->pi_lock is held. in select_cpu_from_kfunc()
920 lockdep_assert_held(&p->pi_lock); in select_cpu_from_kfunc()
924 * per-CPU tasks as well. For these tasks, we can skip all idle CPU in select_cpu_from_kfunc()
926 * used CPU is idle and within the allowed cpumask. in select_cpu_from_kfunc()
928 if (p->nr_cpus_allowed == 1 || is_bpf_migration_disabled(p)) { in select_cpu_from_kfunc()
929 if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) && in select_cpu_from_kfunc()
933 cpu = -EBUSY; in select_cpu_from_kfunc()
936 allowed ?: p->cpus_ptr, flags); in select_cpu_from_kfunc()
946 * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or
963 * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
967 * @is_idle: out parameter indicating whether the returned CPU is idle
970 * context such as a BPF test_run() call, as long as built-in CPU selection
975 * currently idle and thus a good candidate for direct dispatching.
987 return -ENODEV; in scx_bpf_select_cpu_dfl()
999 * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p,
1008 * context such as a BPF test_run() call, as long as built-in CPU selection
1014 * Returns the selected idle CPU, which will be automatically awakened upon
1016 * a negative value if no idle CPU is available.
1027 return -ENODEV; in scx_bpf_select_cpu_and()
1034 * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the
1035 * idle-tracking per-CPU cpumask of a target NUMA node.
1038 * Returns an empty cpumask if idle tracking is not enabled, if @node is
1056 return idle_cpumask(node)->cpu; in scx_bpf_get_idle_cpumask_node()
1060 * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
1061 * per-CPU cpumask.
1063 * Returns an empty mask if idle tracking is not enabled, or running on a
1084 return idle_cpumask(NUMA_NO_NODE)->cpu; in scx_bpf_get_idle_cpumask()
1088 * scx_bpf_get_idle_smtmask_node - Get a referenced kptr to the
1089 * idle-tracking, per-physical-core cpumask of a target NUMA node. Can be
1093 * Returns an empty cpumask if idle tracking is not enabled, if @node is
1112 return idle_cpumask(node)->smt; in scx_bpf_get_idle_smtmask_node()
1114 return idle_cpumask(node)->cpu; in scx_bpf_get_idle_smtmask_node()
1118 * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
1119 * per-physical-core cpumask. Can be used to determine if an entire physical
1122 * Returns an empty mask if idle tracking is not enabled, or running on a
1144 return idle_cpumask(NUMA_NO_NODE)->smt; in scx_bpf_get_idle_smtmask()
1146 return idle_cpumask(NUMA_NO_NODE)->cpu; in scx_bpf_get_idle_smtmask()
1150 * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
1151 * either the percpu, or SMT idle-tracking cpumask.
1158 * a reference to a global idle cpumask, which is read-only in the in scx_bpf_put_idle_cpumask()
1165 * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
1166 * @cpu: cpu to test and clear idle for
1168 * Returns %true if @cpu was idle and its idle state was successfully cleared.
1194 * scx_bpf_pick_idle_cpu_node - Pick and claim an idle cpu from @node
1199 * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node.
1201 * Returns the picked idle cpu number on success, or -%EBUSY if no matching
1221 return -ENODEV; in scx_bpf_pick_idle_cpu_node()
1231 * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
1235 * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
1236 * number on success. -%EBUSY if no matching cpu was found.
1238 * Idle CPU tracking may race against CPU scheduling state transitions. For
1239 * example, this function may return -%EBUSY as CPUs are transitioning into the
1240 * idle state. If the caller then assumes that there will be dispatch events on
1261 return -ENODEV; in scx_bpf_pick_idle_cpu()
1264 scx_error(sch, "per-node idle tracking is enabled"); in scx_bpf_pick_idle_cpu()
1265 return -EBUSY; in scx_bpf_pick_idle_cpu()
1269 return -EBUSY; in scx_bpf_pick_idle_cpu()
1275 * scx_bpf_pick_any_cpu_node - Pick and claim an idle cpu if available
1281 * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
1282 * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
1283 * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
1289 * the CPU idle state).
1292 * set, this function can't tell which CPUs are idle and will always pick any
1305 return -ENODEV; in scx_bpf_pick_any_cpu_node()
1322 return -EBUSY; in scx_bpf_pick_any_cpu_node()
1326 * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
1330 * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
1331 * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
1332 * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
1336 * set, this function can't tell which CPUs are idle and will always pick any
1352 return -ENODEV; in scx_bpf_pick_any_cpu()
1355 scx_error(sch, "per-node idle tracking is enabled"); in scx_bpf_pick_any_cpu()
1356 return -EBUSY; in scx_bpf_pick_any_cpu()
1369 return -EBUSY; in scx_bpf_pick_any_cpu()