Lines Matching +full:no +full:- +full:idle

1 // SPDX-License-Identifier: GPL-2.0
3 * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
5 * Built-in idle CPU tracking policy.
14 /* Enable/disable built-in idle CPU selection policy */
17 /* Enable/disable per-node idle cpumasks */
28 * cpumasks to track idle CPUs within each NUMA node.
31 * from is used to track all the idle CPUs in the system.
39 * Global host-wide idle cpumasks (used when SCX_OPS_BUILTIN_IDLE_PER_NODE
45 * Per-node idle cpumasks.
50 * Local per-CPU cpumasks (used to generate temporary idle cpumasks).
57 * Return the idle masks associated to a target @node.
59 * NUMA_NO_NODE identifies the global idle cpumask.
68 * per-node idle cpumasks are disabled.
81 struct cpumask *idle_cpus = idle_cpumask(node)->cpu; in scx_idle_test_and_clear_cpu()
86 * cluster is not wholly idle either way. This also prevents in scx_idle_test_and_clear_cpu()
91 struct cpumask *idle_smts = idle_cpumask(node)->smt; in scx_idle_test_and_clear_cpu()
96 * @cpu is never cleared from the idle SMT mask. Ensure that in scx_idle_test_and_clear_cpu()
114 * Pick an idle CPU in a specific NUMA node.
122 cpu = cpumask_any_and_distribute(idle_cpumask(node)->smt, cpus_allowed); in pick_idle_cpu_in_node()
127 return -EBUSY; in pick_idle_cpu_in_node()
130 cpu = cpumask_any_and_distribute(idle_cpumask(node)->cpu, cpus_allowed); in pick_idle_cpu_in_node()
132 return -EBUSY; in pick_idle_cpu_in_node()
143 * Tracks nodes that have not yet been visited when searching for an idle
149 * Search for an idle CPU across all nodes, excluding @node.
154 s32 cpu = -EBUSY; in pick_idle_cpu_from_online_nodes()
173 * SCX_OPS_BUILTIN_IDLE_PER_NODE and it's requesting an idle CPU in pick_idle_cpu_from_online_nodes()
178 * in a per-node array, instead of actually traversing them every in pick_idle_cpu_from_online_nodes()
194 return -EBUSY; in pick_idle_cpu_from_online_nodes()
199 * Find an idle CPU in the system, starting from @node.
220 return -EBUSY; in scx_pick_idle_cpu()
240 return sd->span_weight; in llc_weight()
270 sg = sd->groups; in numa_weight()
274 return sg->group_weight; in numa_weight()
289 sg = sd->groups; in numa_span()
315 * - LLC 0: cpu0..cpu7 in llc_numa_mismatch()
316 * - LLC 1: cpu8..cpu15 [offline] in llc_numa_mismatch()
319 * - LLC 0: cpu16..cpu23 in llc_numa_mismatch()
320 * - LLC 1: cpu24..cpu31 in llc_numa_mismatch()
335 * Initialize topology-aware scheduling.
338 * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
354 * single LLC domain, the idle CPU selection logic can choose any in scx_idle_update_selcpu_topology()
377 * for an idle CPU in the same domain twice is redundant. in scx_idle_update_selcpu_topology()
380 * optimization, as we would naturally select idle CPUs within in scx_idle_update_selcpu_topology()
381 * specific NUMA nodes querying the corresponding per-node cpumask. in scx_idle_update_selcpu_topology()
383 if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { in scx_idle_update_selcpu_topology()
394 pr_debug("sched_ext: LLC idle selection %s\n", in scx_idle_update_selcpu_topology()
396 pr_debug("sched_ext: NUMA idle selection %s\n", in scx_idle_update_selcpu_topology()
414 return p->nr_cpus_allowed >= num_possible_cpus(); in task_affinity_all()
418 * Built-in CPU idle selection policy:
420 * 1. Prioritize full-idle cores:
421 * - always prioritize CPUs from fully idle cores (both logical CPUs are
422 * idle) to avoid interference caused by SMT.
425 * - prefer the last used CPU to take advantage of cached data (L1, L2) and
428 * 3. Pick a CPU within the same LLC (Last-Level Cache):
429 * - if the above conditions aren't met, pick a CPU that shares the same
434 * - choose a CPU from the same NUMA node, if the node cpumask is a
437 * 5. Pick any idle CPU within the @cpus_allowed domain.
447 * Return the picked CPU if idle, or a negative value otherwise.
456 const struct cpumask *allowed = cpus_allowed ?: p->cpus_ptr; in scx_select_cpu_dfl()
472 if (allowed != p->cpus_ptr) { in scx_select_cpu_dfl()
477 } else if (cpumask_and(local_cpus, cpus_allowed, p->cpus_ptr)) { in scx_select_cpu_dfl()
480 cpu = -EBUSY; in scx_select_cpu_dfl()
501 if (allowed == p->cpus_ptr && task_affinity_all(p)) in scx_select_cpu_dfl()
511 if (allowed == p->cpus_ptr && task_affinity_all(p)) in scx_select_cpu_dfl()
524 * If the waker's CPU is cache affine and prev_cpu is idle, in scx_select_cpu_dfl()
542 * Checking only for the presence of idle CPUs is also in scx_select_cpu_dfl()
544 * piled up on it even if there is an idle core elsewhere on in scx_select_cpu_dfl()
548 if (!(current->flags & PF_EXITING) && in scx_select_cpu_dfl()
549 cpu_rq(cpu)->scx.local_dsq.nr == 0 && in scx_select_cpu_dfl()
551 !cpumask_empty(idle_cpumask(waker_node)->cpu)) { in scx_select_cpu_dfl()
558 * If CPU has SMT, any wholly idle CPU is likely a better pick than in scx_select_cpu_dfl()
559 * partially idle @prev_cpu. in scx_select_cpu_dfl()
563 * Keep using @prev_cpu if it's part of a fully idle core. in scx_select_cpu_dfl()
566 cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) && in scx_select_cpu_dfl()
573 * Search for any fully idle core in the same LLC domain. in scx_select_cpu_dfl()
582 * Search for any fully idle core in the same NUMA node. in scx_select_cpu_dfl()
591 * Search for any full-idle core usable by the task. in scx_select_cpu_dfl()
593 * If the node-aware idle CPU selection policy is enabled in scx_select_cpu_dfl()
603 * Give up if we're strictly looking for a full-idle SMT in scx_select_cpu_dfl()
607 cpu = -EBUSY; in scx_select_cpu_dfl()
613 * Use @prev_cpu if it's idle. in scx_select_cpu_dfl()
621 * Search for any idle CPU in the same LLC domain. in scx_select_cpu_dfl()
630 * Search for any idle CPU in the same NUMA node. in scx_select_cpu_dfl()
639 * Search for any idle CPU usable by the task. in scx_select_cpu_dfl()
641 * If the node-aware idle CPU selection policy is enabled in scx_select_cpu_dfl()
657 * Initialize global and per-node idle cpumasks.
663 /* Allocate global idle cpumasks */ in scx_idle_init_masks()
667 /* Allocate per-node idle cpumasks */ in scx_idle_init_masks()
677 BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->cpu, GFP_KERNEL, i)); in scx_idle_init_masks()
678 BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->smt, GFP_KERNEL, i)); in scx_idle_init_masks()
681 /* Allocate local per-cpu idle cpumasks */ in scx_idle_init_masks()
692 static void update_builtin_idle(int cpu, bool idle) in update_builtin_idle() argument
695 struct cpumask *idle_cpus = idle_cpumask(node)->cpu; in update_builtin_idle()
697 assign_cpu(cpu, idle_cpus, idle); in update_builtin_idle()
702 struct cpumask *idle_smts = idle_cpumask(node)->smt; in update_builtin_idle()
704 if (idle) { in update_builtin_idle()
707 * only for optimization and self-correcting. in update_builtin_idle()
720 * Update the idle state of a CPU to @idle.
723 * scheduler of an actual idle state transition (idle to busy or vice
724 * versa). If @do_notify is false, only the idle state in the idle masks is
727 * This distinction is necessary, because an idle CPU can be "reserved" and
729 * busy even if no tasks are dispatched. In this case, the CPU may return
730 * to idle without a true state transition. Refreshing the idle masks
731 * without invoking ops.update_idle() ensures accurate idle state tracking
735 void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) in __scx_update_idle() argument
743 * Update the idle masks: in __scx_update_idle()
744 * - for real idle transitions (do_notify == true) in __scx_update_idle()
745 * - for idle-to-idle transitions (indicated by the previous task in __scx_update_idle()
746 * being the idle thread, managed by pick_task_idle()) in __scx_update_idle()
748 * Skip updating idle masks if the previous task is not the idle in __scx_update_idle()
750 * transitioning from a task to the idle thread (calling this in __scx_update_idle()
753 * In this way we can avoid updating the idle masks twice, in __scx_update_idle()
757 if (do_notify || is_idle_task(rq->curr)) in __scx_update_idle()
758 update_builtin_idle(cpu, idle); in __scx_update_idle()
762 * the idle thread and vice versa. in __scx_update_idle()
764 * Idle transitions are indicated by do_notify being set to true, in __scx_update_idle()
767 * This must come after builtin idle update so that BPF schedulers can in __scx_update_idle()
768 * create interlocking between ops.update_idle() and ops.enqueue() - in __scx_update_idle()
769 * either enqueue() sees the idle bit or update_idle() sees the task in __scx_update_idle()
773 SCX_CALL_OP(sch, SCX_KF_REST, update_idle, rq, cpu_of(rq), idle); in __scx_update_idle()
781 * Consider all online cpus idle. Should converge to the actual state in reset_idle_masks()
784 if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { in reset_idle_masks()
785 cpumask_copy(idle_cpumask(NUMA_NO_NODE)->cpu, cpu_online_mask); in reset_idle_masks()
786 cpumask_copy(idle_cpumask(NUMA_NO_NODE)->smt, cpu_online_mask); in reset_idle_masks()
793 cpumask_and(idle_cpumask(node)->cpu, cpu_online_mask, node_mask); in reset_idle_masks()
794 cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask); in reset_idle_masks()
801 if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) in scx_idle_enable()
806 if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) in scx_idle_enable()
829 scx_kf_error("per-node idle tracking is disabled"); in validate_node()
830 return -EOPNOTSUPP; in validate_node()
833 /* Return no entry for NUMA_NO_NODE (not a critical scx error) */ in validate_node()
835 return -ENOENT; in validate_node()
840 return -EINVAL; in validate_node()
846 return -EINVAL; in validate_node()
859 scx_kf_error("built-in idle tracking is disabled"); in check_builtin_idle_enabled()
871 return -EINVAL; in select_cpu_from_kfunc()
874 return -EBUSY; in select_cpu_from_kfunc()
878 * so that we can safely access p->cpus_ptr and p->nr_cpus_allowed. in select_cpu_from_kfunc()
887 return -EPERM; in select_cpu_from_kfunc()
892 * Validate locking correctness to access p->cpus_ptr and in select_cpu_from_kfunc()
893 * p->nr_cpus_allowed: if we're holding an rq lock, we're safe; in select_cpu_from_kfunc()
894 * otherwise, assert that p->pi_lock is held. in select_cpu_from_kfunc()
897 lockdep_assert_held(&p->pi_lock); in select_cpu_from_kfunc()
902 * per-CPU tasks as well. For these tasks, we can skip all idle CPU in select_cpu_from_kfunc()
904 * used CPU is idle and within the allowed cpumask. in select_cpu_from_kfunc()
906 if (p->nr_cpus_allowed == 1) { in select_cpu_from_kfunc()
907 if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) && in select_cpu_from_kfunc()
911 cpu = -EBUSY; in select_cpu_from_kfunc()
914 allowed ?: p->cpus_ptr, flags); in select_cpu_from_kfunc()
917 cpu = -EBUSY; in select_cpu_from_kfunc()
926 * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or
943 * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
947 * @is_idle: out parameter indicating whether the returned CPU is idle
950 * context such as a BPF test_run() call, as long as built-in CPU selection
955 * currently idle and thus a good candidate for direct dispatching.
973 * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p,
982 * context such as a BPF test_run() call, as long as built-in CPU selection
988 * Returns the selected idle CPU, which will be automatically awakened upon
990 * a negative value if no idle CPU is available.
999 * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the
1000 * idle-tracking per-CPU cpumask of a target NUMA node.
1003 * Returns an empty cpumask if idle tracking is not enabled, if @node is
1014 return idle_cpumask(node)->cpu; in scx_bpf_get_idle_cpumask_node()
1021 * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
1022 * per-CPU cpumask.
1024 * Returns an empty mask if idle tracking is not enabled, or running on a
1038 return idle_cpumask(NUMA_NO_NODE)->cpu; in scx_bpf_get_idle_cpumask()
1045 * scx_bpf_get_idle_smtmask_node - Get a referenced kptr to the
1046 * idle-tracking, per-physical-core cpumask of a target NUMA node. Can be
1050 * Returns an empty cpumask if idle tracking is not enabled, if @node is
1062 return idle_cpumask(node)->smt; in scx_bpf_get_idle_smtmask_node()
1064 return idle_cpumask(node)->cpu; in scx_bpf_get_idle_smtmask_node()
1071 * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
1072 * per-physical-core cpumask. Can be used to determine if an entire physical
1075 * Returns an empty mask if idle tracking is not enabled, or running on a
1090 return idle_cpumask(NUMA_NO_NODE)->smt; in scx_bpf_get_idle_smtmask()
1092 return idle_cpumask(NUMA_NO_NODE)->cpu; in scx_bpf_get_idle_smtmask()
1099 * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
1100 * either the percpu, or SMT idle-tracking cpumask.
1107 * a reference to a global idle cpumask, which is read-only in the in scx_bpf_put_idle_cpumask()
1114 * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
1115 * @cpu: cpu to test and clear idle for
1117 * Returns %true if @cpu was idle and its idle state was successfully cleared.
1135 * scx_bpf_pick_idle_cpu_node - Pick and claim an idle cpu from @node
1140 * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node.
1142 * Returns the picked idle cpu number on success, or -%EBUSY if no matching
1164 * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
1168 * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
1169 * number on success. -%EBUSY if no matching cpu was found.
1171 * Idle CPU tracking may race against CPU scheduling state transitions. For
1172 * example, this function may return -%EBUSY as CPUs are transitioning into the
1173 * idle state. If the caller then assumes that there will be dispatch events on
1189 scx_kf_error("per-node idle tracking is enabled"); in scx_bpf_pick_idle_cpu()
1190 return -EBUSY; in scx_bpf_pick_idle_cpu()
1194 return -EBUSY; in scx_bpf_pick_idle_cpu()
1200 * scx_bpf_pick_any_cpu_node - Pick and claim an idle cpu if available
1206 * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
1207 * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
1208 * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
1214 * the CPU idle state).
1217 * set, this function can't tell which CPUs are idle and will always pick any
1240 return -EBUSY; in scx_bpf_pick_any_cpu_node()
1244 * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
1248 * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
1249 * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
1250 * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
1254 * set, this function can't tell which CPUs are idle and will always pick any
1266 scx_kf_error("per-node idle tracking is enabled"); in scx_bpf_pick_any_cpu()
1267 return -EBUSY; in scx_bpf_pick_any_cpu()
1280 return -EBUSY; in scx_bpf_pick_any_cpu()