1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * kernel/sched/core.c 4 * 5 * Core kernel CPU scheduler code 6 * 7 * Copyright (C) 1991-2002 Linus Torvalds 8 * Copyright (C) 1998-2024 Ingo Molnar, Red Hat 9 */ 10 #define INSTANTIATE_EXPORTED_MIGRATE_DISABLE 11 #include <linux/sched.h> 12 #include <linux/highmem.h> 13 #include <linux/hrtimer_api.h> 14 #include <linux/ktime_api.h> 15 #include <linux/sched/signal.h> 16 #include <linux/syscalls_api.h> 17 #include <linux/debug_locks.h> 18 #include <linux/prefetch.h> 19 #include <linux/capability.h> 20 #include <linux/pgtable_api.h> 21 #include <linux/wait_bit.h> 22 #include <linux/jiffies.h> 23 #include <linux/spinlock_api.h> 24 #include <linux/cpumask_api.h> 25 #include <linux/lockdep_api.h> 26 #include <linux/hardirq.h> 27 #include <linux/softirq.h> 28 #include <linux/refcount_api.h> 29 #include <linux/topology.h> 30 #include <linux/sched/clock.h> 31 #include <linux/sched/cond_resched.h> 32 #include <linux/sched/cputime.h> 33 #include <linux/sched/debug.h> 34 #include <linux/sched/hotplug.h> 35 #include <linux/sched/init.h> 36 #include <linux/sched/isolation.h> 37 #include <linux/sched/loadavg.h> 38 #include <linux/sched/mm.h> 39 #include <linux/sched/nohz.h> 40 #include <linux/sched/rseq_api.h> 41 #include <linux/sched/rt.h> 42 43 #include <linux/blkdev.h> 44 #include <linux/context_tracking.h> 45 #include <linux/cpuset.h> 46 #include <linux/delayacct.h> 47 #include <linux/init_task.h> 48 #include <linux/interrupt.h> 49 #include <linux/ioprio.h> 50 #include <linux/kallsyms.h> 51 #include <linux/kcov.h> 52 #include <linux/kprobes.h> 53 #include <linux/llist_api.h> 54 #include <linux/mmu_context.h> 55 #include <linux/mmzone.h> 56 #include <linux/mutex_api.h> 57 #include <linux/nmi.h> 58 #include <linux/nospec.h> 59 #include <linux/perf_event_api.h> 60 #include <linux/profile.h> 61 #include <linux/psi.h> 62 #include <linux/rcuwait_api.h> 63 #include <linux/rseq.h> 64 #include <linux/sched/wake_q.h> 65 #include <linux/scs.h> 66 #include <linux/slab.h> 67 #include <linux/syscalls.h> 68 #include <linux/vtime.h> 69 #include <linux/wait_api.h> 70 #include <linux/workqueue_api.h> 71 #include <linux/livepatch_sched.h> 72 73 #ifdef CONFIG_PREEMPT_DYNAMIC 74 # ifdef CONFIG_GENERIC_IRQ_ENTRY 75 # include <linux/irq-entry-common.h> 76 # endif 77 #endif 78 79 #include <uapi/linux/sched/types.h> 80 81 #include <asm/irq_regs.h> 82 #include <asm/switch_to.h> 83 #include <asm/tlb.h> 84 85 #define CREATE_TRACE_POINTS 86 #include <linux/sched/rseq_api.h> 87 #include <trace/events/sched.h> 88 #include <trace/events/ipi.h> 89 #undef CREATE_TRACE_POINTS 90 91 #include "sched.h" 92 #include "stats.h" 93 94 #include "autogroup.h" 95 #include "pelt.h" 96 #include "smp.h" 97 98 #include "../workqueue_internal.h" 99 #include "../../io_uring/io-wq.h" 100 #include "../smpboot.h" 101 #include "../locking/mutex.h" 102 103 EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); 104 EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); 105 106 /* 107 * Export tracepoints that act as a bare tracehook (ie: have no trace event 108 * associated with them) to allow external modules to probe them. 109 */ 110 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp); 111 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); 112 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); 113 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); 114 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); 115 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp); 116 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); 117 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); 118 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); 119 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); 120 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); 121 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); 122 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_entry_tp); 123 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_exit_tp); 124 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_set_need_resched_tp); 125 126 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 127 DEFINE_PER_CPU(struct rnd_state, sched_rnd_state); 128 129 #ifdef CONFIG_SCHED_PROXY_EXEC 130 DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec); 131 static int __init setup_proxy_exec(char *str) 132 { 133 bool proxy_enable = true; 134 135 if (*str && kstrtobool(str + 1, &proxy_enable)) { 136 pr_warn("Unable to parse sched_proxy_exec=\n"); 137 return 0; 138 } 139 140 if (proxy_enable) { 141 pr_info("sched_proxy_exec enabled via boot arg\n"); 142 static_branch_enable(&__sched_proxy_exec); 143 } else { 144 pr_info("sched_proxy_exec disabled via boot arg\n"); 145 static_branch_disable(&__sched_proxy_exec); 146 } 147 return 1; 148 } 149 #else 150 static int __init setup_proxy_exec(char *str) 151 { 152 pr_warn("CONFIG_SCHED_PROXY_EXEC=n, so it cannot be enabled or disabled at boot time\n"); 153 return 0; 154 } 155 #endif 156 __setup("sched_proxy_exec", setup_proxy_exec); 157 158 /* 159 * Debugging: various feature bits 160 * 161 * If SCHED_DEBUG is disabled, each compilation unit has its own copy of 162 * sysctl_sched_features, defined in sched.h, to allow constants propagation 163 * at compile time and compiler optimization based on features default. 164 */ 165 #define SCHED_FEAT(name, enabled) \ 166 (1UL << __SCHED_FEAT_##name) * enabled | 167 __read_mostly unsigned int sysctl_sched_features = 168 #include "features.h" 169 0; 170 #undef SCHED_FEAT 171 172 /* 173 * Print a warning if need_resched is set for the given duration (if 174 * LATENCY_WARN is enabled). 175 * 176 * If sysctl_resched_latency_warn_once is set, only one warning will be shown 177 * per boot. 178 */ 179 __read_mostly int sysctl_resched_latency_warn_ms = 100; 180 __read_mostly int sysctl_resched_latency_warn_once = 1; 181 182 /* 183 * Number of tasks to iterate in a single balance run. 184 * Limited because this is done with IRQs disabled. 185 */ 186 __read_mostly unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; 187 188 __read_mostly int scheduler_running; 189 190 #ifdef CONFIG_SCHED_CORE 191 192 DEFINE_STATIC_KEY_FALSE(__sched_core_enabled); 193 194 /* kernel prio, less is more */ 195 static inline int __task_prio(const struct task_struct *p) 196 { 197 if (p->sched_class == &stop_sched_class) /* trumps deadline */ 198 return -2; 199 200 if (p->dl_server) 201 return -1; /* deadline */ 202 203 if (rt_or_dl_prio(p->prio)) 204 return p->prio; /* [-1, 99] */ 205 206 if (p->sched_class == &idle_sched_class) 207 return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ 208 209 if (task_on_scx(p)) 210 return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */ 211 212 return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */ 213 } 214 215 /* 216 * l(a,b) 217 * le(a,b) := !l(b,a) 218 * g(a,b) := l(b,a) 219 * ge(a,b) := !l(a,b) 220 */ 221 222 /* real prio, less is less */ 223 static inline bool prio_less(const struct task_struct *a, 224 const struct task_struct *b, bool in_fi) 225 { 226 227 int pa = __task_prio(a), pb = __task_prio(b); 228 229 if (-pa < -pb) 230 return true; 231 232 if (-pb < -pa) 233 return false; 234 235 if (pa == -1) { /* dl_prio() doesn't work because of stop_class above */ 236 const struct sched_dl_entity *a_dl, *b_dl; 237 238 a_dl = &a->dl; 239 /* 240 * Since,'a' and 'b' can be CFS tasks served by DL server, 241 * __task_prio() can return -1 (for DL) even for those. In that 242 * case, get to the dl_server's DL entity. 243 */ 244 if (a->dl_server) 245 a_dl = a->dl_server; 246 247 b_dl = &b->dl; 248 if (b->dl_server) 249 b_dl = b->dl_server; 250 251 return !dl_time_before(a_dl->deadline, b_dl->deadline); 252 } 253 254 if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ 255 return cfs_prio_less(a, b, in_fi); 256 257 #ifdef CONFIG_SCHED_CLASS_EXT 258 if (pa == MAX_RT_PRIO + MAX_NICE + 1) /* ext */ 259 return scx_prio_less(a, b, in_fi); 260 #endif 261 262 return false; 263 } 264 265 static inline bool __sched_core_less(const struct task_struct *a, 266 const struct task_struct *b) 267 { 268 if (a->core_cookie < b->core_cookie) 269 return true; 270 271 if (a->core_cookie > b->core_cookie) 272 return false; 273 274 /* flip prio, so high prio is leftmost */ 275 if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count)) 276 return true; 277 278 return false; 279 } 280 281 #define __node_2_sc(node) rb_entry((node), struct task_struct, core_node) 282 283 static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b) 284 { 285 return __sched_core_less(__node_2_sc(a), __node_2_sc(b)); 286 } 287 288 static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node) 289 { 290 const struct task_struct *p = __node_2_sc(node); 291 unsigned long cookie = (unsigned long)key; 292 293 if (cookie < p->core_cookie) 294 return -1; 295 296 if (cookie > p->core_cookie) 297 return 1; 298 299 return 0; 300 } 301 302 void sched_core_enqueue(struct rq *rq, struct task_struct *p) 303 { 304 if (p->se.sched_delayed) 305 return; 306 307 rq->core->core_task_seq++; 308 309 if (!p->core_cookie) 310 return; 311 312 rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less); 313 } 314 315 void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) 316 { 317 if (p->se.sched_delayed) 318 return; 319 320 rq->core->core_task_seq++; 321 322 if (sched_core_enqueued(p)) { 323 rb_erase(&p->core_node, &rq->core_tree); 324 RB_CLEAR_NODE(&p->core_node); 325 } 326 327 /* 328 * Migrating the last task off the cpu, with the cpu in forced idle 329 * state. Reschedule to create an accounting edge for forced idle, 330 * and re-examine whether the core is still in forced idle state. 331 */ 332 if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 && 333 rq->core->core_forceidle_count && rq->curr == rq->idle) 334 resched_curr(rq); 335 } 336 337 static int sched_task_is_throttled(struct task_struct *p, int cpu) 338 { 339 if (p->sched_class->task_is_throttled) 340 return p->sched_class->task_is_throttled(p, cpu); 341 342 return 0; 343 } 344 345 static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie) 346 { 347 struct rb_node *node = &p->core_node; 348 int cpu = task_cpu(p); 349 350 do { 351 node = rb_next(node); 352 if (!node) 353 return NULL; 354 355 p = __node_2_sc(node); 356 if (p->core_cookie != cookie) 357 return NULL; 358 359 } while (sched_task_is_throttled(p, cpu)); 360 361 return p; 362 } 363 364 /* 365 * Find left-most (aka, highest priority) and unthrottled task matching @cookie. 366 * If no suitable task is found, NULL will be returned. 367 */ 368 static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie) 369 { 370 struct task_struct *p; 371 struct rb_node *node; 372 373 node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp); 374 if (!node) 375 return NULL; 376 377 p = __node_2_sc(node); 378 if (!sched_task_is_throttled(p, rq->cpu)) 379 return p; 380 381 return sched_core_next(p, cookie); 382 } 383 384 /* 385 * Magic required such that: 386 * 387 * raw_spin_rq_lock(rq); 388 * ... 389 * raw_spin_rq_unlock(rq); 390 * 391 * ends up locking and unlocking the _same_ lock, and all CPUs 392 * always agree on what rq has what lock. 393 * 394 * XXX entirely possible to selectively enable cores, don't bother for now. 395 */ 396 397 static DEFINE_MUTEX(sched_core_mutex); 398 static atomic_t sched_core_count; 399 static struct cpumask sched_core_mask; 400 401 static void sched_core_lock(int cpu, unsigned long *flags) 402 __context_unsafe(/* acquires multiple */) 403 __acquires(&runqueues.__lock) /* overapproximation */ 404 { 405 const struct cpumask *smt_mask = cpu_smt_mask(cpu); 406 int t, i = 0; 407 408 local_irq_save(*flags); 409 for_each_cpu(t, smt_mask) 410 raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); 411 } 412 413 static void sched_core_unlock(int cpu, unsigned long *flags) 414 __context_unsafe(/* releases multiple */) 415 __releases(&runqueues.__lock) /* overapproximation */ 416 { 417 const struct cpumask *smt_mask = cpu_smt_mask(cpu); 418 int t; 419 420 for_each_cpu(t, smt_mask) 421 raw_spin_unlock(&cpu_rq(t)->__lock); 422 local_irq_restore(*flags); 423 } 424 425 static void __sched_core_flip(bool enabled) 426 { 427 unsigned long flags; 428 int cpu, t; 429 430 cpus_read_lock(); 431 432 /* 433 * Toggle the online cores, one by one. 434 */ 435 cpumask_copy(&sched_core_mask, cpu_online_mask); 436 for_each_cpu(cpu, &sched_core_mask) { 437 const struct cpumask *smt_mask = cpu_smt_mask(cpu); 438 439 sched_core_lock(cpu, &flags); 440 441 for_each_cpu(t, smt_mask) 442 cpu_rq(t)->core_enabled = enabled; 443 444 cpu_rq(cpu)->core->core_forceidle_start = 0; 445 446 sched_core_unlock(cpu, &flags); 447 448 cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask); 449 } 450 451 /* 452 * Toggle the offline CPUs. 453 */ 454 for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_online_mask) 455 cpu_rq(cpu)->core_enabled = enabled; 456 457 cpus_read_unlock(); 458 } 459 460 static void sched_core_assert_empty(void) 461 { 462 int cpu; 463 464 for_each_possible_cpu(cpu) 465 WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree)); 466 } 467 468 static void __sched_core_enable(void) 469 { 470 static_branch_enable(&__sched_core_enabled); 471 /* 472 * Ensure all previous instances of raw_spin_rq_*lock() have finished 473 * and future ones will observe !sched_core_disabled(). 474 */ 475 synchronize_rcu(); 476 __sched_core_flip(true); 477 sched_core_assert_empty(); 478 } 479 480 static void __sched_core_disable(void) 481 { 482 sched_core_assert_empty(); 483 __sched_core_flip(false); 484 static_branch_disable(&__sched_core_enabled); 485 } 486 487 void sched_core_get(void) 488 { 489 if (atomic_inc_not_zero(&sched_core_count)) 490 return; 491 492 mutex_lock(&sched_core_mutex); 493 if (!atomic_read(&sched_core_count)) 494 __sched_core_enable(); 495 496 smp_mb__before_atomic(); 497 atomic_inc(&sched_core_count); 498 mutex_unlock(&sched_core_mutex); 499 } 500 501 static void __sched_core_put(struct work_struct *work) 502 { 503 if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) { 504 __sched_core_disable(); 505 mutex_unlock(&sched_core_mutex); 506 } 507 } 508 509 void sched_core_put(void) 510 { 511 static DECLARE_WORK(_work, __sched_core_put); 512 513 /* 514 * "There can be only one" 515 * 516 * Either this is the last one, or we don't actually need to do any 517 * 'work'. If it is the last *again*, we rely on 518 * WORK_STRUCT_PENDING_BIT. 519 */ 520 if (!atomic_add_unless(&sched_core_count, -1, 1)) 521 schedule_work(&_work); 522 } 523 524 #else /* !CONFIG_SCHED_CORE: */ 525 526 static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } 527 static inline void 528 sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } 529 530 #endif /* !CONFIG_SCHED_CORE */ 531 532 /* need a wrapper since we may need to trace from modules */ 533 EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp); 534 535 /* Call via the helper macro trace_set_current_state. */ 536 void __trace_set_current_state(int state_value) 537 { 538 trace_sched_set_state_tp(current, state_value); 539 } 540 EXPORT_SYMBOL(__trace_set_current_state); 541 542 /* 543 * Serialization rules: 544 * 545 * Lock order: 546 * 547 * p->pi_lock 548 * rq->lock 549 * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls) 550 * 551 * rq1->lock 552 * rq2->lock where: rq1 < rq2 553 * 554 * Regular state: 555 * 556 * Normal scheduling state is serialized by rq->lock. __schedule() takes the 557 * local CPU's rq->lock, it optionally removes the task from the runqueue and 558 * always looks at the local rq data structures to find the most eligible task 559 * to run next. 560 * 561 * Task enqueue is also under rq->lock, possibly taken from another CPU. 562 * Wakeups from another LLC domain might use an IPI to transfer the enqueue to 563 * the local CPU to avoid bouncing the runqueue state around [ see 564 * ttwu_queue_wakelist() ] 565 * 566 * Task wakeup, specifically wakeups that involve migration, are horribly 567 * complicated to avoid having to take two rq->locks. 568 * 569 * Special state: 570 * 571 * System-calls and anything external will use task_rq_lock() which acquires 572 * both p->pi_lock and rq->lock. As a consequence the state they change is 573 * stable while holding either lock: 574 * 575 * - sched_setaffinity()/ 576 * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed 577 * - set_user_nice(): p->se.load, p->*prio 578 * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio, 579 * p->se.load, p->rt_priority, 580 * p->dl.dl_{runtime, deadline, period, flags, bw, density} 581 * - sched_setnuma(): p->numa_preferred_nid 582 * - sched_move_task(): p->sched_task_group 583 * - uclamp_update_active() p->uclamp* 584 * 585 * p->state <- TASK_*: 586 * 587 * is changed locklessly using set_current_state(), __set_current_state() or 588 * set_special_state(), see their respective comments, or by 589 * try_to_wake_up(). This latter uses p->pi_lock to serialize against 590 * concurrent self. 591 * 592 * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: 593 * 594 * is set by activate_task() and cleared by deactivate_task()/block_task(), 595 * under rq->lock. Non-zero indicates the task is runnable, the special 596 * ON_RQ_MIGRATING state is used for migration without holding both 597 * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). 598 * 599 * Additionally it is possible to be ->on_rq but still be considered not 600 * runnable when p->se.sched_delayed is true. These tasks are on the runqueue 601 * but will be dequeued as soon as they get picked again. See the 602 * task_is_runnable() helper. 603 * 604 * p->on_cpu <- { 0, 1 }: 605 * 606 * is set by prepare_task() and cleared by finish_task() such that it will be 607 * set before p is scheduled-in and cleared after p is scheduled-out, both 608 * under rq->lock. Non-zero indicates the task is running on its CPU. 609 * 610 * [ The astute reader will observe that it is possible for two tasks on one 611 * CPU to have ->on_cpu = 1 at the same time. ] 612 * 613 * task_cpu(p): is changed by set_task_cpu(), the rules are: 614 * 615 * - Don't call set_task_cpu() on a blocked task: 616 * 617 * We don't care what CPU we're not running on, this simplifies hotplug, 618 * the CPU assignment of blocked tasks isn't required to be valid. 619 * 620 * - for try_to_wake_up(), called under p->pi_lock: 621 * 622 * This allows try_to_wake_up() to only take one rq->lock, see its comment. 623 * 624 * - for migration called under rq->lock: 625 * [ see task_on_rq_migrating() in task_rq_lock() ] 626 * 627 * o move_queued_task() 628 * o detach_task() 629 * 630 * - for migration called under double_rq_lock(): 631 * 632 * o __migrate_swap_task() 633 * o push_rt_task() / pull_rt_task() 634 * o push_dl_task() / pull_dl_task() 635 * o dl_task_offline_migration() 636 * 637 */ 638 639 void raw_spin_rq_lock_nested(struct rq *rq, int subclass) 640 __context_unsafe() 641 { 642 raw_spinlock_t *lock; 643 644 /* Matches synchronize_rcu() in __sched_core_enable() */ 645 preempt_disable(); 646 if (sched_core_disabled()) { 647 raw_spin_lock_nested(&rq->__lock, subclass); 648 /* preempt_count *MUST* be > 1 */ 649 preempt_enable_no_resched(); 650 return; 651 } 652 653 for (;;) { 654 lock = __rq_lockp(rq); 655 raw_spin_lock_nested(lock, subclass); 656 if (likely(lock == __rq_lockp(rq))) { 657 /* preempt_count *MUST* be > 1 */ 658 preempt_enable_no_resched(); 659 return; 660 } 661 raw_spin_unlock(lock); 662 } 663 } 664 665 bool raw_spin_rq_trylock(struct rq *rq) 666 __context_unsafe() 667 { 668 raw_spinlock_t *lock; 669 bool ret; 670 671 /* Matches synchronize_rcu() in __sched_core_enable() */ 672 preempt_disable(); 673 if (sched_core_disabled()) { 674 ret = raw_spin_trylock(&rq->__lock); 675 preempt_enable(); 676 return ret; 677 } 678 679 for (;;) { 680 lock = __rq_lockp(rq); 681 ret = raw_spin_trylock(lock); 682 if (!ret || (likely(lock == __rq_lockp(rq)))) { 683 preempt_enable(); 684 return ret; 685 } 686 raw_spin_unlock(lock); 687 } 688 } 689 690 /* 691 * double_rq_lock - safely lock two runqueues 692 */ 693 void double_rq_lock(struct rq *rq1, struct rq *rq2) 694 { 695 lockdep_assert_irqs_disabled(); 696 697 if (rq_order_less(rq2, rq1)) 698 swap(rq1, rq2); 699 700 raw_spin_rq_lock(rq1); 701 if (__rq_lockp(rq1) != __rq_lockp(rq2)) 702 raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); 703 else 704 __acquire_ctx_lock(__rq_lockp(rq2)); /* fake acquire */ 705 706 double_rq_clock_clear_update(rq1, rq2); 707 } 708 709 /* 710 * ___task_rq_lock - lock the rq @p resides on. 711 */ 712 struct rq *___task_rq_lock(struct task_struct *p, struct rq_flags *rf) 713 { 714 struct rq *rq; 715 716 lockdep_assert_held(&p->pi_lock); 717 718 for (;;) { 719 rq = task_rq(p); 720 raw_spin_rq_lock(rq); 721 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { 722 rq_pin_lock(rq, rf); 723 return rq; 724 } 725 raw_spin_rq_unlock(rq); 726 727 while (unlikely(task_on_rq_migrating(p))) 728 cpu_relax(); 729 } 730 } 731 732 /* 733 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. 734 */ 735 struct rq *_task_rq_lock(struct task_struct *p, struct rq_flags *rf) 736 { 737 struct rq *rq; 738 739 for (;;) { 740 raw_spin_lock_irqsave(&p->pi_lock, rf->flags); 741 rq = task_rq(p); 742 raw_spin_rq_lock(rq); 743 /* 744 * move_queued_task() task_rq_lock() 745 * 746 * ACQUIRE (rq->lock) 747 * [S] ->on_rq = MIGRATING [L] rq = task_rq() 748 * WMB (__set_task_cpu()) ACQUIRE (rq->lock); 749 * [S] ->cpu = new_cpu [L] task_rq() 750 * [L] ->on_rq 751 * RELEASE (rq->lock) 752 * 753 * If we observe the old CPU in task_rq_lock(), the acquire of 754 * the old rq->lock will fully serialize against the stores. 755 * 756 * If we observe the new CPU in task_rq_lock(), the address 757 * dependency headed by '[L] rq = task_rq()' and the acquire 758 * will pair with the WMB to ensure we then also see migrating. 759 */ 760 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { 761 rq_pin_lock(rq, rf); 762 return rq; 763 } 764 raw_spin_rq_unlock(rq); 765 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); 766 767 while (unlikely(task_on_rq_migrating(p))) 768 cpu_relax(); 769 } 770 } 771 772 /* 773 * RQ-clock updating methods: 774 */ 775 776 /* Use CONFIG_PARAVIRT as this will avoid more #ifdef in arch code. */ 777 #ifdef CONFIG_PARAVIRT 778 struct static_key paravirt_steal_rq_enabled; 779 #endif 780 781 static void update_rq_clock_task(struct rq *rq, s64 delta) 782 { 783 /* 784 * In theory, the compile should just see 0 here, and optimize out the call 785 * to sched_rt_avg_update. But I don't trust it... 786 */ 787 s64 __maybe_unused steal = 0, irq_delta = 0; 788 789 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 790 if (irqtime_enabled()) { 791 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 792 793 /* 794 * Since irq_time is only updated on {soft,}irq_exit, we might run into 795 * this case when a previous update_rq_clock() happened inside a 796 * {soft,}IRQ region. 797 * 798 * When this happens, we stop ->clock_task and only update the 799 * prev_irq_time stamp to account for the part that fit, so that a next 800 * update will consume the rest. This ensures ->clock_task is 801 * monotonic. 802 * 803 * It does however cause some slight miss-attribution of {soft,}IRQ 804 * time, a more accurate solution would be to update the irq_time using 805 * the current rq->clock timestamp, except that would require using 806 * atomic ops. 807 */ 808 if (irq_delta > delta) 809 irq_delta = delta; 810 811 rq->prev_irq_time += irq_delta; 812 delta -= irq_delta; 813 delayacct_irq(rq->curr, irq_delta); 814 } 815 #endif 816 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 817 if (static_key_false((¶virt_steal_rq_enabled))) { 818 u64 prev_steal; 819 820 steal = prev_steal = paravirt_steal_clock(cpu_of(rq)); 821 steal -= rq->prev_steal_time_rq; 822 823 if (unlikely(steal > delta)) 824 steal = delta; 825 826 rq->prev_steal_time_rq = prev_steal; 827 delta -= steal; 828 } 829 #endif 830 831 rq->clock_task += delta; 832 833 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ 834 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) 835 update_irq_load_avg(rq, irq_delta + steal); 836 #endif 837 update_rq_clock_pelt(rq, delta); 838 } 839 840 void update_rq_clock(struct rq *rq) 841 { 842 s64 delta; 843 u64 clock; 844 845 lockdep_assert_rq_held(rq); 846 847 if (rq->clock_update_flags & RQCF_ACT_SKIP) 848 return; 849 850 if (sched_feat(WARN_DOUBLE_CLOCK)) 851 WARN_ON_ONCE(rq->clock_update_flags & RQCF_UPDATED); 852 rq->clock_update_flags |= RQCF_UPDATED; 853 854 clock = sched_clock_cpu(cpu_of(rq)); 855 scx_rq_clock_update(rq, clock); 856 857 delta = clock - rq->clock; 858 if (delta < 0) 859 return; 860 rq->clock += delta; 861 862 update_rq_clock_task(rq, delta); 863 } 864 865 #ifdef CONFIG_SCHED_HRTICK 866 /* 867 * Use HR-timers to deliver accurate preemption points. 868 */ 869 870 enum { 871 HRTICK_SCHED_NONE = 0, 872 HRTICK_SCHED_DEFER = BIT(1), 873 HRTICK_SCHED_START = BIT(2), 874 HRTICK_SCHED_REARM_HRTIMER = BIT(3) 875 }; 876 877 static void __used hrtick_clear(struct rq *rq) 878 { 879 if (hrtimer_active(&rq->hrtick_timer)) 880 hrtimer_cancel(&rq->hrtick_timer); 881 } 882 883 /* 884 * High-resolution timer tick. 885 * Runs from hardirq context with interrupts disabled. 886 */ 887 static enum hrtimer_restart hrtick(struct hrtimer *timer) 888 { 889 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 890 struct rq_flags rf; 891 892 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 893 894 rq_lock(rq, &rf); 895 update_rq_clock(rq); 896 rq->donor->sched_class->task_tick(rq, rq->donor, 1); 897 rq_unlock(rq, &rf); 898 899 return HRTIMER_NORESTART; 900 } 901 902 static inline bool hrtick_needs_rearm(struct hrtimer *timer, ktime_t expires) 903 { 904 /* 905 * Queued is false when the timer is not started or currently 906 * running the callback. In both cases, restart. If queued check 907 * whether the expiry time actually changes substantially. 908 */ 909 return !hrtimer_is_queued(timer) || 910 abs(expires - hrtimer_get_expires(timer)) > 5000; 911 } 912 913 static void hrtick_cond_restart(struct rq *rq) 914 { 915 struct hrtimer *timer = &rq->hrtick_timer; 916 ktime_t time = rq->hrtick_time; 917 918 if (hrtick_needs_rearm(timer, time)) 919 hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD); 920 } 921 922 /* 923 * called from hardirq (IPI) context 924 */ 925 static void __hrtick_start(void *arg) 926 { 927 struct rq *rq = arg; 928 struct rq_flags rf; 929 930 rq_lock(rq, &rf); 931 hrtick_cond_restart(rq); 932 rq_unlock(rq, &rf); 933 } 934 935 /* 936 * Called to set the hrtick timer state. 937 * 938 * called with rq->lock held and IRQs disabled 939 */ 940 void hrtick_start(struct rq *rq, u64 delay) 941 { 942 s64 delta; 943 944 /* 945 * Don't schedule slices shorter than 10000ns, that just 946 * doesn't make sense and can cause timer DoS. 947 */ 948 delta = max_t(s64, delay, 10000LL); 949 950 /* 951 * If this is in the middle of schedule() only note the delay 952 * and let hrtick_schedule_exit() deal with it. 953 */ 954 if (rq->hrtick_sched) { 955 rq->hrtick_sched |= HRTICK_SCHED_START; 956 rq->hrtick_delay = delta; 957 return; 958 } 959 960 rq->hrtick_time = ktime_add_ns(ktime_get(), delta); 961 if (!hrtick_needs_rearm(&rq->hrtick_timer, rq->hrtick_time)) 962 return; 963 964 if (rq == this_rq()) 965 hrtimer_start(&rq->hrtick_timer, rq->hrtick_time, HRTIMER_MODE_ABS_PINNED_HARD); 966 else 967 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); 968 } 969 970 static inline void hrtick_schedule_enter(struct rq *rq) 971 { 972 rq->hrtick_sched = HRTICK_SCHED_DEFER; 973 if (hrtimer_test_and_clear_rearm_deferred()) 974 rq->hrtick_sched |= HRTICK_SCHED_REARM_HRTIMER; 975 } 976 977 static inline void hrtick_schedule_exit(struct rq *rq) 978 { 979 if (rq->hrtick_sched & HRTICK_SCHED_START) { 980 rq->hrtick_time = ktime_add_ns(ktime_get(), rq->hrtick_delay); 981 hrtick_cond_restart(rq); 982 } else if (idle_rq(rq)) { 983 /* 984 * No need for using hrtimer_is_active(). The timer is CPU local 985 * and interrupts are disabled, so the callback cannot be 986 * running and the queued state is valid. 987 */ 988 if (hrtimer_is_queued(&rq->hrtick_timer)) 989 hrtimer_cancel(&rq->hrtick_timer); 990 } 991 992 if (rq->hrtick_sched & HRTICK_SCHED_REARM_HRTIMER) 993 __hrtimer_rearm_deferred(); 994 995 rq->hrtick_sched = HRTICK_SCHED_NONE; 996 } 997 998 static void hrtick_rq_init(struct rq *rq) 999 { 1000 INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); 1001 rq->hrtick_sched = HRTICK_SCHED_NONE; 1002 hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, 1003 HRTIMER_MODE_REL_HARD | HRTIMER_MODE_LAZY_REARM); 1004 } 1005 #else /* !CONFIG_SCHED_HRTICK: */ 1006 static inline void hrtick_clear(struct rq *rq) { } 1007 static inline void hrtick_rq_init(struct rq *rq) { } 1008 static inline void hrtick_schedule_enter(struct rq *rq) { } 1009 static inline void hrtick_schedule_exit(struct rq *rq) { } 1010 #endif /* !CONFIG_SCHED_HRTICK */ 1011 1012 /* 1013 * try_cmpxchg based fetch_or() macro so it works for different integer types: 1014 */ 1015 #define fetch_or(ptr, mask) \ 1016 ({ \ 1017 typeof(ptr) _ptr = (ptr); \ 1018 typeof(mask) _mask = (mask); \ 1019 typeof(*_ptr) _val = *_ptr; \ 1020 \ 1021 do { \ 1022 } while (!try_cmpxchg(_ptr, &_val, _val | _mask)); \ 1023 _val; \ 1024 }) 1025 1026 #ifdef TIF_POLLING_NRFLAG 1027 /* 1028 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, 1029 * this avoids any races wrt polling state changes and thereby avoids 1030 * spurious IPIs. 1031 */ 1032 static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) 1033 { 1034 return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG); 1035 } 1036 1037 /* 1038 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. 1039 * 1040 * If this returns true, then the idle task promises to call 1041 * sched_ttwu_pending() and reschedule soon. 1042 */ 1043 static bool set_nr_if_polling(struct task_struct *p) 1044 { 1045 struct thread_info *ti = task_thread_info(p); 1046 typeof(ti->flags) val = READ_ONCE(ti->flags); 1047 1048 do { 1049 if (!(val & _TIF_POLLING_NRFLAG)) 1050 return false; 1051 if (val & _TIF_NEED_RESCHED) 1052 return true; 1053 } while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED)); 1054 1055 return true; 1056 } 1057 1058 #else 1059 static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) 1060 { 1061 set_ti_thread_flag(ti, tif); 1062 return true; 1063 } 1064 1065 static inline bool set_nr_if_polling(struct task_struct *p) 1066 { 1067 return false; 1068 } 1069 #endif 1070 1071 static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) 1072 { 1073 struct wake_q_node *node = &task->wake_q; 1074 1075 /* 1076 * Atomically grab the task, if ->wake_q is !nil already it means 1077 * it's already queued (either by us or someone else) and will get the 1078 * wakeup due to that. 1079 * 1080 * In order to ensure that a pending wakeup will observe our pending 1081 * state, even in the failed case, an explicit smp_mb() must be used. 1082 */ 1083 smp_mb__before_atomic(); 1084 if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) 1085 return false; 1086 1087 /* 1088 * The head is context local, there can be no concurrency. 1089 */ 1090 *head->lastp = node; 1091 head->lastp = &node->next; 1092 return true; 1093 } 1094 1095 /** 1096 * wake_q_add() - queue a wakeup for 'later' waking. 1097 * @head: the wake_q_head to add @task to 1098 * @task: the task to queue for 'later' wakeup 1099 * 1100 * Queue a task for later wakeup, most likely by the wake_up_q() call in the 1101 * same context, _HOWEVER_ this is not guaranteed, the wakeup can come 1102 * instantly. 1103 * 1104 * This function must be used as-if it were wake_up_process(); IOW the task 1105 * must be ready to be woken at this location. 1106 */ 1107 void wake_q_add(struct wake_q_head *head, struct task_struct *task) 1108 { 1109 if (__wake_q_add(head, task)) 1110 get_task_struct(task); 1111 } 1112 1113 /** 1114 * wake_q_add_safe() - safely queue a wakeup for 'later' waking. 1115 * @head: the wake_q_head to add @task to 1116 * @task: the task to queue for 'later' wakeup 1117 * 1118 * Queue a task for later wakeup, most likely by the wake_up_q() call in the 1119 * same context, _HOWEVER_ this is not guaranteed, the wakeup can come 1120 * instantly. 1121 * 1122 * This function must be used as-if it were wake_up_process(); IOW the task 1123 * must be ready to be woken at this location. 1124 * 1125 * This function is essentially a task-safe equivalent to wake_q_add(). Callers 1126 * that already hold reference to @task can call the 'safe' version and trust 1127 * wake_q to do the right thing depending whether or not the @task is already 1128 * queued for wakeup. 1129 */ 1130 void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) 1131 { 1132 if (!__wake_q_add(head, task)) 1133 put_task_struct(task); 1134 } 1135 1136 void wake_up_q(struct wake_q_head *head) 1137 { 1138 struct wake_q_node *node = head->first; 1139 1140 while (node != WAKE_Q_TAIL) { 1141 struct task_struct *task; 1142 1143 task = container_of(node, struct task_struct, wake_q); 1144 node = node->next; 1145 /* pairs with cmpxchg_relaxed() in __wake_q_add() */ 1146 WRITE_ONCE(task->wake_q.next, NULL); 1147 /* Task can safely be re-inserted now. */ 1148 1149 /* 1150 * wake_up_process() executes a full barrier, which pairs with 1151 * the queueing in wake_q_add() so as not to miss wakeups. 1152 */ 1153 wake_up_process(task); 1154 put_task_struct(task); 1155 } 1156 } 1157 1158 /* 1159 * resched_curr - mark rq's current task 'to be rescheduled now'. 1160 * 1161 * On UP this means the setting of the need_resched flag, on SMP it 1162 * might also involve a cross-CPU call to trigger the scheduler on 1163 * the target CPU. 1164 */ 1165 static void __resched_curr(struct rq *rq, int tif) 1166 { 1167 struct task_struct *curr = rq->curr; 1168 struct thread_info *cti = task_thread_info(curr); 1169 int cpu; 1170 1171 lockdep_assert_rq_held(rq); 1172 1173 /* 1174 * Always immediately preempt the idle task; no point in delaying doing 1175 * actual work. 1176 */ 1177 if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY) 1178 tif = TIF_NEED_RESCHED; 1179 1180 if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED)) 1181 return; 1182 1183 cpu = cpu_of(rq); 1184 1185 trace_sched_set_need_resched_tp(curr, cpu, tif); 1186 if (cpu == smp_processor_id()) { 1187 set_ti_thread_flag(cti, tif); 1188 if (tif == TIF_NEED_RESCHED) 1189 set_preempt_need_resched(); 1190 return; 1191 } 1192 1193 if (set_nr_and_not_polling(cti, tif)) { 1194 if (tif == TIF_NEED_RESCHED) 1195 smp_send_reschedule(cpu); 1196 } else { 1197 trace_sched_wake_idle_without_ipi(cpu); 1198 } 1199 } 1200 1201 void __trace_set_need_resched(struct task_struct *curr, int tif) 1202 { 1203 trace_sched_set_need_resched_tp(curr, smp_processor_id(), tif); 1204 } 1205 EXPORT_SYMBOL_GPL(__trace_set_need_resched); 1206 1207 void resched_curr(struct rq *rq) 1208 { 1209 __resched_curr(rq, TIF_NEED_RESCHED); 1210 } 1211 1212 #ifdef CONFIG_PREEMPT_DYNAMIC 1213 static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy); 1214 static __always_inline bool dynamic_preempt_lazy(void) 1215 { 1216 return static_branch_unlikely(&sk_dynamic_preempt_lazy); 1217 } 1218 #else 1219 static __always_inline bool dynamic_preempt_lazy(void) 1220 { 1221 return IS_ENABLED(CONFIG_PREEMPT_LAZY); 1222 } 1223 #endif 1224 1225 static __always_inline int get_lazy_tif_bit(void) 1226 { 1227 if (dynamic_preempt_lazy()) 1228 return TIF_NEED_RESCHED_LAZY; 1229 1230 return TIF_NEED_RESCHED; 1231 } 1232 1233 void resched_curr_lazy(struct rq *rq) 1234 { 1235 __resched_curr(rq, get_lazy_tif_bit()); 1236 } 1237 1238 void resched_cpu(int cpu) 1239 { 1240 struct rq *rq = cpu_rq(cpu); 1241 unsigned long flags; 1242 1243 raw_spin_rq_lock_irqsave(rq, flags); 1244 if (cpu_online(cpu) || cpu == smp_processor_id()) 1245 resched_curr(rq); 1246 raw_spin_rq_unlock_irqrestore(rq, flags); 1247 } 1248 1249 #ifdef CONFIG_NO_HZ_COMMON 1250 /* 1251 * In the semi idle case, use the nearest busy CPU for migrating timers 1252 * from an idle CPU. This is good for power-savings. 1253 * 1254 * We don't do similar optimization for completely idle system, as 1255 * selecting an idle CPU will add more delays to the timers than intended 1256 * (as that CPU's timer base may not be up to date wrt jiffies etc). 1257 */ 1258 int get_nohz_timer_target(void) 1259 { 1260 int i, cpu = smp_processor_id(), default_cpu = -1; 1261 struct sched_domain *sd; 1262 const struct cpumask *hk_mask; 1263 1264 if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) { 1265 if (!idle_cpu(cpu)) 1266 return cpu; 1267 default_cpu = cpu; 1268 } 1269 1270 hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE); 1271 1272 guard(rcu)(); 1273 1274 for_each_domain(cpu, sd) { 1275 for_each_cpu_and(i, sched_domain_span(sd), hk_mask) { 1276 if (cpu == i) 1277 continue; 1278 1279 if (!idle_cpu(i)) 1280 return i; 1281 } 1282 } 1283 1284 if (default_cpu == -1) 1285 default_cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE); 1286 1287 return default_cpu; 1288 } 1289 1290 /* 1291 * When add_timer_on() enqueues a timer into the timer wheel of an 1292 * idle CPU then this timer might expire before the next timer event 1293 * which is scheduled to wake up that CPU. In case of a completely 1294 * idle system the next event might even be infinite time into the 1295 * future. wake_up_idle_cpu() ensures that the CPU is woken up and 1296 * leaves the inner idle loop so the newly added timer is taken into 1297 * account when the CPU goes back to idle and evaluates the timer 1298 * wheel for the next timer event. 1299 */ 1300 static void wake_up_idle_cpu(int cpu) 1301 { 1302 struct rq *rq = cpu_rq(cpu); 1303 1304 if (cpu == smp_processor_id()) 1305 return; 1306 1307 /* 1308 * Set TIF_NEED_RESCHED and send an IPI if in the non-polling 1309 * part of the idle loop. This forces an exit from the idle loop 1310 * and a round trip to schedule(). Now this could be optimized 1311 * because a simple new idle loop iteration is enough to 1312 * re-evaluate the next tick. Provided some re-ordering of tick 1313 * nohz functions that would need to follow TIF_NR_POLLING 1314 * clearing: 1315 * 1316 * - On most architectures, a simple fetch_or on ti::flags with a 1317 * "0" value would be enough to know if an IPI needs to be sent. 1318 * 1319 * - x86 needs to perform a last need_resched() check between 1320 * monitor and mwait which doesn't take timers into account. 1321 * There a dedicated TIF_TIMER flag would be required to 1322 * fetch_or here and be checked along with TIF_NEED_RESCHED 1323 * before mwait(). 1324 * 1325 * However, remote timer enqueue is not such a frequent event 1326 * and testing of the above solutions didn't appear to report 1327 * much benefits. 1328 */ 1329 if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED)) 1330 smp_send_reschedule(cpu); 1331 else 1332 trace_sched_wake_idle_without_ipi(cpu); 1333 } 1334 1335 static bool wake_up_full_nohz_cpu(int cpu) 1336 { 1337 /* 1338 * We just need the target to call irq_exit() and re-evaluate 1339 * the next tick. The nohz full kick at least implies that. 1340 * If needed we can still optimize that later with an 1341 * empty IRQ. 1342 */ 1343 if (cpu_is_offline(cpu)) 1344 return true; /* Don't try to wake offline CPUs. */ 1345 if (tick_nohz_full_cpu(cpu)) { 1346 if (cpu != smp_processor_id() || 1347 tick_nohz_tick_stopped()) 1348 tick_nohz_full_kick_cpu(cpu); 1349 return true; 1350 } 1351 1352 return false; 1353 } 1354 1355 /* 1356 * Wake up the specified CPU. If the CPU is going offline, it is the 1357 * caller's responsibility to deal with the lost wakeup, for example, 1358 * by hooking into the CPU_DEAD notifier like timers and hrtimers do. 1359 */ 1360 void wake_up_nohz_cpu(int cpu) 1361 { 1362 if (!wake_up_full_nohz_cpu(cpu)) 1363 wake_up_idle_cpu(cpu); 1364 } 1365 1366 static void nohz_csd_func(void *info) 1367 { 1368 struct rq *rq = info; 1369 int cpu = cpu_of(rq); 1370 unsigned int flags; 1371 1372 /* 1373 * Release the rq::nohz_csd. 1374 */ 1375 flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu)); 1376 WARN_ON(!(flags & NOHZ_KICK_MASK)); 1377 1378 rq->idle_balance = idle_cpu(cpu); 1379 if (rq->idle_balance) { 1380 rq->nohz_idle_balance = flags; 1381 __raise_softirq_irqoff(SCHED_SOFTIRQ); 1382 } 1383 } 1384 1385 #endif /* CONFIG_NO_HZ_COMMON */ 1386 1387 #ifdef CONFIG_NO_HZ_FULL 1388 static inline bool __need_bw_check(struct rq *rq, struct task_struct *p) 1389 { 1390 if (rq->nr_running != 1) 1391 return false; 1392 1393 if (p->sched_class != &fair_sched_class) 1394 return false; 1395 1396 if (!task_on_rq_queued(p)) 1397 return false; 1398 1399 return true; 1400 } 1401 1402 bool sched_can_stop_tick(struct rq *rq) 1403 { 1404 int fifo_nr_running; 1405 1406 /* Deadline tasks, even if single, need the tick */ 1407 if (rq->dl.dl_nr_running) 1408 return false; 1409 1410 /* 1411 * If there are more than one RR tasks, we need the tick to affect the 1412 * actual RR behaviour. 1413 */ 1414 if (rq->rt.rr_nr_running) { 1415 if (rq->rt.rr_nr_running == 1) 1416 return true; 1417 else 1418 return false; 1419 } 1420 1421 /* 1422 * If there's no RR tasks, but FIFO tasks, we can skip the tick, no 1423 * forced preemption between FIFO tasks. 1424 */ 1425 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running; 1426 if (fifo_nr_running) 1427 return true; 1428 1429 /* 1430 * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks 1431 * left. For CFS, if there's more than one we need the tick for 1432 * involuntary preemption. For SCX, ask. 1433 */ 1434 if (scx_enabled() && !scx_can_stop_tick(rq)) 1435 return false; 1436 1437 if (rq->cfs.h_nr_queued > 1) 1438 return false; 1439 1440 /* 1441 * If there is one task and it has CFS runtime bandwidth constraints 1442 * and it's on the cpu now we don't want to stop the tick. 1443 * This check prevents clearing the bit if a newly enqueued task here is 1444 * dequeued by migrating while the constrained task continues to run. 1445 * E.g. going from 2->1 without going through pick_next_task(). 1446 */ 1447 if (__need_bw_check(rq, rq->curr)) { 1448 if (cfs_task_bw_constrained(rq->curr)) 1449 return false; 1450 } 1451 1452 return true; 1453 } 1454 #endif /* CONFIG_NO_HZ_FULL */ 1455 1456 #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_FAIR_GROUP_SCHED) 1457 /* 1458 * Iterate task_group tree rooted at *from, calling @down when first entering a 1459 * node and @up when leaving it for the final time. 1460 * 1461 * Caller must hold rcu_lock or sufficient equivalent. 1462 */ 1463 int walk_tg_tree_from(struct task_group *from, 1464 tg_visitor down, tg_visitor up, void *data) 1465 { 1466 struct task_group *parent, *child; 1467 int ret; 1468 1469 parent = from; 1470 1471 down: 1472 ret = (*down)(parent, data); 1473 if (ret) 1474 goto out; 1475 list_for_each_entry_rcu(child, &parent->children, siblings) { 1476 parent = child; 1477 goto down; 1478 1479 up: 1480 continue; 1481 } 1482 ret = (*up)(parent, data); 1483 if (ret || parent == from) 1484 goto out; 1485 1486 child = parent; 1487 parent = parent->parent; 1488 if (parent) 1489 goto up; 1490 out: 1491 return ret; 1492 } 1493 1494 int tg_nop(struct task_group *tg, void *data) 1495 { 1496 return 0; 1497 } 1498 #endif 1499 1500 void set_load_weight(struct task_struct *p, bool update_load) 1501 { 1502 int prio = p->static_prio - MAX_RT_PRIO; 1503 struct load_weight lw; 1504 1505 if (task_has_idle_policy(p)) { 1506 lw.weight = scale_load(WEIGHT_IDLEPRIO); 1507 lw.inv_weight = WMULT_IDLEPRIO; 1508 } else { 1509 lw.weight = scale_load(sched_prio_to_weight[prio]); 1510 lw.inv_weight = sched_prio_to_wmult[prio]; 1511 } 1512 1513 /* 1514 * SCHED_OTHER tasks have to update their load when changing their 1515 * weight 1516 */ 1517 if (update_load && p->sched_class->reweight_task) 1518 p->sched_class->reweight_task(task_rq(p), p, &lw); 1519 else 1520 p->se.load = lw; 1521 } 1522 1523 #ifdef CONFIG_UCLAMP_TASK 1524 /* 1525 * Serializes updates of utilization clamp values 1526 * 1527 * The (slow-path) user-space triggers utilization clamp value updates which 1528 * can require updates on (fast-path) scheduler's data structures used to 1529 * support enqueue/dequeue operations. 1530 * While the per-CPU rq lock protects fast-path update operations, user-space 1531 * requests are serialized using a mutex to reduce the risk of conflicting 1532 * updates or API abuses. 1533 */ 1534 static __maybe_unused DEFINE_MUTEX(uclamp_mutex); 1535 1536 /* Max allowed minimum utilization */ 1537 static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; 1538 1539 /* Max allowed maximum utilization */ 1540 static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; 1541 1542 /* 1543 * By default RT tasks run at the maximum performance point/capacity of the 1544 * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to 1545 * SCHED_CAPACITY_SCALE. 1546 * 1547 * This knob allows admins to change the default behavior when uclamp is being 1548 * used. In battery powered devices, particularly, running at the maximum 1549 * capacity and frequency will increase energy consumption and shorten the 1550 * battery life. 1551 * 1552 * This knob only affects RT tasks that their uclamp_se->user_defined == false. 1553 * 1554 * This knob will not override the system default sched_util_clamp_min defined 1555 * above. 1556 */ 1557 unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; 1558 1559 /* All clamps are required to be less or equal than these values */ 1560 static struct uclamp_se uclamp_default[UCLAMP_CNT]; 1561 1562 /* 1563 * This static key is used to reduce the uclamp overhead in the fast path. It 1564 * primarily disables the call to uclamp_rq_{inc, dec}() in 1565 * enqueue/dequeue_task(). 1566 * 1567 * This allows users to continue to enable uclamp in their kernel config with 1568 * minimum uclamp overhead in the fast path. 1569 * 1570 * As soon as userspace modifies any of the uclamp knobs, the static key is 1571 * enabled, since we have an actual users that make use of uclamp 1572 * functionality. 1573 * 1574 * The knobs that would enable this static key are: 1575 * 1576 * * A task modifying its uclamp value with sched_setattr(). 1577 * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs. 1578 * * An admin modifying the cgroup cpu.uclamp.{min, max} 1579 */ 1580 DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); 1581 1582 static inline unsigned int 1583 uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id, 1584 unsigned int clamp_value) 1585 { 1586 /* 1587 * Avoid blocked utilization pushing up the frequency when we go 1588 * idle (which drops the max-clamp) by retaining the last known 1589 * max-clamp. 1590 */ 1591 if (clamp_id == UCLAMP_MAX) { 1592 rq->uclamp_flags |= UCLAMP_FLAG_IDLE; 1593 return clamp_value; 1594 } 1595 1596 return uclamp_none(UCLAMP_MIN); 1597 } 1598 1599 static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id, 1600 unsigned int clamp_value) 1601 { 1602 /* Reset max-clamp retention only on idle exit */ 1603 if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE)) 1604 return; 1605 1606 uclamp_rq_set(rq, clamp_id, clamp_value); 1607 } 1608 1609 static inline 1610 unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, 1611 unsigned int clamp_value) 1612 { 1613 struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; 1614 int bucket_id = UCLAMP_BUCKETS - 1; 1615 1616 /* 1617 * Since both min and max clamps are max aggregated, find the 1618 * top most bucket with tasks in. 1619 */ 1620 for ( ; bucket_id >= 0; bucket_id--) { 1621 if (!bucket[bucket_id].tasks) 1622 continue; 1623 return bucket[bucket_id].value; 1624 } 1625 1626 /* No tasks -- default clamp values */ 1627 return uclamp_idle_value(rq, clamp_id, clamp_value); 1628 } 1629 1630 static void __uclamp_update_util_min_rt_default(struct task_struct *p) 1631 { 1632 unsigned int default_util_min; 1633 struct uclamp_se *uc_se; 1634 1635 lockdep_assert_held(&p->pi_lock); 1636 1637 uc_se = &p->uclamp_req[UCLAMP_MIN]; 1638 1639 /* Only sync if user didn't override the default */ 1640 if (uc_se->user_defined) 1641 return; 1642 1643 default_util_min = sysctl_sched_uclamp_util_min_rt_default; 1644 uclamp_se_set(uc_se, default_util_min, false); 1645 } 1646 1647 static void uclamp_update_util_min_rt_default(struct task_struct *p) 1648 { 1649 if (!rt_task(p)) 1650 return; 1651 1652 /* Protect updates to p->uclamp_* */ 1653 guard(task_rq_lock)(p); 1654 __uclamp_update_util_min_rt_default(p); 1655 } 1656 1657 static inline struct uclamp_se 1658 uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) 1659 { 1660 /* Copy by value as we could modify it */ 1661 struct uclamp_se uc_req = p->uclamp_req[clamp_id]; 1662 #ifdef CONFIG_UCLAMP_TASK_GROUP 1663 unsigned int tg_min, tg_max, value; 1664 1665 /* 1666 * Tasks in autogroups or root task group will be 1667 * restricted by system defaults. 1668 */ 1669 if (task_group_is_autogroup(task_group(p))) 1670 return uc_req; 1671 if (task_group(p) == &root_task_group) 1672 return uc_req; 1673 1674 tg_min = task_group(p)->uclamp[UCLAMP_MIN].value; 1675 tg_max = task_group(p)->uclamp[UCLAMP_MAX].value; 1676 value = uc_req.value; 1677 value = clamp(value, tg_min, tg_max); 1678 uclamp_se_set(&uc_req, value, false); 1679 #endif 1680 1681 return uc_req; 1682 } 1683 1684 /* 1685 * The effective clamp bucket index of a task depends on, by increasing 1686 * priority: 1687 * - the task specific clamp value, when explicitly requested from userspace 1688 * - the task group effective clamp value, for tasks not either in the root 1689 * group or in an autogroup 1690 * - the system default clamp value, defined by the sysadmin 1691 */ 1692 static inline struct uclamp_se 1693 uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id) 1694 { 1695 struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id); 1696 struct uclamp_se uc_max = uclamp_default[clamp_id]; 1697 1698 /* System default restrictions always apply */ 1699 if (unlikely(uc_req.value > uc_max.value)) 1700 return uc_max; 1701 1702 return uc_req; 1703 } 1704 1705 unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) 1706 { 1707 struct uclamp_se uc_eff; 1708 1709 /* Task currently refcounted: use back-annotated (effective) value */ 1710 if (p->uclamp[clamp_id].active) 1711 return (unsigned long)p->uclamp[clamp_id].value; 1712 1713 uc_eff = uclamp_eff_get(p, clamp_id); 1714 1715 return (unsigned long)uc_eff.value; 1716 } 1717 1718 /* 1719 * When a task is enqueued on a rq, the clamp bucket currently defined by the 1720 * task's uclamp::bucket_id is refcounted on that rq. This also immediately 1721 * updates the rq's clamp value if required. 1722 * 1723 * Tasks can have a task-specific value requested from user-space, track 1724 * within each bucket the maximum value for tasks refcounted in it. 1725 * This "local max aggregation" allows to track the exact "requested" value 1726 * for each bucket when all its RUNNABLE tasks require the same clamp. 1727 */ 1728 static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, 1729 enum uclamp_id clamp_id) 1730 { 1731 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; 1732 struct uclamp_se *uc_se = &p->uclamp[clamp_id]; 1733 struct uclamp_bucket *bucket; 1734 1735 lockdep_assert_rq_held(rq); 1736 1737 /* Update task effective clamp */ 1738 p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id); 1739 1740 bucket = &uc_rq->bucket[uc_se->bucket_id]; 1741 bucket->tasks++; 1742 uc_se->active = true; 1743 1744 uclamp_idle_reset(rq, clamp_id, uc_se->value); 1745 1746 /* 1747 * Local max aggregation: rq buckets always track the max 1748 * "requested" clamp value of its RUNNABLE tasks. 1749 */ 1750 if (bucket->tasks == 1 || uc_se->value > bucket->value) 1751 bucket->value = uc_se->value; 1752 1753 if (uc_se->value > uclamp_rq_get(rq, clamp_id)) 1754 uclamp_rq_set(rq, clamp_id, uc_se->value); 1755 } 1756 1757 /* 1758 * When a task is dequeued from a rq, the clamp bucket refcounted by the task 1759 * is released. If this is the last task reference counting the rq's max 1760 * active clamp value, then the rq's clamp value is updated. 1761 * 1762 * Both refcounted tasks and rq's cached clamp values are expected to be 1763 * always valid. If it's detected they are not, as defensive programming, 1764 * enforce the expected state and warn. 1765 */ 1766 static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, 1767 enum uclamp_id clamp_id) 1768 { 1769 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; 1770 struct uclamp_se *uc_se = &p->uclamp[clamp_id]; 1771 struct uclamp_bucket *bucket; 1772 unsigned int bkt_clamp; 1773 unsigned int rq_clamp; 1774 1775 lockdep_assert_rq_held(rq); 1776 1777 /* 1778 * If sched_uclamp_used was enabled after task @p was enqueued, 1779 * we could end up with unbalanced call to uclamp_rq_dec_id(). 1780 * 1781 * In this case the uc_se->active flag should be false since no uclamp 1782 * accounting was performed at enqueue time and we can just return 1783 * here. 1784 * 1785 * Need to be careful of the following enqueue/dequeue ordering 1786 * problem too 1787 * 1788 * enqueue(taskA) 1789 * // sched_uclamp_used gets enabled 1790 * enqueue(taskB) 1791 * dequeue(taskA) 1792 * // Must not decrement bucket->tasks here 1793 * dequeue(taskB) 1794 * 1795 * where we could end up with stale data in uc_se and 1796 * bucket[uc_se->bucket_id]. 1797 * 1798 * The following check here eliminates the possibility of such race. 1799 */ 1800 if (unlikely(!uc_se->active)) 1801 return; 1802 1803 bucket = &uc_rq->bucket[uc_se->bucket_id]; 1804 1805 WARN_ON_ONCE(!bucket->tasks); 1806 if (likely(bucket->tasks)) 1807 bucket->tasks--; 1808 1809 uc_se->active = false; 1810 1811 /* 1812 * Keep "local max aggregation" simple and accept to (possibly) 1813 * overboost some RUNNABLE tasks in the same bucket. 1814 * The rq clamp bucket value is reset to its base value whenever 1815 * there are no more RUNNABLE tasks refcounting it. 1816 */ 1817 if (likely(bucket->tasks)) 1818 return; 1819 1820 rq_clamp = uclamp_rq_get(rq, clamp_id); 1821 /* 1822 * Defensive programming: this should never happen. If it happens, 1823 * e.g. due to future modification, warn and fix up the expected value. 1824 */ 1825 WARN_ON_ONCE(bucket->value > rq_clamp); 1826 if (bucket->value >= rq_clamp) { 1827 bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); 1828 uclamp_rq_set(rq, clamp_id, bkt_clamp); 1829 } 1830 } 1831 1832 static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) 1833 { 1834 enum uclamp_id clamp_id; 1835 1836 /* 1837 * Avoid any overhead until uclamp is actually used by the userspace. 1838 * 1839 * The condition is constructed such that a NOP is generated when 1840 * sched_uclamp_used is disabled. 1841 */ 1842 if (!uclamp_is_used()) 1843 return; 1844 1845 if (unlikely(!p->sched_class->uclamp_enabled)) 1846 return; 1847 1848 /* Only inc the delayed task which being woken up. */ 1849 if (p->se.sched_delayed && !(flags & ENQUEUE_DELAYED)) 1850 return; 1851 1852 for_each_clamp_id(clamp_id) 1853 uclamp_rq_inc_id(rq, p, clamp_id); 1854 1855 /* Reset clamp idle holding when there is one RUNNABLE task */ 1856 if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) 1857 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; 1858 } 1859 1860 static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) 1861 { 1862 enum uclamp_id clamp_id; 1863 1864 /* 1865 * Avoid any overhead until uclamp is actually used by the userspace. 1866 * 1867 * The condition is constructed such that a NOP is generated when 1868 * sched_uclamp_used is disabled. 1869 */ 1870 if (!uclamp_is_used()) 1871 return; 1872 1873 if (unlikely(!p->sched_class->uclamp_enabled)) 1874 return; 1875 1876 if (p->se.sched_delayed) 1877 return; 1878 1879 for_each_clamp_id(clamp_id) 1880 uclamp_rq_dec_id(rq, p, clamp_id); 1881 } 1882 1883 static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p, 1884 enum uclamp_id clamp_id) 1885 { 1886 if (!p->uclamp[clamp_id].active) 1887 return; 1888 1889 uclamp_rq_dec_id(rq, p, clamp_id); 1890 uclamp_rq_inc_id(rq, p, clamp_id); 1891 1892 /* 1893 * Make sure to clear the idle flag if we've transiently reached 0 1894 * active tasks on rq. 1895 */ 1896 if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE)) 1897 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; 1898 } 1899 1900 static inline void 1901 uclamp_update_active(struct task_struct *p) 1902 { 1903 enum uclamp_id clamp_id; 1904 struct rq_flags rf; 1905 struct rq *rq; 1906 1907 /* 1908 * Lock the task and the rq where the task is (or was) queued. 1909 * 1910 * We might lock the (previous) rq of a !RUNNABLE task, but that's the 1911 * price to pay to safely serialize util_{min,max} updates with 1912 * enqueues, dequeues and migration operations. 1913 * This is the same locking schema used by __set_cpus_allowed_ptr(). 1914 */ 1915 rq = task_rq_lock(p, &rf); 1916 1917 /* 1918 * Setting the clamp bucket is serialized by task_rq_lock(). 1919 * If the task is not yet RUNNABLE and its task_struct is not 1920 * affecting a valid clamp bucket, the next time it's enqueued, 1921 * it will already see the updated clamp bucket value. 1922 */ 1923 for_each_clamp_id(clamp_id) 1924 uclamp_rq_reinc_id(rq, p, clamp_id); 1925 1926 task_rq_unlock(rq, p, &rf); 1927 } 1928 1929 #ifdef CONFIG_UCLAMP_TASK_GROUP 1930 static inline void 1931 uclamp_update_active_tasks(struct cgroup_subsys_state *css) 1932 { 1933 struct css_task_iter it; 1934 struct task_struct *p; 1935 1936 css_task_iter_start(css, 0, &it); 1937 while ((p = css_task_iter_next(&it))) 1938 uclamp_update_active(p); 1939 css_task_iter_end(&it); 1940 } 1941 1942 static void cpu_util_update_eff(struct cgroup_subsys_state *css); 1943 #endif 1944 1945 #ifdef CONFIG_SYSCTL 1946 #ifdef CONFIG_UCLAMP_TASK_GROUP 1947 static void uclamp_update_root_tg(void) 1948 { 1949 struct task_group *tg = &root_task_group; 1950 1951 uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN], 1952 sysctl_sched_uclamp_util_min, false); 1953 uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX], 1954 sysctl_sched_uclamp_util_max, false); 1955 1956 guard(rcu)(); 1957 cpu_util_update_eff(&root_task_group.css); 1958 } 1959 #else 1960 static void uclamp_update_root_tg(void) { } 1961 #endif 1962 1963 static void uclamp_sync_util_min_rt_default(void) 1964 { 1965 struct task_struct *g, *p; 1966 1967 /* 1968 * copy_process() sysctl_uclamp 1969 * uclamp_min_rt = X; 1970 * write_lock(&tasklist_lock) read_lock(&tasklist_lock) 1971 * // link thread smp_mb__after_spinlock() 1972 * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock); 1973 * sched_post_fork() for_each_process_thread() 1974 * __uclamp_sync_rt() __uclamp_sync_rt() 1975 * 1976 * Ensures that either sched_post_fork() will observe the new 1977 * uclamp_min_rt or for_each_process_thread() will observe the new 1978 * task. 1979 */ 1980 read_lock(&tasklist_lock); 1981 smp_mb__after_spinlock(); 1982 read_unlock(&tasklist_lock); 1983 1984 guard(rcu)(); 1985 for_each_process_thread(g, p) 1986 uclamp_update_util_min_rt_default(p); 1987 } 1988 1989 static int sysctl_sched_uclamp_handler(const struct ctl_table *table, int write, 1990 void *buffer, size_t *lenp, loff_t *ppos) 1991 { 1992 bool update_root_tg = false; 1993 int old_min, old_max, old_min_rt; 1994 int result; 1995 1996 guard(mutex)(&uclamp_mutex); 1997 1998 old_min = sysctl_sched_uclamp_util_min; 1999 old_max = sysctl_sched_uclamp_util_max; 2000 old_min_rt = sysctl_sched_uclamp_util_min_rt_default; 2001 2002 result = proc_dointvec(table, write, buffer, lenp, ppos); 2003 if (result) 2004 goto undo; 2005 if (!write) 2006 return 0; 2007 2008 if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || 2009 sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE || 2010 sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) { 2011 2012 result = -EINVAL; 2013 goto undo; 2014 } 2015 2016 if (old_min != sysctl_sched_uclamp_util_min) { 2017 uclamp_se_set(&uclamp_default[UCLAMP_MIN], 2018 sysctl_sched_uclamp_util_min, false); 2019 update_root_tg = true; 2020 } 2021 if (old_max != sysctl_sched_uclamp_util_max) { 2022 uclamp_se_set(&uclamp_default[UCLAMP_MAX], 2023 sysctl_sched_uclamp_util_max, false); 2024 update_root_tg = true; 2025 } 2026 2027 if (update_root_tg) { 2028 sched_uclamp_enable(); 2029 uclamp_update_root_tg(); 2030 } 2031 2032 if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) { 2033 sched_uclamp_enable(); 2034 uclamp_sync_util_min_rt_default(); 2035 } 2036 2037 /* 2038 * We update all RUNNABLE tasks only when task groups are in use. 2039 * Otherwise, keep it simple and do just a lazy update at each next 2040 * task enqueue time. 2041 */ 2042 return 0; 2043 2044 undo: 2045 sysctl_sched_uclamp_util_min = old_min; 2046 sysctl_sched_uclamp_util_max = old_max; 2047 sysctl_sched_uclamp_util_min_rt_default = old_min_rt; 2048 return result; 2049 } 2050 #endif /* CONFIG_SYSCTL */ 2051 2052 static void uclamp_fork(struct task_struct *p) 2053 { 2054 enum uclamp_id clamp_id; 2055 2056 /* 2057 * We don't need to hold task_rq_lock() when updating p->uclamp_* here 2058 * as the task is still at its early fork stages. 2059 */ 2060 for_each_clamp_id(clamp_id) 2061 p->uclamp[clamp_id].active = false; 2062 2063 if (likely(!p->sched_reset_on_fork)) 2064 return; 2065 2066 for_each_clamp_id(clamp_id) { 2067 uclamp_se_set(&p->uclamp_req[clamp_id], 2068 uclamp_none(clamp_id), false); 2069 } 2070 } 2071 2072 static void uclamp_post_fork(struct task_struct *p) 2073 { 2074 uclamp_update_util_min_rt_default(p); 2075 } 2076 2077 static void __init init_uclamp_rq(struct rq *rq) 2078 { 2079 enum uclamp_id clamp_id; 2080 struct uclamp_rq *uc_rq = rq->uclamp; 2081 2082 for_each_clamp_id(clamp_id) { 2083 uc_rq[clamp_id] = (struct uclamp_rq) { 2084 .value = uclamp_none(clamp_id) 2085 }; 2086 } 2087 2088 rq->uclamp_flags = UCLAMP_FLAG_IDLE; 2089 } 2090 2091 static void __init init_uclamp(void) 2092 { 2093 struct uclamp_se uc_max = {}; 2094 enum uclamp_id clamp_id; 2095 int cpu; 2096 2097 for_each_possible_cpu(cpu) 2098 init_uclamp_rq(cpu_rq(cpu)); 2099 2100 for_each_clamp_id(clamp_id) { 2101 uclamp_se_set(&init_task.uclamp_req[clamp_id], 2102 uclamp_none(clamp_id), false); 2103 } 2104 2105 /* System defaults allow max clamp values for both indexes */ 2106 uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false); 2107 for_each_clamp_id(clamp_id) { 2108 uclamp_default[clamp_id] = uc_max; 2109 #ifdef CONFIG_UCLAMP_TASK_GROUP 2110 root_task_group.uclamp_req[clamp_id] = uc_max; 2111 root_task_group.uclamp[clamp_id] = uc_max; 2112 #endif 2113 } 2114 } 2115 2116 #else /* !CONFIG_UCLAMP_TASK: */ 2117 static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { } 2118 static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } 2119 static inline void uclamp_fork(struct task_struct *p) { } 2120 static inline void uclamp_post_fork(struct task_struct *p) { } 2121 static inline void init_uclamp(void) { } 2122 #endif /* !CONFIG_UCLAMP_TASK */ 2123 2124 bool sched_task_on_rq(struct task_struct *p) 2125 { 2126 return task_on_rq_queued(p); 2127 } 2128 2129 unsigned long get_wchan(struct task_struct *p) 2130 { 2131 unsigned long ip = 0; 2132 unsigned int state; 2133 2134 if (!p || p == current) 2135 return 0; 2136 2137 /* Only get wchan if task is blocked and we can keep it that way. */ 2138 raw_spin_lock_irq(&p->pi_lock); 2139 state = READ_ONCE(p->__state); 2140 smp_rmb(); /* see try_to_wake_up() */ 2141 if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq) 2142 ip = __get_wchan(p); 2143 raw_spin_unlock_irq(&p->pi_lock); 2144 2145 return ip; 2146 } 2147 2148 void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 2149 { 2150 if (!(flags & ENQUEUE_NOCLOCK)) 2151 update_rq_clock(rq); 2152 2153 /* 2154 * Can be before ->enqueue_task() because uclamp considers the 2155 * ENQUEUE_DELAYED task before its ->sched_delayed gets cleared 2156 * in ->enqueue_task(). 2157 */ 2158 uclamp_rq_inc(rq, p, flags); 2159 2160 p->sched_class->enqueue_task(rq, p, flags); 2161 2162 psi_enqueue(p, flags); 2163 2164 if (!(flags & ENQUEUE_RESTORE)) 2165 sched_info_enqueue(rq, p); 2166 2167 if (sched_core_enabled(rq)) 2168 sched_core_enqueue(rq, p); 2169 } 2170 2171 /* 2172 * Must only return false when DEQUEUE_SLEEP. 2173 */ 2174 inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) 2175 { 2176 if (sched_core_enabled(rq)) 2177 sched_core_dequeue(rq, p, flags); 2178 2179 if (!(flags & DEQUEUE_NOCLOCK)) 2180 update_rq_clock(rq); 2181 2182 if (!(flags & DEQUEUE_SAVE)) 2183 sched_info_dequeue(rq, p); 2184 2185 psi_dequeue(p, flags); 2186 2187 /* 2188 * Must be before ->dequeue_task() because ->dequeue_task() can 'fail' 2189 * and mark the task ->sched_delayed. 2190 */ 2191 uclamp_rq_dec(rq, p); 2192 return p->sched_class->dequeue_task(rq, p, flags); 2193 } 2194 2195 void activate_task(struct rq *rq, struct task_struct *p, int flags) 2196 { 2197 if (task_on_rq_migrating(p)) 2198 flags |= ENQUEUE_MIGRATED; 2199 2200 enqueue_task(rq, p, flags); 2201 2202 WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED); 2203 ASSERT_EXCLUSIVE_WRITER(p->on_rq); 2204 } 2205 2206 void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 2207 { 2208 WARN_ON_ONCE(flags & DEQUEUE_SLEEP); 2209 2210 WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); 2211 ASSERT_EXCLUSIVE_WRITER(p->on_rq); 2212 2213 /* 2214 * Code explicitly relies on TASK_ON_RQ_MIGRATING begin set *before* 2215 * dequeue_task() and cleared *after* enqueue_task(). 2216 */ 2217 2218 dequeue_task(rq, p, flags); 2219 } 2220 2221 static void block_task(struct rq *rq, struct task_struct *p, int flags) 2222 { 2223 if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags)) 2224 __block_task(rq, p); 2225 } 2226 2227 /** 2228 * task_curr - is this task currently executing on a CPU? 2229 * @p: the task in question. 2230 * 2231 * Return: 1 if the task is currently executing. 0 otherwise. 2232 */ 2233 inline int task_curr(const struct task_struct *p) 2234 { 2235 return cpu_curr(task_cpu(p)) == p; 2236 } 2237 2238 void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) 2239 { 2240 struct task_struct *donor = rq->donor; 2241 2242 if (p->sched_class == rq->next_class) { 2243 rq->next_class->wakeup_preempt(rq, p, flags); 2244 2245 } else if (sched_class_above(p->sched_class, rq->next_class)) { 2246 rq->next_class->wakeup_preempt(rq, p, flags); 2247 resched_curr(rq); 2248 rq->next_class = p->sched_class; 2249 } 2250 2251 /* 2252 * A queue event has occurred, and we're going to schedule. In 2253 * this case, we can save a useless back to back clock update. 2254 */ 2255 if (task_on_rq_queued(donor) && test_tsk_need_resched(rq->curr)) 2256 rq_clock_skip_update(rq); 2257 } 2258 2259 static __always_inline 2260 int __task_state_match(struct task_struct *p, unsigned int state) 2261 { 2262 if (READ_ONCE(p->__state) & state) 2263 return 1; 2264 2265 if (READ_ONCE(p->saved_state) & state) 2266 return -1; 2267 2268 return 0; 2269 } 2270 2271 static __always_inline 2272 int task_state_match(struct task_struct *p, unsigned int state) 2273 { 2274 /* 2275 * Serialize against current_save_and_set_rtlock_wait_state(), 2276 * current_restore_rtlock_saved_state(), and __refrigerator(). 2277 */ 2278 guard(raw_spinlock_irq)(&p->pi_lock); 2279 return __task_state_match(p, state); 2280 } 2281 2282 /* 2283 * wait_task_inactive - wait for a thread to unschedule. 2284 * 2285 * Wait for the thread to block in any of the states set in @match_state. 2286 * If it changes, i.e. @p might have woken up, then return zero. When we 2287 * succeed in waiting for @p to be off its CPU, we return a positive number 2288 * (its total switch count). If a second call a short while later returns the 2289 * same number, the caller can be sure that @p has remained unscheduled the 2290 * whole time. 2291 * 2292 * The caller must ensure that the task *will* unschedule sometime soon, 2293 * else this function might spin for a *long* time. This function can't 2294 * be called with interrupts off, or it may introduce deadlock with 2295 * smp_call_function() if an IPI is sent by the same process we are 2296 * waiting to become inactive. 2297 */ 2298 unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) 2299 { 2300 int running, queued, match; 2301 struct rq_flags rf; 2302 unsigned long ncsw; 2303 struct rq *rq; 2304 2305 for (;;) { 2306 /* 2307 * We do the initial early heuristics without holding 2308 * any task-queue locks at all. We'll only try to get 2309 * the runqueue lock when things look like they will 2310 * work out! 2311 */ 2312 rq = task_rq(p); 2313 2314 /* 2315 * If the task is actively running on another CPU 2316 * still, just relax and busy-wait without holding 2317 * any locks. 2318 * 2319 * NOTE! Since we don't hold any locks, it's not 2320 * even sure that "rq" stays as the right runqueue! 2321 * But we don't care, since "task_on_cpu()" will 2322 * return false if the runqueue has changed and p 2323 * is actually now running somewhere else! 2324 */ 2325 while (task_on_cpu(rq, p)) { 2326 if (!task_state_match(p, match_state)) 2327 return 0; 2328 cpu_relax(); 2329 } 2330 2331 /* 2332 * Ok, time to look more closely! We need the rq 2333 * lock now, to be *sure*. If we're wrong, we'll 2334 * just go back and repeat. 2335 */ 2336 rq = task_rq_lock(p, &rf); 2337 /* 2338 * If task is sched_delayed, force dequeue it, to avoid always 2339 * hitting the tick timeout in the queued case 2340 */ 2341 if (p->se.sched_delayed) 2342 dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); 2343 trace_sched_wait_task(p); 2344 running = task_on_cpu(rq, p); 2345 queued = task_on_rq_queued(p); 2346 ncsw = 0; 2347 if ((match = __task_state_match(p, match_state))) { 2348 /* 2349 * When matching on p->saved_state, consider this task 2350 * still queued so it will wait. 2351 */ 2352 if (match < 0) 2353 queued = 1; 2354 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2355 } 2356 task_rq_unlock(rq, p, &rf); 2357 2358 /* 2359 * If it changed from the expected state, bail out now. 2360 */ 2361 if (unlikely(!ncsw)) 2362 break; 2363 2364 /* 2365 * Was it really running after all now that we 2366 * checked with the proper locks actually held? 2367 * 2368 * Oops. Go back and try again.. 2369 */ 2370 if (unlikely(running)) { 2371 cpu_relax(); 2372 continue; 2373 } 2374 2375 /* 2376 * It's not enough that it's not actively running, 2377 * it must be off the runqueue _entirely_, and not 2378 * preempted! 2379 * 2380 * So if it was still runnable (but just not actively 2381 * running right now), it's preempted, and we should 2382 * yield - it could be a while. 2383 */ 2384 if (unlikely(queued)) { 2385 ktime_t to = NSEC_PER_SEC / HZ; 2386 2387 set_current_state(TASK_UNINTERRUPTIBLE); 2388 schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); 2389 continue; 2390 } 2391 2392 /* 2393 * Ahh, all good. It wasn't running, and it wasn't 2394 * runnable, which means that it will never become 2395 * running in the future either. We're all done! 2396 */ 2397 break; 2398 } 2399 2400 return ncsw; 2401 } 2402 2403 static void 2404 do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); 2405 2406 static void migrate_disable_switch(struct rq *rq, struct task_struct *p) 2407 { 2408 struct affinity_context ac = { 2409 .new_mask = cpumask_of(rq->cpu), 2410 .flags = SCA_MIGRATE_DISABLE, 2411 }; 2412 2413 if (likely(!p->migration_disabled)) 2414 return; 2415 2416 if (p->cpus_ptr != &p->cpus_mask) 2417 return; 2418 2419 scoped_guard (task_rq_lock, p) 2420 do_set_cpus_allowed(p, &ac); 2421 } 2422 2423 void ___migrate_enable(void) 2424 { 2425 struct task_struct *p = current; 2426 struct affinity_context ac = { 2427 .new_mask = &p->cpus_mask, 2428 .flags = SCA_MIGRATE_ENABLE, 2429 }; 2430 2431 __set_cpus_allowed_ptr(p, &ac); 2432 } 2433 EXPORT_SYMBOL_GPL(___migrate_enable); 2434 2435 void migrate_disable(void) 2436 { 2437 __migrate_disable(); 2438 } 2439 EXPORT_SYMBOL_GPL(migrate_disable); 2440 2441 void migrate_enable(void) 2442 { 2443 __migrate_enable(); 2444 } 2445 EXPORT_SYMBOL_GPL(migrate_enable); 2446 2447 static inline bool rq_has_pinned_tasks(struct rq *rq) 2448 { 2449 return rq->nr_pinned; 2450 } 2451 2452 /* 2453 * Per-CPU kthreads are allowed to run on !active && online CPUs, see 2454 * __set_cpus_allowed_ptr() and select_fallback_rq(). 2455 */ 2456 static inline bool is_cpu_allowed(struct task_struct *p, int cpu) 2457 { 2458 /* When not in the task's cpumask, no point in looking further. */ 2459 if (!task_allowed_on_cpu(p, cpu)) 2460 return false; 2461 2462 /* migrate_disabled() must be allowed to finish. */ 2463 if (is_migration_disabled(p)) 2464 return cpu_online(cpu); 2465 2466 /* Non kernel threads are not allowed during either online or offline. */ 2467 if (!(p->flags & PF_KTHREAD)) 2468 return cpu_active(cpu); 2469 2470 /* KTHREAD_IS_PER_CPU is always allowed. */ 2471 if (kthread_is_per_cpu(p)) 2472 return cpu_online(cpu); 2473 2474 /* Regular kernel threads don't get to stay during offline. */ 2475 if (cpu_dying(cpu)) 2476 return false; 2477 2478 /* But are allowed during online. */ 2479 return cpu_online(cpu); 2480 } 2481 2482 /* 2483 * This is how migration works: 2484 * 2485 * 1) we invoke migration_cpu_stop() on the target CPU using 2486 * stop_one_cpu(). 2487 * 2) stopper starts to run (implicitly forcing the migrated thread 2488 * off the CPU) 2489 * 3) it checks whether the migrated task is still in the wrong runqueue. 2490 * 4) if it's in the wrong runqueue then the migration thread removes 2491 * it and puts it into the right queue. 2492 * 5) stopper completes and stop_one_cpu() returns and the migration 2493 * is done. 2494 */ 2495 2496 /* 2497 * move_queued_task - move a queued task to new rq. 2498 * 2499 * Returns (locked) new rq. Old rq's lock is released. 2500 */ 2501 static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, 2502 struct task_struct *p, int new_cpu) 2503 __must_hold(__rq_lockp(rq)) 2504 { 2505 lockdep_assert_rq_held(rq); 2506 2507 deactivate_task(rq, p, DEQUEUE_NOCLOCK); 2508 set_task_cpu(p, new_cpu); 2509 rq_unlock(rq, rf); 2510 2511 rq = cpu_rq(new_cpu); 2512 2513 rq_lock(rq, rf); 2514 WARN_ON_ONCE(task_cpu(p) != new_cpu); 2515 activate_task(rq, p, 0); 2516 wakeup_preempt(rq, p, 0); 2517 2518 return rq; 2519 } 2520 2521 struct migration_arg { 2522 struct task_struct *task; 2523 int dest_cpu; 2524 struct set_affinity_pending *pending; 2525 }; 2526 2527 /* 2528 * @refs: number of wait_for_completion() 2529 * @stop_pending: is @stop_work in use 2530 */ 2531 struct set_affinity_pending { 2532 refcount_t refs; 2533 unsigned int stop_pending; 2534 struct completion done; 2535 struct cpu_stop_work stop_work; 2536 struct migration_arg arg; 2537 }; 2538 2539 /* 2540 * Move (not current) task off this CPU, onto the destination CPU. We're doing 2541 * this because either it can't run here any more (set_cpus_allowed() 2542 * away from this CPU, or CPU going down), or because we're 2543 * attempting to rebalance this task on exec (sched_exec). 2544 * 2545 * So we race with normal scheduler movements, but that's OK, as long 2546 * as the task is no longer on this CPU. 2547 */ 2548 static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, 2549 struct task_struct *p, int dest_cpu) 2550 __must_hold(__rq_lockp(rq)) 2551 { 2552 /* Affinity changed (again). */ 2553 if (!is_cpu_allowed(p, dest_cpu)) 2554 return rq; 2555 2556 rq = move_queued_task(rq, rf, p, dest_cpu); 2557 2558 return rq; 2559 } 2560 2561 /* 2562 * migration_cpu_stop - this will be executed by a high-prio stopper thread 2563 * and performs thread migration by bumping thread off CPU then 2564 * 'pushing' onto another runqueue. 2565 */ 2566 static int migration_cpu_stop(void *data) 2567 { 2568 struct migration_arg *arg = data; 2569 struct set_affinity_pending *pending = arg->pending; 2570 struct task_struct *p = arg->task; 2571 struct rq *rq = this_rq(); 2572 bool complete = false; 2573 struct rq_flags rf; 2574 2575 /* 2576 * The original target CPU might have gone down and we might 2577 * be on another CPU but it doesn't matter. 2578 */ 2579 local_irq_save(rf.flags); 2580 /* 2581 * We need to explicitly wake pending tasks before running 2582 * __migrate_task() such that we will not miss enforcing cpus_ptr 2583 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. 2584 */ 2585 flush_smp_call_function_queue(); 2586 2587 /* 2588 * We may change the underlying rq, but the locks held will 2589 * appropriately be "transferred" when switching. 2590 */ 2591 context_unsafe_alias(rq); 2592 2593 raw_spin_lock(&p->pi_lock); 2594 rq_lock(rq, &rf); 2595 2596 /* 2597 * If we were passed a pending, then ->stop_pending was set, thus 2598 * p->migration_pending must have remained stable. 2599 */ 2600 WARN_ON_ONCE(pending && pending != p->migration_pending); 2601 2602 /* 2603 * If task_rq(p) != rq, it cannot be migrated here, because we're 2604 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because 2605 * we're holding p->pi_lock. 2606 */ 2607 if (task_rq(p) == rq) { 2608 if (is_migration_disabled(p)) 2609 goto out; 2610 2611 if (pending) { 2612 p->migration_pending = NULL; 2613 complete = true; 2614 2615 if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) 2616 goto out; 2617 } 2618 2619 if (task_on_rq_queued(p)) { 2620 update_rq_clock(rq); 2621 rq = __migrate_task(rq, &rf, p, arg->dest_cpu); 2622 } else { 2623 p->wake_cpu = arg->dest_cpu; 2624 } 2625 2626 /* 2627 * XXX __migrate_task() can fail, at which point we might end 2628 * up running on a dodgy CPU, AFAICT this can only happen 2629 * during CPU hotplug, at which point we'll get pushed out 2630 * anyway, so it's probably not a big deal. 2631 */ 2632 2633 } else if (pending) { 2634 /* 2635 * This happens when we get migrated between migrate_enable()'s 2636 * preempt_enable() and scheduling the stopper task. At that 2637 * point we're a regular task again and not current anymore. 2638 * 2639 * A !PREEMPT kernel has a giant hole here, which makes it far 2640 * more likely. 2641 */ 2642 2643 /* 2644 * The task moved before the stopper got to run. We're holding 2645 * ->pi_lock, so the allowed mask is stable - if it got 2646 * somewhere allowed, we're done. 2647 */ 2648 if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { 2649 p->migration_pending = NULL; 2650 complete = true; 2651 goto out; 2652 } 2653 2654 /* 2655 * When migrate_enable() hits a rq mis-match we can't reliably 2656 * determine is_migration_disabled() and so have to chase after 2657 * it. 2658 */ 2659 WARN_ON_ONCE(!pending->stop_pending); 2660 preempt_disable(); 2661 rq_unlock(rq, &rf); 2662 raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); 2663 stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, 2664 &pending->arg, &pending->stop_work); 2665 preempt_enable(); 2666 return 0; 2667 } 2668 out: 2669 if (pending) 2670 pending->stop_pending = false; 2671 rq_unlock(rq, &rf); 2672 raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); 2673 2674 if (complete) 2675 complete_all(&pending->done); 2676 2677 return 0; 2678 } 2679 2680 int push_cpu_stop(void *arg) 2681 { 2682 struct rq *lowest_rq = NULL, *rq = this_rq(); 2683 struct task_struct *p = arg; 2684 2685 raw_spin_lock_irq(&p->pi_lock); 2686 raw_spin_rq_lock(rq); 2687 2688 if (task_rq(p) != rq) 2689 goto out_unlock; 2690 2691 if (is_migration_disabled(p)) { 2692 p->migration_flags |= MDF_PUSH; 2693 goto out_unlock; 2694 } 2695 2696 p->migration_flags &= ~MDF_PUSH; 2697 2698 if (p->sched_class->find_lock_rq) 2699 lowest_rq = p->sched_class->find_lock_rq(p, rq); 2700 2701 if (!lowest_rq) 2702 goto out_unlock; 2703 2704 lockdep_assert_rq_held(lowest_rq); 2705 2706 // XXX validate p is still the highest prio task 2707 if (task_rq(p) == rq) { 2708 move_queued_task_locked(rq, lowest_rq, p); 2709 resched_curr(lowest_rq); 2710 } 2711 2712 double_unlock_balance(rq, lowest_rq); 2713 2714 out_unlock: 2715 rq->push_busy = false; 2716 raw_spin_rq_unlock(rq); 2717 raw_spin_unlock_irq(&p->pi_lock); 2718 2719 put_task_struct(p); 2720 return 0; 2721 } 2722 2723 static inline void mm_update_cpus_allowed(struct mm_struct *mm, const cpumask_t *affmask); 2724 2725 /* 2726 * sched_class::set_cpus_allowed must do the below, but is not required to 2727 * actually call this function. 2728 */ 2729 void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx) 2730 { 2731 if (ctx->flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) { 2732 p->cpus_ptr = ctx->new_mask; 2733 return; 2734 } 2735 2736 cpumask_copy(&p->cpus_mask, ctx->new_mask); 2737 p->nr_cpus_allowed = cpumask_weight(ctx->new_mask); 2738 mm_update_cpus_allowed(p->mm, ctx->new_mask); 2739 2740 /* 2741 * Swap in a new user_cpus_ptr if SCA_USER flag set 2742 */ 2743 if (ctx->flags & SCA_USER) 2744 swap(p->user_cpus_ptr, ctx->user_mask); 2745 } 2746 2747 static void 2748 do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) 2749 { 2750 scoped_guard (sched_change, p, DEQUEUE_SAVE) 2751 p->sched_class->set_cpus_allowed(p, ctx); 2752 } 2753 2754 /* 2755 * Used for kthread_bind() and select_fallback_rq(), in both cases the user 2756 * affinity (if any) should be destroyed too. 2757 */ 2758 void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask) 2759 { 2760 struct affinity_context ac = { 2761 .new_mask = new_mask, 2762 .user_mask = NULL, 2763 .flags = SCA_USER, /* clear the user requested mask */ 2764 }; 2765 union cpumask_rcuhead { 2766 cpumask_t cpumask; 2767 struct rcu_head rcu; 2768 }; 2769 2770 scoped_guard (__task_rq_lock, p) 2771 do_set_cpus_allowed(p, &ac); 2772 2773 /* 2774 * Because this is called with p->pi_lock held, it is not possible 2775 * to use kfree() here (when PREEMPT_RT=y), therefore punt to using 2776 * kfree_rcu(). 2777 */ 2778 kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu); 2779 } 2780 2781 int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, 2782 int node) 2783 { 2784 cpumask_t *user_mask; 2785 unsigned long flags; 2786 2787 /* 2788 * Always clear dst->user_cpus_ptr first as their user_cpus_ptr's 2789 * may differ by now due to racing. 2790 */ 2791 dst->user_cpus_ptr = NULL; 2792 2793 /* 2794 * This check is racy and losing the race is a valid situation. 2795 * It is not worth the extra overhead of taking the pi_lock on 2796 * every fork/clone. 2797 */ 2798 if (data_race(!src->user_cpus_ptr)) 2799 return 0; 2800 2801 user_mask = alloc_user_cpus_ptr(node); 2802 if (!user_mask) 2803 return -ENOMEM; 2804 2805 /* 2806 * Use pi_lock to protect content of user_cpus_ptr 2807 * 2808 * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent 2809 * set_cpus_allowed_force(). 2810 */ 2811 raw_spin_lock_irqsave(&src->pi_lock, flags); 2812 if (src->user_cpus_ptr) { 2813 swap(dst->user_cpus_ptr, user_mask); 2814 cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); 2815 } 2816 raw_spin_unlock_irqrestore(&src->pi_lock, flags); 2817 2818 if (unlikely(user_mask)) 2819 kfree(user_mask); 2820 2821 return 0; 2822 } 2823 2824 static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p) 2825 { 2826 struct cpumask *user_mask = NULL; 2827 2828 swap(p->user_cpus_ptr, user_mask); 2829 2830 return user_mask; 2831 } 2832 2833 void release_user_cpus_ptr(struct task_struct *p) 2834 { 2835 kfree(clear_user_cpus_ptr(p)); 2836 } 2837 2838 /* 2839 * This function is wildly self concurrent; here be dragons. 2840 * 2841 * 2842 * When given a valid mask, __set_cpus_allowed_ptr() must block until the 2843 * designated task is enqueued on an allowed CPU. If that task is currently 2844 * running, we have to kick it out using the CPU stopper. 2845 * 2846 * Migrate-Disable comes along and tramples all over our nice sandcastle. 2847 * Consider: 2848 * 2849 * Initial conditions: P0->cpus_mask = [0, 1] 2850 * 2851 * P0@CPU0 P1 2852 * 2853 * migrate_disable(); 2854 * <preempted> 2855 * set_cpus_allowed_ptr(P0, [1]); 2856 * 2857 * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes 2858 * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region). 2859 * This means we need the following scheme: 2860 * 2861 * P0@CPU0 P1 2862 * 2863 * migrate_disable(); 2864 * <preempted> 2865 * set_cpus_allowed_ptr(P0, [1]); 2866 * <blocks> 2867 * <resumes> 2868 * migrate_enable(); 2869 * __set_cpus_allowed_ptr(); 2870 * <wakes local stopper> 2871 * `--> <woken on migration completion> 2872 * 2873 * Now the fun stuff: there may be several P1-like tasks, i.e. multiple 2874 * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any 2875 * task p are serialized by p->pi_lock, which we can leverage: the one that 2876 * should come into effect at the end of the Migrate-Disable region is the last 2877 * one. This means we only need to track a single cpumask (i.e. p->cpus_mask), 2878 * but we still need to properly signal those waiting tasks at the appropriate 2879 * moment. 2880 * 2881 * This is implemented using struct set_affinity_pending. The first 2882 * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will 2883 * setup an instance of that struct and install it on the targeted task_struct. 2884 * Any and all further callers will reuse that instance. Those then wait for 2885 * a completion signaled at the tail of the CPU stopper callback (1), triggered 2886 * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()). 2887 * 2888 * 2889 * (1) In the cases covered above. There is one more where the completion is 2890 * signaled within affine_move_task() itself: when a subsequent affinity request 2891 * occurs after the stopper bailed out due to the targeted task still being 2892 * Migrate-Disable. Consider: 2893 * 2894 * Initial conditions: P0->cpus_mask = [0, 1] 2895 * 2896 * CPU0 P1 P2 2897 * <P0> 2898 * migrate_disable(); 2899 * <preempted> 2900 * set_cpus_allowed_ptr(P0, [1]); 2901 * <blocks> 2902 * <migration/0> 2903 * migration_cpu_stop() 2904 * is_migration_disabled() 2905 * <bails> 2906 * set_cpus_allowed_ptr(P0, [0, 1]); 2907 * <signal completion> 2908 * <awakes> 2909 * 2910 * Note that the above is safe vs a concurrent migrate_enable(), as any 2911 * pending affinity completion is preceded by an uninstallation of 2912 * p->migration_pending done with p->pi_lock held. 2913 */ 2914 static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf, 2915 int dest_cpu, unsigned int flags) 2916 __releases(__rq_lockp(rq), &p->pi_lock) 2917 { 2918 struct set_affinity_pending my_pending = { }, *pending = NULL; 2919 bool stop_pending, complete = false; 2920 2921 /* 2922 * Can the task run on the task's current CPU? If so, we're done 2923 * 2924 * We are also done if the task is the current donor, boosting a lock- 2925 * holding proxy, (and potentially has been migrated outside its 2926 * current or previous affinity mask) 2927 */ 2928 if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask) || 2929 (task_current_donor(rq, p) && !task_current(rq, p))) { 2930 struct task_struct *push_task = NULL; 2931 2932 if ((flags & SCA_MIGRATE_ENABLE) && 2933 (p->migration_flags & MDF_PUSH) && !rq->push_busy) { 2934 rq->push_busy = true; 2935 push_task = get_task_struct(p); 2936 } 2937 2938 /* 2939 * If there are pending waiters, but no pending stop_work, 2940 * then complete now. 2941 */ 2942 pending = p->migration_pending; 2943 if (pending && !pending->stop_pending) { 2944 p->migration_pending = NULL; 2945 complete = true; 2946 } 2947 2948 preempt_disable(); 2949 task_rq_unlock(rq, p, rf); 2950 if (push_task) { 2951 stop_one_cpu_nowait(rq->cpu, push_cpu_stop, 2952 p, &rq->push_work); 2953 } 2954 preempt_enable(); 2955 2956 if (complete) 2957 complete_all(&pending->done); 2958 2959 return 0; 2960 } 2961 2962 if (!(flags & SCA_MIGRATE_ENABLE)) { 2963 /* serialized by p->pi_lock */ 2964 if (!p->migration_pending) { 2965 /* Install the request */ 2966 refcount_set(&my_pending.refs, 1); 2967 init_completion(&my_pending.done); 2968 my_pending.arg = (struct migration_arg) { 2969 .task = p, 2970 .dest_cpu = dest_cpu, 2971 .pending = &my_pending, 2972 }; 2973 2974 p->migration_pending = &my_pending; 2975 } else { 2976 pending = p->migration_pending; 2977 refcount_inc(&pending->refs); 2978 /* 2979 * Affinity has changed, but we've already installed a 2980 * pending. migration_cpu_stop() *must* see this, else 2981 * we risk a completion of the pending despite having a 2982 * task on a disallowed CPU. 2983 * 2984 * Serialized by p->pi_lock, so this is safe. 2985 */ 2986 pending->arg.dest_cpu = dest_cpu; 2987 } 2988 } 2989 pending = p->migration_pending; 2990 /* 2991 * - !MIGRATE_ENABLE: 2992 * we'll have installed a pending if there wasn't one already. 2993 * 2994 * - MIGRATE_ENABLE: 2995 * we're here because the current CPU isn't matching anymore, 2996 * the only way that can happen is because of a concurrent 2997 * set_cpus_allowed_ptr() call, which should then still be 2998 * pending completion. 2999 * 3000 * Either way, we really should have a @pending here. 3001 */ 3002 if (WARN_ON_ONCE(!pending)) { 3003 task_rq_unlock(rq, p, rf); 3004 return -EINVAL; 3005 } 3006 3007 if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) { 3008 /* 3009 * MIGRATE_ENABLE gets here because 'p == current', but for 3010 * anything else we cannot do is_migration_disabled(), punt 3011 * and have the stopper function handle it all race-free. 3012 */ 3013 stop_pending = pending->stop_pending; 3014 if (!stop_pending) 3015 pending->stop_pending = true; 3016 3017 if (flags & SCA_MIGRATE_ENABLE) 3018 p->migration_flags &= ~MDF_PUSH; 3019 3020 preempt_disable(); 3021 task_rq_unlock(rq, p, rf); 3022 if (!stop_pending) { 3023 stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, 3024 &pending->arg, &pending->stop_work); 3025 } 3026 preempt_enable(); 3027 3028 if (flags & SCA_MIGRATE_ENABLE) 3029 return 0; 3030 } else { 3031 3032 if (!is_migration_disabled(p)) { 3033 if (task_on_rq_queued(p)) 3034 rq = move_queued_task(rq, rf, p, dest_cpu); 3035 3036 if (!pending->stop_pending) { 3037 p->migration_pending = NULL; 3038 complete = true; 3039 } 3040 } 3041 task_rq_unlock(rq, p, rf); 3042 3043 if (complete) 3044 complete_all(&pending->done); 3045 } 3046 3047 wait_for_completion(&pending->done); 3048 3049 if (refcount_dec_and_test(&pending->refs)) 3050 wake_up_var(&pending->refs); /* No UaF, just an address */ 3051 3052 /* 3053 * Block the original owner of &pending until all subsequent callers 3054 * have seen the completion and decremented the refcount 3055 */ 3056 wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); 3057 3058 /* ARGH */ 3059 WARN_ON_ONCE(my_pending.stop_pending); 3060 3061 return 0; 3062 } 3063 3064 /* 3065 * Called with both p->pi_lock and rq->lock held; drops both before returning. 3066 */ 3067 static int __set_cpus_allowed_ptr_locked(struct task_struct *p, 3068 struct affinity_context *ctx, 3069 struct rq *rq, 3070 struct rq_flags *rf) 3071 __releases(__rq_lockp(rq), &p->pi_lock) 3072 { 3073 const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p); 3074 const struct cpumask *cpu_valid_mask = cpu_active_mask; 3075 bool kthread = p->flags & PF_KTHREAD; 3076 unsigned int dest_cpu; 3077 int ret = 0; 3078 3079 if (kthread || is_migration_disabled(p)) { 3080 /* 3081 * Kernel threads are allowed on online && !active CPUs, 3082 * however, during cpu-hot-unplug, even these might get pushed 3083 * away if not KTHREAD_IS_PER_CPU. 3084 * 3085 * Specifically, migration_disabled() tasks must not fail the 3086 * cpumask_any_and_distribute() pick below, esp. so on 3087 * SCA_MIGRATE_ENABLE, otherwise we'll not call 3088 * set_cpus_allowed_common() and actually reset p->cpus_ptr. 3089 */ 3090 cpu_valid_mask = cpu_online_mask; 3091 } 3092 3093 if (!kthread && !cpumask_subset(ctx->new_mask, cpu_allowed_mask)) { 3094 ret = -EINVAL; 3095 goto out; 3096 } 3097 3098 /* 3099 * Must re-check here, to close a race against __kthread_bind(), 3100 * sched_setaffinity() is not guaranteed to observe the flag. 3101 */ 3102 if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { 3103 ret = -EINVAL; 3104 goto out; 3105 } 3106 3107 if (!(ctx->flags & SCA_MIGRATE_ENABLE)) { 3108 if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) { 3109 if (ctx->flags & SCA_USER) 3110 swap(p->user_cpus_ptr, ctx->user_mask); 3111 goto out; 3112 } 3113 3114 if (WARN_ON_ONCE(p == current && 3115 is_migration_disabled(p) && 3116 !cpumask_test_cpu(task_cpu(p), ctx->new_mask))) { 3117 ret = -EBUSY; 3118 goto out; 3119 } 3120 } 3121 3122 /* 3123 * Picking a ~random cpu helps in cases where we are changing affinity 3124 * for groups of tasks (ie. cpuset), so that load balancing is not 3125 * immediately required to distribute the tasks within their new mask. 3126 */ 3127 dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ctx->new_mask); 3128 if (dest_cpu >= nr_cpu_ids) { 3129 ret = -EINVAL; 3130 goto out; 3131 } 3132 3133 do_set_cpus_allowed(p, ctx); 3134 3135 return affine_move_task(rq, p, rf, dest_cpu, ctx->flags); 3136 3137 out: 3138 task_rq_unlock(rq, p, rf); 3139 3140 return ret; 3141 } 3142 3143 /* 3144 * Change a given task's CPU affinity. Migrate the thread to a 3145 * proper CPU and schedule it away if the CPU it's executing on 3146 * is removed from the allowed bitmask. 3147 * 3148 * NOTE: the caller must have a valid reference to the task, the 3149 * task must not exit() & deallocate itself prematurely. The 3150 * call is not atomic; no spinlocks may be held. 3151 */ 3152 int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx) 3153 { 3154 struct rq_flags rf; 3155 struct rq *rq; 3156 3157 rq = task_rq_lock(p, &rf); 3158 /* 3159 * Masking should be skipped if SCA_USER or any of the SCA_MIGRATE_* 3160 * flags are set. 3161 */ 3162 if (p->user_cpus_ptr && 3163 !(ctx->flags & (SCA_USER | SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) && 3164 cpumask_and(rq->scratch_mask, ctx->new_mask, p->user_cpus_ptr)) 3165 ctx->new_mask = rq->scratch_mask; 3166 3167 return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf); 3168 } 3169 3170 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 3171 { 3172 struct affinity_context ac = { 3173 .new_mask = new_mask, 3174 .flags = 0, 3175 }; 3176 3177 return __set_cpus_allowed_ptr(p, &ac); 3178 } 3179 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 3180 3181 /* 3182 * Change a given task's CPU affinity to the intersection of its current 3183 * affinity mask and @subset_mask, writing the resulting mask to @new_mask. 3184 * If user_cpus_ptr is defined, use it as the basis for restricting CPU 3185 * affinity or use cpu_online_mask instead. 3186 * 3187 * If the resulting mask is empty, leave the affinity unchanged and return 3188 * -EINVAL. 3189 */ 3190 static int restrict_cpus_allowed_ptr(struct task_struct *p, 3191 struct cpumask *new_mask, 3192 const struct cpumask *subset_mask) 3193 { 3194 struct affinity_context ac = { 3195 .new_mask = new_mask, 3196 .flags = 0, 3197 }; 3198 struct rq_flags rf; 3199 struct rq *rq; 3200 int err; 3201 3202 rq = task_rq_lock(p, &rf); 3203 3204 /* 3205 * Forcefully restricting the affinity of a deadline task is 3206 * likely to cause problems, so fail and noisily override the 3207 * mask entirely. 3208 */ 3209 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { 3210 err = -EPERM; 3211 goto err_unlock; 3212 } 3213 3214 if (!cpumask_and(new_mask, task_user_cpus(p), subset_mask)) { 3215 err = -EINVAL; 3216 goto err_unlock; 3217 } 3218 3219 return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf); 3220 3221 err_unlock: 3222 task_rq_unlock(rq, p, &rf); 3223 return err; 3224 } 3225 3226 /* 3227 * Restrict the CPU affinity of task @p so that it is a subset of 3228 * task_cpu_possible_mask() and point @p->user_cpus_ptr to a copy of the 3229 * old affinity mask. If the resulting mask is empty, we warn and walk 3230 * up the cpuset hierarchy until we find a suitable mask. 3231 */ 3232 void force_compatible_cpus_allowed_ptr(struct task_struct *p) 3233 { 3234 cpumask_var_t new_mask; 3235 const struct cpumask *override_mask = task_cpu_possible_mask(p); 3236 3237 alloc_cpumask_var(&new_mask, GFP_KERNEL); 3238 3239 /* 3240 * __migrate_task() can fail silently in the face of concurrent 3241 * offlining of the chosen destination CPU, so take the hotplug 3242 * lock to ensure that the migration succeeds. 3243 */ 3244 cpus_read_lock(); 3245 if (!cpumask_available(new_mask)) 3246 goto out_set_mask; 3247 3248 if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask)) 3249 goto out_free_mask; 3250 3251 /* 3252 * We failed to find a valid subset of the affinity mask for the 3253 * task, so override it based on its cpuset hierarchy. 3254 */ 3255 cpuset_cpus_allowed(p, new_mask); 3256 override_mask = new_mask; 3257 3258 out_set_mask: 3259 if (printk_ratelimit()) { 3260 printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n", 3261 task_pid_nr(p), p->comm, 3262 cpumask_pr_args(override_mask)); 3263 } 3264 3265 WARN_ON(set_cpus_allowed_ptr(p, override_mask)); 3266 out_free_mask: 3267 cpus_read_unlock(); 3268 free_cpumask_var(new_mask); 3269 } 3270 3271 /* 3272 * Restore the affinity of a task @p which was previously restricted by a 3273 * call to force_compatible_cpus_allowed_ptr(). 3274 * 3275 * It is the caller's responsibility to serialise this with any calls to 3276 * force_compatible_cpus_allowed_ptr(@p). 3277 */ 3278 void relax_compatible_cpus_allowed_ptr(struct task_struct *p) 3279 { 3280 struct affinity_context ac = { 3281 .new_mask = task_user_cpus(p), 3282 .flags = 0, 3283 }; 3284 int ret; 3285 3286 /* 3287 * Try to restore the old affinity mask with __sched_setaffinity(). 3288 * Cpuset masking will be done there too. 3289 */ 3290 ret = __sched_setaffinity(p, &ac); 3291 WARN_ON_ONCE(ret); 3292 } 3293 3294 #ifdef CONFIG_SMP 3295 3296 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 3297 { 3298 unsigned int state = READ_ONCE(p->__state); 3299 3300 /* 3301 * We should never call set_task_cpu() on a blocked task, 3302 * ttwu() will sort out the placement. 3303 */ 3304 WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq); 3305 3306 /* 3307 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING, 3308 * because schedstat_wait_{start,end} rebase migrating task's wait_start 3309 * time relying on p->on_rq. 3310 */ 3311 WARN_ON_ONCE(state == TASK_RUNNING && 3312 p->sched_class == &fair_sched_class && 3313 (p->on_rq && !task_on_rq_migrating(p))); 3314 3315 #ifdef CONFIG_LOCKDEP 3316 /* 3317 * The caller should hold either p->pi_lock or rq->lock, when changing 3318 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 3319 * 3320 * sched_move_task() holds both and thus holding either pins the cgroup, 3321 * see task_group(). 3322 * 3323 * Furthermore, all task_rq users should acquire both locks, see 3324 * task_rq_lock(). 3325 */ 3326 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || 3327 lockdep_is_held(__rq_lockp(task_rq(p))))); 3328 #endif 3329 /* 3330 * Clearly, migrating tasks to offline CPUs is a fairly daft thing. 3331 */ 3332 WARN_ON_ONCE(!cpu_online(new_cpu)); 3333 3334 WARN_ON_ONCE(is_migration_disabled(p)); 3335 3336 trace_sched_migrate_task(p, new_cpu); 3337 3338 if (task_cpu(p) != new_cpu) { 3339 if (p->sched_class->migrate_task_rq) 3340 p->sched_class->migrate_task_rq(p, new_cpu); 3341 p->se.nr_migrations++; 3342 perf_event_task_migrate(p); 3343 } 3344 3345 __set_task_cpu(p, new_cpu); 3346 } 3347 #endif /* CONFIG_SMP */ 3348 3349 #ifdef CONFIG_NUMA_BALANCING 3350 static void __migrate_swap_task(struct task_struct *p, int cpu) 3351 { 3352 if (task_on_rq_queued(p)) { 3353 struct rq *src_rq, *dst_rq; 3354 struct rq_flags srf, drf; 3355 3356 src_rq = task_rq(p); 3357 dst_rq = cpu_rq(cpu); 3358 3359 rq_pin_lock(src_rq, &srf); 3360 rq_pin_lock(dst_rq, &drf); 3361 3362 move_queued_task_locked(src_rq, dst_rq, p); 3363 wakeup_preempt(dst_rq, p, 0); 3364 3365 rq_unpin_lock(dst_rq, &drf); 3366 rq_unpin_lock(src_rq, &srf); 3367 3368 } else { 3369 /* 3370 * Task isn't running anymore; make it appear like we migrated 3371 * it before it went to sleep. This means on wakeup we make the 3372 * previous CPU our target instead of where it really is. 3373 */ 3374 p->wake_cpu = cpu; 3375 } 3376 } 3377 3378 struct migration_swap_arg { 3379 struct task_struct *src_task, *dst_task; 3380 int src_cpu, dst_cpu; 3381 }; 3382 3383 static int migrate_swap_stop(void *data) 3384 { 3385 struct migration_swap_arg *arg = data; 3386 struct rq *src_rq, *dst_rq; 3387 3388 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) 3389 return -EAGAIN; 3390 3391 src_rq = cpu_rq(arg->src_cpu); 3392 dst_rq = cpu_rq(arg->dst_cpu); 3393 3394 guard(double_raw_spinlock)(&arg->src_task->pi_lock, &arg->dst_task->pi_lock); 3395 guard(double_rq_lock)(src_rq, dst_rq); 3396 3397 if (task_cpu(arg->dst_task) != arg->dst_cpu) 3398 return -EAGAIN; 3399 3400 if (task_cpu(arg->src_task) != arg->src_cpu) 3401 return -EAGAIN; 3402 3403 if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr)) 3404 return -EAGAIN; 3405 3406 if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr)) 3407 return -EAGAIN; 3408 3409 __migrate_swap_task(arg->src_task, arg->dst_cpu); 3410 __migrate_swap_task(arg->dst_task, arg->src_cpu); 3411 3412 return 0; 3413 } 3414 3415 /* 3416 * Cross migrate two tasks 3417 */ 3418 int migrate_swap(struct task_struct *cur, struct task_struct *p, 3419 int target_cpu, int curr_cpu) 3420 { 3421 struct migration_swap_arg arg; 3422 int ret = -EINVAL; 3423 3424 arg = (struct migration_swap_arg){ 3425 .src_task = cur, 3426 .src_cpu = curr_cpu, 3427 .dst_task = p, 3428 .dst_cpu = target_cpu, 3429 }; 3430 3431 if (arg.src_cpu == arg.dst_cpu) 3432 goto out; 3433 3434 /* 3435 * These three tests are all lockless; this is OK since all of them 3436 * will be re-checked with proper locks held further down the line. 3437 */ 3438 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) 3439 goto out; 3440 3441 if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr)) 3442 goto out; 3443 3444 if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr)) 3445 goto out; 3446 3447 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); 3448 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); 3449 3450 out: 3451 return ret; 3452 } 3453 #endif /* CONFIG_NUMA_BALANCING */ 3454 3455 /*** 3456 * kick_process - kick a running thread to enter/exit the kernel 3457 * @p: the to-be-kicked thread 3458 * 3459 * Cause a process which is running on another CPU to enter 3460 * kernel-mode, without any delay. (to get signals handled.) 3461 * 3462 * NOTE: this function doesn't have to take the runqueue lock, 3463 * because all it wants to ensure is that the remote task enters 3464 * the kernel. If the IPI races and the task has been migrated 3465 * to another CPU then no harm is done and the purpose has been 3466 * achieved as well. 3467 */ 3468 void kick_process(struct task_struct *p) 3469 { 3470 guard(preempt)(); 3471 int cpu = task_cpu(p); 3472 3473 if ((cpu != smp_processor_id()) && task_curr(p)) 3474 smp_send_reschedule(cpu); 3475 } 3476 EXPORT_SYMBOL_GPL(kick_process); 3477 3478 /* 3479 * ->cpus_ptr is protected by both rq->lock and p->pi_lock 3480 * 3481 * A few notes on cpu_active vs cpu_online: 3482 * 3483 * - cpu_active must be a subset of cpu_online 3484 * 3485 * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, 3486 * see __set_cpus_allowed_ptr(). At this point the newly online 3487 * CPU isn't yet part of the sched domains, and balancing will not 3488 * see it. 3489 * 3490 * - on CPU-down we clear cpu_active() to mask the sched domains and 3491 * avoid the load balancer to place new tasks on the to be removed 3492 * CPU. Existing tasks will remain running there and will be taken 3493 * off. 3494 * 3495 * This means that fallback selection must not select !active CPUs. 3496 * And can assume that any active CPU must be online. Conversely 3497 * select_task_rq() below may allow selection of !active CPUs in order 3498 * to satisfy the above rules. 3499 */ 3500 static int select_fallback_rq(int cpu, struct task_struct *p) 3501 { 3502 int nid = cpu_to_node(cpu); 3503 const struct cpumask *nodemask = NULL; 3504 enum { cpuset, possible, fail } state = cpuset; 3505 int dest_cpu; 3506 3507 /* 3508 * If the node that the CPU is on has been offlined, cpu_to_node() 3509 * will return -1. There is no CPU on the node, and we should 3510 * select the CPU on the other node. 3511 */ 3512 if (nid != -1) { 3513 nodemask = cpumask_of_node(nid); 3514 3515 /* Look for allowed, online CPU in same node. */ 3516 for_each_cpu(dest_cpu, nodemask) { 3517 if (is_cpu_allowed(p, dest_cpu)) 3518 return dest_cpu; 3519 } 3520 } 3521 3522 for (;;) { 3523 /* Any allowed, online CPU? */ 3524 for_each_cpu(dest_cpu, p->cpus_ptr) { 3525 if (!is_cpu_allowed(p, dest_cpu)) 3526 continue; 3527 3528 goto out; 3529 } 3530 3531 /* No more Mr. Nice Guy. */ 3532 switch (state) { 3533 case cpuset: 3534 if (cpuset_cpus_allowed_fallback(p)) { 3535 state = possible; 3536 break; 3537 } 3538 fallthrough; 3539 case possible: 3540 set_cpus_allowed_force(p, task_cpu_fallback_mask(p)); 3541 state = fail; 3542 break; 3543 case fail: 3544 BUG(); 3545 break; 3546 } 3547 } 3548 3549 out: 3550 if (state != cpuset) { 3551 /* 3552 * Don't tell them about moving exiting tasks or 3553 * kernel threads (both mm NULL), since they never 3554 * leave kernel. 3555 */ 3556 if (p->mm && printk_ratelimit()) { 3557 printk_deferred("process %d (%s) no longer affine to cpu%d\n", 3558 task_pid_nr(p), p->comm, cpu); 3559 } 3560 } 3561 3562 return dest_cpu; 3563 } 3564 3565 /* 3566 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. 3567 */ 3568 static inline 3569 int select_task_rq(struct task_struct *p, int cpu, int *wake_flags) 3570 { 3571 lockdep_assert_held(&p->pi_lock); 3572 3573 if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) { 3574 cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags); 3575 *wake_flags |= WF_RQ_SELECTED; 3576 } else { 3577 cpu = cpumask_any(p->cpus_ptr); 3578 } 3579 3580 /* 3581 * In order not to call set_task_cpu() on a blocking task we need 3582 * to rely on ttwu() to place the task on a valid ->cpus_ptr 3583 * CPU. 3584 * 3585 * Since this is common to all placement strategies, this lives here. 3586 * 3587 * [ this allows ->select_task() to simply return task_cpu(p) and 3588 * not worry about this generic constraint ] 3589 */ 3590 if (unlikely(!is_cpu_allowed(p, cpu))) 3591 cpu = select_fallback_rq(task_cpu(p), p); 3592 3593 return cpu; 3594 } 3595 3596 void sched_set_stop_task(int cpu, struct task_struct *stop) 3597 { 3598 static struct lock_class_key stop_pi_lock; 3599 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 3600 struct task_struct *old_stop = cpu_rq(cpu)->stop; 3601 3602 if (stop) { 3603 /* 3604 * Make it appear like a SCHED_FIFO task, its something 3605 * userspace knows about and won't get confused about. 3606 * 3607 * Also, it will make PI more or less work without too 3608 * much confusion -- but then, stop work should not 3609 * rely on PI working anyway. 3610 */ 3611 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); 3612 3613 stop->sched_class = &stop_sched_class; 3614 3615 /* 3616 * The PI code calls rt_mutex_setprio() with ->pi_lock held to 3617 * adjust the effective priority of a task. As a result, 3618 * rt_mutex_setprio() can trigger (RT) balancing operations, 3619 * which can then trigger wakeups of the stop thread to push 3620 * around the current task. 3621 * 3622 * The stop task itself will never be part of the PI-chain, it 3623 * never blocks, therefore that ->pi_lock recursion is safe. 3624 * Tell lockdep about this by placing the stop->pi_lock in its 3625 * own class. 3626 */ 3627 lockdep_set_class(&stop->pi_lock, &stop_pi_lock); 3628 } 3629 3630 cpu_rq(cpu)->stop = stop; 3631 3632 if (old_stop) { 3633 /* 3634 * Reset it back to a normal scheduling class so that 3635 * it can die in pieces. 3636 */ 3637 old_stop->sched_class = &rt_sched_class; 3638 } 3639 } 3640 3641 static void 3642 ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 3643 { 3644 struct rq *rq; 3645 3646 if (!schedstat_enabled()) 3647 return; 3648 3649 rq = this_rq(); 3650 3651 if (cpu == rq->cpu) { 3652 __schedstat_inc(rq->ttwu_local); 3653 __schedstat_inc(p->stats.nr_wakeups_local); 3654 } else { 3655 struct sched_domain *sd; 3656 3657 __schedstat_inc(p->stats.nr_wakeups_remote); 3658 3659 guard(rcu)(); 3660 for_each_domain(rq->cpu, sd) { 3661 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 3662 __schedstat_inc(sd->ttwu_wake_remote); 3663 break; 3664 } 3665 } 3666 } 3667 3668 if (wake_flags & WF_MIGRATED) 3669 __schedstat_inc(p->stats.nr_wakeups_migrate); 3670 3671 __schedstat_inc(rq->ttwu_count); 3672 __schedstat_inc(p->stats.nr_wakeups); 3673 3674 if (wake_flags & WF_SYNC) 3675 __schedstat_inc(p->stats.nr_wakeups_sync); 3676 } 3677 3678 /* 3679 * Mark the task runnable. 3680 */ 3681 static inline void ttwu_do_wakeup(struct task_struct *p) 3682 { 3683 WRITE_ONCE(p->__state, TASK_RUNNING); 3684 trace_sched_wakeup(p); 3685 } 3686 3687 void update_rq_avg_idle(struct rq *rq) 3688 { 3689 u64 delta = rq_clock(rq) - rq->idle_stamp; 3690 u64 max = 2*rq->max_idle_balance_cost; 3691 3692 update_avg(&rq->avg_idle, delta); 3693 3694 if (rq->avg_idle > max) 3695 rq->avg_idle = max; 3696 rq->idle_stamp = 0; 3697 } 3698 3699 static void 3700 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, 3701 struct rq_flags *rf) 3702 { 3703 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; 3704 3705 lockdep_assert_rq_held(rq); 3706 3707 if (p->sched_contributes_to_load) 3708 rq->nr_uninterruptible--; 3709 3710 if (wake_flags & WF_RQ_SELECTED) 3711 en_flags |= ENQUEUE_RQ_SELECTED; 3712 if (wake_flags & WF_MIGRATED) 3713 en_flags |= ENQUEUE_MIGRATED; 3714 else 3715 if (p->in_iowait) { 3716 delayacct_blkio_end(p); 3717 atomic_dec(&task_rq(p)->nr_iowait); 3718 } 3719 3720 activate_task(rq, p, en_flags); 3721 wakeup_preempt(rq, p, wake_flags); 3722 3723 ttwu_do_wakeup(p); 3724 3725 if (p->sched_class->task_woken) { 3726 /* 3727 * Our task @p is fully woken up and running; so it's safe to 3728 * drop the rq->lock, hereafter rq is only used for statistics. 3729 */ 3730 rq_unpin_lock(rq, rf); 3731 p->sched_class->task_woken(rq, p); 3732 rq_repin_lock(rq, rf); 3733 } 3734 } 3735 3736 /* 3737 * Consider @p being inside a wait loop: 3738 * 3739 * for (;;) { 3740 * set_current_state(TASK_UNINTERRUPTIBLE); 3741 * 3742 * if (CONDITION) 3743 * break; 3744 * 3745 * schedule(); 3746 * } 3747 * __set_current_state(TASK_RUNNING); 3748 * 3749 * between set_current_state() and schedule(). In this case @p is still 3750 * runnable, so all that needs doing is change p->state back to TASK_RUNNING in 3751 * an atomic manner. 3752 * 3753 * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq 3754 * then schedule() must still happen and p->state can be changed to 3755 * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we 3756 * need to do a full wakeup with enqueue. 3757 * 3758 * Returns: %true when the wakeup is done, 3759 * %false otherwise. 3760 */ 3761 static int ttwu_runnable(struct task_struct *p, int wake_flags) 3762 { 3763 struct rq_flags rf; 3764 struct rq *rq; 3765 int ret = 0; 3766 3767 rq = __task_rq_lock(p, &rf); 3768 if (task_on_rq_queued(p)) { 3769 update_rq_clock(rq); 3770 if (p->se.sched_delayed) 3771 enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED); 3772 if (!task_on_cpu(rq, p)) { 3773 /* 3774 * When on_rq && !on_cpu the task is preempted, see if 3775 * it should preempt the task that is current now. 3776 */ 3777 wakeup_preempt(rq, p, wake_flags); 3778 } 3779 ttwu_do_wakeup(p); 3780 ret = 1; 3781 } 3782 __task_rq_unlock(rq, p, &rf); 3783 3784 return ret; 3785 } 3786 3787 void sched_ttwu_pending(void *arg) 3788 { 3789 struct llist_node *llist = arg; 3790 struct rq *rq = this_rq(); 3791 struct task_struct *p, *t; 3792 struct rq_flags rf; 3793 3794 if (!llist) 3795 return; 3796 3797 rq_lock_irqsave(rq, &rf); 3798 update_rq_clock(rq); 3799 3800 llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { 3801 if (WARN_ON_ONCE(p->on_cpu)) 3802 smp_cond_load_acquire(&p->on_cpu, !VAL); 3803 3804 if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) 3805 set_task_cpu(p, cpu_of(rq)); 3806 3807 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); 3808 } 3809 3810 /* 3811 * Must be after enqueueing at least once task such that 3812 * idle_cpu() does not observe a false-negative -- if it does, 3813 * it is possible for select_idle_siblings() to stack a number 3814 * of tasks on this CPU during that window. 3815 * 3816 * It is OK to clear ttwu_pending when another task pending. 3817 * We will receive IPI after local IRQ enabled and then enqueue it. 3818 * Since now nr_running > 0, idle_cpu() will always get correct result. 3819 */ 3820 WRITE_ONCE(rq->ttwu_pending, 0); 3821 rq_unlock_irqrestore(rq, &rf); 3822 } 3823 3824 /* 3825 * Prepare the scene for sending an IPI for a remote smp_call 3826 * 3827 * Returns true if the caller can proceed with sending the IPI. 3828 * Returns false otherwise. 3829 */ 3830 bool call_function_single_prep_ipi(int cpu) 3831 { 3832 if (set_nr_if_polling(cpu_rq(cpu)->idle)) { 3833 trace_sched_wake_idle_without_ipi(cpu); 3834 return false; 3835 } 3836 3837 return true; 3838 } 3839 3840 /* 3841 * Queue a task on the target CPUs wake_list and wake the CPU via IPI if 3842 * necessary. The wakee CPU on receipt of the IPI will queue the task 3843 * via sched_ttwu_wakeup() for activation so the wakee incurs the cost 3844 * of the wakeup instead of the waker. 3845 */ 3846 static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) 3847 { 3848 struct rq *rq = cpu_rq(cpu); 3849 3850 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); 3851 3852 WRITE_ONCE(rq->ttwu_pending, 1); 3853 #ifdef CONFIG_SMP 3854 __smp_call_single_queue(cpu, &p->wake_entry.llist); 3855 #endif 3856 } 3857 3858 void wake_up_if_idle(int cpu) 3859 { 3860 struct rq *rq = cpu_rq(cpu); 3861 3862 guard(rcu)(); 3863 if (is_idle_task(rcu_dereference(rq->curr))) { 3864 guard(rq_lock_irqsave)(rq); 3865 if (is_idle_task(rq->curr)) 3866 resched_curr(rq); 3867 } 3868 } 3869 3870 bool cpus_equal_capacity(int this_cpu, int that_cpu) 3871 { 3872 if (!sched_asym_cpucap_active()) 3873 return true; 3874 3875 if (this_cpu == that_cpu) 3876 return true; 3877 3878 return arch_scale_cpu_capacity(this_cpu) == arch_scale_cpu_capacity(that_cpu); 3879 } 3880 3881 bool cpus_share_cache(int this_cpu, int that_cpu) 3882 { 3883 if (this_cpu == that_cpu) 3884 return true; 3885 3886 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 3887 } 3888 3889 /* 3890 * Whether CPUs are share cache resources, which means LLC on non-cluster 3891 * machines and LLC tag or L2 on machines with clusters. 3892 */ 3893 bool cpus_share_resources(int this_cpu, int that_cpu) 3894 { 3895 if (this_cpu == that_cpu) 3896 return true; 3897 3898 return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu); 3899 } 3900 3901 static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) 3902 { 3903 int this_cpu = smp_processor_id(); 3904 3905 /* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */ 3906 if (!scx_allow_ttwu_queue(p)) 3907 return false; 3908 3909 #ifdef CONFIG_SMP 3910 if (p->sched_class == &stop_sched_class) 3911 return false; 3912 #endif 3913 3914 /* 3915 * Do not complicate things with the async wake_list while the CPU is 3916 * in hotplug state. 3917 */ 3918 if (!cpu_active(cpu)) 3919 return false; 3920 3921 /* Ensure the task will still be allowed to run on the CPU. */ 3922 if (!cpumask_test_cpu(cpu, p->cpus_ptr)) 3923 return false; 3924 3925 /* 3926 * If the CPU does not share cache, then queue the task on the 3927 * remote rqs wakelist to avoid accessing remote data. 3928 */ 3929 if (!cpus_share_cache(this_cpu, cpu)) 3930 return true; 3931 3932 if (cpu == this_cpu) 3933 return false; 3934 3935 /* 3936 * If the wakee cpu is idle, or the task is descheduling and the 3937 * only running task on the CPU, then use the wakelist to offload 3938 * the task activation to the idle (or soon-to-be-idle) CPU as 3939 * the current CPU is likely busy. nr_running is checked to 3940 * avoid unnecessary task stacking. 3941 * 3942 * Note that we can only get here with (wakee) p->on_rq=0, 3943 * p->on_cpu can be whatever, we've done the dequeue, so 3944 * the wakee has been accounted out of ->nr_running. 3945 */ 3946 if (!cpu_rq(cpu)->nr_running) 3947 return true; 3948 3949 return false; 3950 } 3951 3952 static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) 3953 { 3954 if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu)) { 3955 sched_clock_cpu(cpu); /* Sync clocks across CPUs */ 3956 __ttwu_queue_wakelist(p, cpu, wake_flags); 3957 return true; 3958 } 3959 3960 return false; 3961 } 3962 3963 static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) 3964 { 3965 struct rq *rq = cpu_rq(cpu); 3966 struct rq_flags rf; 3967 3968 if (ttwu_queue_wakelist(p, cpu, wake_flags)) 3969 return; 3970 3971 rq_lock(rq, &rf); 3972 update_rq_clock(rq); 3973 ttwu_do_activate(rq, p, wake_flags, &rf); 3974 rq_unlock(rq, &rf); 3975 } 3976 3977 /* 3978 * Invoked from try_to_wake_up() to check whether the task can be woken up. 3979 * 3980 * The caller holds p::pi_lock if p != current or has preemption 3981 * disabled when p == current. 3982 * 3983 * The rules of saved_state: 3984 * 3985 * The related locking code always holds p::pi_lock when updating 3986 * p::saved_state, which means the code is fully serialized in both cases. 3987 * 3988 * For PREEMPT_RT, the lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. 3989 * No other bits set. This allows to distinguish all wakeup scenarios. 3990 * 3991 * For FREEZER, the wakeup happens via TASK_FROZEN. No other bits set. This 3992 * allows us to prevent early wakeup of tasks before they can be run on 3993 * asymmetric ISA architectures (eg ARMv9). 3994 */ 3995 static __always_inline 3996 bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) 3997 { 3998 int match; 3999 4000 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { 4001 WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) && 4002 state != TASK_RTLOCK_WAIT); 4003 } 4004 4005 *success = !!(match = __task_state_match(p, state)); 4006 4007 /* 4008 * Saved state preserves the task state across blocking on 4009 * an RT lock or TASK_FREEZABLE tasks. If the state matches, 4010 * set p::saved_state to TASK_RUNNING, but do not wake the task 4011 * because it waits for a lock wakeup or __thaw_task(). Also 4012 * indicate success because from the regular waker's point of 4013 * view this has succeeded. 4014 * 4015 * After acquiring the lock the task will restore p::__state 4016 * from p::saved_state which ensures that the regular 4017 * wakeup is not lost. The restore will also set 4018 * p::saved_state to TASK_RUNNING so any further tests will 4019 * not result in false positives vs. @success 4020 */ 4021 if (match < 0) 4022 p->saved_state = TASK_RUNNING; 4023 4024 return match > 0; 4025 } 4026 4027 /* 4028 * Notes on Program-Order guarantees on SMP systems. 4029 * 4030 * MIGRATION 4031 * 4032 * The basic program-order guarantee on SMP systems is that when a task [t] 4033 * migrates, all its activity on its old CPU [c0] happens-before any subsequent 4034 * execution on its new CPU [c1]. 4035 * 4036 * For migration (of runnable tasks) this is provided by the following means: 4037 * 4038 * A) UNLOCK of the rq(c0)->lock scheduling out task t 4039 * B) migration for t is required to synchronize *both* rq(c0)->lock and 4040 * rq(c1)->lock (if not at the same time, then in that order). 4041 * C) LOCK of the rq(c1)->lock scheduling in task 4042 * 4043 * Release/acquire chaining guarantees that B happens after A and C after B. 4044 * Note: the CPU doing B need not be c0 or c1 4045 * 4046 * Example: 4047 * 4048 * CPU0 CPU1 CPU2 4049 * 4050 * LOCK rq(0)->lock 4051 * sched-out X 4052 * sched-in Y 4053 * UNLOCK rq(0)->lock 4054 * 4055 * LOCK rq(0)->lock // orders against CPU0 4056 * dequeue X 4057 * UNLOCK rq(0)->lock 4058 * 4059 * LOCK rq(1)->lock 4060 * enqueue X 4061 * UNLOCK rq(1)->lock 4062 * 4063 * LOCK rq(1)->lock // orders against CPU2 4064 * sched-out Z 4065 * sched-in X 4066 * UNLOCK rq(1)->lock 4067 * 4068 * 4069 * BLOCKING -- aka. SLEEP + WAKEUP 4070 * 4071 * For blocking we (obviously) need to provide the same guarantee as for 4072 * migration. However the means are completely different as there is no lock 4073 * chain to provide order. Instead we do: 4074 * 4075 * 1) smp_store_release(X->on_cpu, 0) -- finish_task() 4076 * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up() 4077 * 4078 * Example: 4079 * 4080 * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) 4081 * 4082 * LOCK rq(0)->lock LOCK X->pi_lock 4083 * dequeue X 4084 * sched-out X 4085 * smp_store_release(X->on_cpu, 0); 4086 * 4087 * smp_cond_load_acquire(&X->on_cpu, !VAL); 4088 * X->state = WAKING 4089 * set_task_cpu(X,2) 4090 * 4091 * LOCK rq(2)->lock 4092 * enqueue X 4093 * X->state = RUNNING 4094 * UNLOCK rq(2)->lock 4095 * 4096 * LOCK rq(2)->lock // orders against CPU1 4097 * sched-out Z 4098 * sched-in X 4099 * UNLOCK rq(2)->lock 4100 * 4101 * UNLOCK X->pi_lock 4102 * UNLOCK rq(0)->lock 4103 * 4104 * 4105 * However, for wakeups there is a second guarantee we must provide, namely we 4106 * must ensure that CONDITION=1 done by the caller can not be reordered with 4107 * accesses to the task state; see try_to_wake_up() and set_current_state(). 4108 */ 4109 4110 /** 4111 * try_to_wake_up - wake up a thread 4112 * @p: the thread to be awakened 4113 * @state: the mask of task states that can be woken 4114 * @wake_flags: wake modifier flags (WF_*) 4115 * 4116 * Conceptually does: 4117 * 4118 * If (@state & @p->state) @p->state = TASK_RUNNING. 4119 * 4120 * If the task was not queued/runnable, also place it back on a runqueue. 4121 * 4122 * This function is atomic against schedule() which would dequeue the task. 4123 * 4124 * It issues a full memory barrier before accessing @p->state, see the comment 4125 * with set_current_state(). 4126 * 4127 * Uses p->pi_lock to serialize against concurrent wake-ups. 4128 * 4129 * Relies on p->pi_lock stabilizing: 4130 * - p->sched_class 4131 * - p->cpus_ptr 4132 * - p->sched_task_group 4133 * in order to do migration, see its use of select_task_rq()/set_task_cpu(). 4134 * 4135 * Tries really hard to only take one task_rq(p)->lock for performance. 4136 * Takes rq->lock in: 4137 * - ttwu_runnable() -- old rq, unavoidable, see comment there; 4138 * - ttwu_queue() -- new rq, for enqueue of the task; 4139 * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us. 4140 * 4141 * As a consequence we race really badly with just about everything. See the 4142 * many memory barriers and their comments for details. 4143 * 4144 * Return: %true if @p->state changes (an actual wakeup was done), 4145 * %false otherwise. 4146 */ 4147 int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 4148 { 4149 guard(preempt)(); 4150 int cpu, success = 0; 4151 4152 wake_flags |= WF_TTWU; 4153 4154 if (p == current) { 4155 /* 4156 * We're waking current, this means 'p->on_rq' and 'task_cpu(p) 4157 * == smp_processor_id()'. Together this means we can special 4158 * case the whole 'p->on_rq && ttwu_runnable()' case below 4159 * without taking any locks. 4160 * 4161 * Specifically, given current runs ttwu() we must be before 4162 * schedule()'s block_task(), as such this must not observe 4163 * sched_delayed. 4164 * 4165 * In particular: 4166 * - we rely on Program-Order guarantees for all the ordering, 4167 * - we're serialized against set_special_state() by virtue of 4168 * it disabling IRQs (this allows not taking ->pi_lock). 4169 */ 4170 WARN_ON_ONCE(p->se.sched_delayed); 4171 if (!ttwu_state_match(p, state, &success)) 4172 goto out; 4173 4174 trace_sched_waking(p); 4175 ttwu_do_wakeup(p); 4176 goto out; 4177 } 4178 4179 /* 4180 * If we are going to wake up a thread waiting for CONDITION we 4181 * need to ensure that CONDITION=1 done by the caller can not be 4182 * reordered with p->state check below. This pairs with smp_store_mb() 4183 * in set_current_state() that the waiting thread does. 4184 */ 4185 scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { 4186 smp_mb__after_spinlock(); 4187 if (!ttwu_state_match(p, state, &success)) 4188 break; 4189 4190 trace_sched_waking(p); 4191 4192 /* 4193 * Ensure we load p->on_rq _after_ p->state, otherwise it would 4194 * be possible to, falsely, observe p->on_rq == 0 and get stuck 4195 * in smp_cond_load_acquire() below. 4196 * 4197 * sched_ttwu_pending() try_to_wake_up() 4198 * STORE p->on_rq = 1 LOAD p->state 4199 * UNLOCK rq->lock 4200 * 4201 * __schedule() (switch to task 'p') 4202 * LOCK rq->lock smp_rmb(); 4203 * smp_mb__after_spinlock(); 4204 * UNLOCK rq->lock 4205 * 4206 * [task p] 4207 * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq 4208 * 4209 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in 4210 * __schedule(). See the comment for smp_mb__after_spinlock(). 4211 * 4212 * A similar smp_rmb() lives in __task_needs_rq_lock(). 4213 */ 4214 smp_rmb(); 4215 if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) 4216 break; 4217 4218 /* 4219 * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be 4220 * possible to, falsely, observe p->on_cpu == 0. 4221 * 4222 * One must be running (->on_cpu == 1) in order to remove oneself 4223 * from the runqueue. 4224 * 4225 * __schedule() (switch to task 'p') try_to_wake_up() 4226 * STORE p->on_cpu = 1 LOAD p->on_rq 4227 * UNLOCK rq->lock 4228 * 4229 * __schedule() (put 'p' to sleep) 4230 * LOCK rq->lock smp_rmb(); 4231 * smp_mb__after_spinlock(); 4232 * STORE p->on_rq = 0 LOAD p->on_cpu 4233 * 4234 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in 4235 * __schedule(). See the comment for smp_mb__after_spinlock(). 4236 * 4237 * Form a control-dep-acquire with p->on_rq == 0 above, to ensure 4238 * schedule()'s block_task() has 'happened' and p will no longer 4239 * care about it's own p->state. See the comment in __schedule(). 4240 */ 4241 smp_acquire__after_ctrl_dep(); 4242 4243 /* 4244 * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq 4245 * == 0), which means we need to do an enqueue, change p->state to 4246 * TASK_WAKING such that we can unlock p->pi_lock before doing the 4247 * enqueue, such as ttwu_queue_wakelist(). 4248 */ 4249 WRITE_ONCE(p->__state, TASK_WAKING); 4250 4251 /* 4252 * If the owning (remote) CPU is still in the middle of schedule() with 4253 * this task as prev, considering queueing p on the remote CPUs wake_list 4254 * which potentially sends an IPI instead of spinning on p->on_cpu to 4255 * let the waker make forward progress. This is safe because IRQs are 4256 * disabled and the IPI will deliver after on_cpu is cleared. 4257 * 4258 * Ensure we load task_cpu(p) after p->on_cpu: 4259 * 4260 * set_task_cpu(p, cpu); 4261 * STORE p->cpu = @cpu 4262 * __schedule() (switch to task 'p') 4263 * LOCK rq->lock 4264 * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) 4265 * STORE p->on_cpu = 1 LOAD p->cpu 4266 * 4267 * to ensure we observe the correct CPU on which the task is currently 4268 * scheduling. 4269 */ 4270 if (smp_load_acquire(&p->on_cpu) && 4271 ttwu_queue_wakelist(p, task_cpu(p), wake_flags)) 4272 break; 4273 4274 /* 4275 * If the owning (remote) CPU is still in the middle of schedule() with 4276 * this task as prev, wait until it's done referencing the task. 4277 * 4278 * Pairs with the smp_store_release() in finish_task(). 4279 * 4280 * This ensures that tasks getting woken will be fully ordered against 4281 * their previous state and preserve Program Order. 4282 */ 4283 smp_cond_load_acquire(&p->on_cpu, !VAL); 4284 4285 cpu = select_task_rq(p, p->wake_cpu, &wake_flags); 4286 if (task_cpu(p) != cpu) { 4287 if (p->in_iowait) { 4288 delayacct_blkio_end(p); 4289 atomic_dec(&task_rq(p)->nr_iowait); 4290 } 4291 4292 wake_flags |= WF_MIGRATED; 4293 psi_ttwu_dequeue(p); 4294 set_task_cpu(p, cpu); 4295 } 4296 4297 ttwu_queue(p, cpu, wake_flags); 4298 } 4299 out: 4300 if (success) 4301 ttwu_stat(p, task_cpu(p), wake_flags); 4302 4303 return success; 4304 } 4305 4306 static bool __task_needs_rq_lock(struct task_struct *p) 4307 { 4308 unsigned int state = READ_ONCE(p->__state); 4309 4310 /* 4311 * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when 4312 * the task is blocked. Make sure to check @state since ttwu() can drop 4313 * locks at the end, see ttwu_queue_wakelist(). 4314 */ 4315 if (state == TASK_RUNNING || state == TASK_WAKING) 4316 return true; 4317 4318 /* 4319 * Ensure we load p->on_rq after p->__state, otherwise it would be 4320 * possible to, falsely, observe p->on_rq == 0. 4321 * 4322 * See try_to_wake_up() for a longer comment. 4323 */ 4324 smp_rmb(); 4325 if (p->on_rq) 4326 return true; 4327 4328 /* 4329 * Ensure the task has finished __schedule() and will not be referenced 4330 * anymore. Again, see try_to_wake_up() for a longer comment. 4331 */ 4332 smp_rmb(); 4333 smp_cond_load_acquire(&p->on_cpu, !VAL); 4334 4335 return false; 4336 } 4337 4338 /** 4339 * task_call_func - Invoke a function on task in fixed state 4340 * @p: Process for which the function is to be invoked, can be @current. 4341 * @func: Function to invoke. 4342 * @arg: Argument to function. 4343 * 4344 * Fix the task in it's current state by avoiding wakeups and or rq operations 4345 * and call @func(@arg) on it. This function can use task_is_runnable() and 4346 * task_curr() to work out what the state is, if required. Given that @func 4347 * can be invoked with a runqueue lock held, it had better be quite 4348 * lightweight. 4349 * 4350 * Returns: 4351 * Whatever @func returns 4352 */ 4353 int task_call_func(struct task_struct *p, task_call_f func, void *arg) 4354 { 4355 struct rq_flags rf; 4356 int ret; 4357 4358 raw_spin_lock_irqsave(&p->pi_lock, rf.flags); 4359 4360 if (__task_needs_rq_lock(p)) { 4361 struct rq *rq = __task_rq_lock(p, &rf); 4362 4363 /* 4364 * At this point the task is pinned; either: 4365 * - blocked and we're holding off wakeups (pi->lock) 4366 * - woken, and we're holding off enqueue (rq->lock) 4367 * - queued, and we're holding off schedule (rq->lock) 4368 * - running, and we're holding off de-schedule (rq->lock) 4369 * 4370 * The called function (@func) can use: task_curr(), p->on_rq and 4371 * p->__state to differentiate between these states. 4372 */ 4373 ret = func(p, arg); 4374 4375 __task_rq_unlock(rq, p, &rf); 4376 } else { 4377 ret = func(p, arg); 4378 } 4379 4380 raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); 4381 return ret; 4382 } 4383 4384 /** 4385 * cpu_curr_snapshot - Return a snapshot of the currently running task 4386 * @cpu: The CPU on which to snapshot the task. 4387 * 4388 * Returns the task_struct pointer of the task "currently" running on 4389 * the specified CPU. 4390 * 4391 * If the specified CPU was offline, the return value is whatever it 4392 * is, perhaps a pointer to the task_struct structure of that CPU's idle 4393 * task, but there is no guarantee. Callers wishing a useful return 4394 * value must take some action to ensure that the specified CPU remains 4395 * online throughout. 4396 * 4397 * This function executes full memory barriers before and after fetching 4398 * the pointer, which permits the caller to confine this function's fetch 4399 * with respect to the caller's accesses to other shared variables. 4400 */ 4401 struct task_struct *cpu_curr_snapshot(int cpu) 4402 { 4403 struct rq *rq = cpu_rq(cpu); 4404 struct task_struct *t; 4405 struct rq_flags rf; 4406 4407 rq_lock_irqsave(rq, &rf); 4408 smp_mb__after_spinlock(); /* Pairing determined by caller's synchronization design. */ 4409 t = rcu_dereference(cpu_curr(cpu)); 4410 rq_unlock_irqrestore(rq, &rf); 4411 smp_mb(); /* Pairing determined by caller's synchronization design. */ 4412 4413 return t; 4414 } 4415 4416 /** 4417 * wake_up_process - Wake up a specific process 4418 * @p: The process to be woken up. 4419 * 4420 * Attempt to wake up the nominated process and move it to the set of runnable 4421 * processes. 4422 * 4423 * Return: 1 if the process was woken up, 0 if it was already running. 4424 * 4425 * This function executes a full memory barrier before accessing the task state. 4426 */ 4427 int wake_up_process(struct task_struct *p) 4428 { 4429 return try_to_wake_up(p, TASK_NORMAL, 0); 4430 } 4431 EXPORT_SYMBOL(wake_up_process); 4432 4433 int wake_up_state(struct task_struct *p, unsigned int state) 4434 { 4435 return try_to_wake_up(p, state, 0); 4436 } 4437 4438 /* 4439 * Perform scheduler related setup for a newly forked process p. 4440 * p is forked by current. 4441 * 4442 * __sched_fork() is basic setup which is also used by sched_init() to 4443 * initialize the boot CPU's idle task. 4444 */ 4445 static void __sched_fork(u64 clone_flags, struct task_struct *p) 4446 { 4447 p->on_rq = 0; 4448 4449 p->se.on_rq = 0; 4450 p->se.exec_start = 0; 4451 p->se.sum_exec_runtime = 0; 4452 p->se.prev_sum_exec_runtime = 0; 4453 p->se.nr_migrations = 0; 4454 p->se.vruntime = 0; 4455 p->se.vlag = 0; 4456 INIT_LIST_HEAD(&p->se.group_node); 4457 4458 /* A delayed task cannot be in clone(). */ 4459 WARN_ON_ONCE(p->se.sched_delayed); 4460 4461 #ifdef CONFIG_FAIR_GROUP_SCHED 4462 p->se.cfs_rq = NULL; 4463 #ifdef CONFIG_CFS_BANDWIDTH 4464 init_cfs_throttle_work(p); 4465 #endif 4466 #endif 4467 4468 #ifdef CONFIG_SCHEDSTATS 4469 /* Even if schedstat is disabled, there should not be garbage */ 4470 memset(&p->stats, 0, sizeof(p->stats)); 4471 #endif 4472 4473 init_dl_entity(&p->dl); 4474 4475 INIT_LIST_HEAD(&p->rt.run_list); 4476 p->rt.timeout = 0; 4477 p->rt.time_slice = sched_rr_timeslice; 4478 p->rt.on_rq = 0; 4479 p->rt.on_list = 0; 4480 4481 #ifdef CONFIG_SCHED_CLASS_EXT 4482 init_scx_entity(&p->scx); 4483 #endif 4484 4485 #ifdef CONFIG_PREEMPT_NOTIFIERS 4486 INIT_HLIST_HEAD(&p->preempt_notifiers); 4487 #endif 4488 4489 #ifdef CONFIG_COMPACTION 4490 p->capture_control = NULL; 4491 #endif 4492 init_numa_balancing(clone_flags, p); 4493 p->wake_entry.u_flags = CSD_TYPE_TTWU; 4494 p->migration_pending = NULL; 4495 } 4496 4497 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); 4498 4499 #ifdef CONFIG_NUMA_BALANCING 4500 4501 int sysctl_numa_balancing_mode; 4502 4503 static void __set_numabalancing_state(bool enabled) 4504 { 4505 if (enabled) 4506 static_branch_enable(&sched_numa_balancing); 4507 else 4508 static_branch_disable(&sched_numa_balancing); 4509 } 4510 4511 void set_numabalancing_state(bool enabled) 4512 { 4513 if (enabled) 4514 sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL; 4515 else 4516 sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED; 4517 __set_numabalancing_state(enabled); 4518 } 4519 4520 #ifdef CONFIG_PROC_SYSCTL 4521 static void reset_memory_tiering(void) 4522 { 4523 struct pglist_data *pgdat; 4524 4525 for_each_online_pgdat(pgdat) { 4526 pgdat->nbp_threshold = 0; 4527 pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); 4528 pgdat->nbp_th_start = jiffies_to_msecs(jiffies); 4529 } 4530 } 4531 4532 static int sysctl_numa_balancing(const struct ctl_table *table, int write, 4533 void *buffer, size_t *lenp, loff_t *ppos) 4534 { 4535 struct ctl_table t; 4536 int err; 4537 int state = sysctl_numa_balancing_mode; 4538 4539 if (write && !capable(CAP_SYS_ADMIN)) 4540 return -EPERM; 4541 4542 t = *table; 4543 t.data = &state; 4544 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 4545 if (err < 0) 4546 return err; 4547 if (write) { 4548 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && 4549 (state & NUMA_BALANCING_MEMORY_TIERING)) 4550 reset_memory_tiering(); 4551 sysctl_numa_balancing_mode = state; 4552 __set_numabalancing_state(state); 4553 } 4554 return err; 4555 } 4556 #endif /* CONFIG_PROC_SYSCTL */ 4557 #endif /* CONFIG_NUMA_BALANCING */ 4558 4559 #ifdef CONFIG_SCHEDSTATS 4560 4561 DEFINE_STATIC_KEY_FALSE(sched_schedstats); 4562 4563 static void set_schedstats(bool enabled) 4564 { 4565 if (enabled) 4566 static_branch_enable(&sched_schedstats); 4567 else 4568 static_branch_disable(&sched_schedstats); 4569 } 4570 4571 void force_schedstat_enabled(void) 4572 { 4573 if (!schedstat_enabled()) { 4574 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); 4575 static_branch_enable(&sched_schedstats); 4576 } 4577 } 4578 4579 static int __init setup_schedstats(char *str) 4580 { 4581 int ret = 0; 4582 if (!str) 4583 goto out; 4584 4585 if (!strcmp(str, "enable")) { 4586 set_schedstats(true); 4587 ret = 1; 4588 } else if (!strcmp(str, "disable")) { 4589 set_schedstats(false); 4590 ret = 1; 4591 } 4592 out: 4593 if (!ret) 4594 pr_warn("Unable to parse schedstats=\n"); 4595 4596 return ret; 4597 } 4598 __setup("schedstats=", setup_schedstats); 4599 4600 #ifdef CONFIG_PROC_SYSCTL 4601 static int sysctl_schedstats(const struct ctl_table *table, int write, void *buffer, 4602 size_t *lenp, loff_t *ppos) 4603 { 4604 struct ctl_table t; 4605 int err; 4606 int state = static_branch_likely(&sched_schedstats); 4607 4608 if (write && !capable(CAP_SYS_ADMIN)) 4609 return -EPERM; 4610 4611 t = *table; 4612 t.data = &state; 4613 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 4614 if (err < 0) 4615 return err; 4616 if (write) 4617 set_schedstats(state); 4618 return err; 4619 } 4620 #endif /* CONFIG_PROC_SYSCTL */ 4621 #endif /* CONFIG_SCHEDSTATS */ 4622 4623 #ifdef CONFIG_SYSCTL 4624 static const struct ctl_table sched_core_sysctls[] = { 4625 #ifdef CONFIG_SCHEDSTATS 4626 { 4627 .procname = "sched_schedstats", 4628 .data = NULL, 4629 .maxlen = sizeof(unsigned int), 4630 .mode = 0644, 4631 .proc_handler = sysctl_schedstats, 4632 .extra1 = SYSCTL_ZERO, 4633 .extra2 = SYSCTL_ONE, 4634 }, 4635 #endif /* CONFIG_SCHEDSTATS */ 4636 #ifdef CONFIG_UCLAMP_TASK 4637 { 4638 .procname = "sched_util_clamp_min", 4639 .data = &sysctl_sched_uclamp_util_min, 4640 .maxlen = sizeof(unsigned int), 4641 .mode = 0644, 4642 .proc_handler = sysctl_sched_uclamp_handler, 4643 }, 4644 { 4645 .procname = "sched_util_clamp_max", 4646 .data = &sysctl_sched_uclamp_util_max, 4647 .maxlen = sizeof(unsigned int), 4648 .mode = 0644, 4649 .proc_handler = sysctl_sched_uclamp_handler, 4650 }, 4651 { 4652 .procname = "sched_util_clamp_min_rt_default", 4653 .data = &sysctl_sched_uclamp_util_min_rt_default, 4654 .maxlen = sizeof(unsigned int), 4655 .mode = 0644, 4656 .proc_handler = sysctl_sched_uclamp_handler, 4657 }, 4658 #endif /* CONFIG_UCLAMP_TASK */ 4659 #ifdef CONFIG_NUMA_BALANCING 4660 { 4661 .procname = "numa_balancing", 4662 .data = NULL, /* filled in by handler */ 4663 .maxlen = sizeof(unsigned int), 4664 .mode = 0644, 4665 .proc_handler = sysctl_numa_balancing, 4666 .extra1 = SYSCTL_ZERO, 4667 .extra2 = SYSCTL_FOUR, 4668 }, 4669 #endif /* CONFIG_NUMA_BALANCING */ 4670 }; 4671 static int __init sched_core_sysctl_init(void) 4672 { 4673 register_sysctl_init("kernel", sched_core_sysctls); 4674 return 0; 4675 } 4676 late_initcall(sched_core_sysctl_init); 4677 #endif /* CONFIG_SYSCTL */ 4678 4679 /* 4680 * fork()/clone()-time setup: 4681 */ 4682 int sched_fork(u64 clone_flags, struct task_struct *p) 4683 { 4684 __sched_fork(clone_flags, p); 4685 /* 4686 * We mark the process as NEW here. This guarantees that 4687 * nobody will actually run it, and a signal or other external 4688 * event cannot wake it up and insert it on the runqueue either. 4689 */ 4690 p->__state = TASK_NEW; 4691 4692 /* 4693 * Make sure we do not leak PI boosting priority to the child. 4694 */ 4695 p->prio = current->normal_prio; 4696 4697 uclamp_fork(p); 4698 4699 /* 4700 * Revert to default priority/policy on fork if requested. 4701 */ 4702 if (unlikely(p->sched_reset_on_fork)) { 4703 if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 4704 p->policy = SCHED_NORMAL; 4705 p->static_prio = NICE_TO_PRIO(0); 4706 p->rt_priority = 0; 4707 } else if (PRIO_TO_NICE(p->static_prio) < 0) 4708 p->static_prio = NICE_TO_PRIO(0); 4709 4710 p->prio = p->normal_prio = p->static_prio; 4711 set_load_weight(p, false); 4712 p->se.custom_slice = 0; 4713 p->se.slice = sysctl_sched_base_slice; 4714 4715 /* 4716 * We don't need the reset flag anymore after the fork. It has 4717 * fulfilled its duty: 4718 */ 4719 p->sched_reset_on_fork = 0; 4720 } 4721 4722 if (dl_prio(p->prio)) 4723 return -EAGAIN; 4724 4725 scx_pre_fork(p); 4726 4727 if (rt_prio(p->prio)) { 4728 p->sched_class = &rt_sched_class; 4729 #ifdef CONFIG_SCHED_CLASS_EXT 4730 } else if (task_should_scx(p->policy)) { 4731 p->sched_class = &ext_sched_class; 4732 #endif 4733 } else { 4734 p->sched_class = &fair_sched_class; 4735 } 4736 4737 init_entity_runnable_average(&p->se); 4738 4739 4740 #ifdef CONFIG_SCHED_INFO 4741 if (likely(sched_info_on())) 4742 memset(&p->sched_info, 0, sizeof(p->sched_info)); 4743 #endif 4744 p->on_cpu = 0; 4745 init_task_preempt_count(p); 4746 plist_node_init(&p->pushable_tasks, MAX_PRIO); 4747 RB_CLEAR_NODE(&p->pushable_dl_tasks); 4748 4749 return 0; 4750 } 4751 4752 int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) 4753 { 4754 unsigned long flags; 4755 4756 /* 4757 * Because we're not yet on the pid-hash, p->pi_lock isn't strictly 4758 * required yet, but lockdep gets upset if rules are violated. 4759 */ 4760 raw_spin_lock_irqsave(&p->pi_lock, flags); 4761 #ifdef CONFIG_CGROUP_SCHED 4762 if (1) { 4763 struct task_group *tg; 4764 tg = container_of(kargs->cset->subsys[cpu_cgrp_id], 4765 struct task_group, css); 4766 tg = autogroup_task_group(p, tg); 4767 p->sched_task_group = tg; 4768 } 4769 #endif 4770 /* 4771 * We're setting the CPU for the first time, we don't migrate, 4772 * so use __set_task_cpu(). 4773 */ 4774 __set_task_cpu(p, smp_processor_id()); 4775 if (p->sched_class->task_fork) 4776 p->sched_class->task_fork(p); 4777 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4778 4779 return scx_fork(p); 4780 } 4781 4782 void sched_cancel_fork(struct task_struct *p) 4783 { 4784 scx_cancel_fork(p); 4785 } 4786 4787 static void sched_mm_cid_fork(struct task_struct *t); 4788 4789 void sched_post_fork(struct task_struct *p) 4790 { 4791 sched_mm_cid_fork(p); 4792 uclamp_post_fork(p); 4793 scx_post_fork(p); 4794 } 4795 4796 u64 to_ratio(u64 period, u64 runtime) 4797 { 4798 if (runtime == RUNTIME_INF) 4799 return BW_UNIT; 4800 4801 /* 4802 * Doing this here saves a lot of checks in all 4803 * the calling paths, and returning zero seems 4804 * safe for them anyway. 4805 */ 4806 if (period == 0) 4807 return 0; 4808 4809 return div64_u64(runtime << BW_SHIFT, period); 4810 } 4811 4812 /* 4813 * wake_up_new_task - wake up a newly created task for the first time. 4814 * 4815 * This function will do some initial scheduler statistics housekeeping 4816 * that must be done for every newly created context, then puts the task 4817 * on the runqueue and wakes it. 4818 */ 4819 void wake_up_new_task(struct task_struct *p) 4820 { 4821 struct rq_flags rf; 4822 struct rq *rq; 4823 int wake_flags = WF_FORK; 4824 4825 raw_spin_lock_irqsave(&p->pi_lock, rf.flags); 4826 WRITE_ONCE(p->__state, TASK_RUNNING); 4827 /* 4828 * Fork balancing, do it here and not earlier because: 4829 * - cpus_ptr can change in the fork path 4830 * - any previously selected CPU might disappear through hotplug 4831 * 4832 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, 4833 * as we're not fully set-up yet. 4834 */ 4835 p->recent_used_cpu = task_cpu(p); 4836 __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); 4837 rq = __task_rq_lock(p, &rf); 4838 update_rq_clock(rq); 4839 post_init_entity_util_avg(p); 4840 4841 activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); 4842 trace_sched_wakeup_new(p); 4843 wakeup_preempt(rq, p, wake_flags); 4844 if (p->sched_class->task_woken) { 4845 /* 4846 * Nothing relies on rq->lock after this, so it's fine to 4847 * drop it. 4848 */ 4849 rq_unpin_lock(rq, &rf); 4850 p->sched_class->task_woken(rq, p); 4851 rq_repin_lock(rq, &rf); 4852 } 4853 task_rq_unlock(rq, p, &rf); 4854 } 4855 4856 #ifdef CONFIG_PREEMPT_NOTIFIERS 4857 4858 static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); 4859 4860 void preempt_notifier_inc(void) 4861 { 4862 static_branch_inc(&preempt_notifier_key); 4863 } 4864 EXPORT_SYMBOL_GPL(preempt_notifier_inc); 4865 4866 void preempt_notifier_dec(void) 4867 { 4868 static_branch_dec(&preempt_notifier_key); 4869 } 4870 EXPORT_SYMBOL_GPL(preempt_notifier_dec); 4871 4872 /** 4873 * preempt_notifier_register - tell me when current is being preempted & rescheduled 4874 * @notifier: notifier struct to register 4875 */ 4876 void preempt_notifier_register(struct preempt_notifier *notifier) 4877 { 4878 if (!static_branch_unlikely(&preempt_notifier_key)) 4879 WARN(1, "registering preempt_notifier while notifiers disabled\n"); 4880 4881 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); 4882 } 4883 EXPORT_SYMBOL_GPL(preempt_notifier_register); 4884 4885 /** 4886 * preempt_notifier_unregister - no longer interested in preemption notifications 4887 * @notifier: notifier struct to unregister 4888 * 4889 * This is *not* safe to call from within a preemption notifier. 4890 */ 4891 void preempt_notifier_unregister(struct preempt_notifier *notifier) 4892 { 4893 hlist_del(¬ifier->link); 4894 } 4895 EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 4896 4897 static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) 4898 { 4899 struct preempt_notifier *notifier; 4900 4901 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 4902 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 4903 } 4904 4905 static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) 4906 { 4907 if (static_branch_unlikely(&preempt_notifier_key)) 4908 __fire_sched_in_preempt_notifiers(curr); 4909 } 4910 4911 static void 4912 __fire_sched_out_preempt_notifiers(struct task_struct *curr, 4913 struct task_struct *next) 4914 { 4915 struct preempt_notifier *notifier; 4916 4917 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 4918 notifier->ops->sched_out(notifier, next); 4919 } 4920 4921 static __always_inline void 4922 fire_sched_out_preempt_notifiers(struct task_struct *curr, 4923 struct task_struct *next) 4924 { 4925 if (static_branch_unlikely(&preempt_notifier_key)) 4926 __fire_sched_out_preempt_notifiers(curr, next); 4927 } 4928 4929 #else /* !CONFIG_PREEMPT_NOTIFIERS: */ 4930 4931 static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) 4932 { 4933 } 4934 4935 static inline void 4936 fire_sched_out_preempt_notifiers(struct task_struct *curr, 4937 struct task_struct *next) 4938 { 4939 } 4940 4941 #endif /* !CONFIG_PREEMPT_NOTIFIERS */ 4942 4943 static inline void prepare_task(struct task_struct *next) 4944 { 4945 /* 4946 * Claim the task as running, we do this before switching to it 4947 * such that any running task will have this set. 4948 * 4949 * See the smp_load_acquire(&p->on_cpu) case in ttwu() and 4950 * its ordering comment. 4951 */ 4952 WRITE_ONCE(next->on_cpu, 1); 4953 } 4954 4955 static inline void finish_task(struct task_struct *prev) 4956 { 4957 /* 4958 * This must be the very last reference to @prev from this CPU. After 4959 * p->on_cpu is cleared, the task can be moved to a different CPU. We 4960 * must ensure this doesn't happen until the switch is completely 4961 * finished. 4962 * 4963 * In particular, the load of prev->state in finish_task_switch() must 4964 * happen before this. 4965 * 4966 * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). 4967 */ 4968 smp_store_release(&prev->on_cpu, 0); 4969 } 4970 4971 /* 4972 * Only called from __schedule context 4973 * 4974 * There are some cases where we are going to re-do the action 4975 * that added the balance callbacks. We may not be in a state 4976 * where we can run them, so just zap them so they can be 4977 * properly re-added on the next time around. This is similar 4978 * handling to running the callbacks, except we just don't call 4979 * them. 4980 */ 4981 static void zap_balance_callbacks(struct rq *rq) 4982 { 4983 struct balance_callback *next, *head; 4984 bool found = false; 4985 4986 lockdep_assert_rq_held(rq); 4987 4988 head = rq->balance_callback; 4989 while (head) { 4990 if (head == &balance_push_callback) 4991 found = true; 4992 next = head->next; 4993 head->next = NULL; 4994 head = next; 4995 } 4996 rq->balance_callback = found ? &balance_push_callback : NULL; 4997 } 4998 4999 static void do_balance_callbacks(struct rq *rq, struct balance_callback *head) 5000 { 5001 void (*func)(struct rq *rq); 5002 struct balance_callback *next; 5003 5004 lockdep_assert_rq_held(rq); 5005 5006 while (head) { 5007 func = (void (*)(struct rq *))head->func; 5008 next = head->next; 5009 head->next = NULL; 5010 head = next; 5011 5012 func(rq); 5013 } 5014 } 5015 5016 static void balance_push(struct rq *rq); 5017 5018 /* 5019 * balance_push_callback is a right abuse of the callback interface and plays 5020 * by significantly different rules. 5021 * 5022 * Where the normal balance_callback's purpose is to be ran in the same context 5023 * that queued it (only later, when it's safe to drop rq->lock again), 5024 * balance_push_callback is specifically targeted at __schedule(). 5025 * 5026 * This abuse is tolerated because it places all the unlikely/odd cases behind 5027 * a single test, namely: rq->balance_callback == NULL. 5028 */ 5029 struct balance_callback balance_push_callback = { 5030 .next = NULL, 5031 .func = balance_push, 5032 }; 5033 5034 static inline struct balance_callback * 5035 __splice_balance_callbacks(struct rq *rq, bool split) 5036 { 5037 struct balance_callback *head = rq->balance_callback; 5038 5039 if (likely(!head)) 5040 return NULL; 5041 5042 lockdep_assert_rq_held(rq); 5043 /* 5044 * Must not take balance_push_callback off the list when 5045 * splice_balance_callbacks() and balance_callbacks() are not 5046 * in the same rq->lock section. 5047 * 5048 * In that case it would be possible for __schedule() to interleave 5049 * and observe the list empty. 5050 */ 5051 if (split && head == &balance_push_callback) 5052 head = NULL; 5053 else 5054 rq->balance_callback = NULL; 5055 5056 return head; 5057 } 5058 5059 struct balance_callback *splice_balance_callbacks(struct rq *rq) 5060 { 5061 return __splice_balance_callbacks(rq, true); 5062 } 5063 5064 void __balance_callbacks(struct rq *rq, struct rq_flags *rf) 5065 { 5066 if (rf) 5067 rq_unpin_lock(rq, rf); 5068 do_balance_callbacks(rq, __splice_balance_callbacks(rq, false)); 5069 if (rf) 5070 rq_repin_lock(rq, rf); 5071 } 5072 5073 void balance_callbacks(struct rq *rq, struct balance_callback *head) 5074 { 5075 unsigned long flags; 5076 5077 if (unlikely(head)) { 5078 raw_spin_rq_lock_irqsave(rq, flags); 5079 do_balance_callbacks(rq, head); 5080 raw_spin_rq_unlock_irqrestore(rq, flags); 5081 } 5082 } 5083 5084 static inline void 5085 prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) 5086 __releases(__rq_lockp(rq)) 5087 __acquires(__rq_lockp(this_rq())) 5088 { 5089 /* 5090 * Since the runqueue lock will be released by the next 5091 * task (which is an invalid locking op but in the case 5092 * of the scheduler it's an obvious special-case), so we 5093 * do an early lockdep release here: 5094 */ 5095 rq_unpin_lock(rq, rf); 5096 spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_); 5097 #ifdef CONFIG_DEBUG_SPINLOCK 5098 /* this is a valid case when another task releases the spinlock */ 5099 rq_lockp(rq)->owner = next; 5100 #endif 5101 /* 5102 * Model the rq reference switcheroo. 5103 */ 5104 __release(__rq_lockp(rq)); 5105 __acquire(__rq_lockp(this_rq())); 5106 } 5107 5108 static inline void finish_lock_switch(struct rq *rq) 5109 __releases(__rq_lockp(rq)) 5110 { 5111 /* 5112 * If we are tracking spinlock dependencies then we have to 5113 * fix up the runqueue lock - which gets 'carried over' from 5114 * prev into current: 5115 */ 5116 spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_); 5117 __balance_callbacks(rq, NULL); 5118 hrtick_schedule_exit(rq); 5119 raw_spin_rq_unlock_irq(rq); 5120 } 5121 5122 /* 5123 * NOP if the arch has not defined these: 5124 */ 5125 5126 #ifndef prepare_arch_switch 5127 # define prepare_arch_switch(next) do { } while (0) 5128 #endif 5129 5130 #ifndef finish_arch_post_lock_switch 5131 # define finish_arch_post_lock_switch() do { } while (0) 5132 #endif 5133 5134 static inline void kmap_local_sched_out(void) 5135 { 5136 #ifdef CONFIG_KMAP_LOCAL 5137 if (unlikely(current->kmap_ctrl.idx)) 5138 __kmap_local_sched_out(); 5139 #endif 5140 } 5141 5142 static inline void kmap_local_sched_in(void) 5143 { 5144 #ifdef CONFIG_KMAP_LOCAL 5145 if (unlikely(current->kmap_ctrl.idx)) 5146 __kmap_local_sched_in(); 5147 #endif 5148 } 5149 5150 /** 5151 * prepare_task_switch - prepare to switch tasks 5152 * @rq: the runqueue preparing to switch 5153 * @prev: the current task that is being switched out 5154 * @next: the task we are going to switch to. 5155 * 5156 * This is called with the rq lock held and interrupts off. It must 5157 * be paired with a subsequent finish_task_switch after the context 5158 * switch. 5159 * 5160 * prepare_task_switch sets up locking and calls architecture specific 5161 * hooks. 5162 */ 5163 static inline void 5164 prepare_task_switch(struct rq *rq, struct task_struct *prev, 5165 struct task_struct *next) 5166 __must_hold(__rq_lockp(rq)) 5167 { 5168 kcov_prepare_switch(prev); 5169 sched_info_switch(rq, prev, next); 5170 perf_event_task_sched_out(prev, next); 5171 fire_sched_out_preempt_notifiers(prev, next); 5172 kmap_local_sched_out(); 5173 prepare_task(next); 5174 prepare_arch_switch(next); 5175 } 5176 5177 /** 5178 * finish_task_switch - clean up after a task-switch 5179 * @prev: the thread we just switched away from. 5180 * 5181 * finish_task_switch must be called after the context switch, paired 5182 * with a prepare_task_switch call before the context switch. 5183 * finish_task_switch will reconcile locking set up by prepare_task_switch, 5184 * and do any other architecture-specific cleanup actions. 5185 * 5186 * Note that we may have delayed dropping an mm in context_switch(). If 5187 * so, we finish that here outside of the runqueue lock. (Doing it 5188 * with the lock held can cause deadlocks; see schedule() for 5189 * details.) 5190 * 5191 * The context switch have flipped the stack from under us and restored the 5192 * local variables which were saved when this task called schedule() in the 5193 * past. 'prev == current' is still correct but we need to recalculate this_rq 5194 * because prev may have moved to another CPU. 5195 */ 5196 static struct rq *finish_task_switch(struct task_struct *prev) 5197 __releases(__rq_lockp(this_rq())) 5198 { 5199 struct rq *rq = this_rq(); 5200 struct mm_struct *mm = rq->prev_mm; 5201 unsigned int prev_state; 5202 5203 /* 5204 * The previous task will have left us with a preempt_count of 2 5205 * because it left us after: 5206 * 5207 * schedule() 5208 * preempt_disable(); // 1 5209 * __schedule() 5210 * raw_spin_lock_irq(&rq->lock) // 2 5211 * 5212 * Also, see FORK_PREEMPT_COUNT. 5213 */ 5214 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, 5215 "corrupted preempt_count: %s/%d/0x%x\n", 5216 current->comm, current->pid, preempt_count())) 5217 preempt_count_set(FORK_PREEMPT_COUNT); 5218 5219 rq->prev_mm = NULL; 5220 5221 /* 5222 * A task struct has one reference for the use as "current". 5223 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 5224 * schedule one last time. The schedule call will never return, and 5225 * the scheduled task must drop that reference. 5226 * 5227 * We must observe prev->state before clearing prev->on_cpu (in 5228 * finish_task), otherwise a concurrent wakeup can get prev 5229 * running on another CPU and we could rave with its RUNNING -> DEAD 5230 * transition, resulting in a double drop. 5231 */ 5232 prev_state = READ_ONCE(prev->__state); 5233 vtime_task_switch(prev); 5234 perf_event_task_sched_in(prev, current); 5235 finish_task(prev); 5236 tick_nohz_task_switch(); 5237 finish_lock_switch(rq); 5238 finish_arch_post_lock_switch(); 5239 kcov_finish_switch(current); 5240 /* 5241 * kmap_local_sched_out() is invoked with rq::lock held and 5242 * interrupts disabled. There is no requirement for that, but the 5243 * sched out code does not have an interrupt enabled section. 5244 * Restoring the maps on sched in does not require interrupts being 5245 * disabled either. 5246 */ 5247 kmap_local_sched_in(); 5248 5249 fire_sched_in_preempt_notifiers(current); 5250 /* 5251 * When switching through a kernel thread, the loop in 5252 * membarrier_{private,global}_expedited() may have observed that 5253 * kernel thread and not issued an IPI. It is therefore possible to 5254 * schedule between user->kernel->user threads without passing though 5255 * switch_mm(). Membarrier requires a barrier after storing to 5256 * rq->curr, before returning to userspace, so provide them here: 5257 * 5258 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly 5259 * provided by mmdrop_lazy_tlb(), 5260 * - a sync_core for SYNC_CORE. 5261 */ 5262 if (mm) { 5263 membarrier_mm_sync_core_before_usermode(mm); 5264 mmdrop_lazy_tlb_sched(mm); 5265 } 5266 5267 if (unlikely(prev_state == TASK_DEAD)) { 5268 if (prev->sched_class->task_dead) 5269 prev->sched_class->task_dead(prev); 5270 5271 /* 5272 * sched_ext_dead() must come before cgroup_task_dead() to 5273 * prevent cgroups from being removed while its member tasks are 5274 * visible to SCX schedulers. 5275 */ 5276 sched_ext_dead(prev); 5277 cgroup_task_dead(prev); 5278 5279 /* Task is done with its stack. */ 5280 put_task_stack(prev); 5281 5282 put_task_struct_rcu_user(prev); 5283 } 5284 5285 return rq; 5286 } 5287 5288 /** 5289 * schedule_tail - first thing a freshly forked thread must call. 5290 * @prev: the thread we just switched away from. 5291 */ 5292 asmlinkage __visible void schedule_tail(struct task_struct *prev) 5293 __releases(__rq_lockp(this_rq())) 5294 { 5295 /* 5296 * New tasks start with FORK_PREEMPT_COUNT, see there and 5297 * finish_task_switch() for details. 5298 * 5299 * finish_task_switch() will drop rq->lock() and lower preempt_count 5300 * and the preempt_enable() will end up enabling preemption (on 5301 * PREEMPT_COUNT kernels). 5302 */ 5303 5304 finish_task_switch(prev); 5305 /* 5306 * This is a special case: the newly created task has just 5307 * switched the context for the first time. It is returning from 5308 * schedule for the first time in this path. 5309 */ 5310 trace_sched_exit_tp(true); 5311 preempt_enable(); 5312 5313 if (current->set_child_tid) 5314 put_user(task_pid_vnr(current), current->set_child_tid); 5315 5316 calculate_sigpending(); 5317 } 5318 5319 /* 5320 * context_switch - switch to the new MM and the new thread's register state. 5321 */ 5322 static __always_inline struct rq * 5323 context_switch(struct rq *rq, struct task_struct *prev, 5324 struct task_struct *next, struct rq_flags *rf) 5325 __releases(__rq_lockp(rq)) 5326 { 5327 prepare_task_switch(rq, prev, next); 5328 5329 /* 5330 * For paravirt, this is coupled with an exit in switch_to to 5331 * combine the page table reload and the switch backend into 5332 * one hypercall. 5333 */ 5334 arch_start_context_switch(prev); 5335 5336 /* 5337 * kernel -> kernel lazy + transfer active 5338 * user -> kernel lazy + mmgrab_lazy_tlb() active 5339 * 5340 * kernel -> user switch + mmdrop_lazy_tlb() active 5341 * user -> user switch 5342 */ 5343 if (!next->mm) { // to kernel 5344 enter_lazy_tlb(prev->active_mm, next); 5345 5346 next->active_mm = prev->active_mm; 5347 if (prev->mm) // from user 5348 mmgrab_lazy_tlb(prev->active_mm); 5349 else 5350 prev->active_mm = NULL; 5351 } else { // to user 5352 membarrier_switch_mm(rq, prev->active_mm, next->mm); 5353 /* 5354 * sys_membarrier() requires an smp_mb() between setting 5355 * rq->curr / membarrier_switch_mm() and returning to userspace. 5356 * 5357 * The below provides this either through switch_mm(), or in 5358 * case 'prev->active_mm == next->mm' through 5359 * finish_task_switch()'s mmdrop(). 5360 */ 5361 switch_mm_irqs_off(prev->active_mm, next->mm, next); 5362 lru_gen_use_mm(next->mm); 5363 5364 if (!prev->mm) { // from kernel 5365 /* will mmdrop_lazy_tlb() in finish_task_switch(). */ 5366 rq->prev_mm = prev->active_mm; 5367 prev->active_mm = NULL; 5368 } 5369 } 5370 5371 mm_cid_switch_to(prev, next); 5372 5373 /* 5374 * Tell rseq that the task was scheduled in. Must be after 5375 * switch_mm_cid() to get the TIF flag set. 5376 */ 5377 rseq_sched_switch_event(next); 5378 5379 prepare_lock_switch(rq, next, rf); 5380 5381 /* Here we just switch the register state and the stack. */ 5382 switch_to(prev, next, prev); 5383 barrier(); 5384 5385 return finish_task_switch(prev); 5386 } 5387 5388 /* 5389 * nr_running and nr_context_switches: 5390 * 5391 * externally visible scheduler statistics: current number of runnable 5392 * threads, total number of context switches performed since bootup. 5393 */ 5394 unsigned int nr_running(void) 5395 { 5396 unsigned int i, sum = 0; 5397 5398 for_each_online_cpu(i) 5399 sum += cpu_rq(i)->nr_running; 5400 5401 return sum; 5402 } 5403 5404 /* 5405 * Check if only the current task is running on the CPU. 5406 * 5407 * Caution: this function does not check that the caller has disabled 5408 * preemption, thus the result might have a time-of-check-to-time-of-use 5409 * race. The caller is responsible to use it correctly, for example: 5410 * 5411 * - from a non-preemptible section (of course) 5412 * 5413 * - from a thread that is bound to a single CPU 5414 * 5415 * - in a loop with very short iterations (e.g. a polling loop) 5416 */ 5417 bool single_task_running(void) 5418 { 5419 return raw_rq()->nr_running == 1; 5420 } 5421 EXPORT_SYMBOL(single_task_running); 5422 5423 unsigned long long nr_context_switches_cpu(int cpu) 5424 { 5425 return cpu_rq(cpu)->nr_switches; 5426 } 5427 5428 unsigned long long nr_context_switches(void) 5429 { 5430 int i; 5431 unsigned long long sum = 0; 5432 5433 for_each_possible_cpu(i) 5434 sum += cpu_rq(i)->nr_switches; 5435 5436 return sum; 5437 } 5438 5439 /* 5440 * Consumers of these two interfaces, like for example the cpuidle menu 5441 * governor, are using nonsensical data. Preferring shallow idle state selection 5442 * for a CPU that has IO-wait which might not even end up running the task when 5443 * it does become runnable. 5444 */ 5445 5446 unsigned int nr_iowait_cpu(int cpu) 5447 { 5448 return atomic_read(&cpu_rq(cpu)->nr_iowait); 5449 } 5450 5451 /* 5452 * IO-wait accounting, and how it's mostly bollocks (on SMP). 5453 * 5454 * The idea behind IO-wait account is to account the idle time that we could 5455 * have spend running if it were not for IO. That is, if we were to improve the 5456 * storage performance, we'd have a proportional reduction in IO-wait time. 5457 * 5458 * This all works nicely on UP, where, when a task blocks on IO, we account 5459 * idle time as IO-wait, because if the storage were faster, it could've been 5460 * running and we'd not be idle. 5461 * 5462 * This has been extended to SMP, by doing the same for each CPU. This however 5463 * is broken. 5464 * 5465 * Imagine for instance the case where two tasks block on one CPU, only the one 5466 * CPU will have IO-wait accounted, while the other has regular idle. Even 5467 * though, if the storage were faster, both could've ran at the same time, 5468 * utilising both CPUs. 5469 * 5470 * This means, that when looking globally, the current IO-wait accounting on 5471 * SMP is a lower bound, by reason of under accounting. 5472 * 5473 * Worse, since the numbers are provided per CPU, they are sometimes 5474 * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly 5475 * associated with any one particular CPU, it can wake to another CPU than it 5476 * blocked on. This means the per CPU IO-wait number is meaningless. 5477 * 5478 * Task CPU affinities can make all that even more 'interesting'. 5479 */ 5480 5481 unsigned int nr_iowait(void) 5482 { 5483 unsigned int i, sum = 0; 5484 5485 for_each_possible_cpu(i) 5486 sum += nr_iowait_cpu(i); 5487 5488 return sum; 5489 } 5490 5491 /* 5492 * sched_exec - execve() is a valuable balancing opportunity, because at 5493 * this point the task has the smallest effective memory and cache footprint. 5494 */ 5495 void sched_exec(void) 5496 { 5497 struct task_struct *p = current; 5498 struct migration_arg arg; 5499 int dest_cpu; 5500 5501 scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { 5502 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC); 5503 if (dest_cpu == smp_processor_id()) 5504 return; 5505 5506 if (unlikely(!cpu_active(dest_cpu))) 5507 return; 5508 5509 arg = (struct migration_arg){ p, dest_cpu }; 5510 } 5511 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); 5512 } 5513 5514 DEFINE_PER_CPU(struct kernel_stat, kstat); 5515 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); 5516 5517 EXPORT_PER_CPU_SYMBOL(kstat); 5518 EXPORT_PER_CPU_SYMBOL(kernel_cpustat); 5519 5520 /* 5521 * The function fair_sched_class.update_curr accesses the struct curr 5522 * and its field curr->exec_start; when called from task_sched_runtime(), 5523 * we observe a high rate of cache misses in practice. 5524 * Prefetching this data results in improved performance. 5525 */ 5526 static inline void prefetch_curr_exec_start(struct task_struct *p) 5527 { 5528 #ifdef CONFIG_FAIR_GROUP_SCHED 5529 struct sched_entity *curr = p->se.cfs_rq->curr; 5530 #else 5531 struct sched_entity *curr = task_rq(p)->cfs.curr; 5532 #endif 5533 prefetch(curr); 5534 prefetch(&curr->exec_start); 5535 } 5536 5537 /* 5538 * Return accounted runtime for the task. 5539 * In case the task is currently running, return the runtime plus current's 5540 * pending runtime that have not been accounted yet. 5541 */ 5542 unsigned long long task_sched_runtime(struct task_struct *p) 5543 { 5544 struct rq_flags rf; 5545 struct rq *rq; 5546 u64 ns; 5547 5548 #ifdef CONFIG_64BIT 5549 /* 5550 * 64-bit doesn't need locks to atomically read a 64-bit value. 5551 * So we have a optimization chance when the task's delta_exec is 0. 5552 * Reading ->on_cpu is racy, but this is OK. 5553 * 5554 * If we race with it leaving CPU, we'll take a lock. So we're correct. 5555 * If we race with it entering CPU, unaccounted time is 0. This is 5556 * indistinguishable from the read occurring a few cycles earlier. 5557 * If we see ->on_cpu without ->on_rq, the task is leaving, and has 5558 * been accounted, so we're correct here as well. 5559 */ 5560 if (!p->on_cpu || !task_on_rq_queued(p)) 5561 return p->se.sum_exec_runtime; 5562 #endif 5563 5564 rq = task_rq_lock(p, &rf); 5565 /* 5566 * Must be ->curr _and_ ->on_rq. If dequeued, we would 5567 * project cycles that may never be accounted to this 5568 * thread, breaking clock_gettime(). 5569 */ 5570 if (task_current_donor(rq, p) && task_on_rq_queued(p)) { 5571 prefetch_curr_exec_start(p); 5572 update_rq_clock(rq); 5573 p->sched_class->update_curr(rq); 5574 } 5575 ns = p->se.sum_exec_runtime; 5576 task_rq_unlock(rq, p, &rf); 5577 5578 return ns; 5579 } 5580 5581 static u64 cpu_resched_latency(struct rq *rq) 5582 { 5583 int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms); 5584 u64 resched_latency, now = rq_clock(rq); 5585 static bool warned_once; 5586 5587 if (sysctl_resched_latency_warn_once && warned_once) 5588 return 0; 5589 5590 if (!need_resched() || !latency_warn_ms) 5591 return 0; 5592 5593 if (system_state == SYSTEM_BOOTING) 5594 return 0; 5595 5596 if (!rq->last_seen_need_resched_ns) { 5597 rq->last_seen_need_resched_ns = now; 5598 rq->ticks_without_resched = 0; 5599 return 0; 5600 } 5601 5602 rq->ticks_without_resched++; 5603 resched_latency = now - rq->last_seen_need_resched_ns; 5604 if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC) 5605 return 0; 5606 5607 warned_once = true; 5608 5609 return resched_latency; 5610 } 5611 5612 static int __init setup_resched_latency_warn_ms(char *str) 5613 { 5614 long val; 5615 5616 if ((kstrtol(str, 0, &val))) { 5617 pr_warn("Unable to set resched_latency_warn_ms\n"); 5618 return 1; 5619 } 5620 5621 sysctl_resched_latency_warn_ms = val; 5622 return 1; 5623 } 5624 __setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms); 5625 5626 /* 5627 * This function gets called by the timer code, with HZ frequency. 5628 * We call it with interrupts disabled. 5629 */ 5630 void sched_tick(void) 5631 { 5632 int cpu = smp_processor_id(); 5633 struct rq *rq = cpu_rq(cpu); 5634 /* accounting goes to the donor task */ 5635 struct task_struct *donor; 5636 struct rq_flags rf; 5637 unsigned long hw_pressure; 5638 u64 resched_latency; 5639 5640 if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) 5641 arch_scale_freq_tick(); 5642 5643 sched_clock_tick(); 5644 5645 rq_lock(rq, &rf); 5646 donor = rq->donor; 5647 5648 psi_account_irqtime(rq, donor, NULL); 5649 5650 update_rq_clock(rq); 5651 hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); 5652 update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); 5653 5654 if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY)) 5655 resched_curr(rq); 5656 5657 donor->sched_class->task_tick(rq, donor, 0); 5658 if (sched_feat(LATENCY_WARN)) 5659 resched_latency = cpu_resched_latency(rq); 5660 calc_global_load_tick(rq); 5661 sched_core_tick(rq); 5662 scx_tick(rq); 5663 5664 rq_unlock(rq, &rf); 5665 5666 if (sched_feat(LATENCY_WARN) && resched_latency) 5667 resched_latency_warn(cpu, resched_latency); 5668 5669 perf_event_task_tick(); 5670 5671 if (donor->flags & PF_WQ_WORKER) 5672 wq_worker_tick(donor); 5673 5674 if (!scx_switched_all()) { 5675 rq->idle_balance = idle_cpu(cpu); 5676 sched_balance_trigger(rq); 5677 } 5678 } 5679 5680 #ifdef CONFIG_NO_HZ_FULL 5681 5682 struct tick_work { 5683 int cpu; 5684 atomic_t state; 5685 struct delayed_work work; 5686 }; 5687 /* Values for ->state, see diagram below. */ 5688 #define TICK_SCHED_REMOTE_OFFLINE 0 5689 #define TICK_SCHED_REMOTE_OFFLINING 1 5690 #define TICK_SCHED_REMOTE_RUNNING 2 5691 5692 /* 5693 * State diagram for ->state: 5694 * 5695 * 5696 * TICK_SCHED_REMOTE_OFFLINE 5697 * | ^ 5698 * | | 5699 * | | sched_tick_remote() 5700 * | | 5701 * | | 5702 * +--TICK_SCHED_REMOTE_OFFLINING 5703 * | ^ 5704 * | | 5705 * sched_tick_start() | | sched_tick_stop() 5706 * | | 5707 * V | 5708 * TICK_SCHED_REMOTE_RUNNING 5709 * 5710 * 5711 * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() 5712 * and sched_tick_start() are happy to leave the state in RUNNING. 5713 */ 5714 5715 static struct tick_work __percpu *tick_work_cpu; 5716 5717 static void sched_tick_remote(struct work_struct *work) 5718 { 5719 struct delayed_work *dwork = to_delayed_work(work); 5720 struct tick_work *twork = container_of(dwork, struct tick_work, work); 5721 int cpu = twork->cpu; 5722 struct rq *rq = cpu_rq(cpu); 5723 int os; 5724 5725 /* 5726 * Handle the tick only if it appears the remote CPU is running in full 5727 * dynticks mode. The check is racy by nature, but missing a tick or 5728 * having one too much is no big deal because the scheduler tick updates 5729 * statistics and checks timeslices in a time-independent way, regardless 5730 * of when exactly it is running. 5731 */ 5732 if (tick_nohz_tick_stopped_cpu(cpu)) { 5733 guard(rq_lock_irq)(rq); 5734 struct task_struct *curr = rq->curr; 5735 5736 if (cpu_online(cpu)) { 5737 /* 5738 * Since this is a remote tick for full dynticks mode, 5739 * we are always sure that there is no proxy (only a 5740 * single task is running). 5741 */ 5742 WARN_ON_ONCE(rq->curr != rq->donor); 5743 update_rq_clock(rq); 5744 5745 if (!is_idle_task(curr)) { 5746 /* 5747 * Make sure the next tick runs within a 5748 * reasonable amount of time. 5749 */ 5750 u64 delta = rq_clock_task(rq) - curr->se.exec_start; 5751 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 30); 5752 } 5753 curr->sched_class->task_tick(rq, curr, 0); 5754 5755 calc_load_nohz_remote(rq); 5756 } 5757 } 5758 5759 /* 5760 * Run the remote tick once per second (1Hz). This arbitrary 5761 * frequency is large enough to avoid overload but short enough 5762 * to keep scheduler internal stats reasonably up to date. But 5763 * first update state to reflect hotplug activity if required. 5764 */ 5765 os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); 5766 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); 5767 if (os == TICK_SCHED_REMOTE_RUNNING) 5768 queue_delayed_work(system_dfl_wq, dwork, HZ); 5769 } 5770 5771 static void sched_tick_start(int cpu) 5772 { 5773 int os; 5774 struct tick_work *twork; 5775 5776 if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) 5777 return; 5778 5779 WARN_ON_ONCE(!tick_work_cpu); 5780 5781 twork = per_cpu_ptr(tick_work_cpu, cpu); 5782 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); 5783 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); 5784 if (os == TICK_SCHED_REMOTE_OFFLINE) { 5785 twork->cpu = cpu; 5786 INIT_DELAYED_WORK(&twork->work, sched_tick_remote); 5787 queue_delayed_work(system_dfl_wq, &twork->work, HZ); 5788 } 5789 } 5790 5791 #ifdef CONFIG_HOTPLUG_CPU 5792 static void sched_tick_stop(int cpu) 5793 { 5794 struct tick_work *twork; 5795 int os; 5796 5797 if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) 5798 return; 5799 5800 WARN_ON_ONCE(!tick_work_cpu); 5801 5802 twork = per_cpu_ptr(tick_work_cpu, cpu); 5803 /* There cannot be competing actions, but don't rely on stop-machine. */ 5804 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); 5805 WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); 5806 /* Don't cancel, as this would mess up the state machine. */ 5807 } 5808 #endif /* CONFIG_HOTPLUG_CPU */ 5809 5810 int __init sched_tick_offload_init(void) 5811 { 5812 tick_work_cpu = alloc_percpu(struct tick_work); 5813 BUG_ON(!tick_work_cpu); 5814 return 0; 5815 } 5816 5817 #else /* !CONFIG_NO_HZ_FULL: */ 5818 static inline void sched_tick_start(int cpu) { } 5819 static inline void sched_tick_stop(int cpu) { } 5820 #endif /* !CONFIG_NO_HZ_FULL */ 5821 5822 #if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ 5823 defined(CONFIG_TRACE_PREEMPT_TOGGLE)) 5824 /* 5825 * If the value passed in is equal to the current preempt count 5826 * then we just disabled preemption. Start timing the latency. 5827 */ 5828 static inline void preempt_latency_start(int val) 5829 { 5830 if (preempt_count() == val) { 5831 unsigned long ip = get_lock_parent_ip(); 5832 #ifdef CONFIG_DEBUG_PREEMPT 5833 current->preempt_disable_ip = ip; 5834 #endif 5835 trace_preempt_off(CALLER_ADDR0, ip); 5836 } 5837 } 5838 5839 void preempt_count_add(int val) 5840 { 5841 #ifdef CONFIG_DEBUG_PREEMPT 5842 /* 5843 * Underflow? 5844 */ 5845 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 5846 return; 5847 #endif 5848 __preempt_count_add(val); 5849 #ifdef CONFIG_DEBUG_PREEMPT 5850 /* 5851 * Spinlock count overflowing soon? 5852 */ 5853 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 5854 PREEMPT_MASK - 10); 5855 #endif 5856 preempt_latency_start(val); 5857 } 5858 EXPORT_SYMBOL(preempt_count_add); 5859 NOKPROBE_SYMBOL(preempt_count_add); 5860 5861 /* 5862 * If the value passed in equals to the current preempt count 5863 * then we just enabled preemption. Stop timing the latency. 5864 */ 5865 static inline void preempt_latency_stop(int val) 5866 { 5867 if (preempt_count() == val) 5868 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); 5869 } 5870 5871 void preempt_count_sub(int val) 5872 { 5873 #ifdef CONFIG_DEBUG_PREEMPT 5874 /* 5875 * Underflow? 5876 */ 5877 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 5878 return; 5879 /* 5880 * Is the spinlock portion underflowing? 5881 */ 5882 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 5883 !(preempt_count() & PREEMPT_MASK))) 5884 return; 5885 #endif 5886 5887 preempt_latency_stop(val); 5888 __preempt_count_sub(val); 5889 } 5890 EXPORT_SYMBOL(preempt_count_sub); 5891 NOKPROBE_SYMBOL(preempt_count_sub); 5892 5893 #else 5894 static inline void preempt_latency_start(int val) { } 5895 static inline void preempt_latency_stop(int val) { } 5896 #endif 5897 5898 static inline unsigned long get_preempt_disable_ip(struct task_struct *p) 5899 { 5900 #ifdef CONFIG_DEBUG_PREEMPT 5901 return p->preempt_disable_ip; 5902 #else 5903 return 0; 5904 #endif 5905 } 5906 5907 /* 5908 * Print scheduling while atomic bug: 5909 */ 5910 static noinline void __schedule_bug(struct task_struct *prev) 5911 { 5912 /* Save this before calling printk(), since that will clobber it */ 5913 unsigned long preempt_disable_ip = get_preempt_disable_ip(current); 5914 5915 if (oops_in_progress) 5916 return; 5917 5918 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 5919 prev->comm, prev->pid, preempt_count()); 5920 5921 debug_show_held_locks(prev); 5922 print_modules(); 5923 if (irqs_disabled()) 5924 print_irqtrace_events(prev); 5925 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { 5926 pr_err("Preemption disabled at:"); 5927 print_ip_sym(KERN_ERR, preempt_disable_ip); 5928 } 5929 check_panic_on_warn("scheduling while atomic"); 5930 5931 dump_stack(); 5932 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 5933 } 5934 5935 /* 5936 * Various schedule()-time debugging checks and statistics: 5937 */ 5938 static inline void schedule_debug(struct task_struct *prev, bool preempt) 5939 { 5940 #ifdef CONFIG_SCHED_STACK_END_CHECK 5941 if (task_stack_end_corrupted(prev)) 5942 panic("corrupted stack end detected inside scheduler\n"); 5943 5944 if (task_scs_end_corrupted(prev)) 5945 panic("corrupted shadow stack detected inside scheduler\n"); 5946 #endif 5947 5948 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 5949 if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) { 5950 printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", 5951 prev->comm, prev->pid, prev->non_block_count); 5952 dump_stack(); 5953 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 5954 } 5955 #endif 5956 5957 if (unlikely(in_atomic_preempt_off())) { 5958 __schedule_bug(prev); 5959 preempt_count_set(PREEMPT_DISABLED); 5960 } 5961 rcu_sleep_check(); 5962 WARN_ON_ONCE(ct_state() == CT_STATE_USER); 5963 5964 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 5965 5966 schedstat_inc(this_rq()->sched_count); 5967 } 5968 5969 static void prev_balance(struct rq *rq, struct task_struct *prev, 5970 struct rq_flags *rf) 5971 { 5972 const struct sched_class *start_class = prev->sched_class; 5973 const struct sched_class *class; 5974 5975 /* 5976 * We must do the balancing pass before put_prev_task(), such 5977 * that when we release the rq->lock the task is in the same 5978 * state as before we took rq->lock. 5979 * 5980 * We can terminate the balance pass as soon as we know there is 5981 * a runnable task of @class priority or higher. 5982 */ 5983 for_active_class_range(class, start_class, &idle_sched_class) { 5984 if (class->balance && class->balance(rq, prev, rf)) 5985 break; 5986 } 5987 } 5988 5989 /* 5990 * Pick up the highest-prio task: 5991 */ 5992 static inline struct task_struct * 5993 __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 5994 __must_hold(__rq_lockp(rq)) 5995 { 5996 const struct sched_class *class; 5997 struct task_struct *p; 5998 5999 rq->dl_server = NULL; 6000 6001 if (scx_enabled()) 6002 goto restart; 6003 6004 /* 6005 * Optimization: we know that if all tasks are in the fair class we can 6006 * call that function directly, but only if the @prev task wasn't of a 6007 * higher scheduling class, because otherwise those lose the 6008 * opportunity to pull in more work from other CPUs. 6009 */ 6010 if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && 6011 rq->nr_running == rq->cfs.h_nr_queued)) { 6012 6013 p = pick_next_task_fair(rq, prev, rf); 6014 if (unlikely(p == RETRY_TASK)) 6015 goto restart; 6016 6017 /* Assume the next prioritized class is idle_sched_class */ 6018 if (!p) { 6019 p = pick_task_idle(rq, rf); 6020 put_prev_set_next_task(rq, prev, p); 6021 } 6022 6023 return p; 6024 } 6025 6026 restart: 6027 prev_balance(rq, prev, rf); 6028 6029 for_each_active_class(class) { 6030 if (class->pick_next_task) { 6031 p = class->pick_next_task(rq, prev, rf); 6032 if (unlikely(p == RETRY_TASK)) 6033 goto restart; 6034 if (p) 6035 return p; 6036 } else { 6037 p = class->pick_task(rq, rf); 6038 if (unlikely(p == RETRY_TASK)) 6039 goto restart; 6040 if (p) { 6041 put_prev_set_next_task(rq, prev, p); 6042 return p; 6043 } 6044 } 6045 } 6046 6047 BUG(); /* The idle class should always have a runnable task. */ 6048 } 6049 6050 #ifdef CONFIG_SCHED_CORE 6051 static inline bool is_task_rq_idle(struct task_struct *t) 6052 { 6053 return (task_rq(t)->idle == t); 6054 } 6055 6056 static inline bool cookie_equals(struct task_struct *a, unsigned long cookie) 6057 { 6058 return is_task_rq_idle(a) || (a->core_cookie == cookie); 6059 } 6060 6061 static inline bool cookie_match(struct task_struct *a, struct task_struct *b) 6062 { 6063 if (is_task_rq_idle(a) || is_task_rq_idle(b)) 6064 return true; 6065 6066 return a->core_cookie == b->core_cookie; 6067 } 6068 6069 /* 6070 * Careful; this can return RETRY_TASK, it does not include the retry-loop 6071 * itself due to the whole SMT pick retry thing below. 6072 */ 6073 static inline struct task_struct *pick_task(struct rq *rq, struct rq_flags *rf) 6074 { 6075 const struct sched_class *class; 6076 struct task_struct *p; 6077 6078 rq->dl_server = NULL; 6079 6080 for_each_active_class(class) { 6081 p = class->pick_task(rq, rf); 6082 if (p) 6083 return p; 6084 } 6085 6086 BUG(); /* The idle class should always have a runnable task. */ 6087 } 6088 6089 extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); 6090 6091 static void queue_core_balance(struct rq *rq); 6092 6093 static struct task_struct * 6094 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 6095 __must_hold(__rq_lockp(rq)) 6096 { 6097 struct task_struct *next, *p, *max; 6098 const struct cpumask *smt_mask; 6099 bool fi_before = false; 6100 bool core_clock_updated = (rq == rq->core); 6101 unsigned long cookie; 6102 int i, cpu, occ = 0; 6103 struct rq *rq_i; 6104 bool need_sync; 6105 6106 if (!sched_core_enabled(rq)) 6107 return __pick_next_task(rq, prev, rf); 6108 6109 cpu = cpu_of(rq); 6110 6111 /* Stopper task is switching into idle, no need core-wide selection. */ 6112 if (cpu_is_offline(cpu)) { 6113 /* 6114 * Reset core_pick so that we don't enter the fastpath when 6115 * coming online. core_pick would already be migrated to 6116 * another cpu during offline. 6117 */ 6118 rq->core_pick = NULL; 6119 rq->core_dl_server = NULL; 6120 return __pick_next_task(rq, prev, rf); 6121 } 6122 6123 /* 6124 * If there were no {en,de}queues since we picked (IOW, the task 6125 * pointers are all still valid), and we haven't scheduled the last 6126 * pick yet, do so now. 6127 * 6128 * rq->core_pick can be NULL if no selection was made for a CPU because 6129 * it was either offline or went offline during a sibling's core-wide 6130 * selection. In this case, do a core-wide selection. 6131 */ 6132 if (rq->core->core_pick_seq == rq->core->core_task_seq && 6133 rq->core->core_pick_seq != rq->core_sched_seq && 6134 rq->core_pick) { 6135 WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq); 6136 6137 next = rq->core_pick; 6138 rq->dl_server = rq->core_dl_server; 6139 rq->core_pick = NULL; 6140 rq->core_dl_server = NULL; 6141 goto out_set_next; 6142 } 6143 6144 prev_balance(rq, prev, rf); 6145 6146 smt_mask = cpu_smt_mask(cpu); 6147 need_sync = !!rq->core->core_cookie; 6148 6149 /* reset state */ 6150 rq->core->core_cookie = 0UL; 6151 if (rq->core->core_forceidle_count) { 6152 if (!core_clock_updated) { 6153 update_rq_clock(rq->core); 6154 core_clock_updated = true; 6155 } 6156 sched_core_account_forceidle(rq); 6157 /* reset after accounting force idle */ 6158 rq->core->core_forceidle_start = 0; 6159 rq->core->core_forceidle_count = 0; 6160 rq->core->core_forceidle_occupation = 0; 6161 need_sync = true; 6162 fi_before = true; 6163 } 6164 6165 /* 6166 * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq 6167 * 6168 * @task_seq guards the task state ({en,de}queues) 6169 * @pick_seq is the @task_seq we did a selection on 6170 * @sched_seq is the @pick_seq we scheduled 6171 * 6172 * However, preemptions can cause multiple picks on the same task set. 6173 * 'Fix' this by also increasing @task_seq for every pick. 6174 */ 6175 rq->core->core_task_seq++; 6176 6177 /* 6178 * Optimize for common case where this CPU has no cookies 6179 * and there are no cookied tasks running on siblings. 6180 */ 6181 if (!need_sync) { 6182 restart_single: 6183 next = pick_task(rq, rf); 6184 if (unlikely(next == RETRY_TASK)) 6185 goto restart_single; 6186 if (!next->core_cookie) { 6187 rq->core_pick = NULL; 6188 rq->core_dl_server = NULL; 6189 /* 6190 * For robustness, update the min_vruntime_fi for 6191 * unconstrained picks as well. 6192 */ 6193 WARN_ON_ONCE(fi_before); 6194 task_vruntime_update(rq, next, false); 6195 goto out_set_next; 6196 } 6197 } 6198 6199 /* 6200 * For each thread: do the regular task pick and find the max prio task 6201 * amongst them. 6202 * 6203 * Tie-break prio towards the current CPU 6204 */ 6205 restart_multi: 6206 max = NULL; 6207 for_each_cpu_wrap(i, smt_mask, cpu) { 6208 rq_i = cpu_rq(i); 6209 6210 /* 6211 * Current cpu always has its clock updated on entrance to 6212 * pick_next_task(). If the current cpu is not the core, 6213 * the core may also have been updated above. 6214 */ 6215 if (i != cpu && (rq_i != rq->core || !core_clock_updated)) 6216 update_rq_clock(rq_i); 6217 6218 p = pick_task(rq_i, rf); 6219 if (unlikely(p == RETRY_TASK)) 6220 goto restart_multi; 6221 6222 rq_i->core_pick = p; 6223 rq_i->core_dl_server = rq_i->dl_server; 6224 6225 if (!max || prio_less(max, p, fi_before)) 6226 max = p; 6227 } 6228 6229 cookie = rq->core->core_cookie = max->core_cookie; 6230 6231 /* 6232 * For each thread: try and find a runnable task that matches @max or 6233 * force idle. 6234 */ 6235 for_each_cpu(i, smt_mask) { 6236 rq_i = cpu_rq(i); 6237 p = rq_i->core_pick; 6238 6239 if (!cookie_equals(p, cookie)) { 6240 p = NULL; 6241 if (cookie) 6242 p = sched_core_find(rq_i, cookie); 6243 if (!p) 6244 p = idle_sched_class.pick_task(rq_i, rf); 6245 } 6246 6247 rq_i->core_pick = p; 6248 rq_i->core_dl_server = NULL; 6249 6250 if (p == rq_i->idle) { 6251 if (rq_i->nr_running) { 6252 rq->core->core_forceidle_count++; 6253 if (!fi_before) 6254 rq->core->core_forceidle_seq++; 6255 } 6256 } else { 6257 occ++; 6258 } 6259 } 6260 6261 if (schedstat_enabled() && rq->core->core_forceidle_count) { 6262 rq->core->core_forceidle_start = rq_clock(rq->core); 6263 rq->core->core_forceidle_occupation = occ; 6264 } 6265 6266 rq->core->core_pick_seq = rq->core->core_task_seq; 6267 next = rq->core_pick; 6268 rq->core_sched_seq = rq->core->core_pick_seq; 6269 6270 /* Something should have been selected for current CPU */ 6271 WARN_ON_ONCE(!next); 6272 6273 /* 6274 * Reschedule siblings 6275 * 6276 * NOTE: L1TF -- at this point we're no longer running the old task and 6277 * sending an IPI (below) ensures the sibling will no longer be running 6278 * their task. This ensures there is no inter-sibling overlap between 6279 * non-matching user state. 6280 */ 6281 for_each_cpu(i, smt_mask) { 6282 rq_i = cpu_rq(i); 6283 6284 /* 6285 * An online sibling might have gone offline before a task 6286 * could be picked for it, or it might be offline but later 6287 * happen to come online, but its too late and nothing was 6288 * picked for it. That's Ok - it will pick tasks for itself, 6289 * so ignore it. 6290 */ 6291 if (!rq_i->core_pick) 6292 continue; 6293 6294 /* 6295 * Update for new !FI->FI transitions, or if continuing to be in !FI: 6296 * fi_before fi update? 6297 * 0 0 1 6298 * 0 1 1 6299 * 1 0 1 6300 * 1 1 0 6301 */ 6302 if (!(fi_before && rq->core->core_forceidle_count)) 6303 task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count); 6304 6305 rq_i->core_pick->core_occupation = occ; 6306 6307 if (i == cpu) { 6308 rq_i->core_pick = NULL; 6309 rq_i->core_dl_server = NULL; 6310 continue; 6311 } 6312 6313 /* Did we break L1TF mitigation requirements? */ 6314 WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick)); 6315 6316 if (rq_i->curr == rq_i->core_pick) { 6317 rq_i->core_pick = NULL; 6318 rq_i->core_dl_server = NULL; 6319 continue; 6320 } 6321 6322 resched_curr(rq_i); 6323 } 6324 6325 out_set_next: 6326 put_prev_set_next_task(rq, prev, next); 6327 if (rq->core->core_forceidle_count && next == rq->idle) 6328 queue_core_balance(rq); 6329 6330 return next; 6331 } 6332 6333 static bool try_steal_cookie(int this, int that) 6334 { 6335 struct rq *dst = cpu_rq(this), *src = cpu_rq(that); 6336 struct task_struct *p; 6337 unsigned long cookie; 6338 bool success = false; 6339 6340 guard(irq)(); 6341 guard(double_rq_lock)(dst, src); 6342 6343 cookie = dst->core->core_cookie; 6344 if (!cookie) 6345 return false; 6346 6347 if (dst->curr != dst->idle) 6348 return false; 6349 6350 p = sched_core_find(src, cookie); 6351 if (!p) 6352 return false; 6353 6354 do { 6355 if (p == src->core_pick || p == src->curr) 6356 goto next; 6357 6358 if (!is_cpu_allowed(p, this)) 6359 goto next; 6360 6361 if (p->core_occupation > dst->idle->core_occupation) 6362 goto next; 6363 /* 6364 * sched_core_find() and sched_core_next() will ensure 6365 * that task @p is not throttled now, we also need to 6366 * check whether the runqueue of the destination CPU is 6367 * being throttled. 6368 */ 6369 if (sched_task_is_throttled(p, this)) 6370 goto next; 6371 6372 move_queued_task_locked(src, dst, p); 6373 resched_curr(dst); 6374 6375 success = true; 6376 break; 6377 6378 next: 6379 p = sched_core_next(p, cookie); 6380 } while (p); 6381 6382 return success; 6383 } 6384 6385 static bool steal_cookie_task(int cpu, struct sched_domain *sd) 6386 { 6387 int i; 6388 6389 for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) { 6390 if (i == cpu) 6391 continue; 6392 6393 if (need_resched()) 6394 break; 6395 6396 if (try_steal_cookie(cpu, i)) 6397 return true; 6398 } 6399 6400 return false; 6401 } 6402 6403 static void sched_core_balance(struct rq *rq) 6404 __must_hold(__rq_lockp(rq)) 6405 { 6406 struct sched_domain *sd; 6407 int cpu = cpu_of(rq); 6408 6409 guard(preempt)(); 6410 guard(rcu)(); 6411 6412 raw_spin_rq_unlock_irq(rq); 6413 for_each_domain(cpu, sd) { 6414 if (need_resched()) 6415 break; 6416 6417 if (steal_cookie_task(cpu, sd)) 6418 break; 6419 } 6420 raw_spin_rq_lock_irq(rq); 6421 } 6422 6423 static DEFINE_PER_CPU(struct balance_callback, core_balance_head); 6424 6425 static void queue_core_balance(struct rq *rq) 6426 { 6427 if (!sched_core_enabled(rq)) 6428 return; 6429 6430 if (!rq->core->core_cookie) 6431 return; 6432 6433 if (!rq->nr_running) /* not forced idle */ 6434 return; 6435 6436 queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance); 6437 } 6438 6439 DEFINE_LOCK_GUARD_1(core_lock, int, 6440 sched_core_lock(*_T->lock, &_T->flags), 6441 sched_core_unlock(*_T->lock, &_T->flags), 6442 unsigned long flags) 6443 6444 static void sched_core_cpu_starting(unsigned int cpu) 6445 { 6446 const struct cpumask *smt_mask = cpu_smt_mask(cpu); 6447 struct rq *rq = cpu_rq(cpu), *core_rq = NULL; 6448 int t; 6449 6450 guard(core_lock)(&cpu); 6451 6452 WARN_ON_ONCE(rq->core != rq); 6453 6454 /* if we're the first, we'll be our own leader */ 6455 if (cpumask_weight(smt_mask) == 1) 6456 return; 6457 6458 /* find the leader */ 6459 for_each_cpu(t, smt_mask) { 6460 if (t == cpu) 6461 continue; 6462 rq = cpu_rq(t); 6463 if (rq->core == rq) { 6464 core_rq = rq; 6465 break; 6466 } 6467 } 6468 6469 if (WARN_ON_ONCE(!core_rq)) /* whoopsie */ 6470 return; 6471 6472 /* install and validate core_rq */ 6473 for_each_cpu(t, smt_mask) { 6474 rq = cpu_rq(t); 6475 6476 if (t == cpu) 6477 rq->core = core_rq; 6478 6479 WARN_ON_ONCE(rq->core != core_rq); 6480 } 6481 } 6482 6483 static void sched_core_cpu_deactivate(unsigned int cpu) 6484 { 6485 const struct cpumask *smt_mask = cpu_smt_mask(cpu); 6486 struct rq *rq = cpu_rq(cpu), *core_rq = NULL; 6487 int t; 6488 6489 guard(core_lock)(&cpu); 6490 6491 /* if we're the last man standing, nothing to do */ 6492 if (cpumask_weight(smt_mask) == 1) { 6493 WARN_ON_ONCE(rq->core != rq); 6494 return; 6495 } 6496 6497 /* if we're not the leader, nothing to do */ 6498 if (rq->core != rq) 6499 return; 6500 6501 /* find a new leader */ 6502 for_each_cpu(t, smt_mask) { 6503 if (t == cpu) 6504 continue; 6505 core_rq = cpu_rq(t); 6506 break; 6507 } 6508 6509 if (WARN_ON_ONCE(!core_rq)) /* impossible */ 6510 return; 6511 6512 /* copy the shared state to the new leader */ 6513 core_rq->core_task_seq = rq->core_task_seq; 6514 core_rq->core_pick_seq = rq->core_pick_seq; 6515 core_rq->core_cookie = rq->core_cookie; 6516 core_rq->core_forceidle_count = rq->core_forceidle_count; 6517 core_rq->core_forceidle_seq = rq->core_forceidle_seq; 6518 core_rq->core_forceidle_occupation = rq->core_forceidle_occupation; 6519 6520 /* 6521 * Accounting edge for forced idle is handled in pick_next_task(). 6522 * Don't need another one here, since the hotplug thread shouldn't 6523 * have a cookie. 6524 */ 6525 core_rq->core_forceidle_start = 0; 6526 6527 /* install new leader */ 6528 for_each_cpu(t, smt_mask) { 6529 rq = cpu_rq(t); 6530 rq->core = core_rq; 6531 } 6532 } 6533 6534 static inline void sched_core_cpu_dying(unsigned int cpu) 6535 { 6536 struct rq *rq = cpu_rq(cpu); 6537 6538 if (rq->core != rq) 6539 rq->core = rq; 6540 } 6541 6542 #else /* !CONFIG_SCHED_CORE: */ 6543 6544 static inline void sched_core_cpu_starting(unsigned int cpu) {} 6545 static inline void sched_core_cpu_deactivate(unsigned int cpu) {} 6546 static inline void sched_core_cpu_dying(unsigned int cpu) {} 6547 6548 static struct task_struct * 6549 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 6550 __must_hold(__rq_lockp(rq)) 6551 { 6552 return __pick_next_task(rq, prev, rf); 6553 } 6554 6555 #endif /* !CONFIG_SCHED_CORE */ 6556 6557 /* 6558 * Constants for the sched_mode argument of __schedule(). 6559 * 6560 * The mode argument allows RT enabled kernels to differentiate a 6561 * preemption from blocking on an 'sleeping' spin/rwlock. 6562 */ 6563 #define SM_IDLE (-1) 6564 #define SM_NONE 0 6565 #define SM_PREEMPT 1 6566 #define SM_RTLOCK_WAIT 2 6567 6568 /* 6569 * Helper function for __schedule() 6570 * 6571 * Tries to deactivate the task, unless the should_block arg 6572 * is false or if a signal is pending. In the case a signal 6573 * is pending, marks the task's __state as RUNNING (and clear 6574 * blocked_on). 6575 */ 6576 static bool try_to_block_task(struct rq *rq, struct task_struct *p, 6577 unsigned long *task_state_p, bool should_block) 6578 { 6579 unsigned long task_state = *task_state_p; 6580 int flags = DEQUEUE_NOCLOCK; 6581 6582 if (signal_pending_state(task_state, p)) { 6583 WRITE_ONCE(p->__state, TASK_RUNNING); 6584 *task_state_p = TASK_RUNNING; 6585 set_task_blocked_on_waking(p, NULL); 6586 6587 return false; 6588 } 6589 6590 /* 6591 * We check should_block after signal_pending because we 6592 * will want to wake the task in that case. But if 6593 * should_block is false, its likely due to the task being 6594 * blocked on a mutex, and we want to keep it on the runqueue 6595 * to be selectable for proxy-execution. 6596 */ 6597 if (!should_block) 6598 return false; 6599 6600 p->sched_contributes_to_load = 6601 (task_state & TASK_UNINTERRUPTIBLE) && 6602 !(task_state & TASK_NOLOAD) && 6603 !(task_state & TASK_FROZEN); 6604 6605 if (unlikely(is_special_task_state(task_state))) 6606 flags |= DEQUEUE_SPECIAL; 6607 6608 /* 6609 * __schedule() ttwu() 6610 * prev_state = prev->state; if (p->on_rq && ...) 6611 * if (prev_state) goto out; 6612 * p->on_rq = 0; smp_acquire__after_ctrl_dep(); 6613 * p->state = TASK_WAKING 6614 * 6615 * Where __schedule() and ttwu() have matching control dependencies. 6616 * 6617 * After this, schedule() must not care about p->state any more. 6618 */ 6619 block_task(rq, p, flags); 6620 return true; 6621 } 6622 6623 #ifdef CONFIG_SCHED_PROXY_EXEC 6624 static inline void proxy_set_task_cpu(struct task_struct *p, int cpu) 6625 { 6626 unsigned int wake_cpu; 6627 6628 /* 6629 * Since we are enqueuing a blocked task on a cpu it may 6630 * not be able to run on, preserve wake_cpu when we 6631 * __set_task_cpu so we can return the task to where it 6632 * was previously runnable. 6633 */ 6634 wake_cpu = p->wake_cpu; 6635 __set_task_cpu(p, cpu); 6636 p->wake_cpu = wake_cpu; 6637 } 6638 6639 static inline struct task_struct *proxy_resched_idle(struct rq *rq) 6640 { 6641 put_prev_set_next_task(rq, rq->donor, rq->idle); 6642 rq_set_donor(rq, rq->idle); 6643 set_tsk_need_resched(rq->idle); 6644 return rq->idle; 6645 } 6646 6647 static bool proxy_deactivate(struct rq *rq, struct task_struct *donor) 6648 { 6649 unsigned long state = READ_ONCE(donor->__state); 6650 6651 /* Don't deactivate if the state has been changed to TASK_RUNNING */ 6652 if (state == TASK_RUNNING) 6653 return false; 6654 /* 6655 * Because we got donor from pick_next_task(), it is *crucial* 6656 * that we call proxy_resched_idle() before we deactivate it. 6657 * As once we deactivate donor, donor->on_rq is set to zero, 6658 * which allows ttwu() to immediately try to wake the task on 6659 * another rq. So we cannot use *any* references to donor 6660 * after that point. So things like cfs_rq->curr or rq->donor 6661 * need to be changed from next *before* we deactivate. 6662 */ 6663 proxy_resched_idle(rq); 6664 return try_to_block_task(rq, donor, &state, true); 6665 } 6666 6667 static inline void proxy_release_rq_lock(struct rq *rq, struct rq_flags *rf) 6668 __releases(__rq_lockp(rq)) 6669 { 6670 /* 6671 * The class scheduler may have queued a balance callback 6672 * from pick_next_task() called earlier. 6673 * 6674 * So here we have to zap callbacks before unlocking the rq 6675 * as another CPU may jump in and call sched_balance_rq 6676 * which can trip the warning in rq_pin_lock() if we 6677 * leave callbacks set. 6678 * 6679 * After we later reaquire the rq lock, we will force __schedule() 6680 * to pick_again, so the callbacks will get re-established. 6681 */ 6682 zap_balance_callbacks(rq); 6683 rq_unpin_lock(rq, rf); 6684 raw_spin_rq_unlock(rq); 6685 } 6686 6687 static inline void proxy_reacquire_rq_lock(struct rq *rq, struct rq_flags *rf) 6688 __acquires(__rq_lockp(rq)) 6689 { 6690 raw_spin_rq_lock(rq); 6691 rq_repin_lock(rq, rf); 6692 update_rq_clock(rq); 6693 } 6694 6695 /* 6696 * If the blocked-on relationship crosses CPUs, migrate @p to the 6697 * owner's CPU. 6698 * 6699 * This is because we must respect the CPU affinity of execution 6700 * contexts (owner) but we can ignore affinity for scheduling 6701 * contexts (@p). So we have to move scheduling contexts towards 6702 * potential execution contexts. 6703 * 6704 * Note: The owner can disappear, but simply migrate to @target_cpu 6705 * and leave that CPU to sort things out. 6706 */ 6707 static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf, 6708 struct task_struct *p, int target_cpu) 6709 __must_hold(__rq_lockp(rq)) 6710 { 6711 struct rq *target_rq = cpu_rq(target_cpu); 6712 6713 lockdep_assert_rq_held(rq); 6714 WARN_ON(p == rq->curr); 6715 /* 6716 * Since we are migrating a blocked donor, it could be rq->donor, 6717 * and we want to make sure there aren't any references from this 6718 * rq to it before we drop the lock. This avoids another cpu 6719 * jumping in and grabbing the rq lock and referencing rq->donor 6720 * or cfs_rq->curr, etc after we have migrated it to another cpu, 6721 * and before we pick_again in __schedule. 6722 * 6723 * So call proxy_resched_idle() to drop the rq->donor references 6724 * before we release the lock. 6725 */ 6726 proxy_resched_idle(rq); 6727 6728 deactivate_task(rq, p, DEQUEUE_NOCLOCK); 6729 proxy_set_task_cpu(p, target_cpu); 6730 6731 proxy_release_rq_lock(rq, rf); 6732 6733 attach_one_task(target_rq, p); 6734 6735 proxy_reacquire_rq_lock(rq, rf); 6736 } 6737 6738 static void proxy_force_return(struct rq *rq, struct rq_flags *rf, 6739 struct task_struct *p) 6740 __must_hold(__rq_lockp(rq)) 6741 { 6742 struct rq *task_rq, *target_rq = NULL; 6743 int cpu, wake_flag = WF_TTWU; 6744 6745 lockdep_assert_rq_held(rq); 6746 WARN_ON(p == rq->curr); 6747 6748 if (p == rq->donor) 6749 proxy_resched_idle(rq); 6750 6751 proxy_release_rq_lock(rq, rf); 6752 /* 6753 * We drop the rq lock, and re-grab task_rq_lock to get 6754 * the pi_lock (needed for select_task_rq) as well. 6755 */ 6756 scoped_guard (task_rq_lock, p) { 6757 task_rq = scope.rq; 6758 6759 /* 6760 * Since we let go of the rq lock, the task may have been 6761 * woken or migrated to another rq before we got the 6762 * task_rq_lock. So re-check we're on the same RQ. If 6763 * not, the task has already been migrated and that CPU 6764 * will handle any futher migrations. 6765 */ 6766 if (task_rq != rq) 6767 break; 6768 6769 /* 6770 * Similarly, if we've been dequeued, someone else will 6771 * wake us 6772 */ 6773 if (!task_on_rq_queued(p)) 6774 break; 6775 6776 /* 6777 * Since we should only be calling here from __schedule() 6778 * -> find_proxy_task(), no one else should have 6779 * assigned current out from under us. But check and warn 6780 * if we see this, then bail. 6781 */ 6782 if (task_current(task_rq, p) || task_on_cpu(task_rq, p)) { 6783 WARN_ONCE(1, "%s rq: %i current/on_cpu task %s %d on_cpu: %i\n", 6784 __func__, cpu_of(task_rq), 6785 p->comm, p->pid, p->on_cpu); 6786 break; 6787 } 6788 6789 update_rq_clock(task_rq); 6790 deactivate_task(task_rq, p, DEQUEUE_NOCLOCK); 6791 cpu = select_task_rq(p, p->wake_cpu, &wake_flag); 6792 set_task_cpu(p, cpu); 6793 target_rq = cpu_rq(cpu); 6794 clear_task_blocked_on(p, NULL); 6795 } 6796 6797 if (target_rq) 6798 attach_one_task(target_rq, p); 6799 6800 proxy_reacquire_rq_lock(rq, rf); 6801 } 6802 6803 /* 6804 * Find runnable lock owner to proxy for mutex blocked donor 6805 * 6806 * Follow the blocked-on relation: 6807 * task->blocked_on -> mutex->owner -> task... 6808 * 6809 * Lock order: 6810 * 6811 * p->pi_lock 6812 * rq->lock 6813 * mutex->wait_lock 6814 * p->blocked_lock 6815 * 6816 * Returns the task that is going to be used as execution context (the one 6817 * that is actually going to be run on cpu_of(rq)). 6818 */ 6819 static struct task_struct * 6820 find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) 6821 __must_hold(__rq_lockp(rq)) 6822 { 6823 struct task_struct *owner = NULL; 6824 bool curr_in_chain = false; 6825 int this_cpu = cpu_of(rq); 6826 struct task_struct *p; 6827 struct mutex *mutex; 6828 int owner_cpu; 6829 6830 /* Follow blocked_on chain. */ 6831 for (p = donor; (mutex = p->blocked_on); p = owner) { 6832 /* if its PROXY_WAKING, do return migration or run if current */ 6833 if (mutex == PROXY_WAKING) { 6834 if (task_current(rq, p)) { 6835 clear_task_blocked_on(p, PROXY_WAKING); 6836 return p; 6837 } 6838 goto force_return; 6839 } 6840 6841 /* 6842 * By taking mutex->wait_lock we hold off concurrent mutex_unlock() 6843 * and ensure @owner sticks around. 6844 */ 6845 guard(raw_spinlock)(&mutex->wait_lock); 6846 guard(raw_spinlock)(&p->blocked_lock); 6847 6848 /* Check again that p is blocked with blocked_lock held */ 6849 if (mutex != __get_task_blocked_on(p)) { 6850 /* 6851 * Something changed in the blocked_on chain and 6852 * we don't know if only at this level. So, let's 6853 * just bail out completely and let __schedule() 6854 * figure things out (pick_again loop). 6855 */ 6856 return NULL; 6857 } 6858 6859 if (task_current(rq, p)) 6860 curr_in_chain = true; 6861 6862 owner = __mutex_owner(mutex); 6863 if (!owner) { 6864 /* 6865 * If there is no owner, either clear blocked_on 6866 * and return p (if it is current and safe to 6867 * just run on this rq), or return-migrate the task. 6868 */ 6869 if (task_current(rq, p)) { 6870 __clear_task_blocked_on(p, NULL); 6871 return p; 6872 } 6873 goto force_return; 6874 } 6875 6876 if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) { 6877 /* XXX Don't handle blocked owners/delayed dequeue yet */ 6878 if (curr_in_chain) 6879 return proxy_resched_idle(rq); 6880 goto deactivate; 6881 } 6882 6883 owner_cpu = task_cpu(owner); 6884 if (owner_cpu != this_cpu) { 6885 /* 6886 * @owner can disappear, simply migrate to @owner_cpu 6887 * and leave that CPU to sort things out. 6888 */ 6889 if (curr_in_chain) 6890 return proxy_resched_idle(rq); 6891 goto migrate_task; 6892 } 6893 6894 if (task_on_rq_migrating(owner)) { 6895 /* 6896 * One of the chain of mutex owners is currently migrating to this 6897 * CPU, but has not yet been enqueued because we are holding the 6898 * rq lock. As a simple solution, just schedule rq->idle to give 6899 * the migration a chance to complete. Much like the migrate_task 6900 * case we should end up back in find_proxy_task(), this time 6901 * hopefully with all relevant tasks already enqueued. 6902 */ 6903 return proxy_resched_idle(rq); 6904 } 6905 6906 /* 6907 * Its possible to race where after we check owner->on_rq 6908 * but before we check (owner_cpu != this_cpu) that the 6909 * task on another cpu was migrated back to this cpu. In 6910 * that case it could slip by our checks. So double check 6911 * we are still on this cpu and not migrating. If we get 6912 * inconsistent results, try again. 6913 */ 6914 if (!task_on_rq_queued(owner) || task_cpu(owner) != this_cpu) 6915 return NULL; 6916 6917 if (owner == p) { 6918 /* 6919 * It's possible we interleave with mutex_unlock like: 6920 * 6921 * lock(&rq->lock); 6922 * find_proxy_task() 6923 * mutex_unlock() 6924 * lock(&wait_lock); 6925 * donor(owner) = current->blocked_donor; 6926 * unlock(&wait_lock); 6927 * 6928 * wake_up_q(); 6929 * ... 6930 * ttwu_runnable() 6931 * __task_rq_lock() 6932 * lock(&wait_lock); 6933 * owner == p 6934 * 6935 * Which leaves us to finish the ttwu_runnable() and make it go. 6936 * 6937 * So schedule rq->idle so that ttwu_runnable() can get the rq 6938 * lock and mark owner as running. 6939 */ 6940 return proxy_resched_idle(rq); 6941 } 6942 /* 6943 * OK, now we're absolutely sure @owner is on this 6944 * rq, therefore holding @rq->lock is sufficient to 6945 * guarantee its existence, as per ttwu_remote(). 6946 */ 6947 } 6948 WARN_ON_ONCE(owner && !owner->on_rq); 6949 return owner; 6950 6951 deactivate: 6952 if (proxy_deactivate(rq, donor)) 6953 return NULL; 6954 /* If deactivate fails, force return */ 6955 p = donor; 6956 force_return: 6957 proxy_force_return(rq, rf, p); 6958 return NULL; 6959 migrate_task: 6960 proxy_migrate_task(rq, rf, p, owner_cpu); 6961 return NULL; 6962 } 6963 #else /* SCHED_PROXY_EXEC */ 6964 static struct task_struct * 6965 find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) 6966 { 6967 WARN_ONCE(1, "This should never be called in the !SCHED_PROXY_EXEC case\n"); 6968 return donor; 6969 } 6970 #endif /* SCHED_PROXY_EXEC */ 6971 6972 /* 6973 * __schedule() is the main scheduler function. 6974 * 6975 * The main means of driving the scheduler and thus entering this function are: 6976 * 6977 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. 6978 * 6979 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return 6980 * paths. For example, see arch/x86/entry_64.S. 6981 * 6982 * To drive preemption between tasks, the scheduler sets the flag in timer 6983 * interrupt handler sched_tick(). 6984 * 6985 * 3. Wakeups don't really cause entry into schedule(). They add a 6986 * task to the run-queue and that's it. 6987 * 6988 * Now, if the new task added to the run-queue preempts the current 6989 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets 6990 * called on the nearest possible occasion: 6991 * 6992 * - If the kernel is preemptible (CONFIG_PREEMPTION=y): 6993 * 6994 * - in syscall or exception context, at the next outmost 6995 * preempt_enable(). (this might be as soon as the wake_up()'s 6996 * spin_unlock()!) 6997 * 6998 * - in IRQ context, return from interrupt-handler to 6999 * preemptible context 7000 * 7001 * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) 7002 * then at the next: 7003 * 7004 * - cond_resched() call 7005 * - explicit schedule() call 7006 * - return from syscall or exception to user-space 7007 * - return from interrupt-handler to user-space 7008 * 7009 * WARNING: must be called with preemption disabled! 7010 */ 7011 static void __sched notrace __schedule(int sched_mode) 7012 { 7013 struct task_struct *prev, *next; 7014 /* 7015 * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted 7016 * as a preemption by schedule_debug() and RCU. 7017 */ 7018 bool preempt = sched_mode > SM_NONE; 7019 bool is_switch = false; 7020 unsigned long *switch_count; 7021 unsigned long prev_state; 7022 struct rq_flags rf; 7023 struct rq *rq; 7024 int cpu; 7025 7026 /* Trace preemptions consistently with task switches */ 7027 trace_sched_entry_tp(sched_mode == SM_PREEMPT); 7028 7029 cpu = smp_processor_id(); 7030 rq = cpu_rq(cpu); 7031 prev = rq->curr; 7032 7033 schedule_debug(prev, preempt); 7034 7035 klp_sched_try_switch(prev); 7036 7037 local_irq_disable(); 7038 rcu_note_context_switch(preempt); 7039 migrate_disable_switch(rq, prev); 7040 7041 /* 7042 * Make sure that signal_pending_state()->signal_pending() below 7043 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) 7044 * done by the caller to avoid the race with signal_wake_up(): 7045 * 7046 * __set_current_state(@state) signal_wake_up() 7047 * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) 7048 * wake_up_state(p, state) 7049 * LOCK rq->lock LOCK p->pi_state 7050 * smp_mb__after_spinlock() smp_mb__after_spinlock() 7051 * if (signal_pending_state()) if (p->state & @state) 7052 * 7053 * Also, the membarrier system call requires a full memory barrier 7054 * after coming from user-space, before storing to rq->curr; this 7055 * barrier matches a full barrier in the proximity of the membarrier 7056 * system call exit. 7057 */ 7058 rq_lock(rq, &rf); 7059 smp_mb__after_spinlock(); 7060 7061 hrtick_schedule_enter(rq); 7062 7063 /* Promote REQ to ACT */ 7064 rq->clock_update_flags <<= 1; 7065 update_rq_clock(rq); 7066 rq->clock_update_flags = RQCF_UPDATED; 7067 7068 switch_count = &prev->nivcsw; 7069 7070 /* Task state changes only considers SM_PREEMPT as preemption */ 7071 preempt = sched_mode == SM_PREEMPT; 7072 7073 /* 7074 * We must load prev->state once (task_struct::state is volatile), such 7075 * that we form a control dependency vs deactivate_task() below. 7076 */ 7077 prev_state = READ_ONCE(prev->__state); 7078 if (sched_mode == SM_IDLE) { 7079 /* SCX must consult the BPF scheduler to tell if rq is empty */ 7080 if (!rq->nr_running && !scx_enabled()) { 7081 next = prev; 7082 rq->next_class = &idle_sched_class; 7083 goto picked; 7084 } 7085 } else if (!preempt && prev_state) { 7086 /* 7087 * We pass task_is_blocked() as the should_block arg 7088 * in order to keep mutex-blocked tasks on the runqueue 7089 * for slection with proxy-exec (without proxy-exec 7090 * task_is_blocked() will always be false). 7091 */ 7092 try_to_block_task(rq, prev, &prev_state, 7093 !task_is_blocked(prev)); 7094 switch_count = &prev->nvcsw; 7095 } 7096 7097 pick_again: 7098 assert_balance_callbacks_empty(rq); 7099 next = pick_next_task(rq, rq->donor, &rf); 7100 rq->next_class = next->sched_class; 7101 if (sched_proxy_exec()) { 7102 struct task_struct *prev_donor = rq->donor; 7103 7104 rq_set_donor(rq, next); 7105 if (unlikely(next->blocked_on)) { 7106 next = find_proxy_task(rq, next, &rf); 7107 if (!next) { 7108 zap_balance_callbacks(rq); 7109 goto pick_again; 7110 } 7111 if (next == rq->idle) { 7112 zap_balance_callbacks(rq); 7113 goto keep_resched; 7114 } 7115 } 7116 if (rq->donor == prev_donor && prev != next) { 7117 struct task_struct *donor = rq->donor; 7118 /* 7119 * When transitioning like: 7120 * 7121 * prev next 7122 * donor: B B 7123 * curr: A B or C 7124 * 7125 * then put_prev_set_next_task() will not have done 7126 * anything, since B == B. However, A might have 7127 * missed a RT/DL balance opportunity due to being 7128 * on_cpu. 7129 */ 7130 donor->sched_class->put_prev_task(rq, donor, donor); 7131 donor->sched_class->set_next_task(rq, donor, true); 7132 } 7133 } else { 7134 rq_set_donor(rq, next); 7135 } 7136 7137 picked: 7138 clear_tsk_need_resched(prev); 7139 clear_preempt_need_resched(); 7140 keep_resched: 7141 rq->last_seen_need_resched_ns = 0; 7142 7143 is_switch = prev != next; 7144 if (likely(is_switch)) { 7145 rq->nr_switches++; 7146 /* 7147 * RCU users of rcu_dereference(rq->curr) may not see 7148 * changes to task_struct made by pick_next_task(). 7149 */ 7150 RCU_INIT_POINTER(rq->curr, next); 7151 7152 /* 7153 * The membarrier system call requires each architecture 7154 * to have a full memory barrier after updating 7155 * rq->curr, before returning to user-space. 7156 * 7157 * Here are the schemes providing that barrier on the 7158 * various architectures: 7159 * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC, 7160 * RISC-V. switch_mm() relies on membarrier_arch_switch_mm() 7161 * on PowerPC and on RISC-V. 7162 * - finish_lock_switch() for weakly-ordered 7163 * architectures where spin_unlock is a full barrier, 7164 * - switch_to() for arm64 (weakly-ordered, spin_unlock 7165 * is a RELEASE barrier), 7166 * 7167 * The barrier matches a full barrier in the proximity of 7168 * the membarrier system call entry. 7169 * 7170 * On RISC-V, this barrier pairing is also needed for the 7171 * SYNC_CORE command when switching between processes, cf. 7172 * the inline comments in membarrier_arch_switch_mm(). 7173 */ 7174 ++*switch_count; 7175 7176 psi_account_irqtime(rq, prev, next); 7177 psi_sched_switch(prev, next, !task_on_rq_queued(prev) || 7178 prev->se.sched_delayed); 7179 7180 trace_sched_switch(preempt, prev, next, prev_state); 7181 7182 /* Also unlocks the rq: */ 7183 rq = context_switch(rq, prev, next, &rf); 7184 } else { 7185 rq_unpin_lock(rq, &rf); 7186 __balance_callbacks(rq, NULL); 7187 hrtick_schedule_exit(rq); 7188 raw_spin_rq_unlock_irq(rq); 7189 } 7190 trace_sched_exit_tp(is_switch); 7191 } 7192 7193 void __noreturn do_task_dead(void) 7194 { 7195 /* Causes final put_task_struct in finish_task_switch(): */ 7196 set_special_state(TASK_DEAD); 7197 7198 /* Tell freezer to ignore us: */ 7199 current->flags |= PF_NOFREEZE; 7200 7201 __schedule(SM_NONE); 7202 BUG(); 7203 7204 /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ 7205 for (;;) 7206 cpu_relax(); 7207 } 7208 7209 static inline void sched_submit_work(struct task_struct *tsk) 7210 { 7211 static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG); 7212 unsigned int task_flags; 7213 7214 /* 7215 * Establish LD_WAIT_CONFIG context to ensure none of the code called 7216 * will use a blocking primitive -- which would lead to recursion. 7217 */ 7218 lock_map_acquire_try(&sched_map); 7219 7220 task_flags = tsk->flags; 7221 /* 7222 * If a worker goes to sleep, notify and ask workqueue whether it 7223 * wants to wake up a task to maintain concurrency. 7224 */ 7225 if (task_flags & PF_WQ_WORKER) 7226 wq_worker_sleeping(tsk); 7227 else if (task_flags & PF_IO_WORKER) 7228 io_wq_worker_sleeping(tsk); 7229 7230 /* 7231 * spinlock and rwlock must not flush block requests. This will 7232 * deadlock if the callback attempts to acquire a lock which is 7233 * already acquired. 7234 */ 7235 WARN_ON_ONCE(current->__state & TASK_RTLOCK_WAIT); 7236 7237 /* 7238 * If we are going to sleep and we have plugged IO queued, 7239 * make sure to submit it to avoid deadlocks. 7240 */ 7241 blk_flush_plug(tsk->plug, true); 7242 7243 lock_map_release(&sched_map); 7244 } 7245 7246 static void sched_update_worker(struct task_struct *tsk) 7247 { 7248 if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) { 7249 if (tsk->flags & PF_BLOCK_TS) 7250 blk_plug_invalidate_ts(tsk); 7251 if (tsk->flags & PF_WQ_WORKER) 7252 wq_worker_running(tsk); 7253 else if (tsk->flags & PF_IO_WORKER) 7254 io_wq_worker_running(tsk); 7255 } 7256 } 7257 7258 static __always_inline void __schedule_loop(int sched_mode) 7259 { 7260 do { 7261 preempt_disable(); 7262 __schedule(sched_mode); 7263 sched_preempt_enable_no_resched(); 7264 } while (need_resched()); 7265 } 7266 7267 asmlinkage __visible void __sched schedule(void) 7268 { 7269 struct task_struct *tsk = current; 7270 7271 #ifdef CONFIG_RT_MUTEXES 7272 lockdep_assert(!tsk->sched_rt_mutex); 7273 #endif 7274 7275 if (!task_is_running(tsk)) 7276 sched_submit_work(tsk); 7277 __schedule_loop(SM_NONE); 7278 sched_update_worker(tsk); 7279 } 7280 EXPORT_SYMBOL(schedule); 7281 7282 /* 7283 * synchronize_rcu_tasks() makes sure that no task is stuck in preempted 7284 * state (have scheduled out non-voluntarily) by making sure that all 7285 * tasks have either left the run queue or have gone into user space. 7286 * As idle tasks do not do either, they must not ever be preempted 7287 * (schedule out non-voluntarily). 7288 * 7289 * schedule_idle() is similar to schedule_preempt_disable() except that it 7290 * never enables preemption because it does not call sched_submit_work(). 7291 */ 7292 void __sched schedule_idle(void) 7293 { 7294 /* 7295 * As this skips calling sched_submit_work(), which the idle task does 7296 * regardless because that function is a NOP when the task is in a 7297 * TASK_RUNNING state, make sure this isn't used someplace that the 7298 * current task can be in any other state. Note, idle is always in the 7299 * TASK_RUNNING state. 7300 */ 7301 WARN_ON_ONCE(current->__state); 7302 do { 7303 __schedule(SM_IDLE); 7304 } while (need_resched()); 7305 } 7306 7307 #if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK) 7308 asmlinkage __visible void __sched schedule_user(void) 7309 { 7310 /* 7311 * If we come here after a random call to set_need_resched(), 7312 * or we have been woken up remotely but the IPI has not yet arrived, 7313 * we haven't yet exited the RCU idle mode. Do it here manually until 7314 * we find a better solution. 7315 * 7316 * NB: There are buggy callers of this function. Ideally we 7317 * should warn if prev_state != CT_STATE_USER, but that will trigger 7318 * too frequently to make sense yet. 7319 */ 7320 enum ctx_state prev_state = exception_enter(); 7321 schedule(); 7322 exception_exit(prev_state); 7323 } 7324 #endif 7325 7326 /** 7327 * schedule_preempt_disabled - called with preemption disabled 7328 * 7329 * Returns with preemption disabled. Note: preempt_count must be 1 7330 */ 7331 void __sched schedule_preempt_disabled(void) 7332 { 7333 sched_preempt_enable_no_resched(); 7334 schedule(); 7335 preempt_disable(); 7336 } 7337 7338 #ifdef CONFIG_PREEMPT_RT 7339 void __sched notrace schedule_rtlock(void) 7340 { 7341 __schedule_loop(SM_RTLOCK_WAIT); 7342 } 7343 NOKPROBE_SYMBOL(schedule_rtlock); 7344 #endif 7345 7346 static void __sched notrace preempt_schedule_common(void) 7347 { 7348 do { 7349 /* 7350 * Because the function tracer can trace preempt_count_sub() 7351 * and it also uses preempt_enable/disable_notrace(), if 7352 * NEED_RESCHED is set, the preempt_enable_notrace() called 7353 * by the function tracer will call this function again and 7354 * cause infinite recursion. 7355 * 7356 * Preemption must be disabled here before the function 7357 * tracer can trace. Break up preempt_disable() into two 7358 * calls. One to disable preemption without fear of being 7359 * traced. The other to still record the preemption latency, 7360 * which can also be traced by the function tracer. 7361 */ 7362 preempt_disable_notrace(); 7363 preempt_latency_start(1); 7364 __schedule(SM_PREEMPT); 7365 preempt_latency_stop(1); 7366 preempt_enable_no_resched_notrace(); 7367 7368 /* 7369 * Check again in case we missed a preemption opportunity 7370 * between schedule and now. 7371 */ 7372 } while (need_resched()); 7373 } 7374 7375 #ifdef CONFIG_PREEMPTION 7376 /* 7377 * This is the entry point to schedule() from in-kernel preemption 7378 * off of preempt_enable. 7379 */ 7380 asmlinkage __visible void __sched notrace preempt_schedule(void) 7381 { 7382 /* 7383 * If there is a non-zero preempt_count or interrupts are disabled, 7384 * we do not want to preempt the current task. Just return.. 7385 */ 7386 if (likely(!preemptible())) 7387 return; 7388 preempt_schedule_common(); 7389 } 7390 NOKPROBE_SYMBOL(preempt_schedule); 7391 EXPORT_SYMBOL(preempt_schedule); 7392 7393 #ifdef CONFIG_PREEMPT_DYNAMIC 7394 # ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL 7395 # ifndef preempt_schedule_dynamic_enabled 7396 # define preempt_schedule_dynamic_enabled preempt_schedule 7397 # define preempt_schedule_dynamic_disabled NULL 7398 # endif 7399 DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled); 7400 EXPORT_STATIC_CALL_TRAMP(preempt_schedule); 7401 # elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 7402 static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule); 7403 void __sched notrace dynamic_preempt_schedule(void) 7404 { 7405 if (!static_branch_unlikely(&sk_dynamic_preempt_schedule)) 7406 return; 7407 preempt_schedule(); 7408 } 7409 NOKPROBE_SYMBOL(dynamic_preempt_schedule); 7410 EXPORT_SYMBOL(dynamic_preempt_schedule); 7411 # endif 7412 #endif /* CONFIG_PREEMPT_DYNAMIC */ 7413 7414 /** 7415 * preempt_schedule_notrace - preempt_schedule called by tracing 7416 * 7417 * The tracing infrastructure uses preempt_enable_notrace to prevent 7418 * recursion and tracing preempt enabling caused by the tracing 7419 * infrastructure itself. But as tracing can happen in areas coming 7420 * from userspace or just about to enter userspace, a preempt enable 7421 * can occur before user_exit() is called. This will cause the scheduler 7422 * to be called when the system is still in usermode. 7423 * 7424 * To prevent this, the preempt_enable_notrace will use this function 7425 * instead of preempt_schedule() to exit user context if needed before 7426 * calling the scheduler. 7427 */ 7428 asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) 7429 { 7430 enum ctx_state prev_ctx; 7431 7432 if (likely(!preemptible())) 7433 return; 7434 7435 do { 7436 /* 7437 * Because the function tracer can trace preempt_count_sub() 7438 * and it also uses preempt_enable/disable_notrace(), if 7439 * NEED_RESCHED is set, the preempt_enable_notrace() called 7440 * by the function tracer will call this function again and 7441 * cause infinite recursion. 7442 * 7443 * Preemption must be disabled here before the function 7444 * tracer can trace. Break up preempt_disable() into two 7445 * calls. One to disable preemption without fear of being 7446 * traced. The other to still record the preemption latency, 7447 * which can also be traced by the function tracer. 7448 */ 7449 preempt_disable_notrace(); 7450 preempt_latency_start(1); 7451 /* 7452 * Needs preempt disabled in case user_exit() is traced 7453 * and the tracer calls preempt_enable_notrace() causing 7454 * an infinite recursion. 7455 */ 7456 prev_ctx = exception_enter(); 7457 __schedule(SM_PREEMPT); 7458 exception_exit(prev_ctx); 7459 7460 preempt_latency_stop(1); 7461 preempt_enable_no_resched_notrace(); 7462 } while (need_resched()); 7463 } 7464 EXPORT_SYMBOL_GPL(preempt_schedule_notrace); 7465 7466 #ifdef CONFIG_PREEMPT_DYNAMIC 7467 # if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 7468 # ifndef preempt_schedule_notrace_dynamic_enabled 7469 # define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace 7470 # define preempt_schedule_notrace_dynamic_disabled NULL 7471 # endif 7472 DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled); 7473 EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); 7474 # elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 7475 static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace); 7476 void __sched notrace dynamic_preempt_schedule_notrace(void) 7477 { 7478 if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace)) 7479 return; 7480 preempt_schedule_notrace(); 7481 } 7482 NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace); 7483 EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); 7484 # endif 7485 #endif 7486 7487 #endif /* CONFIG_PREEMPTION */ 7488 7489 /* 7490 * This is the entry point to schedule() from kernel preemption 7491 * off of IRQ context. 7492 * Note, that this is called and return with IRQs disabled. This will 7493 * protect us against recursive calling from IRQ contexts. 7494 */ 7495 asmlinkage __visible void __sched preempt_schedule_irq(void) 7496 { 7497 enum ctx_state prev_state; 7498 7499 /* Catch callers which need to be fixed */ 7500 BUG_ON(preempt_count() || !irqs_disabled()); 7501 7502 prev_state = exception_enter(); 7503 7504 do { 7505 preempt_disable(); 7506 local_irq_enable(); 7507 __schedule(SM_PREEMPT); 7508 local_irq_disable(); 7509 sched_preempt_enable_no_resched(); 7510 } while (need_resched()); 7511 7512 exception_exit(prev_state); 7513 } 7514 7515 int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, 7516 void *key) 7517 { 7518 WARN_ON_ONCE(wake_flags & ~(WF_SYNC|WF_CURRENT_CPU)); 7519 return try_to_wake_up(curr->private, mode, wake_flags); 7520 } 7521 EXPORT_SYMBOL(default_wake_function); 7522 7523 const struct sched_class *__setscheduler_class(int policy, int prio) 7524 { 7525 if (dl_prio(prio)) 7526 return &dl_sched_class; 7527 7528 if (rt_prio(prio)) 7529 return &rt_sched_class; 7530 7531 #ifdef CONFIG_SCHED_CLASS_EXT 7532 if (task_should_scx(policy)) 7533 return &ext_sched_class; 7534 #endif 7535 7536 return &fair_sched_class; 7537 } 7538 7539 #ifdef CONFIG_RT_MUTEXES 7540 7541 /* 7542 * Would be more useful with typeof()/auto_type but they don't mix with 7543 * bit-fields. Since it's a local thing, use int. Keep the generic sounding 7544 * name such that if someone were to implement this function we get to compare 7545 * notes. 7546 */ 7547 #define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; }) 7548 7549 void rt_mutex_pre_schedule(void) 7550 { 7551 lockdep_assert(!fetch_and_set(current->sched_rt_mutex, 1)); 7552 sched_submit_work(current); 7553 } 7554 7555 void rt_mutex_schedule(void) 7556 { 7557 lockdep_assert(current->sched_rt_mutex); 7558 __schedule_loop(SM_NONE); 7559 } 7560 7561 void rt_mutex_post_schedule(void) 7562 { 7563 sched_update_worker(current); 7564 lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0)); 7565 } 7566 7567 /* 7568 * rt_mutex_setprio - set the current priority of a task 7569 * @p: task to boost 7570 * @pi_task: donor task 7571 * 7572 * This function changes the 'effective' priority of a task. It does 7573 * not touch ->normal_prio like __setscheduler(). 7574 * 7575 * Used by the rt_mutex code to implement priority inheritance 7576 * logic. Call site only calls if the priority of the task changed. 7577 */ 7578 void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) 7579 { 7580 int prio, oldprio, queue_flag = 7581 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 7582 const struct sched_class *prev_class, *next_class; 7583 struct rq_flags rf; 7584 struct rq *rq; 7585 7586 /* XXX used to be waiter->prio, not waiter->task->prio */ 7587 prio = __rt_effective_prio(pi_task, p->normal_prio); 7588 7589 /* 7590 * If nothing changed; bail early. 7591 */ 7592 if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio)) 7593 return; 7594 7595 rq = __task_rq_lock(p, &rf); 7596 update_rq_clock(rq); 7597 /* 7598 * Set under pi_lock && rq->lock, such that the value can be used under 7599 * either lock. 7600 * 7601 * Note that there is loads of tricky to make this pointer cache work 7602 * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to 7603 * ensure a task is de-boosted (pi_task is set to NULL) before the 7604 * task is allowed to run again (and can exit). This ensures the pointer 7605 * points to a blocked task -- which guarantees the task is present. 7606 */ 7607 p->pi_top_task = pi_task; 7608 7609 /* 7610 * For FIFO/RR we only need to set prio, if that matches we're done. 7611 */ 7612 if (prio == p->prio && !dl_prio(prio)) 7613 goto out_unlock; 7614 7615 /* 7616 * Idle task boosting is a no-no in general. There is one 7617 * exception, when PREEMPT_RT and NOHZ is active: 7618 * 7619 * The idle task calls get_next_timer_interrupt() and holds 7620 * the timer wheel base->lock on the CPU and another CPU wants 7621 * to access the timer (probably to cancel it). We can safely 7622 * ignore the boosting request, as the idle CPU runs this code 7623 * with interrupts disabled and will complete the lock 7624 * protected section without being interrupted. So there is no 7625 * real need to boost. 7626 */ 7627 if (unlikely(p == rq->idle)) { 7628 WARN_ON(p != rq->curr); 7629 WARN_ON(p->pi_blocked_on); 7630 goto out_unlock; 7631 } 7632 7633 trace_sched_pi_setprio(p, pi_task); 7634 oldprio = p->prio; 7635 7636 if (oldprio == prio && !dl_prio(prio)) 7637 queue_flag &= ~DEQUEUE_MOVE; 7638 7639 prev_class = p->sched_class; 7640 next_class = __setscheduler_class(p->policy, prio); 7641 7642 if (prev_class != next_class) 7643 queue_flag |= DEQUEUE_CLASS; 7644 7645 scoped_guard (sched_change, p, queue_flag) { 7646 /* 7647 * Boosting condition are: 7648 * 1. -rt task is running and holds mutex A 7649 * --> -dl task blocks on mutex A 7650 * 7651 * 2. -dl task is running and holds mutex A 7652 * --> -dl task blocks on mutex A and could preempt the 7653 * running task 7654 */ 7655 if (dl_prio(prio)) { 7656 if (!dl_prio(p->normal_prio) || 7657 (pi_task && dl_prio(pi_task->prio) && 7658 dl_entity_preempt(&pi_task->dl, &p->dl))) { 7659 p->dl.pi_se = pi_task->dl.pi_se; 7660 scope->flags |= ENQUEUE_REPLENISH; 7661 } else { 7662 p->dl.pi_se = &p->dl; 7663 } 7664 } else if (rt_prio(prio)) { 7665 if (dl_prio(oldprio)) 7666 p->dl.pi_se = &p->dl; 7667 if (oldprio < prio) 7668 scope->flags |= ENQUEUE_HEAD; 7669 } else { 7670 if (dl_prio(oldprio)) 7671 p->dl.pi_se = &p->dl; 7672 if (rt_prio(oldprio)) 7673 p->rt.timeout = 0; 7674 } 7675 7676 p->sched_class = next_class; 7677 p->prio = prio; 7678 } 7679 out_unlock: 7680 /* Caller holds task_struct::pi_lock, IRQs are still disabled */ 7681 7682 __balance_callbacks(rq, &rf); 7683 __task_rq_unlock(rq, p, &rf); 7684 } 7685 #endif /* CONFIG_RT_MUTEXES */ 7686 7687 #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) 7688 int __sched __cond_resched(void) 7689 { 7690 if (should_resched(0) && !irqs_disabled()) { 7691 preempt_schedule_common(); 7692 return 1; 7693 } 7694 /* 7695 * In PREEMPT_RCU kernels, ->rcu_read_lock_nesting tells the tick 7696 * whether the current CPU is in an RCU read-side critical section, 7697 * so the tick can report quiescent states even for CPUs looping 7698 * in kernel context. In contrast, in non-preemptible kernels, 7699 * RCU readers leave no in-memory hints, which means that CPU-bound 7700 * processes executing in kernel context might never report an 7701 * RCU quiescent state. Therefore, the following code causes 7702 * cond_resched() to report a quiescent state, but only when RCU 7703 * is in urgent need of one. 7704 * A third case, preemptible, but non-PREEMPT_RCU provides for 7705 * urgently needed quiescent states via rcu_flavor_sched_clock_irq(). 7706 */ 7707 #ifndef CONFIG_PREEMPT_RCU 7708 rcu_all_qs(); 7709 #endif 7710 return 0; 7711 } 7712 EXPORT_SYMBOL(__cond_resched); 7713 #endif 7714 7715 #ifdef CONFIG_PREEMPT_DYNAMIC 7716 # ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL 7717 # define cond_resched_dynamic_enabled __cond_resched 7718 # define cond_resched_dynamic_disabled ((void *)&__static_call_return0) 7719 DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); 7720 EXPORT_STATIC_CALL_TRAMP(cond_resched); 7721 7722 # define might_resched_dynamic_enabled __cond_resched 7723 # define might_resched_dynamic_disabled ((void *)&__static_call_return0) 7724 DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); 7725 EXPORT_STATIC_CALL_TRAMP(might_resched); 7726 # elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 7727 static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); 7728 int __sched dynamic_cond_resched(void) 7729 { 7730 if (!static_branch_unlikely(&sk_dynamic_cond_resched)) 7731 return 0; 7732 return __cond_resched(); 7733 } 7734 EXPORT_SYMBOL(dynamic_cond_resched); 7735 7736 static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched); 7737 int __sched dynamic_might_resched(void) 7738 { 7739 if (!static_branch_unlikely(&sk_dynamic_might_resched)) 7740 return 0; 7741 return __cond_resched(); 7742 } 7743 EXPORT_SYMBOL(dynamic_might_resched); 7744 # endif 7745 #endif /* CONFIG_PREEMPT_DYNAMIC */ 7746 7747 /* 7748 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 7749 * call schedule, and on return reacquire the lock. 7750 * 7751 * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level 7752 * operations here to prevent schedule() from being called twice (once via 7753 * spin_unlock(), once by hand). 7754 */ 7755 int __cond_resched_lock(spinlock_t *lock) 7756 { 7757 int resched = should_resched(PREEMPT_LOCK_OFFSET); 7758 int ret = 0; 7759 7760 lockdep_assert_held(lock); 7761 7762 if (spin_needbreak(lock) || resched) { 7763 spin_unlock(lock); 7764 if (!_cond_resched()) 7765 cpu_relax(); 7766 ret = 1; 7767 spin_lock(lock); 7768 } 7769 return ret; 7770 } 7771 EXPORT_SYMBOL(__cond_resched_lock); 7772 7773 int __cond_resched_rwlock_read(rwlock_t *lock) 7774 { 7775 int resched = should_resched(PREEMPT_LOCK_OFFSET); 7776 int ret = 0; 7777 7778 lockdep_assert_held_read(lock); 7779 7780 if (rwlock_needbreak(lock) || resched) { 7781 read_unlock(lock); 7782 if (!_cond_resched()) 7783 cpu_relax(); 7784 ret = 1; 7785 read_lock(lock); 7786 } 7787 return ret; 7788 } 7789 EXPORT_SYMBOL(__cond_resched_rwlock_read); 7790 7791 int __cond_resched_rwlock_write(rwlock_t *lock) 7792 { 7793 int resched = should_resched(PREEMPT_LOCK_OFFSET); 7794 int ret = 0; 7795 7796 lockdep_assert_held_write(lock); 7797 7798 if (rwlock_needbreak(lock) || resched) { 7799 write_unlock(lock); 7800 if (!_cond_resched()) 7801 cpu_relax(); 7802 ret = 1; 7803 write_lock(lock); 7804 } 7805 return ret; 7806 } 7807 EXPORT_SYMBOL(__cond_resched_rwlock_write); 7808 7809 #ifdef CONFIG_PREEMPT_DYNAMIC 7810 7811 # ifdef CONFIG_GENERIC_IRQ_ENTRY 7812 # include <linux/irq-entry-common.h> 7813 # endif 7814 7815 /* 7816 * SC:cond_resched 7817 * SC:might_resched 7818 * SC:preempt_schedule 7819 * SC:preempt_schedule_notrace 7820 * SC:irqentry_exit_cond_resched 7821 * 7822 * 7823 * NONE: 7824 * cond_resched <- __cond_resched 7825 * might_resched <- RET0 7826 * preempt_schedule <- NOP 7827 * preempt_schedule_notrace <- NOP 7828 * irqentry_exit_cond_resched <- NOP 7829 * dynamic_preempt_lazy <- false 7830 * 7831 * VOLUNTARY: 7832 * cond_resched <- __cond_resched 7833 * might_resched <- __cond_resched 7834 * preempt_schedule <- NOP 7835 * preempt_schedule_notrace <- NOP 7836 * irqentry_exit_cond_resched <- NOP 7837 * dynamic_preempt_lazy <- false 7838 * 7839 * FULL: 7840 * cond_resched <- RET0 7841 * might_resched <- RET0 7842 * preempt_schedule <- preempt_schedule 7843 * preempt_schedule_notrace <- preempt_schedule_notrace 7844 * irqentry_exit_cond_resched <- irqentry_exit_cond_resched 7845 * dynamic_preempt_lazy <- false 7846 * 7847 * LAZY: 7848 * cond_resched <- RET0 7849 * might_resched <- RET0 7850 * preempt_schedule <- preempt_schedule 7851 * preempt_schedule_notrace <- preempt_schedule_notrace 7852 * irqentry_exit_cond_resched <- irqentry_exit_cond_resched 7853 * dynamic_preempt_lazy <- true 7854 */ 7855 7856 enum { 7857 preempt_dynamic_undefined = -1, 7858 preempt_dynamic_none, 7859 preempt_dynamic_voluntary, 7860 preempt_dynamic_full, 7861 preempt_dynamic_lazy, 7862 }; 7863 7864 int preempt_dynamic_mode = preempt_dynamic_undefined; 7865 7866 int sched_dynamic_mode(const char *str) 7867 { 7868 # if !(defined(CONFIG_PREEMPT_RT) || defined(CONFIG_ARCH_HAS_PREEMPT_LAZY)) 7869 if (!strcmp(str, "none")) 7870 return preempt_dynamic_none; 7871 7872 if (!strcmp(str, "voluntary")) 7873 return preempt_dynamic_voluntary; 7874 # endif 7875 7876 if (!strcmp(str, "full")) 7877 return preempt_dynamic_full; 7878 7879 # ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY 7880 if (!strcmp(str, "lazy")) 7881 return preempt_dynamic_lazy; 7882 # endif 7883 7884 return -EINVAL; 7885 } 7886 7887 # define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) 7888 # define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) 7889 7890 # if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 7891 # define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) 7892 # define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) 7893 # elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 7894 # define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) 7895 # define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) 7896 # else 7897 # error "Unsupported PREEMPT_DYNAMIC mechanism" 7898 # endif 7899 7900 static DEFINE_MUTEX(sched_dynamic_mutex); 7901 7902 static void __sched_dynamic_update(int mode) 7903 { 7904 /* 7905 * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in 7906 * the ZERO state, which is invalid. 7907 */ 7908 preempt_dynamic_enable(cond_resched); 7909 preempt_dynamic_enable(might_resched); 7910 preempt_dynamic_enable(preempt_schedule); 7911 preempt_dynamic_enable(preempt_schedule_notrace); 7912 preempt_dynamic_enable(irqentry_exit_cond_resched); 7913 preempt_dynamic_key_disable(preempt_lazy); 7914 7915 switch (mode) { 7916 case preempt_dynamic_none: 7917 preempt_dynamic_enable(cond_resched); 7918 preempt_dynamic_disable(might_resched); 7919 preempt_dynamic_disable(preempt_schedule); 7920 preempt_dynamic_disable(preempt_schedule_notrace); 7921 preempt_dynamic_disable(irqentry_exit_cond_resched); 7922 preempt_dynamic_key_disable(preempt_lazy); 7923 if (mode != preempt_dynamic_mode) 7924 pr_info("Dynamic Preempt: none\n"); 7925 break; 7926 7927 case preempt_dynamic_voluntary: 7928 preempt_dynamic_enable(cond_resched); 7929 preempt_dynamic_enable(might_resched); 7930 preempt_dynamic_disable(preempt_schedule); 7931 preempt_dynamic_disable(preempt_schedule_notrace); 7932 preempt_dynamic_disable(irqentry_exit_cond_resched); 7933 preempt_dynamic_key_disable(preempt_lazy); 7934 if (mode != preempt_dynamic_mode) 7935 pr_info("Dynamic Preempt: voluntary\n"); 7936 break; 7937 7938 case preempt_dynamic_full: 7939 preempt_dynamic_disable(cond_resched); 7940 preempt_dynamic_disable(might_resched); 7941 preempt_dynamic_enable(preempt_schedule); 7942 preempt_dynamic_enable(preempt_schedule_notrace); 7943 preempt_dynamic_enable(irqentry_exit_cond_resched); 7944 preempt_dynamic_key_disable(preempt_lazy); 7945 if (mode != preempt_dynamic_mode) 7946 pr_info("Dynamic Preempt: full\n"); 7947 break; 7948 7949 case preempt_dynamic_lazy: 7950 preempt_dynamic_disable(cond_resched); 7951 preempt_dynamic_disable(might_resched); 7952 preempt_dynamic_enable(preempt_schedule); 7953 preempt_dynamic_enable(preempt_schedule_notrace); 7954 preempt_dynamic_enable(irqentry_exit_cond_resched); 7955 preempt_dynamic_key_enable(preempt_lazy); 7956 if (mode != preempt_dynamic_mode) 7957 pr_info("Dynamic Preempt: lazy\n"); 7958 break; 7959 } 7960 7961 preempt_dynamic_mode = mode; 7962 } 7963 7964 void sched_dynamic_update(int mode) 7965 { 7966 mutex_lock(&sched_dynamic_mutex); 7967 __sched_dynamic_update(mode); 7968 mutex_unlock(&sched_dynamic_mutex); 7969 } 7970 7971 static int __init setup_preempt_mode(char *str) 7972 { 7973 int mode = sched_dynamic_mode(str); 7974 if (mode < 0) { 7975 pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); 7976 return 0; 7977 } 7978 7979 sched_dynamic_update(mode); 7980 return 1; 7981 } 7982 __setup("preempt=", setup_preempt_mode); 7983 7984 static void __init preempt_dynamic_init(void) 7985 { 7986 if (preempt_dynamic_mode == preempt_dynamic_undefined) { 7987 if (IS_ENABLED(CONFIG_PREEMPT_NONE)) { 7988 sched_dynamic_update(preempt_dynamic_none); 7989 } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { 7990 sched_dynamic_update(preempt_dynamic_voluntary); 7991 } else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) { 7992 sched_dynamic_update(preempt_dynamic_lazy); 7993 } else { 7994 /* Default static call setting, nothing to do */ 7995 WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); 7996 preempt_dynamic_mode = preempt_dynamic_full; 7997 pr_info("Dynamic Preempt: full\n"); 7998 } 7999 } 8000 } 8001 8002 # define PREEMPT_MODEL_ACCESSOR(mode) \ 8003 bool preempt_model_##mode(void) \ 8004 { \ 8005 WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ 8006 return preempt_dynamic_mode == preempt_dynamic_##mode; \ 8007 } \ 8008 EXPORT_SYMBOL_GPL(preempt_model_##mode) 8009 8010 PREEMPT_MODEL_ACCESSOR(none); 8011 PREEMPT_MODEL_ACCESSOR(voluntary); 8012 PREEMPT_MODEL_ACCESSOR(full); 8013 PREEMPT_MODEL_ACCESSOR(lazy); 8014 8015 #else /* !CONFIG_PREEMPT_DYNAMIC: */ 8016 8017 #define preempt_dynamic_mode -1 8018 8019 static inline void preempt_dynamic_init(void) { } 8020 8021 #endif /* CONFIG_PREEMPT_DYNAMIC */ 8022 8023 const char *preempt_modes[] = { 8024 "none", "voluntary", "full", "lazy", NULL, 8025 }; 8026 8027 const char *preempt_model_str(void) 8028 { 8029 bool brace = IS_ENABLED(CONFIG_PREEMPT_RT) && 8030 (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC) || 8031 IS_ENABLED(CONFIG_PREEMPT_LAZY)); 8032 static char buf[128]; 8033 8034 if (IS_ENABLED(CONFIG_PREEMPT_BUILD)) { 8035 struct seq_buf s; 8036 8037 seq_buf_init(&s, buf, sizeof(buf)); 8038 seq_buf_puts(&s, "PREEMPT"); 8039 8040 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 8041 seq_buf_printf(&s, "%sRT%s", 8042 brace ? "_{" : "_", 8043 brace ? "," : ""); 8044 8045 if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) { 8046 seq_buf_printf(&s, "(%s)%s", 8047 preempt_dynamic_mode >= 0 ? 8048 preempt_modes[preempt_dynamic_mode] : "undef", 8049 brace ? "}" : ""); 8050 return seq_buf_str(&s); 8051 } 8052 8053 if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) { 8054 seq_buf_printf(&s, "LAZY%s", 8055 brace ? "}" : ""); 8056 return seq_buf_str(&s); 8057 } 8058 8059 return seq_buf_str(&s); 8060 } 8061 8062 if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BUILD)) 8063 return "VOLUNTARY"; 8064 8065 return "NONE"; 8066 } 8067 8068 int io_schedule_prepare(void) 8069 { 8070 int old_iowait = current->in_iowait; 8071 8072 current->in_iowait = 1; 8073 blk_flush_plug(current->plug, true); 8074 return old_iowait; 8075 } 8076 8077 void io_schedule_finish(int token) 8078 { 8079 current->in_iowait = token; 8080 } 8081 8082 /* 8083 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 8084 * that process accounting knows that this is a task in IO wait state. 8085 */ 8086 long __sched io_schedule_timeout(long timeout) 8087 { 8088 int token; 8089 long ret; 8090 8091 token = io_schedule_prepare(); 8092 ret = schedule_timeout(timeout); 8093 io_schedule_finish(token); 8094 8095 return ret; 8096 } 8097 EXPORT_SYMBOL(io_schedule_timeout); 8098 8099 void __sched io_schedule(void) 8100 { 8101 int token; 8102 8103 token = io_schedule_prepare(); 8104 schedule(); 8105 io_schedule_finish(token); 8106 } 8107 EXPORT_SYMBOL(io_schedule); 8108 8109 void sched_show_task(struct task_struct *p) 8110 { 8111 unsigned long free; 8112 int ppid; 8113 8114 if (!try_get_task_stack(p)) 8115 return; 8116 8117 pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p)); 8118 8119 if (task_is_running(p)) 8120 pr_cont(" running task "); 8121 free = stack_not_used(p); 8122 ppid = 0; 8123 rcu_read_lock(); 8124 if (pid_alive(p)) 8125 ppid = task_pid_nr(rcu_dereference(p->real_parent)); 8126 rcu_read_unlock(); 8127 pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d task_flags:0x%04x flags:0x%08lx\n", 8128 free, task_pid_nr(p), task_tgid_nr(p), 8129 ppid, p->flags, read_task_thread_flags(p)); 8130 8131 print_worker_info(KERN_INFO, p); 8132 print_stop_info(KERN_INFO, p); 8133 print_scx_info(KERN_INFO, p); 8134 show_stack(p, NULL, KERN_INFO); 8135 put_task_stack(p); 8136 } 8137 EXPORT_SYMBOL_GPL(sched_show_task); 8138 8139 static inline bool 8140 state_filter_match(unsigned long state_filter, struct task_struct *p) 8141 { 8142 unsigned int state = READ_ONCE(p->__state); 8143 8144 /* no filter, everything matches */ 8145 if (!state_filter) 8146 return true; 8147 8148 /* filter, but doesn't match */ 8149 if (!(state & state_filter)) 8150 return false; 8151 8152 /* 8153 * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows 8154 * TASK_KILLABLE). 8155 */ 8156 if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD)) 8157 return false; 8158 8159 return true; 8160 } 8161 8162 8163 void show_state_filter(unsigned int state_filter) 8164 { 8165 struct task_struct *g, *p; 8166 8167 rcu_read_lock(); 8168 for_each_process_thread(g, p) { 8169 /* 8170 * reset the NMI-timeout, listing all files on a slow 8171 * console might take a lot of time: 8172 * Also, reset softlockup watchdogs on all CPUs, because 8173 * another CPU might be blocked waiting for us to process 8174 * an IPI. 8175 */ 8176 touch_nmi_watchdog(); 8177 touch_all_softlockup_watchdogs(); 8178 if (state_filter_match(state_filter, p)) 8179 sched_show_task(p); 8180 } 8181 8182 if (!state_filter) 8183 sysrq_sched_debug_show(); 8184 8185 rcu_read_unlock(); 8186 /* 8187 * Only show locks if all tasks are dumped: 8188 */ 8189 if (!state_filter) 8190 debug_show_all_locks(); 8191 } 8192 8193 /** 8194 * init_idle - set up an idle thread for a given CPU 8195 * @idle: task in question 8196 * @cpu: CPU the idle task belongs to 8197 * 8198 * NOTE: this function does not set the idle thread's NEED_RESCHED 8199 * flag, to make booting more robust. 8200 */ 8201 void __init init_idle(struct task_struct *idle, int cpu) 8202 { 8203 struct affinity_context ac = (struct affinity_context) { 8204 .new_mask = cpumask_of(cpu), 8205 .flags = 0, 8206 }; 8207 struct rq *rq = cpu_rq(cpu); 8208 unsigned long flags; 8209 8210 raw_spin_lock_irqsave(&idle->pi_lock, flags); 8211 raw_spin_rq_lock(rq); 8212 8213 idle->__state = TASK_RUNNING; 8214 idle->se.exec_start = sched_clock(); 8215 /* 8216 * PF_KTHREAD should already be set at this point; regardless, make it 8217 * look like a proper per-CPU kthread. 8218 */ 8219 idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY; 8220 kthread_set_per_cpu(idle, cpu); 8221 8222 /* 8223 * No validation and serialization required at boot time and for 8224 * setting up the idle tasks of not yet online CPUs. 8225 */ 8226 set_cpus_allowed_common(idle, &ac); 8227 /* 8228 * We're having a chicken and egg problem, even though we are 8229 * holding rq->lock, the CPU isn't yet set to this CPU so the 8230 * lockdep check in task_group() will fail. 8231 * 8232 * Similar case to sched_fork(). / Alternatively we could 8233 * use task_rq_lock() here and obtain the other rq->lock. 8234 * 8235 * Silence PROVE_RCU 8236 */ 8237 rcu_read_lock(); 8238 __set_task_cpu(idle, cpu); 8239 rcu_read_unlock(); 8240 8241 rq->idle = idle; 8242 rq_set_donor(rq, idle); 8243 rcu_assign_pointer(rq->curr, idle); 8244 idle->on_rq = TASK_ON_RQ_QUEUED; 8245 idle->on_cpu = 1; 8246 raw_spin_rq_unlock(rq); 8247 raw_spin_unlock_irqrestore(&idle->pi_lock, flags); 8248 8249 /* Set the preempt count _outside_ the spinlocks! */ 8250 init_idle_preempt_count(idle, cpu); 8251 8252 /* 8253 * The idle tasks have their own, simple scheduling class: 8254 */ 8255 idle->sched_class = &idle_sched_class; 8256 ftrace_graph_init_idle_task(idle, cpu); 8257 vtime_init_idle(idle, cpu); 8258 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 8259 } 8260 8261 int cpuset_cpumask_can_shrink(const struct cpumask *cur, 8262 const struct cpumask *trial) 8263 { 8264 int ret = 1; 8265 8266 if (cpumask_empty(cur)) 8267 return ret; 8268 8269 ret = dl_cpuset_cpumask_can_shrink(cur, trial); 8270 8271 return ret; 8272 } 8273 8274 int task_can_attach(struct task_struct *p) 8275 { 8276 int ret = 0; 8277 8278 /* 8279 * Kthreads which disallow setaffinity shouldn't be moved 8280 * to a new cpuset; we don't want to change their CPU 8281 * affinity and isolating such threads by their set of 8282 * allowed nodes is unnecessary. Thus, cpusets are not 8283 * applicable for such threads. This prevents checking for 8284 * success of set_cpus_allowed_ptr() on all attached tasks 8285 * before cpus_mask may be changed. 8286 */ 8287 if (p->flags & PF_NO_SETAFFINITY) 8288 ret = -EINVAL; 8289 8290 return ret; 8291 } 8292 8293 bool sched_smp_initialized __read_mostly; 8294 8295 #ifdef CONFIG_NUMA_BALANCING 8296 /* Migrate current task p to target_cpu */ 8297 int migrate_task_to(struct task_struct *p, int target_cpu) 8298 { 8299 struct migration_arg arg = { p, target_cpu }; 8300 int curr_cpu = task_cpu(p); 8301 8302 if (curr_cpu == target_cpu) 8303 return 0; 8304 8305 if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) 8306 return -EINVAL; 8307 8308 /* TODO: This is not properly updating schedstats */ 8309 8310 trace_sched_move_numa(p, curr_cpu, target_cpu); 8311 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); 8312 } 8313 8314 /* 8315 * Requeue a task on a given node and accurately track the number of NUMA 8316 * tasks on the runqueues 8317 */ 8318 void sched_setnuma(struct task_struct *p, int nid) 8319 { 8320 guard(task_rq_lock)(p); 8321 scoped_guard (sched_change, p, DEQUEUE_SAVE) 8322 p->numa_preferred_nid = nid; 8323 } 8324 #endif /* CONFIG_NUMA_BALANCING */ 8325 8326 #ifdef CONFIG_HOTPLUG_CPU 8327 /* 8328 * Invoked on the outgoing CPU in context of the CPU hotplug thread 8329 * after ensuring that there are no user space tasks left on the CPU. 8330 * 8331 * If there is a lazy mm in use on the hotplug thread, drop it and 8332 * switch to init_mm. 8333 * 8334 * The reference count on init_mm is dropped in finish_cpu(). 8335 */ 8336 static void sched_force_init_mm(void) 8337 { 8338 struct mm_struct *mm = current->active_mm; 8339 8340 if (mm != &init_mm) { 8341 mmgrab_lazy_tlb(&init_mm); 8342 local_irq_disable(); 8343 current->active_mm = &init_mm; 8344 switch_mm_irqs_off(mm, &init_mm, current); 8345 local_irq_enable(); 8346 finish_arch_post_lock_switch(); 8347 mmdrop_lazy_tlb(mm); 8348 } 8349 8350 /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ 8351 } 8352 8353 static int __balance_push_cpu_stop(void *arg) 8354 { 8355 struct task_struct *p = arg; 8356 struct rq *rq = this_rq(); 8357 struct rq_flags rf; 8358 int cpu; 8359 8360 scoped_guard (raw_spinlock_irq, &p->pi_lock) { 8361 /* 8362 * We may change the underlying rq, but the locks held will 8363 * appropriately be "transferred" when switching. 8364 */ 8365 context_unsafe_alias(rq); 8366 8367 cpu = select_fallback_rq(rq->cpu, p); 8368 8369 rq_lock(rq, &rf); 8370 update_rq_clock(rq); 8371 if (task_rq(p) == rq && task_on_rq_queued(p)) 8372 rq = __migrate_task(rq, &rf, p, cpu); 8373 rq_unlock(rq, &rf); 8374 } 8375 8376 put_task_struct(p); 8377 8378 return 0; 8379 } 8380 8381 static DEFINE_PER_CPU(struct cpu_stop_work, push_work); 8382 8383 /* 8384 * Ensure we only run per-cpu kthreads once the CPU goes !active. 8385 * 8386 * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only 8387 * effective when the hotplug motion is down. 8388 */ 8389 static void balance_push(struct rq *rq) 8390 __must_hold(__rq_lockp(rq)) 8391 { 8392 struct task_struct *push_task = rq->curr; 8393 8394 lockdep_assert_rq_held(rq); 8395 8396 /* 8397 * Ensure the thing is persistent until balance_push_set(.on = false); 8398 */ 8399 rq->balance_callback = &balance_push_callback; 8400 8401 /* 8402 * Only active while going offline and when invoked on the outgoing 8403 * CPU. 8404 */ 8405 if (!cpu_dying(rq->cpu) || rq != this_rq()) 8406 return; 8407 8408 /* 8409 * Both the cpu-hotplug and stop task are in this case and are 8410 * required to complete the hotplug process. 8411 */ 8412 if (kthread_is_per_cpu(push_task) || 8413 is_migration_disabled(push_task)) { 8414 8415 /* 8416 * If this is the idle task on the outgoing CPU try to wake 8417 * up the hotplug control thread which might wait for the 8418 * last task to vanish. The rcuwait_active() check is 8419 * accurate here because the waiter is pinned on this CPU 8420 * and can't obviously be running in parallel. 8421 * 8422 * On RT kernels this also has to check whether there are 8423 * pinned and scheduled out tasks on the runqueue. They 8424 * need to leave the migrate disabled section first. 8425 */ 8426 if (!rq->nr_running && !rq_has_pinned_tasks(rq) && 8427 rcuwait_active(&rq->hotplug_wait)) { 8428 raw_spin_rq_unlock(rq); 8429 rcuwait_wake_up(&rq->hotplug_wait); 8430 raw_spin_rq_lock(rq); 8431 } 8432 return; 8433 } 8434 8435 get_task_struct(push_task); 8436 /* 8437 * Temporarily drop rq->lock such that we can wake-up the stop task. 8438 * Both preemption and IRQs are still disabled. 8439 */ 8440 preempt_disable(); 8441 raw_spin_rq_unlock(rq); 8442 stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, 8443 this_cpu_ptr(&push_work)); 8444 preempt_enable(); 8445 /* 8446 * At this point need_resched() is true and we'll take the loop in 8447 * schedule(). The next pick is obviously going to be the stop task 8448 * which kthread_is_per_cpu() and will push this task away. 8449 */ 8450 raw_spin_rq_lock(rq); 8451 } 8452 8453 static void balance_push_set(int cpu, bool on) 8454 { 8455 struct rq *rq = cpu_rq(cpu); 8456 struct rq_flags rf; 8457 8458 rq_lock_irqsave(rq, &rf); 8459 if (on) { 8460 WARN_ON_ONCE(rq->balance_callback); 8461 rq->balance_callback = &balance_push_callback; 8462 } else if (rq->balance_callback == &balance_push_callback) { 8463 rq->balance_callback = NULL; 8464 } 8465 rq_unlock_irqrestore(rq, &rf); 8466 } 8467 8468 /* 8469 * Invoked from a CPUs hotplug control thread after the CPU has been marked 8470 * inactive. All tasks which are not per CPU kernel threads are either 8471 * pushed off this CPU now via balance_push() or placed on a different CPU 8472 * during wakeup. Wait until the CPU is quiescent. 8473 */ 8474 static void balance_hotplug_wait(void) 8475 { 8476 struct rq *rq = this_rq(); 8477 8478 rcuwait_wait_event(&rq->hotplug_wait, 8479 rq->nr_running == 1 && !rq_has_pinned_tasks(rq), 8480 TASK_UNINTERRUPTIBLE); 8481 } 8482 8483 #else /* !CONFIG_HOTPLUG_CPU: */ 8484 8485 static inline void balance_push(struct rq *rq) 8486 { 8487 } 8488 8489 static inline void balance_push_set(int cpu, bool on) 8490 { 8491 } 8492 8493 static inline void balance_hotplug_wait(void) 8494 { 8495 } 8496 8497 #endif /* !CONFIG_HOTPLUG_CPU */ 8498 8499 void set_rq_online(struct rq *rq) 8500 { 8501 if (!rq->online) { 8502 const struct sched_class *class; 8503 8504 cpumask_set_cpu(rq->cpu, rq->rd->online); 8505 rq->online = 1; 8506 8507 for_each_class(class) { 8508 if (class->rq_online) 8509 class->rq_online(rq); 8510 } 8511 } 8512 } 8513 8514 void set_rq_offline(struct rq *rq) 8515 { 8516 if (rq->online) { 8517 const struct sched_class *class; 8518 8519 update_rq_clock(rq); 8520 for_each_class(class) { 8521 if (class->rq_offline) 8522 class->rq_offline(rq); 8523 } 8524 8525 cpumask_clear_cpu(rq->cpu, rq->rd->online); 8526 rq->online = 0; 8527 } 8528 } 8529 8530 static inline void sched_set_rq_online(struct rq *rq, int cpu) 8531 { 8532 struct rq_flags rf; 8533 8534 rq_lock_irqsave(rq, &rf); 8535 if (rq->rd) { 8536 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 8537 set_rq_online(rq); 8538 } 8539 rq_unlock_irqrestore(rq, &rf); 8540 } 8541 8542 static inline void sched_set_rq_offline(struct rq *rq, int cpu) 8543 { 8544 struct rq_flags rf; 8545 8546 rq_lock_irqsave(rq, &rf); 8547 if (rq->rd) { 8548 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 8549 set_rq_offline(rq); 8550 } 8551 rq_unlock_irqrestore(rq, &rf); 8552 } 8553 8554 /* 8555 * used to mark begin/end of suspend/resume: 8556 */ 8557 static int num_cpus_frozen; 8558 8559 /* 8560 * Update cpusets according to cpu_active mask. If cpusets are 8561 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 8562 * around partition_sched_domains(). 8563 * 8564 * If we come here as part of a suspend/resume, don't touch cpusets because we 8565 * want to restore it back to its original state upon resume anyway. 8566 */ 8567 static void cpuset_cpu_active(void) 8568 { 8569 if (cpuhp_tasks_frozen) { 8570 /* 8571 * num_cpus_frozen tracks how many CPUs are involved in suspend 8572 * resume sequence. As long as this is not the last online 8573 * operation in the resume sequence, just build a single sched 8574 * domain, ignoring cpusets. 8575 */ 8576 cpuset_reset_sched_domains(); 8577 if (--num_cpus_frozen) 8578 return; 8579 /* 8580 * This is the last CPU online operation. So fall through and 8581 * restore the original sched domains by considering the 8582 * cpuset configurations. 8583 */ 8584 cpuset_force_rebuild(); 8585 } 8586 cpuset_update_active_cpus(); 8587 } 8588 8589 static void cpuset_cpu_inactive(unsigned int cpu) 8590 { 8591 if (!cpuhp_tasks_frozen) { 8592 cpuset_update_active_cpus(); 8593 } else { 8594 num_cpus_frozen++; 8595 cpuset_reset_sched_domains(); 8596 } 8597 } 8598 8599 static inline void sched_smt_present_inc(int cpu) 8600 { 8601 #ifdef CONFIG_SCHED_SMT 8602 if (cpumask_weight(cpu_smt_mask(cpu)) == 2) 8603 static_branch_inc_cpuslocked(&sched_smt_present); 8604 #endif 8605 } 8606 8607 static inline void sched_smt_present_dec(int cpu) 8608 { 8609 #ifdef CONFIG_SCHED_SMT 8610 if (cpumask_weight(cpu_smt_mask(cpu)) == 2) 8611 static_branch_dec_cpuslocked(&sched_smt_present); 8612 #endif 8613 } 8614 8615 int sched_cpu_activate(unsigned int cpu) 8616 { 8617 struct rq *rq = cpu_rq(cpu); 8618 8619 /* 8620 * Clear the balance_push callback and prepare to schedule 8621 * regular tasks. 8622 */ 8623 balance_push_set(cpu, false); 8624 8625 /* 8626 * When going up, increment the number of cores with SMT present. 8627 */ 8628 sched_smt_present_inc(cpu); 8629 set_cpu_active(cpu, true); 8630 8631 if (sched_smp_initialized) { 8632 sched_update_numa(cpu, true); 8633 sched_domains_numa_masks_set(cpu); 8634 cpuset_cpu_active(); 8635 } 8636 8637 scx_rq_activate(rq); 8638 8639 /* 8640 * Put the rq online, if not already. This happens: 8641 * 8642 * 1) In the early boot process, because we build the real domains 8643 * after all CPUs have been brought up. 8644 * 8645 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the 8646 * domains. 8647 */ 8648 sched_set_rq_online(rq, cpu); 8649 8650 return 0; 8651 } 8652 8653 int sched_cpu_deactivate(unsigned int cpu) 8654 { 8655 struct rq *rq = cpu_rq(cpu); 8656 int ret; 8657 8658 ret = dl_bw_deactivate(cpu); 8659 8660 if (ret) 8661 return ret; 8662 8663 /* 8664 * Remove CPU from nohz.idle_cpus_mask to prevent participating in 8665 * load balancing when not active 8666 */ 8667 nohz_balance_exit_idle(rq); 8668 8669 set_cpu_active(cpu, false); 8670 8671 /* 8672 * From this point forward, this CPU will refuse to run any task that 8673 * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively 8674 * push those tasks away until this gets cleared, see 8675 * sched_cpu_dying(). 8676 */ 8677 balance_push_set(cpu, true); 8678 8679 /* 8680 * We've cleared cpu_active_mask / set balance_push, wait for all 8681 * preempt-disabled and RCU users of this state to go away such that 8682 * all new such users will observe it. 8683 * 8684 * Specifically, we rely on ttwu to no longer target this CPU, see 8685 * ttwu_queue_cond() and is_cpu_allowed(). 8686 * 8687 * Do sync before park smpboot threads to take care the RCU boost case. 8688 */ 8689 synchronize_rcu(); 8690 8691 sched_set_rq_offline(rq, cpu); 8692 8693 scx_rq_deactivate(rq); 8694 8695 /* 8696 * When going down, decrement the number of cores with SMT present. 8697 */ 8698 sched_smt_present_dec(cpu); 8699 8700 #ifdef CONFIG_SCHED_SMT 8701 sched_core_cpu_deactivate(cpu); 8702 #endif 8703 8704 if (!sched_smp_initialized) 8705 return 0; 8706 8707 sched_update_numa(cpu, false); 8708 cpuset_cpu_inactive(cpu); 8709 sched_domains_numa_masks_clear(cpu); 8710 return 0; 8711 } 8712 8713 static void sched_rq_cpu_starting(unsigned int cpu) 8714 { 8715 struct rq *rq = cpu_rq(cpu); 8716 8717 rq->calc_load_update = calc_load_update; 8718 update_max_interval(); 8719 } 8720 8721 int sched_cpu_starting(unsigned int cpu) 8722 { 8723 sched_core_cpu_starting(cpu); 8724 sched_rq_cpu_starting(cpu); 8725 sched_tick_start(cpu); 8726 return 0; 8727 } 8728 8729 #ifdef CONFIG_HOTPLUG_CPU 8730 8731 /* 8732 * Invoked immediately before the stopper thread is invoked to bring the 8733 * CPU down completely. At this point all per CPU kthreads except the 8734 * hotplug thread (current) and the stopper thread (inactive) have been 8735 * either parked or have been unbound from the outgoing CPU. Ensure that 8736 * any of those which might be on the way out are gone. 8737 * 8738 * If after this point a bound task is being woken on this CPU then the 8739 * responsible hotplug callback has failed to do it's job. 8740 * sched_cpu_dying() will catch it with the appropriate fireworks. 8741 */ 8742 int sched_cpu_wait_empty(unsigned int cpu) 8743 { 8744 balance_hotplug_wait(); 8745 sched_force_init_mm(); 8746 return 0; 8747 } 8748 8749 /* 8750 * Since this CPU is going 'away' for a while, fold any nr_active delta we 8751 * might have. Called from the CPU stopper task after ensuring that the 8752 * stopper is the last running task on the CPU, so nr_active count is 8753 * stable. We need to take the tear-down thread which is calling this into 8754 * account, so we hand in adjust = 1 to the load calculation. 8755 * 8756 * Also see the comment "Global load-average calculations". 8757 */ 8758 static void calc_load_migrate(struct rq *rq) 8759 { 8760 long delta = calc_load_fold_active(rq, 1); 8761 8762 if (delta) 8763 atomic_long_add(delta, &calc_load_tasks); 8764 } 8765 8766 static void dump_rq_tasks(struct rq *rq, const char *loglvl) 8767 { 8768 struct task_struct *g, *p; 8769 int cpu = cpu_of(rq); 8770 8771 lockdep_assert_rq_held(rq); 8772 8773 printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running); 8774 for_each_process_thread(g, p) { 8775 if (task_cpu(p) != cpu) 8776 continue; 8777 8778 if (!task_on_rq_queued(p)) 8779 continue; 8780 8781 printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm); 8782 } 8783 } 8784 8785 int sched_cpu_dying(unsigned int cpu) 8786 { 8787 struct rq *rq = cpu_rq(cpu); 8788 struct rq_flags rf; 8789 8790 /* Handle pending wakeups and then migrate everything off */ 8791 sched_tick_stop(cpu); 8792 8793 rq_lock_irqsave(rq, &rf); 8794 update_rq_clock(rq); 8795 if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) { 8796 WARN(true, "Dying CPU not properly vacated!"); 8797 dump_rq_tasks(rq, KERN_WARNING); 8798 } 8799 dl_server_stop(&rq->fair_server); 8800 #ifdef CONFIG_SCHED_CLASS_EXT 8801 dl_server_stop(&rq->ext_server); 8802 #endif 8803 rq_unlock_irqrestore(rq, &rf); 8804 8805 calc_load_migrate(rq); 8806 update_max_interval(); 8807 hrtick_clear(rq); 8808 sched_core_cpu_dying(cpu); 8809 return 0; 8810 } 8811 #endif /* CONFIG_HOTPLUG_CPU */ 8812 8813 void __init sched_init_smp(void) 8814 { 8815 sched_init_numa(NUMA_NO_NODE); 8816 8817 prandom_init_once(&sched_rnd_state); 8818 8819 /* 8820 * There's no userspace yet to cause hotplug operations; hence all the 8821 * CPU masks are stable and all blatant races in the below code cannot 8822 * happen. 8823 */ 8824 sched_domains_mutex_lock(); 8825 sched_init_domains(cpu_active_mask); 8826 sched_domains_mutex_unlock(); 8827 8828 /* Move init over to a non-isolated CPU */ 8829 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0) 8830 BUG(); 8831 current->flags &= ~PF_NO_SETAFFINITY; 8832 sched_init_granularity(); 8833 8834 init_sched_rt_class(); 8835 init_sched_dl_class(); 8836 8837 sched_init_dl_servers(); 8838 8839 sched_smp_initialized = true; 8840 } 8841 8842 static int __init migration_init(void) 8843 { 8844 sched_cpu_starting(smp_processor_id()); 8845 return 0; 8846 } 8847 early_initcall(migration_init); 8848 8849 int in_sched_functions(unsigned long addr) 8850 { 8851 return in_lock_functions(addr) || 8852 (addr >= (unsigned long)__sched_text_start 8853 && addr < (unsigned long)__sched_text_end); 8854 } 8855 8856 #ifdef CONFIG_CGROUP_SCHED 8857 /* 8858 * Default task group. 8859 * Every task in system belongs to this group at bootup. 8860 */ 8861 struct task_group root_task_group; 8862 LIST_HEAD(task_groups); 8863 8864 /* Cacheline aligned slab cache for task_group */ 8865 static struct kmem_cache *task_group_cache __ro_after_init; 8866 #endif 8867 8868 void __init sched_init(void) 8869 { 8870 unsigned long ptr = 0; 8871 int i; 8872 8873 /* Make sure the linker didn't screw up */ 8874 BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class)); 8875 BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class)); 8876 BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class)); 8877 BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class)); 8878 #ifdef CONFIG_SCHED_CLASS_EXT 8879 BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class)); 8880 BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); 8881 #endif 8882 8883 wait_bit_init(); 8884 8885 #ifdef CONFIG_FAIR_GROUP_SCHED 8886 ptr += 2 * nr_cpu_ids * sizeof(void **); 8887 #endif 8888 #ifdef CONFIG_RT_GROUP_SCHED 8889 ptr += 2 * nr_cpu_ids * sizeof(void **); 8890 #endif 8891 if (ptr) { 8892 ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT); 8893 8894 #ifdef CONFIG_FAIR_GROUP_SCHED 8895 root_task_group.se = (struct sched_entity **)ptr; 8896 ptr += nr_cpu_ids * sizeof(void **); 8897 8898 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 8899 ptr += nr_cpu_ids * sizeof(void **); 8900 8901 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 8902 init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); 8903 #endif /* CONFIG_FAIR_GROUP_SCHED */ 8904 #ifdef CONFIG_EXT_GROUP_SCHED 8905 scx_tg_init(&root_task_group); 8906 #endif /* CONFIG_EXT_GROUP_SCHED */ 8907 #ifdef CONFIG_RT_GROUP_SCHED 8908 root_task_group.rt_se = (struct sched_rt_entity **)ptr; 8909 ptr += nr_cpu_ids * sizeof(void **); 8910 8911 root_task_group.rt_rq = (struct rt_rq **)ptr; 8912 ptr += nr_cpu_ids * sizeof(void **); 8913 8914 #endif /* CONFIG_RT_GROUP_SCHED */ 8915 } 8916 8917 init_defrootdomain(); 8918 8919 #ifdef CONFIG_RT_GROUP_SCHED 8920 init_rt_bandwidth(&root_task_group.rt_bandwidth, 8921 global_rt_period(), global_rt_runtime()); 8922 #endif /* CONFIG_RT_GROUP_SCHED */ 8923 8924 #ifdef CONFIG_CGROUP_SCHED 8925 task_group_cache = KMEM_CACHE(task_group, 0); 8926 8927 list_add(&root_task_group.list, &task_groups); 8928 INIT_LIST_HEAD(&root_task_group.children); 8929 INIT_LIST_HEAD(&root_task_group.siblings); 8930 autogroup_init(&init_task); 8931 #endif /* CONFIG_CGROUP_SCHED */ 8932 8933 for_each_possible_cpu(i) { 8934 struct rq *rq; 8935 8936 rq = cpu_rq(i); 8937 raw_spin_lock_init(&rq->__lock); 8938 rq->nr_running = 0; 8939 rq->calc_load_active = 0; 8940 rq->calc_load_update = jiffies + LOAD_FREQ; 8941 init_cfs_rq(&rq->cfs); 8942 init_rt_rq(&rq->rt); 8943 init_dl_rq(&rq->dl); 8944 #ifdef CONFIG_FAIR_GROUP_SCHED 8945 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 8946 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; 8947 /* 8948 * How much CPU bandwidth does root_task_group get? 8949 * 8950 * In case of task-groups formed through the cgroup filesystem, it 8951 * gets 100% of the CPU resources in the system. This overall 8952 * system CPU resource is divided among the tasks of 8953 * root_task_group and its child task-groups in a fair manner, 8954 * based on each entity's (task or task-group's) weight 8955 * (se->load.weight). 8956 * 8957 * In other words, if root_task_group has 10 tasks of weight 8958 * 1024) and two child groups A0 and A1 (of weight 1024 each), 8959 * then A0's share of the CPU resource is: 8960 * 8961 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 8962 * 8963 * We achieve this by letting root_task_group's tasks sit 8964 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 8965 */ 8966 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 8967 #endif /* CONFIG_FAIR_GROUP_SCHED */ 8968 8969 #ifdef CONFIG_RT_GROUP_SCHED 8970 /* 8971 * This is required for init cpu because rt.c:__enable_runtime() 8972 * starts working after scheduler_running, which is not the case 8973 * yet. 8974 */ 8975 rq->rt.rt_runtime = global_rt_runtime(); 8976 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 8977 #endif 8978 rq->next_class = &idle_sched_class; 8979 8980 rq->sd = NULL; 8981 rq->rd = NULL; 8982 rq->cpu_capacity = SCHED_CAPACITY_SCALE; 8983 rq->balance_callback = &balance_push_callback; 8984 rq->active_balance = 0; 8985 rq->next_balance = jiffies; 8986 rq->push_cpu = 0; 8987 rq->cpu = i; 8988 rq->online = 0; 8989 rq->idle_stamp = 0; 8990 rq->avg_idle = 2*sysctl_sched_migration_cost; 8991 rq->max_idle_balance_cost = sysctl_sched_migration_cost; 8992 8993 INIT_LIST_HEAD(&rq->cfs_tasks); 8994 8995 rq_attach_root(rq, &def_root_domain); 8996 #ifdef CONFIG_NO_HZ_COMMON 8997 rq->last_blocked_load_update_tick = jiffies; 8998 atomic_set(&rq->nohz_flags, 0); 8999 9000 INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq); 9001 #endif 9002 #ifdef CONFIG_HOTPLUG_CPU 9003 rcuwait_init(&rq->hotplug_wait); 9004 #endif 9005 hrtick_rq_init(rq); 9006 atomic_set(&rq->nr_iowait, 0); 9007 fair_server_init(rq); 9008 #ifdef CONFIG_SCHED_CLASS_EXT 9009 ext_server_init(rq); 9010 #endif 9011 9012 #ifdef CONFIG_SCHED_CORE 9013 rq->core = rq; 9014 rq->core_pick = NULL; 9015 rq->core_dl_server = NULL; 9016 rq->core_enabled = 0; 9017 rq->core_tree = RB_ROOT; 9018 rq->core_forceidle_count = 0; 9019 rq->core_forceidle_occupation = 0; 9020 rq->core_forceidle_start = 0; 9021 9022 rq->core_cookie = 0UL; 9023 #endif 9024 zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i)); 9025 } 9026 9027 set_load_weight(&init_task, false); 9028 init_task.se.slice = sysctl_sched_base_slice, 9029 9030 /* 9031 * The boot idle thread does lazy MMU switching as well: 9032 */ 9033 mmgrab_lazy_tlb(&init_mm); 9034 enter_lazy_tlb(&init_mm, current); 9035 9036 /* 9037 * The idle task doesn't need the kthread struct to function, but it 9038 * is dressed up as a per-CPU kthread and thus needs to play the part 9039 * if we want to avoid special-casing it in code that deals with per-CPU 9040 * kthreads. 9041 */ 9042 WARN_ON(!set_kthread_struct(current)); 9043 9044 /* 9045 * Make us the idle thread. Technically, schedule() should not be 9046 * called from this thread, however somewhere below it might be, 9047 * but because we are the idle thread, we just pick up running again 9048 * when this runqueue becomes "idle". 9049 */ 9050 __sched_fork(0, current); 9051 init_idle(current, smp_processor_id()); 9052 9053 calc_load_update = jiffies + LOAD_FREQ; 9054 9055 idle_thread_set_boot_cpu(); 9056 9057 balance_push_set(smp_processor_id(), false); 9058 init_sched_fair_class(); 9059 init_sched_ext_class(); 9060 9061 psi_init(); 9062 9063 init_uclamp(); 9064 9065 preempt_dynamic_init(); 9066 9067 scheduler_running = 1; 9068 } 9069 9070 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 9071 9072 void __might_sleep(const char *file, int line) 9073 { 9074 unsigned int state = get_current_state(); 9075 /* 9076 * Blocking primitives will set (and therefore destroy) current->state, 9077 * since we will exit with TASK_RUNNING make sure we enter with it, 9078 * otherwise we will destroy state. 9079 */ 9080 WARN_ONCE(state != TASK_RUNNING && current->task_state_change, 9081 "do not call blocking ops when !TASK_RUNNING; " 9082 "state=%x set at [<%p>] %pS\n", state, 9083 (void *)current->task_state_change, 9084 (void *)current->task_state_change); 9085 9086 __might_resched(file, line, 0); 9087 } 9088 EXPORT_SYMBOL(__might_sleep); 9089 9090 static void print_preempt_disable_ip(int preempt_offset, unsigned long ip) 9091 { 9092 if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT)) 9093 return; 9094 9095 if (preempt_count() == preempt_offset) 9096 return; 9097 9098 pr_err("Preemption disabled at:"); 9099 print_ip_sym(KERN_ERR, ip); 9100 } 9101 9102 static inline bool resched_offsets_ok(unsigned int offsets) 9103 { 9104 unsigned int nested = preempt_count(); 9105 9106 nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT; 9107 9108 return nested == offsets; 9109 } 9110 9111 void __might_resched(const char *file, int line, unsigned int offsets) 9112 { 9113 /* Ratelimiting timestamp: */ 9114 static unsigned long prev_jiffy; 9115 9116 unsigned long preempt_disable_ip; 9117 9118 /* WARN_ON_ONCE() by default, no rate limit required: */ 9119 rcu_sleep_check(); 9120 9121 if ((resched_offsets_ok(offsets) && !irqs_disabled() && 9122 !is_idle_task(current) && !current->non_block_count) || 9123 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || 9124 oops_in_progress) 9125 return; 9126 9127 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 9128 return; 9129 prev_jiffy = jiffies; 9130 9131 /* Save this before calling printk(), since that will clobber it: */ 9132 preempt_disable_ip = get_preempt_disable_ip(current); 9133 9134 pr_err("BUG: sleeping function called from invalid context at %s:%d\n", 9135 file, line); 9136 pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", 9137 in_atomic(), irqs_disabled(), current->non_block_count, 9138 current->pid, current->comm); 9139 pr_err("preempt_count: %x, expected: %x\n", preempt_count(), 9140 offsets & MIGHT_RESCHED_PREEMPT_MASK); 9141 9142 if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { 9143 pr_err("RCU nest depth: %d, expected: %u\n", 9144 rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT); 9145 } 9146 9147 if (task_stack_end_corrupted(current)) 9148 pr_emerg("Thread overran stack, or stack corrupted\n"); 9149 9150 debug_show_held_locks(current); 9151 if (irqs_disabled()) 9152 print_irqtrace_events(current); 9153 9154 print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK, 9155 preempt_disable_ip); 9156 9157 dump_stack(); 9158 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 9159 } 9160 EXPORT_SYMBOL(__might_resched); 9161 9162 void __cant_sleep(const char *file, int line, int preempt_offset) 9163 { 9164 static unsigned long prev_jiffy; 9165 9166 if (irqs_disabled()) 9167 return; 9168 9169 if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) 9170 return; 9171 9172 if (preempt_count() > preempt_offset) 9173 return; 9174 9175 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 9176 return; 9177 prev_jiffy = jiffies; 9178 9179 printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); 9180 printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 9181 in_atomic(), irqs_disabled(), 9182 current->pid, current->comm); 9183 9184 debug_show_held_locks(current); 9185 dump_stack(); 9186 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 9187 } 9188 EXPORT_SYMBOL_GPL(__cant_sleep); 9189 9190 # ifdef CONFIG_SMP 9191 void __cant_migrate(const char *file, int line) 9192 { 9193 static unsigned long prev_jiffy; 9194 9195 if (irqs_disabled()) 9196 return; 9197 9198 if (is_migration_disabled(current)) 9199 return; 9200 9201 if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) 9202 return; 9203 9204 if (preempt_count() > 0) 9205 return; 9206 9207 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 9208 return; 9209 prev_jiffy = jiffies; 9210 9211 pr_err("BUG: assuming non migratable context at %s:%d\n", file, line); 9212 pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n", 9213 in_atomic(), irqs_disabled(), is_migration_disabled(current), 9214 current->pid, current->comm); 9215 9216 debug_show_held_locks(current); 9217 dump_stack(); 9218 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 9219 } 9220 EXPORT_SYMBOL_GPL(__cant_migrate); 9221 # endif /* CONFIG_SMP */ 9222 #endif /* CONFIG_DEBUG_ATOMIC_SLEEP */ 9223 9224 #ifdef CONFIG_MAGIC_SYSRQ 9225 void normalize_rt_tasks(void) 9226 { 9227 struct task_struct *g, *p; 9228 struct sched_attr attr = { 9229 .sched_policy = SCHED_NORMAL, 9230 }; 9231 9232 read_lock(&tasklist_lock); 9233 for_each_process_thread(g, p) { 9234 /* 9235 * Only normalize user tasks: 9236 */ 9237 if (p->flags & PF_KTHREAD) 9238 continue; 9239 9240 p->se.exec_start = 0; 9241 schedstat_set(p->stats.wait_start, 0); 9242 schedstat_set(p->stats.sleep_start, 0); 9243 schedstat_set(p->stats.block_start, 0); 9244 9245 if (!rt_or_dl_task(p)) { 9246 /* 9247 * Renice negative nice level userspace 9248 * tasks back to 0: 9249 */ 9250 if (task_nice(p) < 0) 9251 set_user_nice(p, 0); 9252 continue; 9253 } 9254 9255 __sched_setscheduler(p, &attr, false, false); 9256 } 9257 read_unlock(&tasklist_lock); 9258 } 9259 9260 #endif /* CONFIG_MAGIC_SYSRQ */ 9261 9262 #ifdef CONFIG_KGDB_KDB 9263 /* 9264 * These functions are only useful for KDB. 9265 * 9266 * They can only be called when the whole system has been 9267 * stopped - every CPU needs to be quiescent, and no scheduling 9268 * activity can take place. Using them for anything else would 9269 * be a serious bug, and as a result, they aren't even visible 9270 * under any other configuration. 9271 */ 9272 9273 /** 9274 * curr_task - return the current task for a given CPU. 9275 * @cpu: the processor in question. 9276 * 9277 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 9278 * 9279 * Return: The current task for @cpu. 9280 */ 9281 struct task_struct *curr_task(int cpu) 9282 { 9283 return cpu_curr(cpu); 9284 } 9285 9286 #endif /* CONFIG_KGDB_KDB */ 9287 9288 #ifdef CONFIG_CGROUP_SCHED 9289 /* task_group_lock serializes the addition/removal of task groups */ 9290 static DEFINE_SPINLOCK(task_group_lock); 9291 9292 static inline void alloc_uclamp_sched_group(struct task_group *tg, 9293 struct task_group *parent) 9294 { 9295 #ifdef CONFIG_UCLAMP_TASK_GROUP 9296 enum uclamp_id clamp_id; 9297 9298 for_each_clamp_id(clamp_id) { 9299 uclamp_se_set(&tg->uclamp_req[clamp_id], 9300 uclamp_none(clamp_id), false); 9301 tg->uclamp[clamp_id] = parent->uclamp[clamp_id]; 9302 } 9303 #endif 9304 } 9305 9306 static void sched_free_group(struct task_group *tg) 9307 { 9308 free_fair_sched_group(tg); 9309 free_rt_sched_group(tg); 9310 autogroup_free(tg); 9311 kmem_cache_free(task_group_cache, tg); 9312 } 9313 9314 static void sched_free_group_rcu(struct rcu_head *rcu) 9315 { 9316 sched_free_group(container_of(rcu, struct task_group, rcu)); 9317 } 9318 9319 static void sched_unregister_group(struct task_group *tg) 9320 { 9321 unregister_fair_sched_group(tg); 9322 unregister_rt_sched_group(tg); 9323 /* 9324 * We have to wait for yet another RCU grace period to expire, as 9325 * print_cfs_stats() might run concurrently. 9326 */ 9327 call_rcu(&tg->rcu, sched_free_group_rcu); 9328 } 9329 9330 /* allocate runqueue etc for a new task group */ 9331 struct task_group *sched_create_group(struct task_group *parent) 9332 { 9333 struct task_group *tg; 9334 9335 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); 9336 if (!tg) 9337 return ERR_PTR(-ENOMEM); 9338 9339 if (!alloc_fair_sched_group(tg, parent)) 9340 goto err; 9341 9342 if (!alloc_rt_sched_group(tg, parent)) 9343 goto err; 9344 9345 scx_tg_init(tg); 9346 alloc_uclamp_sched_group(tg, parent); 9347 9348 return tg; 9349 9350 err: 9351 sched_free_group(tg); 9352 return ERR_PTR(-ENOMEM); 9353 } 9354 9355 void sched_online_group(struct task_group *tg, struct task_group *parent) 9356 { 9357 unsigned long flags; 9358 9359 spin_lock_irqsave(&task_group_lock, flags); 9360 list_add_tail_rcu(&tg->list, &task_groups); 9361 9362 /* Root should already exist: */ 9363 WARN_ON(!parent); 9364 9365 tg->parent = parent; 9366 INIT_LIST_HEAD(&tg->children); 9367 list_add_rcu(&tg->siblings, &parent->children); 9368 spin_unlock_irqrestore(&task_group_lock, flags); 9369 9370 online_fair_sched_group(tg); 9371 } 9372 9373 /* RCU callback to free various structures associated with a task group */ 9374 static void sched_unregister_group_rcu(struct rcu_head *rhp) 9375 { 9376 /* Now it should be safe to free those cfs_rqs: */ 9377 sched_unregister_group(container_of(rhp, struct task_group, rcu)); 9378 } 9379 9380 void sched_destroy_group(struct task_group *tg) 9381 { 9382 /* Wait for possible concurrent references to cfs_rqs complete: */ 9383 call_rcu(&tg->rcu, sched_unregister_group_rcu); 9384 } 9385 9386 void sched_release_group(struct task_group *tg) 9387 { 9388 unsigned long flags; 9389 9390 /* 9391 * Unlink first, to avoid walk_tg_tree_from() from finding us (via 9392 * sched_cfs_period_timer()). 9393 * 9394 * For this to be effective, we have to wait for all pending users of 9395 * this task group to leave their RCU critical section to ensure no new 9396 * user will see our dying task group any more. Specifically ensure 9397 * that tg_unthrottle_up() won't add decayed cfs_rq's to it. 9398 * 9399 * We therefore defer calling unregister_fair_sched_group() to 9400 * sched_unregister_group() which is guarantied to get called only after the 9401 * current RCU grace period has expired. 9402 */ 9403 spin_lock_irqsave(&task_group_lock, flags); 9404 list_del_rcu(&tg->list); 9405 list_del_rcu(&tg->siblings); 9406 spin_unlock_irqrestore(&task_group_lock, flags); 9407 } 9408 9409 static void sched_change_group(struct task_struct *tsk) 9410 { 9411 struct task_group *tg; 9412 9413 /* 9414 * All callers are synchronized by task_rq_lock(); we do not use RCU 9415 * which is pointless here. Thus, we pass "true" to task_css_check() 9416 * to prevent lockdep warnings. 9417 */ 9418 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), 9419 struct task_group, css); 9420 tg = autogroup_task_group(tsk, tg); 9421 tsk->sched_task_group = tg; 9422 9423 #ifdef CONFIG_FAIR_GROUP_SCHED 9424 if (tsk->sched_class->task_change_group) 9425 tsk->sched_class->task_change_group(tsk); 9426 else 9427 #endif 9428 set_task_rq(tsk, task_cpu(tsk)); 9429 } 9430 9431 /* 9432 * Change task's runqueue when it moves between groups. 9433 * 9434 * The caller of this function should have put the task in its new group by 9435 * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect 9436 * its new group. 9437 */ 9438 void sched_move_task(struct task_struct *tsk, bool for_autogroup) 9439 { 9440 unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; 9441 bool resched = false; 9442 bool queued = false; 9443 struct rq *rq; 9444 9445 CLASS(task_rq_lock, rq_guard)(tsk); 9446 rq = rq_guard.rq; 9447 9448 scoped_guard (sched_change, tsk, queue_flags) { 9449 sched_change_group(tsk); 9450 if (!for_autogroup) 9451 scx_cgroup_move_task(tsk); 9452 if (scope->running) 9453 resched = true; 9454 queued = scope->queued; 9455 } 9456 9457 if (resched) 9458 resched_curr(rq); 9459 else if (queued) 9460 wakeup_preempt(rq, tsk, 0); 9461 9462 __balance_callbacks(rq, &rq_guard.rf); 9463 } 9464 9465 static struct cgroup_subsys_state * 9466 cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 9467 { 9468 struct task_group *parent = css_tg(parent_css); 9469 struct task_group *tg; 9470 9471 if (!parent) { 9472 /* This is early initialization for the top cgroup */ 9473 return &root_task_group.css; 9474 } 9475 9476 tg = sched_create_group(parent); 9477 if (IS_ERR(tg)) 9478 return ERR_PTR(-ENOMEM); 9479 9480 return &tg->css; 9481 } 9482 9483 /* Expose task group only after completing cgroup initialization */ 9484 static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) 9485 { 9486 struct task_group *tg = css_tg(css); 9487 struct task_group *parent = css_tg(css->parent); 9488 int ret; 9489 9490 ret = scx_tg_online(tg); 9491 if (ret) 9492 return ret; 9493 9494 if (parent) 9495 sched_online_group(tg, parent); 9496 9497 #ifdef CONFIG_UCLAMP_TASK_GROUP 9498 /* Propagate the effective uclamp value for the new group */ 9499 guard(mutex)(&uclamp_mutex); 9500 guard(rcu)(); 9501 cpu_util_update_eff(css); 9502 #endif 9503 9504 return 0; 9505 } 9506 9507 static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) 9508 { 9509 struct task_group *tg = css_tg(css); 9510 9511 scx_tg_offline(tg); 9512 } 9513 9514 static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) 9515 { 9516 struct task_group *tg = css_tg(css); 9517 9518 sched_release_group(tg); 9519 } 9520 9521 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) 9522 { 9523 struct task_group *tg = css_tg(css); 9524 9525 /* 9526 * Relies on the RCU grace period between css_released() and this. 9527 */ 9528 sched_unregister_group(tg); 9529 } 9530 9531 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) 9532 { 9533 #ifdef CONFIG_RT_GROUP_SCHED 9534 struct task_struct *task; 9535 struct cgroup_subsys_state *css; 9536 9537 if (!rt_group_sched_enabled()) 9538 goto scx_check; 9539 9540 cgroup_taskset_for_each(task, css, tset) { 9541 if (!sched_rt_can_attach(css_tg(css), task)) 9542 return -EINVAL; 9543 } 9544 scx_check: 9545 #endif /* CONFIG_RT_GROUP_SCHED */ 9546 return scx_cgroup_can_attach(tset); 9547 } 9548 9549 static void cpu_cgroup_attach(struct cgroup_taskset *tset) 9550 { 9551 struct task_struct *task; 9552 struct cgroup_subsys_state *css; 9553 9554 cgroup_taskset_for_each(task, css, tset) 9555 sched_move_task(task, false); 9556 } 9557 9558 static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset) 9559 { 9560 scx_cgroup_cancel_attach(tset); 9561 } 9562 9563 #ifdef CONFIG_UCLAMP_TASK_GROUP 9564 static void cpu_util_update_eff(struct cgroup_subsys_state *css) 9565 { 9566 struct cgroup_subsys_state *top_css = css; 9567 struct uclamp_se *uc_parent = NULL; 9568 struct uclamp_se *uc_se = NULL; 9569 unsigned int eff[UCLAMP_CNT]; 9570 enum uclamp_id clamp_id; 9571 unsigned int clamps; 9572 9573 lockdep_assert_held(&uclamp_mutex); 9574 WARN_ON_ONCE(!rcu_read_lock_held()); 9575 9576 css_for_each_descendant_pre(css, top_css) { 9577 uc_parent = css_tg(css)->parent 9578 ? css_tg(css)->parent->uclamp : NULL; 9579 9580 for_each_clamp_id(clamp_id) { 9581 /* Assume effective clamps matches requested clamps */ 9582 eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value; 9583 /* Cap effective clamps with parent's effective clamps */ 9584 if (uc_parent && 9585 eff[clamp_id] > uc_parent[clamp_id].value) { 9586 eff[clamp_id] = uc_parent[clamp_id].value; 9587 } 9588 } 9589 /* Ensure protection is always capped by limit */ 9590 eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]); 9591 9592 /* Propagate most restrictive effective clamps */ 9593 clamps = 0x0; 9594 uc_se = css_tg(css)->uclamp; 9595 for_each_clamp_id(clamp_id) { 9596 if (eff[clamp_id] == uc_se[clamp_id].value) 9597 continue; 9598 uc_se[clamp_id].value = eff[clamp_id]; 9599 uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]); 9600 clamps |= (0x1 << clamp_id); 9601 } 9602 if (!clamps) { 9603 css = css_rightmost_descendant(css); 9604 continue; 9605 } 9606 9607 /* Immediately update descendants RUNNABLE tasks */ 9608 uclamp_update_active_tasks(css); 9609 } 9610 } 9611 9612 /* 9613 * Integer 10^N with a given N exponent by casting to integer the literal "1eN" 9614 * C expression. Since there is no way to convert a macro argument (N) into a 9615 * character constant, use two levels of macros. 9616 */ 9617 #define _POW10(exp) ((unsigned int)1e##exp) 9618 #define POW10(exp) _POW10(exp) 9619 9620 struct uclamp_request { 9621 #define UCLAMP_PERCENT_SHIFT 2 9622 #define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT)) 9623 s64 percent; 9624 u64 util; 9625 int ret; 9626 }; 9627 9628 static inline struct uclamp_request 9629 capacity_from_percent(char *buf) 9630 { 9631 struct uclamp_request req = { 9632 .percent = UCLAMP_PERCENT_SCALE, 9633 .util = SCHED_CAPACITY_SCALE, 9634 .ret = 0, 9635 }; 9636 9637 buf = strim(buf); 9638 if (strcmp(buf, "max")) { 9639 req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT, 9640 &req.percent); 9641 if (req.ret) 9642 return req; 9643 if ((u64)req.percent > UCLAMP_PERCENT_SCALE) { 9644 req.ret = -ERANGE; 9645 return req; 9646 } 9647 9648 req.util = req.percent << SCHED_CAPACITY_SHIFT; 9649 req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE); 9650 } 9651 9652 return req; 9653 } 9654 9655 static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, 9656 size_t nbytes, loff_t off, 9657 enum uclamp_id clamp_id) 9658 { 9659 struct uclamp_request req; 9660 struct task_group *tg; 9661 9662 req = capacity_from_percent(buf); 9663 if (req.ret) 9664 return req.ret; 9665 9666 sched_uclamp_enable(); 9667 9668 guard(mutex)(&uclamp_mutex); 9669 guard(rcu)(); 9670 9671 tg = css_tg(of_css(of)); 9672 if (tg->uclamp_req[clamp_id].value != req.util) 9673 uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false); 9674 9675 /* 9676 * Because of not recoverable conversion rounding we keep track of the 9677 * exact requested value 9678 */ 9679 tg->uclamp_pct[clamp_id] = req.percent; 9680 9681 /* Update effective clamps to track the most restrictive value */ 9682 cpu_util_update_eff(of_css(of)); 9683 9684 return nbytes; 9685 } 9686 9687 static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of, 9688 char *buf, size_t nbytes, 9689 loff_t off) 9690 { 9691 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN); 9692 } 9693 9694 static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of, 9695 char *buf, size_t nbytes, 9696 loff_t off) 9697 { 9698 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX); 9699 } 9700 9701 static inline void cpu_uclamp_print(struct seq_file *sf, 9702 enum uclamp_id clamp_id) 9703 { 9704 struct task_group *tg; 9705 u64 util_clamp; 9706 u64 percent; 9707 u32 rem; 9708 9709 scoped_guard (rcu) { 9710 tg = css_tg(seq_css(sf)); 9711 util_clamp = tg->uclamp_req[clamp_id].value; 9712 } 9713 9714 if (util_clamp == SCHED_CAPACITY_SCALE) { 9715 seq_puts(sf, "max\n"); 9716 return; 9717 } 9718 9719 percent = tg->uclamp_pct[clamp_id]; 9720 percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem); 9721 seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem); 9722 } 9723 9724 static int cpu_uclamp_min_show(struct seq_file *sf, void *v) 9725 { 9726 cpu_uclamp_print(sf, UCLAMP_MIN); 9727 return 0; 9728 } 9729 9730 static int cpu_uclamp_max_show(struct seq_file *sf, void *v) 9731 { 9732 cpu_uclamp_print(sf, UCLAMP_MAX); 9733 return 0; 9734 } 9735 #endif /* CONFIG_UCLAMP_TASK_GROUP */ 9736 9737 #ifdef CONFIG_GROUP_SCHED_WEIGHT 9738 static unsigned long tg_weight(struct task_group *tg) 9739 { 9740 #ifdef CONFIG_FAIR_GROUP_SCHED 9741 return scale_load_down(tg->shares); 9742 #else 9743 return sched_weight_from_cgroup(tg->scx.weight); 9744 #endif 9745 } 9746 9747 static int cpu_shares_write_u64(struct cgroup_subsys_state *css, 9748 struct cftype *cftype, u64 shareval) 9749 { 9750 int ret; 9751 9752 if (shareval > scale_load_down(ULONG_MAX)) 9753 shareval = MAX_SHARES; 9754 ret = sched_group_set_shares(css_tg(css), scale_load(shareval)); 9755 if (!ret) 9756 scx_group_set_weight(css_tg(css), 9757 sched_weight_to_cgroup(shareval)); 9758 return ret; 9759 } 9760 9761 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, 9762 struct cftype *cft) 9763 { 9764 return tg_weight(css_tg(css)); 9765 } 9766 #endif /* CONFIG_GROUP_SCHED_WEIGHT */ 9767 9768 #ifdef CONFIG_CFS_BANDWIDTH 9769 static DEFINE_MUTEX(cfs_constraints_mutex); 9770 9771 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); 9772 9773 static int tg_set_cfs_bandwidth(struct task_group *tg, 9774 u64 period_us, u64 quota_us, u64 burst_us) 9775 { 9776 int i, ret = 0, runtime_enabled, runtime_was_enabled; 9777 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 9778 u64 period, quota, burst; 9779 9780 period = (u64)period_us * NSEC_PER_USEC; 9781 9782 if (quota_us == RUNTIME_INF) 9783 quota = RUNTIME_INF; 9784 else 9785 quota = (u64)quota_us * NSEC_PER_USEC; 9786 9787 burst = (u64)burst_us * NSEC_PER_USEC; 9788 9789 /* 9790 * Prevent race between setting of cfs_rq->runtime_enabled and 9791 * unthrottle_offline_cfs_rqs(). 9792 */ 9793 guard(cpus_read_lock)(); 9794 guard(mutex)(&cfs_constraints_mutex); 9795 9796 ret = __cfs_schedulable(tg, period, quota); 9797 if (ret) 9798 return ret; 9799 9800 runtime_enabled = quota != RUNTIME_INF; 9801 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 9802 /* 9803 * If we need to toggle cfs_bandwidth_used, off->on must occur 9804 * before making related changes, and on->off must occur afterwards 9805 */ 9806 if (runtime_enabled && !runtime_was_enabled) 9807 cfs_bandwidth_usage_inc(); 9808 9809 scoped_guard (raw_spinlock_irq, &cfs_b->lock) { 9810 cfs_b->period = ns_to_ktime(period); 9811 cfs_b->quota = quota; 9812 cfs_b->burst = burst; 9813 9814 __refill_cfs_bandwidth_runtime(cfs_b); 9815 9816 /* 9817 * Restart the period timer (if active) to handle new 9818 * period expiry: 9819 */ 9820 if (runtime_enabled) 9821 start_cfs_bandwidth(cfs_b); 9822 } 9823 9824 for_each_online_cpu(i) { 9825 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 9826 struct rq *rq = cfs_rq->rq; 9827 9828 guard(rq_lock_irq)(rq); 9829 cfs_rq->runtime_enabled = runtime_enabled; 9830 cfs_rq->runtime_remaining = 1; 9831 9832 if (cfs_rq->throttled) 9833 unthrottle_cfs_rq(cfs_rq); 9834 } 9835 9836 if (runtime_was_enabled && !runtime_enabled) 9837 cfs_bandwidth_usage_dec(); 9838 9839 return 0; 9840 } 9841 9842 static u64 tg_get_cfs_period(struct task_group *tg) 9843 { 9844 u64 cfs_period_us; 9845 9846 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); 9847 do_div(cfs_period_us, NSEC_PER_USEC); 9848 9849 return cfs_period_us; 9850 } 9851 9852 static u64 tg_get_cfs_quota(struct task_group *tg) 9853 { 9854 u64 quota_us; 9855 9856 if (tg->cfs_bandwidth.quota == RUNTIME_INF) 9857 return RUNTIME_INF; 9858 9859 quota_us = tg->cfs_bandwidth.quota; 9860 do_div(quota_us, NSEC_PER_USEC); 9861 9862 return quota_us; 9863 } 9864 9865 static u64 tg_get_cfs_burst(struct task_group *tg) 9866 { 9867 u64 burst_us; 9868 9869 burst_us = tg->cfs_bandwidth.burst; 9870 do_div(burst_us, NSEC_PER_USEC); 9871 9872 return burst_us; 9873 } 9874 9875 struct cfs_schedulable_data { 9876 struct task_group *tg; 9877 u64 period, quota; 9878 }; 9879 9880 /* 9881 * normalize group quota/period to be quota/max_period 9882 * note: units are usecs 9883 */ 9884 static u64 normalize_cfs_quota(struct task_group *tg, 9885 struct cfs_schedulable_data *d) 9886 { 9887 u64 quota, period; 9888 9889 if (tg == d->tg) { 9890 period = d->period; 9891 quota = d->quota; 9892 } else { 9893 period = tg_get_cfs_period(tg); 9894 quota = tg_get_cfs_quota(tg); 9895 } 9896 9897 /* note: these should typically be equivalent */ 9898 if (quota == RUNTIME_INF || quota == -1) 9899 return RUNTIME_INF; 9900 9901 return to_ratio(period, quota); 9902 } 9903 9904 static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 9905 { 9906 struct cfs_schedulable_data *d = data; 9907 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 9908 s64 quota = 0, parent_quota = -1; 9909 9910 if (!tg->parent) { 9911 quota = RUNTIME_INF; 9912 } else { 9913 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; 9914 9915 quota = normalize_cfs_quota(tg, d); 9916 parent_quota = parent_b->hierarchical_quota; 9917 9918 /* 9919 * Ensure max(child_quota) <= parent_quota. On cgroup2, 9920 * always take the non-RUNTIME_INF min. On cgroup1, only 9921 * inherit when no limit is set. In both cases this is used 9922 * by the scheduler to determine if a given CFS task has a 9923 * bandwidth constraint at some higher level. 9924 */ 9925 if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) { 9926 if (quota == RUNTIME_INF) 9927 quota = parent_quota; 9928 else if (parent_quota != RUNTIME_INF) 9929 quota = min(quota, parent_quota); 9930 } else { 9931 if (quota == RUNTIME_INF) 9932 quota = parent_quota; 9933 else if (parent_quota != RUNTIME_INF && quota > parent_quota) 9934 return -EINVAL; 9935 } 9936 } 9937 cfs_b->hierarchical_quota = quota; 9938 9939 return 0; 9940 } 9941 9942 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) 9943 { 9944 struct cfs_schedulable_data data = { 9945 .tg = tg, 9946 .period = period, 9947 .quota = quota, 9948 }; 9949 9950 if (quota != RUNTIME_INF) { 9951 do_div(data.period, NSEC_PER_USEC); 9952 do_div(data.quota, NSEC_PER_USEC); 9953 } 9954 9955 guard(rcu)(); 9956 return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); 9957 } 9958 9959 static int cpu_cfs_stat_show(struct seq_file *sf, void *v) 9960 { 9961 struct task_group *tg = css_tg(seq_css(sf)); 9962 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 9963 9964 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); 9965 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); 9966 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); 9967 9968 if (schedstat_enabled() && tg != &root_task_group) { 9969 struct sched_statistics *stats; 9970 u64 ws = 0; 9971 int i; 9972 9973 for_each_possible_cpu(i) { 9974 stats = __schedstats_from_se(tg->se[i]); 9975 ws += schedstat_val(stats->wait_sum); 9976 } 9977 9978 seq_printf(sf, "wait_sum %llu\n", ws); 9979 } 9980 9981 seq_printf(sf, "nr_bursts %d\n", cfs_b->nr_burst); 9982 seq_printf(sf, "burst_time %llu\n", cfs_b->burst_time); 9983 9984 return 0; 9985 } 9986 9987 static u64 throttled_time_self(struct task_group *tg) 9988 { 9989 int i; 9990 u64 total = 0; 9991 9992 for_each_possible_cpu(i) { 9993 total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time); 9994 } 9995 9996 return total; 9997 } 9998 9999 static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) 10000 { 10001 struct task_group *tg = css_tg(seq_css(sf)); 10002 10003 seq_printf(sf, "throttled_time %llu\n", throttled_time_self(tg)); 10004 10005 return 0; 10006 } 10007 #endif /* CONFIG_CFS_BANDWIDTH */ 10008 10009 #ifdef CONFIG_GROUP_SCHED_BANDWIDTH 10010 const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */ 10011 static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */ 10012 /* More than 203 days if BW_SHIFT equals 20. */ 10013 static const u64 max_bw_runtime_us = MAX_BW; 10014 10015 static void tg_bandwidth(struct task_group *tg, 10016 u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p) 10017 { 10018 #ifdef CONFIG_CFS_BANDWIDTH 10019 if (period_us_p) 10020 *period_us_p = tg_get_cfs_period(tg); 10021 if (quota_us_p) 10022 *quota_us_p = tg_get_cfs_quota(tg); 10023 if (burst_us_p) 10024 *burst_us_p = tg_get_cfs_burst(tg); 10025 #else /* !CONFIG_CFS_BANDWIDTH */ 10026 if (period_us_p) 10027 *period_us_p = tg->scx.bw_period_us; 10028 if (quota_us_p) 10029 *quota_us_p = tg->scx.bw_quota_us; 10030 if (burst_us_p) 10031 *burst_us_p = tg->scx.bw_burst_us; 10032 #endif /* CONFIG_CFS_BANDWIDTH */ 10033 } 10034 10035 static u64 cpu_period_read_u64(struct cgroup_subsys_state *css, 10036 struct cftype *cft) 10037 { 10038 u64 period_us; 10039 10040 tg_bandwidth(css_tg(css), &period_us, NULL, NULL); 10041 return period_us; 10042 } 10043 10044 static int tg_set_bandwidth(struct task_group *tg, 10045 u64 period_us, u64 quota_us, u64 burst_us) 10046 { 10047 const u64 max_usec = U64_MAX / NSEC_PER_USEC; 10048 int ret = 0; 10049 10050 if (tg == &root_task_group) 10051 return -EINVAL; 10052 10053 /* Values should survive translation to nsec */ 10054 if (period_us > max_usec || 10055 (quota_us != RUNTIME_INF && quota_us > max_usec) || 10056 burst_us > max_usec) 10057 return -EINVAL; 10058 10059 /* 10060 * Ensure we have some amount of bandwidth every period. This is to 10061 * prevent reaching a state of large arrears when throttled via 10062 * entity_tick() resulting in prolonged exit starvation. 10063 */ 10064 if (quota_us < min_bw_quota_period_us || 10065 period_us < min_bw_quota_period_us) 10066 return -EINVAL; 10067 10068 /* 10069 * Likewise, bound things on the other side by preventing insane quota 10070 * periods. This also allows us to normalize in computing quota 10071 * feasibility. 10072 */ 10073 if (period_us > max_bw_quota_period_us) 10074 return -EINVAL; 10075 10076 /* 10077 * Bound quota to defend quota against overflow during bandwidth shift. 10078 */ 10079 if (quota_us != RUNTIME_INF && quota_us > max_bw_runtime_us) 10080 return -EINVAL; 10081 10082 if (quota_us != RUNTIME_INF && (burst_us > quota_us || 10083 burst_us + quota_us > max_bw_runtime_us)) 10084 return -EINVAL; 10085 10086 #ifdef CONFIG_CFS_BANDWIDTH 10087 ret = tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us); 10088 #endif /* CONFIG_CFS_BANDWIDTH */ 10089 if (!ret) 10090 scx_group_set_bandwidth(tg, period_us, quota_us, burst_us); 10091 return ret; 10092 } 10093 10094 static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css, 10095 struct cftype *cft) 10096 { 10097 u64 quota_us; 10098 10099 tg_bandwidth(css_tg(css), NULL, "a_us, NULL); 10100 return quota_us; /* (s64)RUNTIME_INF becomes -1 */ 10101 } 10102 10103 static u64 cpu_burst_read_u64(struct cgroup_subsys_state *css, 10104 struct cftype *cft) 10105 { 10106 u64 burst_us; 10107 10108 tg_bandwidth(css_tg(css), NULL, NULL, &burst_us); 10109 return burst_us; 10110 } 10111 10112 static int cpu_period_write_u64(struct cgroup_subsys_state *css, 10113 struct cftype *cftype, u64 period_us) 10114 { 10115 struct task_group *tg = css_tg(css); 10116 u64 quota_us, burst_us; 10117 10118 tg_bandwidth(tg, NULL, "a_us, &burst_us); 10119 return tg_set_bandwidth(tg, period_us, quota_us, burst_us); 10120 } 10121 10122 static int cpu_quota_write_s64(struct cgroup_subsys_state *css, 10123 struct cftype *cftype, s64 quota_us) 10124 { 10125 struct task_group *tg = css_tg(css); 10126 u64 period_us, burst_us; 10127 10128 if (quota_us < 0) 10129 quota_us = RUNTIME_INF; 10130 10131 tg_bandwidth(tg, &period_us, NULL, &burst_us); 10132 return tg_set_bandwidth(tg, period_us, quota_us, burst_us); 10133 } 10134 10135 static int cpu_burst_write_u64(struct cgroup_subsys_state *css, 10136 struct cftype *cftype, u64 burst_us) 10137 { 10138 struct task_group *tg = css_tg(css); 10139 u64 period_us, quota_us; 10140 10141 tg_bandwidth(tg, &period_us, "a_us, NULL); 10142 return tg_set_bandwidth(tg, period_us, quota_us, burst_us); 10143 } 10144 #endif /* CONFIG_GROUP_SCHED_BANDWIDTH */ 10145 10146 #ifdef CONFIG_RT_GROUP_SCHED 10147 static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, 10148 struct cftype *cft, s64 val) 10149 { 10150 return sched_group_set_rt_runtime(css_tg(css), val); 10151 } 10152 10153 static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, 10154 struct cftype *cft) 10155 { 10156 return sched_group_rt_runtime(css_tg(css)); 10157 } 10158 10159 static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, 10160 struct cftype *cftype, u64 rt_period_us) 10161 { 10162 return sched_group_set_rt_period(css_tg(css), rt_period_us); 10163 } 10164 10165 static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, 10166 struct cftype *cft) 10167 { 10168 return sched_group_rt_period(css_tg(css)); 10169 } 10170 #endif /* CONFIG_RT_GROUP_SCHED */ 10171 10172 #ifdef CONFIG_GROUP_SCHED_WEIGHT 10173 static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, 10174 struct cftype *cft) 10175 { 10176 return css_tg(css)->idle; 10177 } 10178 10179 static int cpu_idle_write_s64(struct cgroup_subsys_state *css, 10180 struct cftype *cft, s64 idle) 10181 { 10182 int ret; 10183 10184 ret = sched_group_set_idle(css_tg(css), idle); 10185 if (!ret) 10186 scx_group_set_idle(css_tg(css), idle); 10187 return ret; 10188 } 10189 #endif /* CONFIG_GROUP_SCHED_WEIGHT */ 10190 10191 static struct cftype cpu_legacy_files[] = { 10192 #ifdef CONFIG_GROUP_SCHED_WEIGHT 10193 { 10194 .name = "shares", 10195 .read_u64 = cpu_shares_read_u64, 10196 .write_u64 = cpu_shares_write_u64, 10197 }, 10198 { 10199 .name = "idle", 10200 .read_s64 = cpu_idle_read_s64, 10201 .write_s64 = cpu_idle_write_s64, 10202 }, 10203 #endif 10204 #ifdef CONFIG_GROUP_SCHED_BANDWIDTH 10205 { 10206 .name = "cfs_period_us", 10207 .read_u64 = cpu_period_read_u64, 10208 .write_u64 = cpu_period_write_u64, 10209 }, 10210 { 10211 .name = "cfs_quota_us", 10212 .read_s64 = cpu_quota_read_s64, 10213 .write_s64 = cpu_quota_write_s64, 10214 }, 10215 { 10216 .name = "cfs_burst_us", 10217 .read_u64 = cpu_burst_read_u64, 10218 .write_u64 = cpu_burst_write_u64, 10219 }, 10220 #endif 10221 #ifdef CONFIG_CFS_BANDWIDTH 10222 { 10223 .name = "stat", 10224 .seq_show = cpu_cfs_stat_show, 10225 }, 10226 { 10227 .name = "stat.local", 10228 .seq_show = cpu_cfs_local_stat_show, 10229 }, 10230 #endif 10231 #ifdef CONFIG_UCLAMP_TASK_GROUP 10232 { 10233 .name = "uclamp.min", 10234 .flags = CFTYPE_NOT_ON_ROOT, 10235 .seq_show = cpu_uclamp_min_show, 10236 .write = cpu_uclamp_min_write, 10237 }, 10238 { 10239 .name = "uclamp.max", 10240 .flags = CFTYPE_NOT_ON_ROOT, 10241 .seq_show = cpu_uclamp_max_show, 10242 .write = cpu_uclamp_max_write, 10243 }, 10244 #endif 10245 { } /* Terminate */ 10246 }; 10247 10248 #ifdef CONFIG_RT_GROUP_SCHED 10249 static struct cftype rt_group_files[] = { 10250 { 10251 .name = "rt_runtime_us", 10252 .read_s64 = cpu_rt_runtime_read, 10253 .write_s64 = cpu_rt_runtime_write, 10254 }, 10255 { 10256 .name = "rt_period_us", 10257 .read_u64 = cpu_rt_period_read_uint, 10258 .write_u64 = cpu_rt_period_write_uint, 10259 }, 10260 { } /* Terminate */ 10261 }; 10262 10263 # ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED 10264 DEFINE_STATIC_KEY_FALSE(rt_group_sched); 10265 # else 10266 DEFINE_STATIC_KEY_TRUE(rt_group_sched); 10267 # endif 10268 10269 static int __init setup_rt_group_sched(char *str) 10270 { 10271 long val; 10272 10273 if (kstrtol(str, 0, &val) || val < 0 || val > 1) { 10274 pr_warn("Unable to set rt_group_sched\n"); 10275 return 1; 10276 } 10277 if (val) 10278 static_branch_enable(&rt_group_sched); 10279 else 10280 static_branch_disable(&rt_group_sched); 10281 10282 return 1; 10283 } 10284 __setup("rt_group_sched=", setup_rt_group_sched); 10285 10286 static int __init cpu_rt_group_init(void) 10287 { 10288 if (!rt_group_sched_enabled()) 10289 return 0; 10290 10291 WARN_ON(cgroup_add_legacy_cftypes(&cpu_cgrp_subsys, rt_group_files)); 10292 return 0; 10293 } 10294 subsys_initcall(cpu_rt_group_init); 10295 #endif /* CONFIG_RT_GROUP_SCHED */ 10296 10297 static int cpu_extra_stat_show(struct seq_file *sf, 10298 struct cgroup_subsys_state *css) 10299 { 10300 #ifdef CONFIG_CFS_BANDWIDTH 10301 { 10302 struct task_group *tg = css_tg(css); 10303 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 10304 u64 throttled_usec, burst_usec; 10305 10306 throttled_usec = cfs_b->throttled_time; 10307 do_div(throttled_usec, NSEC_PER_USEC); 10308 burst_usec = cfs_b->burst_time; 10309 do_div(burst_usec, NSEC_PER_USEC); 10310 10311 seq_printf(sf, "nr_periods %d\n" 10312 "nr_throttled %d\n" 10313 "throttled_usec %llu\n" 10314 "nr_bursts %d\n" 10315 "burst_usec %llu\n", 10316 cfs_b->nr_periods, cfs_b->nr_throttled, 10317 throttled_usec, cfs_b->nr_burst, burst_usec); 10318 } 10319 #endif /* CONFIG_CFS_BANDWIDTH */ 10320 return 0; 10321 } 10322 10323 static int cpu_local_stat_show(struct seq_file *sf, 10324 struct cgroup_subsys_state *css) 10325 { 10326 #ifdef CONFIG_CFS_BANDWIDTH 10327 { 10328 struct task_group *tg = css_tg(css); 10329 u64 throttled_self_usec; 10330 10331 throttled_self_usec = throttled_time_self(tg); 10332 do_div(throttled_self_usec, NSEC_PER_USEC); 10333 10334 seq_printf(sf, "throttled_usec %llu\n", 10335 throttled_self_usec); 10336 } 10337 #endif 10338 return 0; 10339 } 10340 10341 #ifdef CONFIG_GROUP_SCHED_WEIGHT 10342 10343 static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, 10344 struct cftype *cft) 10345 { 10346 return sched_weight_to_cgroup(tg_weight(css_tg(css))); 10347 } 10348 10349 static int cpu_weight_write_u64(struct cgroup_subsys_state *css, 10350 struct cftype *cft, u64 cgrp_weight) 10351 { 10352 unsigned long weight; 10353 int ret; 10354 10355 if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX) 10356 return -ERANGE; 10357 10358 weight = sched_weight_from_cgroup(cgrp_weight); 10359 10360 ret = sched_group_set_shares(css_tg(css), scale_load(weight)); 10361 if (!ret) 10362 scx_group_set_weight(css_tg(css), cgrp_weight); 10363 return ret; 10364 } 10365 10366 static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, 10367 struct cftype *cft) 10368 { 10369 unsigned long weight = tg_weight(css_tg(css)); 10370 int last_delta = INT_MAX; 10371 int prio, delta; 10372 10373 /* find the closest nice value to the current weight */ 10374 for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) { 10375 delta = abs(sched_prio_to_weight[prio] - weight); 10376 if (delta >= last_delta) 10377 break; 10378 last_delta = delta; 10379 } 10380 10381 return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO); 10382 } 10383 10384 static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, 10385 struct cftype *cft, s64 nice) 10386 { 10387 unsigned long weight; 10388 int idx, ret; 10389 10390 if (nice < MIN_NICE || nice > MAX_NICE) 10391 return -ERANGE; 10392 10393 idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO; 10394 idx = array_index_nospec(idx, 40); 10395 weight = sched_prio_to_weight[idx]; 10396 10397 ret = sched_group_set_shares(css_tg(css), scale_load(weight)); 10398 if (!ret) 10399 scx_group_set_weight(css_tg(css), 10400 sched_weight_to_cgroup(weight)); 10401 return ret; 10402 } 10403 #endif /* CONFIG_GROUP_SCHED_WEIGHT */ 10404 10405 static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, 10406 long period, long quota) 10407 { 10408 if (quota < 0) 10409 seq_puts(sf, "max"); 10410 else 10411 seq_printf(sf, "%ld", quota); 10412 10413 seq_printf(sf, " %ld\n", period); 10414 } 10415 10416 /* caller should put the current value in *@periodp before calling */ 10417 static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p, 10418 u64 *quota_us_p) 10419 { 10420 char tok[21]; /* U64_MAX */ 10421 10422 if (sscanf(buf, "%20s %llu", tok, period_us_p) < 1) 10423 return -EINVAL; 10424 10425 if (sscanf(tok, "%llu", quota_us_p) < 1) { 10426 if (!strcmp(tok, "max")) 10427 *quota_us_p = RUNTIME_INF; 10428 else 10429 return -EINVAL; 10430 } 10431 10432 return 0; 10433 } 10434 10435 #ifdef CONFIG_GROUP_SCHED_BANDWIDTH 10436 static int cpu_max_show(struct seq_file *sf, void *v) 10437 { 10438 struct task_group *tg = css_tg(seq_css(sf)); 10439 u64 period_us, quota_us; 10440 10441 tg_bandwidth(tg, &period_us, "a_us, NULL); 10442 cpu_period_quota_print(sf, period_us, quota_us); 10443 return 0; 10444 } 10445 10446 static ssize_t cpu_max_write(struct kernfs_open_file *of, 10447 char *buf, size_t nbytes, loff_t off) 10448 { 10449 struct task_group *tg = css_tg(of_css(of)); 10450 u64 period_us, quota_us, burst_us; 10451 int ret; 10452 10453 tg_bandwidth(tg, &period_us, NULL, &burst_us); 10454 ret = cpu_period_quota_parse(buf, &period_us, "a_us); 10455 if (!ret) 10456 ret = tg_set_bandwidth(tg, period_us, quota_us, burst_us); 10457 return ret ?: nbytes; 10458 } 10459 #endif /* CONFIG_CFS_BANDWIDTH */ 10460 10461 static struct cftype cpu_files[] = { 10462 #ifdef CONFIG_GROUP_SCHED_WEIGHT 10463 { 10464 .name = "weight", 10465 .flags = CFTYPE_NOT_ON_ROOT, 10466 .read_u64 = cpu_weight_read_u64, 10467 .write_u64 = cpu_weight_write_u64, 10468 }, 10469 { 10470 .name = "weight.nice", 10471 .flags = CFTYPE_NOT_ON_ROOT, 10472 .read_s64 = cpu_weight_nice_read_s64, 10473 .write_s64 = cpu_weight_nice_write_s64, 10474 }, 10475 { 10476 .name = "idle", 10477 .flags = CFTYPE_NOT_ON_ROOT, 10478 .read_s64 = cpu_idle_read_s64, 10479 .write_s64 = cpu_idle_write_s64, 10480 }, 10481 #endif 10482 #ifdef CONFIG_GROUP_SCHED_BANDWIDTH 10483 { 10484 .name = "max", 10485 .flags = CFTYPE_NOT_ON_ROOT, 10486 .seq_show = cpu_max_show, 10487 .write = cpu_max_write, 10488 }, 10489 { 10490 .name = "max.burst", 10491 .flags = CFTYPE_NOT_ON_ROOT, 10492 .read_u64 = cpu_burst_read_u64, 10493 .write_u64 = cpu_burst_write_u64, 10494 }, 10495 #endif /* CONFIG_CFS_BANDWIDTH */ 10496 #ifdef CONFIG_UCLAMP_TASK_GROUP 10497 { 10498 .name = "uclamp.min", 10499 .flags = CFTYPE_NOT_ON_ROOT, 10500 .seq_show = cpu_uclamp_min_show, 10501 .write = cpu_uclamp_min_write, 10502 }, 10503 { 10504 .name = "uclamp.max", 10505 .flags = CFTYPE_NOT_ON_ROOT, 10506 .seq_show = cpu_uclamp_max_show, 10507 .write = cpu_uclamp_max_write, 10508 }, 10509 #endif /* CONFIG_UCLAMP_TASK_GROUP */ 10510 { } /* terminate */ 10511 }; 10512 10513 struct cgroup_subsys cpu_cgrp_subsys = { 10514 .css_alloc = cpu_cgroup_css_alloc, 10515 .css_online = cpu_cgroup_css_online, 10516 .css_offline = cpu_cgroup_css_offline, 10517 .css_released = cpu_cgroup_css_released, 10518 .css_free = cpu_cgroup_css_free, 10519 .css_extra_stat_show = cpu_extra_stat_show, 10520 .css_local_stat_show = cpu_local_stat_show, 10521 .can_attach = cpu_cgroup_can_attach, 10522 .attach = cpu_cgroup_attach, 10523 .cancel_attach = cpu_cgroup_cancel_attach, 10524 .legacy_cftypes = cpu_legacy_files, 10525 .dfl_cftypes = cpu_files, 10526 .early_init = true, 10527 .threaded = true, 10528 }; 10529 10530 #endif /* CONFIG_CGROUP_SCHED */ 10531 10532 void dump_cpu_task(int cpu) 10533 { 10534 if (in_hardirq() && cpu == smp_processor_id()) { 10535 struct pt_regs *regs; 10536 10537 regs = get_irq_regs(); 10538 if (regs) { 10539 show_regs(regs); 10540 return; 10541 } 10542 } 10543 10544 if (trigger_single_cpu_backtrace(cpu)) 10545 return; 10546 10547 pr_info("Task dump for CPU %d:\n", cpu); 10548 sched_show_task(cpu_curr(cpu)); 10549 } 10550 10551 /* 10552 * Nice levels are multiplicative, with a gentle 10% change for every 10553 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to 10554 * nice 1, it will get ~10% less CPU time than another CPU-bound task 10555 * that remained on nice 0. 10556 * 10557 * The "10% effect" is relative and cumulative: from _any_ nice level, 10558 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level 10559 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. 10560 * If a task goes up by ~10% and another task goes down by ~10% then 10561 * the relative distance between them is ~25%.) 10562 */ 10563 const int sched_prio_to_weight[40] = { 10564 /* -20 */ 88761, 71755, 56483, 46273, 36291, 10565 /* -15 */ 29154, 23254, 18705, 14949, 11916, 10566 /* -10 */ 9548, 7620, 6100, 4904, 3906, 10567 /* -5 */ 3121, 2501, 1991, 1586, 1277, 10568 /* 0 */ 1024, 820, 655, 526, 423, 10569 /* 5 */ 335, 272, 215, 172, 137, 10570 /* 10 */ 110, 87, 70, 56, 45, 10571 /* 15 */ 36, 29, 23, 18, 15, 10572 }; 10573 10574 /* 10575 * Inverse (2^32/x) values of the sched_prio_to_weight[] array, pre-calculated. 10576 * 10577 * In cases where the weight does not change often, we can use the 10578 * pre-calculated inverse to speed up arithmetics by turning divisions 10579 * into multiplications: 10580 */ 10581 const u32 sched_prio_to_wmult[40] = { 10582 /* -20 */ 48388, 59856, 76040, 92818, 118348, 10583 /* -15 */ 147320, 184698, 229616, 287308, 360437, 10584 /* -10 */ 449829, 563644, 704093, 875809, 1099582, 10585 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, 10586 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, 10587 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, 10588 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, 10589 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 10590 }; 10591 10592 void call_trace_sched_update_nr_running(struct rq *rq, int count) 10593 { 10594 trace_sched_update_nr_running_tp(rq, count); 10595 } 10596 10597 #ifdef CONFIG_SCHED_MM_CID 10598 /* 10599 * Concurrency IDentifier management 10600 * 10601 * Serialization rules: 10602 * 10603 * mm::mm_cid::mutex: Serializes fork() and exit() and therefore 10604 * protects mm::mm_cid::users and mode switch 10605 * transitions 10606 * 10607 * mm::mm_cid::lock: Serializes mm_update_max_cids() and 10608 * mm_update_cpus_allowed(). Nests in mm_cid::mutex 10609 * and runqueue lock. 10610 * 10611 * The mm_cidmask bitmap is not protected by any of the mm::mm_cid locks 10612 * and can only be modified with atomic operations. 10613 * 10614 * The mm::mm_cid:pcpu per CPU storage is protected by the CPUs runqueue 10615 * lock. 10616 * 10617 * CID ownership: 10618 * 10619 * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or 10620 * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the 10621 * MM_CID_ONCPU bit set. 10622 * 10623 * During the transition of ownership mode, the MM_CID_TRANSIT bit is set 10624 * on the CIDs. When this bit is set the tasks drop the CID back into the 10625 * pool when scheduling out. 10626 * 10627 * Both bits (ONCPU and TRANSIT) are filtered out by task_cid() when the 10628 * CID is actually handed over to user space in the RSEQ memory. 10629 * 10630 * Mode switching: 10631 * 10632 * The ownership mode is per process and stored in mm:mm_cid::mode with the 10633 * following possible states: 10634 * 10635 * 0: Per task ownership 10636 * 0 | MM_CID_TRANSIT: Transition from per CPU to per task 10637 * MM_CID_ONCPU: Per CPU ownership 10638 * MM_CID_ONCPU | MM_CID_TRANSIT: Transition from per task to per CPU 10639 * 10640 * All transitions of ownership mode happen in two phases: 10641 * 10642 * 1) mm:mm_cid::mode has the MM_CID_TRANSIT bit set. This is OR'ed on the 10643 * CIDs and denotes that the CID is only temporarily owned by a 10644 * task. When the task schedules out it drops the CID back into the 10645 * pool if this bit is set. 10646 * 10647 * 2) The initiating context walks the per CPU space or the tasks to fixup 10648 * or drop the CIDs and after completion it clears MM_CID_TRANSIT in 10649 * mm:mm_cid::mode. After that point the CIDs are strictly task or CPU 10650 * owned again. 10651 * 10652 * This two phase transition is required to prevent CID space exhaustion 10653 * during the transition as a direct transfer of ownership would fail: 10654 * 10655 * - On task to CPU mode switch if a task is scheduled in on one CPU and 10656 * then migrated to another CPU before the fixup freed enough per task 10657 * CIDs. 10658 * 10659 * - On CPU to task mode switch if two tasks are scheduled in on the same 10660 * CPU before the fixup freed per CPU CIDs. 10661 * 10662 * Both scenarios can result in a live lock because sched_in() is invoked 10663 * with runqueue lock held and loops in search of a CID and the fixup 10664 * thread can't make progress freeing them up because it is stuck on the 10665 * same runqueue lock. 10666 * 10667 * While MM_CID_TRANSIT is active during the transition phase the MM_CID 10668 * bitmap can be contended, but that's a temporary contention bound to the 10669 * transition period. After that everything goes back into steady state and 10670 * nothing except fork() and exit() will touch the bitmap. This is an 10671 * acceptable tradeoff as it completely avoids complex serialization, 10672 * memory barriers and atomic operations for the common case. 10673 * 10674 * Aside of that this mechanism also ensures RT compability: 10675 * 10676 * - The task which runs the fixup is fully preemptible except for the 10677 * short runqueue lock held sections. 10678 * 10679 * - The transient impact of the bitmap contention is only problematic 10680 * when there is a thundering herd scenario of tasks scheduling in and 10681 * out concurrently. There is not much which can be done about that 10682 * except for avoiding mode switching by a proper overall system 10683 * configuration. 10684 * 10685 * Switching to per CPU mode happens when the user count becomes greater 10686 * than the maximum number of CIDs, which is calculated by: 10687 * 10688 * opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users); 10689 * max_cids = min(1.25 * opt_cids, num_possible_cpus()); 10690 * 10691 * The +25% allowance is useful for tight CPU masks in scenarios where only 10692 * a few threads are created and destroyed to avoid frequent mode 10693 * switches. Though this allowance shrinks, the closer opt_cids becomes to 10694 * num_possible_cpus(), which is the (unfortunate) hard ABI limit. 10695 * 10696 * At the point of switching to per CPU mode the new user is not yet 10697 * visible in the system, so the task which initiated the fork() runs the 10698 * fixup function. mm_cid_fixup_tasks_to_cpu() walks the thread list and 10699 * either marks each task owned CID with MM_CID_TRANSIT if the task is 10700 * running on a CPU or drops it into the CID pool if a task is not on a 10701 * CPU. Tasks which schedule in before the task walk reaches them do the 10702 * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() 10703 * completes it is guaranteed that no task related to that MM owns a CID 10704 * anymore. 10705 * 10706 * Switching back to task mode happens when the user count goes below the 10707 * threshold which was recorded on the per CPU mode switch: 10708 * 10709 * pcpu_thrs = min(opt_cids - (opt_cids / 4), num_possible_cpus() / 2); 10710 * 10711 * This threshold is updated when a affinity change increases the number of 10712 * allowed CPUs for the MM, which might cause a switch back to per task 10713 * mode. 10714 * 10715 * If the switch back was initiated by a exiting task, then that task runs 10716 * the fixup function. If it was initiated by a affinity change, then it's 10717 * run either in the deferred update function in context of a workqueue or 10718 * by a task which forks a new one or by a task which exits. Whatever 10719 * happens first. mm_cid_fixup_cpus_to_task() walks through the possible 10720 * CPUs and either marks the CPU owned CIDs with MM_CID_TRANSIT if a 10721 * related task is running on the CPU or drops it into the pool. Tasks 10722 * which are scheduled in before the fixup covered them do the handover 10723 * themself. When mm_cid_fixup_cpus_to_tasks() completes it is guaranteed 10724 * that no CID related to that MM is owned by a CPU anymore. 10725 */ 10726 10727 /* 10728 * Update the CID range properties when the constraints change. Invoked via 10729 * fork(), exit() and affinity changes 10730 */ 10731 static void __mm_update_max_cids(struct mm_mm_cid *mc) 10732 { 10733 unsigned int opt_cids, max_cids; 10734 10735 /* Calculate the new optimal constraint */ 10736 opt_cids = min(mc->nr_cpus_allowed, mc->users); 10737 10738 /* Adjust the maximum CIDs to +25% limited by the number of possible CPUs */ 10739 max_cids = min(opt_cids + (opt_cids / 4), num_possible_cpus()); 10740 WRITE_ONCE(mc->max_cids, max_cids); 10741 } 10742 10743 static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc) 10744 { 10745 unsigned int opt_cids; 10746 10747 opt_cids = min(mc->nr_cpus_allowed, mc->users); 10748 /* Has to be at least 1 because 0 indicates PCPU mode off */ 10749 return max(min(opt_cids - opt_cids / 4, num_possible_cpus() / 2), 1); 10750 } 10751 10752 static bool mm_update_max_cids(struct mm_struct *mm) 10753 { 10754 struct mm_mm_cid *mc = &mm->mm_cid; 10755 bool percpu = cid_on_cpu(mc->mode); 10756 10757 lockdep_assert_held(&mm->mm_cid.lock); 10758 10759 /* Clear deferred mode switch flag. A change is handled by the caller */ 10760 mc->update_deferred = false; 10761 __mm_update_max_cids(mc); 10762 10763 /* Check whether owner mode must be changed */ 10764 if (!percpu) { 10765 /* Enable per CPU mode when the number of users is above max_cids */ 10766 if (mc->users > mc->max_cids) 10767 mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); 10768 } else { 10769 /* Switch back to per task if user count under threshold */ 10770 if (mc->users < mc->pcpu_thrs) 10771 mc->pcpu_thrs = 0; 10772 } 10773 10774 /* Mode change required? */ 10775 if (percpu == !!mc->pcpu_thrs) 10776 return false; 10777 10778 /* Flip the mode and set the transition flag to bridge the transfer */ 10779 WRITE_ONCE(mc->mode, mc->mode ^ (MM_CID_TRANSIT | MM_CID_ONCPU)); 10780 /* 10781 * Order the store against the subsequent fixups so that 10782 * acquire(rq::lock) cannot be reordered by the CPU before the 10783 * store. 10784 */ 10785 smp_mb(); 10786 return true; 10787 } 10788 10789 static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) 10790 { 10791 struct cpumask *mm_allowed; 10792 struct mm_mm_cid *mc; 10793 unsigned int weight; 10794 10795 if (!mm || !READ_ONCE(mm->mm_cid.users)) 10796 return; 10797 /* 10798 * mm::mm_cid::mm_cpus_allowed is the superset of each threads 10799 * allowed CPUs mask which means it can only grow. 10800 */ 10801 mc = &mm->mm_cid; 10802 guard(raw_spinlock)(&mc->lock); 10803 mm_allowed = mm_cpus_allowed(mm); 10804 weight = cpumask_weighted_or(mm_allowed, mm_allowed, affmsk); 10805 if (weight == mc->nr_cpus_allowed) 10806 return; 10807 10808 WRITE_ONCE(mc->nr_cpus_allowed, weight); 10809 __mm_update_max_cids(mc); 10810 if (!cid_on_cpu(mc->mode)) 10811 return; 10812 10813 /* Adjust the threshold to the wider set */ 10814 mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); 10815 /* Switch back to per task mode? */ 10816 if (mc->users >= mc->pcpu_thrs) 10817 return; 10818 10819 /* Don't queue twice */ 10820 if (mc->update_deferred) 10821 return; 10822 10823 /* Queue the irq work, which schedules the real work */ 10824 mc->update_deferred = true; 10825 irq_work_queue(&mc->irq_work); 10826 } 10827 10828 static inline void mm_cid_complete_transit(struct mm_struct *mm, unsigned int mode) 10829 { 10830 /* 10831 * Ensure that the store removing the TRANSIT bit cannot be 10832 * reordered by the CPU before the fixups have been completed. 10833 */ 10834 smp_mb(); 10835 WRITE_ONCE(mm->mm_cid.mode, mode); 10836 } 10837 10838 static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp) 10839 { 10840 if (cid_on_cpu(t->mm_cid.cid)) { 10841 unsigned int cid = cpu_cid_to_cid(t->mm_cid.cid); 10842 10843 t->mm_cid.cid = cid_to_transit_cid(cid); 10844 pcp->cid = t->mm_cid.cid; 10845 } 10846 } 10847 10848 static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm) 10849 { 10850 unsigned int cpu; 10851 10852 /* Walk the CPUs and fixup all stale CIDs */ 10853 for_each_possible_cpu(cpu) { 10854 struct mm_cid_pcpu *pcp = per_cpu_ptr(mm->mm_cid.pcpu, cpu); 10855 struct rq *rq = cpu_rq(cpu); 10856 10857 /* Remote access to mm::mm_cid::pcpu requires rq_lock */ 10858 guard(rq_lock_irq)(rq); 10859 /* Is the CID still owned by the CPU? */ 10860 if (cid_on_cpu(pcp->cid)) { 10861 /* 10862 * If rq->curr has @mm, transfer it with the 10863 * transition bit set. Otherwise drop it. 10864 */ 10865 if (rq->curr->mm == mm && rq->curr->mm_cid.active) 10866 mm_cid_transit_to_task(rq->curr, pcp); 10867 else 10868 mm_drop_cid_on_cpu(mm, pcp); 10869 10870 } else if (rq->curr->mm == mm && rq->curr->mm_cid.active) { 10871 unsigned int cid = rq->curr->mm_cid.cid; 10872 10873 /* Ensure it has the transition bit set */ 10874 if (!cid_in_transit(cid)) { 10875 cid = cid_to_transit_cid(cid); 10876 rq->curr->mm_cid.cid = cid; 10877 pcp->cid = cid; 10878 } 10879 } 10880 } 10881 mm_cid_complete_transit(mm, 0); 10882 } 10883 10884 static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp) 10885 { 10886 if (cid_on_task(t->mm_cid.cid)) { 10887 t->mm_cid.cid = cid_to_transit_cid(t->mm_cid.cid); 10888 pcp->cid = t->mm_cid.cid; 10889 } 10890 } 10891 10892 static void mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm) 10893 { 10894 /* Remote access to mm::mm_cid::pcpu requires rq_lock */ 10895 guard(task_rq_lock)(t); 10896 if (cid_on_task(t->mm_cid.cid)) { 10897 /* If running on the CPU, put the CID in transit mode, otherwise drop it */ 10898 if (task_rq(t)->curr == t) 10899 mm_cid_transit_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t))); 10900 else 10901 mm_unset_cid_on_task(t); 10902 } 10903 } 10904 10905 static void mm_cid_fixup_tasks_to_cpus(void) 10906 { 10907 struct mm_struct *mm = current->mm; 10908 struct task_struct *t; 10909 10910 lockdep_assert_held(&mm->mm_cid.mutex); 10911 10912 hlist_for_each_entry(t, &mm->mm_cid.user_list, mm_cid.node) { 10913 /* Current has already transferred before invoking the fixup. */ 10914 if (t != current) 10915 mm_cid_fixup_task_to_cpu(t, mm); 10916 } 10917 10918 mm_cid_complete_transit(mm, MM_CID_ONCPU); 10919 } 10920 10921 static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm) 10922 { 10923 lockdep_assert_held(&mm->mm_cid.lock); 10924 10925 t->mm_cid.active = 1; 10926 hlist_add_head(&t->mm_cid.node, &mm->mm_cid.user_list); 10927 mm->mm_cid.users++; 10928 return mm_update_max_cids(mm); 10929 } 10930 10931 static void sched_mm_cid_fork(struct task_struct *t) 10932 { 10933 struct mm_struct *mm = t->mm; 10934 bool percpu; 10935 10936 if (!mm) 10937 return; 10938 10939 WARN_ON_ONCE(t->mm_cid.cid != MM_CID_UNSET); 10940 10941 guard(mutex)(&mm->mm_cid.mutex); 10942 scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { 10943 struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu); 10944 10945 /* First user ? */ 10946 if (!mm->mm_cid.users) { 10947 sched_mm_cid_add_user(t, mm); 10948 t->mm_cid.cid = mm_get_cid(mm); 10949 /* Required for execve() */ 10950 pcp->cid = t->mm_cid.cid; 10951 return; 10952 } 10953 10954 if (!sched_mm_cid_add_user(t, mm)) { 10955 if (!cid_on_cpu(mm->mm_cid.mode)) 10956 t->mm_cid.cid = mm_get_cid(mm); 10957 return; 10958 } 10959 10960 /* Handle the mode change and transfer current's CID */ 10961 percpu = cid_on_cpu(mm->mm_cid.mode); 10962 if (!percpu) 10963 mm_cid_transit_to_task(current, pcp); 10964 else 10965 mm_cid_transit_to_cpu(current, pcp); 10966 } 10967 10968 if (percpu) { 10969 mm_cid_fixup_tasks_to_cpus(); 10970 } else { 10971 mm_cid_fixup_cpus_to_tasks(mm); 10972 t->mm_cid.cid = mm_get_cid(mm); 10973 } 10974 } 10975 10976 static bool sched_mm_cid_remove_user(struct task_struct *t) 10977 { 10978 lockdep_assert_held(&t->mm->mm_cid.lock); 10979 10980 t->mm_cid.active = 0; 10981 /* Clear the transition bit */ 10982 t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid); 10983 mm_unset_cid_on_task(t); 10984 hlist_del_init(&t->mm_cid.node); 10985 t->mm->mm_cid.users--; 10986 return mm_update_max_cids(t->mm); 10987 } 10988 10989 static bool __sched_mm_cid_exit(struct task_struct *t) 10990 { 10991 struct mm_struct *mm = t->mm; 10992 10993 if (!sched_mm_cid_remove_user(t)) 10994 return false; 10995 /* 10996 * Contrary to fork() this only deals with a switch back to per 10997 * task mode either because the above decreased users or an 10998 * affinity change increased the number of allowed CPUs and the 10999 * deferred fixup did not run yet. 11000 */ 11001 if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode))) 11002 return false; 11003 /* 11004 * A failed fork(2) cleanup never gets here, so @current must have 11005 * the same MM as @t. That's true for exit() and the failed 11006 * pthread_create() cleanup case. 11007 */ 11008 if (WARN_ON_ONCE(current->mm != mm)) 11009 return false; 11010 return true; 11011 } 11012 11013 /* 11014 * When a task exits, the MM CID held by the task is not longer required as 11015 * the task cannot return to user space. 11016 */ 11017 void sched_mm_cid_exit(struct task_struct *t) 11018 { 11019 struct mm_struct *mm = t->mm; 11020 11021 if (!mm || !t->mm_cid.active) 11022 return; 11023 /* 11024 * Ensure that only one instance is doing MM CID operations within 11025 * a MM. The common case is uncontended. The rare fixup case adds 11026 * some overhead. 11027 */ 11028 scoped_guard(mutex, &mm->mm_cid.mutex) { 11029 /* mm_cid::mutex is sufficient to protect mm_cid::users */ 11030 if (likely(mm->mm_cid.users > 1)) { 11031 scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { 11032 if (!__sched_mm_cid_exit(t)) 11033 return; 11034 /* 11035 * Mode change. The task has the CID unset 11036 * already and dealt with an eventually set 11037 * TRANSIT bit. If the CID is owned by the CPU 11038 * then drop it. 11039 */ 11040 mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu)); 11041 } 11042 mm_cid_fixup_cpus_to_tasks(mm); 11043 return; 11044 } 11045 /* Last user */ 11046 scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { 11047 /* Required across execve() */ 11048 if (t == current) 11049 mm_cid_transit_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu)); 11050 /* Ignore mode change. There is nothing to do. */ 11051 sched_mm_cid_remove_user(t); 11052 } 11053 } 11054 11055 /* 11056 * As this is the last user (execve(), process exit or failed 11057 * fork(2)) there is no concurrency anymore. 11058 * 11059 * Synchronize eventually pending work to ensure that there are no 11060 * dangling references left. @t->mm_cid.users is zero so nothing 11061 * can queue this work anymore. 11062 */ 11063 irq_work_sync(&mm->mm_cid.irq_work); 11064 cancel_work_sync(&mm->mm_cid.work); 11065 } 11066 11067 /* Deactivate MM CID allocation across execve() */ 11068 void sched_mm_cid_before_execve(struct task_struct *t) 11069 { 11070 sched_mm_cid_exit(t); 11071 } 11072 11073 /* Reactivate MM CID after execve() */ 11074 void sched_mm_cid_after_execve(struct task_struct *t) 11075 { 11076 if (t->mm) 11077 sched_mm_cid_fork(t); 11078 } 11079 11080 static void mm_cid_work_fn(struct work_struct *work) 11081 { 11082 struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work); 11083 11084 guard(mutex)(&mm->mm_cid.mutex); 11085 /* Did the last user task exit already? */ 11086 if (!mm->mm_cid.users) 11087 return; 11088 11089 scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { 11090 /* Have fork() or exit() handled it already? */ 11091 if (!mm->mm_cid.update_deferred) 11092 return; 11093 /* This clears mm_cid::update_deferred */ 11094 if (!mm_update_max_cids(mm)) 11095 return; 11096 /* Affinity changes can only switch back to task mode */ 11097 if (WARN_ON_ONCE(cid_on_cpu(mm->mm_cid.mode))) 11098 return; 11099 } 11100 mm_cid_fixup_cpus_to_tasks(mm); 11101 } 11102 11103 static void mm_cid_irq_work(struct irq_work *work) 11104 { 11105 struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.irq_work); 11106 11107 /* 11108 * Needs to be unconditional because mm_cid::lock cannot be held 11109 * when scheduling work as mm_update_cpus_allowed() nests inside 11110 * rq::lock and schedule_work() might end up in wakeup... 11111 */ 11112 schedule_work(&mm->mm_cid.work); 11113 } 11114 11115 void mm_init_cid(struct mm_struct *mm, struct task_struct *p) 11116 { 11117 mm->mm_cid.max_cids = 0; 11118 mm->mm_cid.mode = 0; 11119 mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; 11120 mm->mm_cid.users = 0; 11121 mm->mm_cid.pcpu_thrs = 0; 11122 mm->mm_cid.update_deferred = 0; 11123 raw_spin_lock_init(&mm->mm_cid.lock); 11124 mutex_init(&mm->mm_cid.mutex); 11125 mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work); 11126 INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn); 11127 INIT_HLIST_HEAD(&mm->mm_cid.user_list); 11128 cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); 11129 bitmap_zero(mm_cidmask(mm), num_possible_cpus()); 11130 } 11131 #else /* CONFIG_SCHED_MM_CID */ 11132 static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { } 11133 static inline void sched_mm_cid_fork(struct task_struct *t) { } 11134 #endif /* !CONFIG_SCHED_MM_CID */ 11135 11136 static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx); 11137 11138 struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags) 11139 { 11140 struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx); 11141 struct rq *rq = task_rq(p); 11142 11143 /* 11144 * Must exclusively use matched flags since this is both dequeue and 11145 * enqueue. 11146 */ 11147 WARN_ON_ONCE(flags & 0xFFFF0000); 11148 11149 lockdep_assert_rq_held(rq); 11150 11151 if (!(flags & DEQUEUE_NOCLOCK)) { 11152 update_rq_clock(rq); 11153 flags |= DEQUEUE_NOCLOCK; 11154 } 11155 11156 if ((flags & DEQUEUE_CLASS) && p->sched_class->switching_from) 11157 p->sched_class->switching_from(rq, p); 11158 11159 *ctx = (struct sched_change_ctx){ 11160 .p = p, 11161 .class = p->sched_class, 11162 .flags = flags, 11163 .queued = task_on_rq_queued(p), 11164 .running = task_current_donor(rq, p), 11165 }; 11166 11167 if (!(flags & DEQUEUE_CLASS)) { 11168 if (p->sched_class->get_prio) 11169 ctx->prio = p->sched_class->get_prio(rq, p); 11170 else 11171 ctx->prio = p->prio; 11172 } 11173 11174 if (ctx->queued) 11175 dequeue_task(rq, p, flags); 11176 if (ctx->running) 11177 put_prev_task(rq, p); 11178 11179 if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from) 11180 p->sched_class->switched_from(rq, p); 11181 11182 return ctx; 11183 } 11184 11185 void sched_change_end(struct sched_change_ctx *ctx) 11186 { 11187 struct task_struct *p = ctx->p; 11188 struct rq *rq = task_rq(p); 11189 11190 lockdep_assert_rq_held(rq); 11191 11192 /* 11193 * Changing class without *QUEUE_CLASS is bad. 11194 */ 11195 WARN_ON_ONCE(p->sched_class != ctx->class && !(ctx->flags & ENQUEUE_CLASS)); 11196 11197 if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to) 11198 p->sched_class->switching_to(rq, p); 11199 11200 if (ctx->queued) 11201 enqueue_task(rq, p, ctx->flags); 11202 if (ctx->running) 11203 set_next_task(rq, p); 11204 11205 if (ctx->flags & ENQUEUE_CLASS) { 11206 if (p->sched_class->switched_to) 11207 p->sched_class->switched_to(rq, p); 11208 11209 if (ctx->running) { 11210 /* 11211 * If this was a class promotion; let the old class 11212 * know it got preempted. Note that none of the 11213 * switch*_from() methods know the new class and none 11214 * of the switch*_to() methods know the old class. 11215 */ 11216 if (sched_class_above(p->sched_class, ctx->class)) { 11217 rq->next_class->wakeup_preempt(rq, p, 0); 11218 rq->next_class = p->sched_class; 11219 } 11220 /* 11221 * If this was a degradation in class; make sure to 11222 * reschedule. 11223 */ 11224 if (sched_class_above(ctx->class, p->sched_class)) 11225 resched_curr(rq); 11226 } 11227 } else { 11228 p->sched_class->prio_changed(rq, p, ctx->prio); 11229 } 11230 } 11231