1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * kernel/sched/core.c 4 * 5 * Core kernel scheduler code and related syscalls 6 * 7 * Copyright (C) 1991-2002 Linus Torvalds 8 */ 9 #include "sched.h" 10 11 #include <linux/nospec.h> 12 13 #include <linux/kcov.h> 14 15 #include <asm/switch_to.h> 16 #include <asm/tlb.h> 17 18 #include "../workqueue_internal.h" 19 #include "../../fs/io-wq.h" 20 #include "../smpboot.h" 21 22 #include "pelt.h" 23 24 #define CREATE_TRACE_POINTS 25 #include <trace/events/sched.h> 26 27 /* 28 * Export tracepoints that act as a bare tracehook (ie: have no trace event 29 * associated with them) to allow external modules to probe them. 30 */ 31 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp); 32 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); 33 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); 34 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); 35 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); 36 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); 37 38 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 39 40 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) 41 /* 42 * Debugging: various feature bits 43 * 44 * If SCHED_DEBUG is disabled, each compilation unit has its own copy of 45 * sysctl_sched_features, defined in sched.h, to allow constants propagation 46 * at compile time and compiler optimization based on features default. 47 */ 48 #define SCHED_FEAT(name, enabled) \ 49 (1UL << __SCHED_FEAT_##name) * enabled | 50 const_debug unsigned int sysctl_sched_features = 51 #include "features.h" 52 0; 53 #undef SCHED_FEAT 54 #endif 55 56 /* 57 * Number of tasks to iterate in a single balance run. 58 * Limited because this is done with IRQs disabled. 59 */ 60 const_debug unsigned int sysctl_sched_nr_migrate = 32; 61 62 /* 63 * period over which we measure -rt task CPU usage in us. 64 * default: 1s 65 */ 66 unsigned int sysctl_sched_rt_period = 1000000; 67 68 __read_mostly int scheduler_running; 69 70 /* 71 * part of the period that we allow rt tasks to run in us. 72 * default: 0.95s 73 */ 74 int sysctl_sched_rt_runtime = 950000; 75 76 /* 77 * __task_rq_lock - lock the rq @p resides on. 78 */ 79 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) 80 __acquires(rq->lock) 81 { 82 struct rq *rq; 83 84 lockdep_assert_held(&p->pi_lock); 85 86 for (;;) { 87 rq = task_rq(p); 88 raw_spin_lock(&rq->lock); 89 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { 90 rq_pin_lock(rq, rf); 91 return rq; 92 } 93 raw_spin_unlock(&rq->lock); 94 95 while (unlikely(task_on_rq_migrating(p))) 96 cpu_relax(); 97 } 98 } 99 100 /* 101 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. 102 */ 103 struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) 104 __acquires(p->pi_lock) 105 __acquires(rq->lock) 106 { 107 struct rq *rq; 108 109 for (;;) { 110 raw_spin_lock_irqsave(&p->pi_lock, rf->flags); 111 rq = task_rq(p); 112 raw_spin_lock(&rq->lock); 113 /* 114 * move_queued_task() task_rq_lock() 115 * 116 * ACQUIRE (rq->lock) 117 * [S] ->on_rq = MIGRATING [L] rq = task_rq() 118 * WMB (__set_task_cpu()) ACQUIRE (rq->lock); 119 * [S] ->cpu = new_cpu [L] task_rq() 120 * [L] ->on_rq 121 * RELEASE (rq->lock) 122 * 123 * If we observe the old CPU in task_rq_lock(), the acquire of 124 * the old rq->lock will fully serialize against the stores. 125 * 126 * If we observe the new CPU in task_rq_lock(), the address 127 * dependency headed by '[L] rq = task_rq()' and the acquire 128 * will pair with the WMB to ensure we then also see migrating. 129 */ 130 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { 131 rq_pin_lock(rq, rf); 132 return rq; 133 } 134 raw_spin_unlock(&rq->lock); 135 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); 136 137 while (unlikely(task_on_rq_migrating(p))) 138 cpu_relax(); 139 } 140 } 141 142 /* 143 * RQ-clock updating methods: 144 */ 145 146 static void update_rq_clock_task(struct rq *rq, s64 delta) 147 { 148 /* 149 * In theory, the compile should just see 0 here, and optimize out the call 150 * to sched_rt_avg_update. But I don't trust it... 151 */ 152 s64 __maybe_unused steal = 0, irq_delta = 0; 153 154 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 155 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 156 157 /* 158 * Since irq_time is only updated on {soft,}irq_exit, we might run into 159 * this case when a previous update_rq_clock() happened inside a 160 * {soft,}irq region. 161 * 162 * When this happens, we stop ->clock_task and only update the 163 * prev_irq_time stamp to account for the part that fit, so that a next 164 * update will consume the rest. This ensures ->clock_task is 165 * monotonic. 166 * 167 * It does however cause some slight miss-attribution of {soft,}irq 168 * time, a more accurate solution would be to update the irq_time using 169 * the current rq->clock timestamp, except that would require using 170 * atomic ops. 171 */ 172 if (irq_delta > delta) 173 irq_delta = delta; 174 175 rq->prev_irq_time += irq_delta; 176 delta -= irq_delta; 177 #endif 178 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 179 if (static_key_false((¶virt_steal_rq_enabled))) { 180 steal = paravirt_steal_clock(cpu_of(rq)); 181 steal -= rq->prev_steal_time_rq; 182 183 if (unlikely(steal > delta)) 184 steal = delta; 185 186 rq->prev_steal_time_rq += steal; 187 delta -= steal; 188 } 189 #endif 190 191 rq->clock_task += delta; 192 193 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ 194 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) 195 update_irq_load_avg(rq, irq_delta + steal); 196 #endif 197 update_rq_clock_pelt(rq, delta); 198 } 199 200 void update_rq_clock(struct rq *rq) 201 { 202 s64 delta; 203 204 lockdep_assert_held(&rq->lock); 205 206 if (rq->clock_update_flags & RQCF_ACT_SKIP) 207 return; 208 209 #ifdef CONFIG_SCHED_DEBUG 210 if (sched_feat(WARN_DOUBLE_CLOCK)) 211 SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED); 212 rq->clock_update_flags |= RQCF_UPDATED; 213 #endif 214 215 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 216 if (delta < 0) 217 return; 218 rq->clock += delta; 219 update_rq_clock_task(rq, delta); 220 } 221 222 223 #ifdef CONFIG_SCHED_HRTICK 224 /* 225 * Use HR-timers to deliver accurate preemption points. 226 */ 227 228 static void hrtick_clear(struct rq *rq) 229 { 230 if (hrtimer_active(&rq->hrtick_timer)) 231 hrtimer_cancel(&rq->hrtick_timer); 232 } 233 234 /* 235 * High-resolution timer tick. 236 * Runs from hardirq context with interrupts disabled. 237 */ 238 static enum hrtimer_restart hrtick(struct hrtimer *timer) 239 { 240 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 241 struct rq_flags rf; 242 243 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 244 245 rq_lock(rq, &rf); 246 update_rq_clock(rq); 247 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 248 rq_unlock(rq, &rf); 249 250 return HRTIMER_NORESTART; 251 } 252 253 #ifdef CONFIG_SMP 254 255 static void __hrtick_restart(struct rq *rq) 256 { 257 struct hrtimer *timer = &rq->hrtick_timer; 258 259 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); 260 } 261 262 /* 263 * called from hardirq (IPI) context 264 */ 265 static void __hrtick_start(void *arg) 266 { 267 struct rq *rq = arg; 268 struct rq_flags rf; 269 270 rq_lock(rq, &rf); 271 __hrtick_restart(rq); 272 rq_unlock(rq, &rf); 273 } 274 275 /* 276 * Called to set the hrtick timer state. 277 * 278 * called with rq->lock held and irqs disabled 279 */ 280 void hrtick_start(struct rq *rq, u64 delay) 281 { 282 struct hrtimer *timer = &rq->hrtick_timer; 283 ktime_t time; 284 s64 delta; 285 286 /* 287 * Don't schedule slices shorter than 10000ns, that just 288 * doesn't make sense and can cause timer DoS. 289 */ 290 delta = max_t(s64, delay, 10000LL); 291 time = ktime_add_ns(timer->base->get_time(), delta); 292 293 hrtimer_set_expires(timer, time); 294 295 if (rq == this_rq()) 296 __hrtick_restart(rq); 297 else 298 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); 299 } 300 301 #else 302 /* 303 * Called to set the hrtick timer state. 304 * 305 * called with rq->lock held and irqs disabled 306 */ 307 void hrtick_start(struct rq *rq, u64 delay) 308 { 309 /* 310 * Don't schedule slices shorter than 10000ns, that just 311 * doesn't make sense. Rely on vruntime for fairness. 312 */ 313 delay = max_t(u64, delay, 10000LL); 314 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), 315 HRTIMER_MODE_REL_PINNED_HARD); 316 } 317 #endif /* CONFIG_SMP */ 318 319 static void hrtick_rq_init(struct rq *rq) 320 { 321 #ifdef CONFIG_SMP 322 rq->hrtick_csd.flags = 0; 323 rq->hrtick_csd.func = __hrtick_start; 324 rq->hrtick_csd.info = rq; 325 #endif 326 327 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); 328 rq->hrtick_timer.function = hrtick; 329 } 330 #else /* CONFIG_SCHED_HRTICK */ 331 static inline void hrtick_clear(struct rq *rq) 332 { 333 } 334 335 static inline void hrtick_rq_init(struct rq *rq) 336 { 337 } 338 #endif /* CONFIG_SCHED_HRTICK */ 339 340 /* 341 * cmpxchg based fetch_or, macro so it works for different integer types 342 */ 343 #define fetch_or(ptr, mask) \ 344 ({ \ 345 typeof(ptr) _ptr = (ptr); \ 346 typeof(mask) _mask = (mask); \ 347 typeof(*_ptr) _old, _val = *_ptr; \ 348 \ 349 for (;;) { \ 350 _old = cmpxchg(_ptr, _val, _val | _mask); \ 351 if (_old == _val) \ 352 break; \ 353 _val = _old; \ 354 } \ 355 _old; \ 356 }) 357 358 #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) 359 /* 360 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, 361 * this avoids any races wrt polling state changes and thereby avoids 362 * spurious IPIs. 363 */ 364 static bool set_nr_and_not_polling(struct task_struct *p) 365 { 366 struct thread_info *ti = task_thread_info(p); 367 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); 368 } 369 370 /* 371 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. 372 * 373 * If this returns true, then the idle task promises to call 374 * sched_ttwu_pending() and reschedule soon. 375 */ 376 static bool set_nr_if_polling(struct task_struct *p) 377 { 378 struct thread_info *ti = task_thread_info(p); 379 typeof(ti->flags) old, val = READ_ONCE(ti->flags); 380 381 for (;;) { 382 if (!(val & _TIF_POLLING_NRFLAG)) 383 return false; 384 if (val & _TIF_NEED_RESCHED) 385 return true; 386 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); 387 if (old == val) 388 break; 389 val = old; 390 } 391 return true; 392 } 393 394 #else 395 static bool set_nr_and_not_polling(struct task_struct *p) 396 { 397 set_tsk_need_resched(p); 398 return true; 399 } 400 401 #ifdef CONFIG_SMP 402 static bool set_nr_if_polling(struct task_struct *p) 403 { 404 return false; 405 } 406 #endif 407 #endif 408 409 static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) 410 { 411 struct wake_q_node *node = &task->wake_q; 412 413 /* 414 * Atomically grab the task, if ->wake_q is !nil already it means 415 * its already queued (either by us or someone else) and will get the 416 * wakeup due to that. 417 * 418 * In order to ensure that a pending wakeup will observe our pending 419 * state, even in the failed case, an explicit smp_mb() must be used. 420 */ 421 smp_mb__before_atomic(); 422 if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) 423 return false; 424 425 /* 426 * The head is context local, there can be no concurrency. 427 */ 428 *head->lastp = node; 429 head->lastp = &node->next; 430 return true; 431 } 432 433 /** 434 * wake_q_add() - queue a wakeup for 'later' waking. 435 * @head: the wake_q_head to add @task to 436 * @task: the task to queue for 'later' wakeup 437 * 438 * Queue a task for later wakeup, most likely by the wake_up_q() call in the 439 * same context, _HOWEVER_ this is not guaranteed, the wakeup can come 440 * instantly. 441 * 442 * This function must be used as-if it were wake_up_process(); IOW the task 443 * must be ready to be woken at this location. 444 */ 445 void wake_q_add(struct wake_q_head *head, struct task_struct *task) 446 { 447 if (__wake_q_add(head, task)) 448 get_task_struct(task); 449 } 450 451 /** 452 * wake_q_add_safe() - safely queue a wakeup for 'later' waking. 453 * @head: the wake_q_head to add @task to 454 * @task: the task to queue for 'later' wakeup 455 * 456 * Queue a task for later wakeup, most likely by the wake_up_q() call in the 457 * same context, _HOWEVER_ this is not guaranteed, the wakeup can come 458 * instantly. 459 * 460 * This function must be used as-if it were wake_up_process(); IOW the task 461 * must be ready to be woken at this location. 462 * 463 * This function is essentially a task-safe equivalent to wake_q_add(). Callers 464 * that already hold reference to @task can call the 'safe' version and trust 465 * wake_q to do the right thing depending whether or not the @task is already 466 * queued for wakeup. 467 */ 468 void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) 469 { 470 if (!__wake_q_add(head, task)) 471 put_task_struct(task); 472 } 473 474 void wake_up_q(struct wake_q_head *head) 475 { 476 struct wake_q_node *node = head->first; 477 478 while (node != WAKE_Q_TAIL) { 479 struct task_struct *task; 480 481 task = container_of(node, struct task_struct, wake_q); 482 BUG_ON(!task); 483 /* Task can safely be re-inserted now: */ 484 node = node->next; 485 task->wake_q.next = NULL; 486 487 /* 488 * wake_up_process() executes a full barrier, which pairs with 489 * the queueing in wake_q_add() so as not to miss wakeups. 490 */ 491 wake_up_process(task); 492 put_task_struct(task); 493 } 494 } 495 496 /* 497 * resched_curr - mark rq's current task 'to be rescheduled now'. 498 * 499 * On UP this means the setting of the need_resched flag, on SMP it 500 * might also involve a cross-CPU call to trigger the scheduler on 501 * the target CPU. 502 */ 503 void resched_curr(struct rq *rq) 504 { 505 struct task_struct *curr = rq->curr; 506 int cpu; 507 508 lockdep_assert_held(&rq->lock); 509 510 if (test_tsk_need_resched(curr)) 511 return; 512 513 cpu = cpu_of(rq); 514 515 if (cpu == smp_processor_id()) { 516 set_tsk_need_resched(curr); 517 set_preempt_need_resched(); 518 return; 519 } 520 521 if (set_nr_and_not_polling(curr)) 522 smp_send_reschedule(cpu); 523 else 524 trace_sched_wake_idle_without_ipi(cpu); 525 } 526 527 void resched_cpu(int cpu) 528 { 529 struct rq *rq = cpu_rq(cpu); 530 unsigned long flags; 531 532 raw_spin_lock_irqsave(&rq->lock, flags); 533 if (cpu_online(cpu) || cpu == smp_processor_id()) 534 resched_curr(rq); 535 raw_spin_unlock_irqrestore(&rq->lock, flags); 536 } 537 538 #ifdef CONFIG_SMP 539 #ifdef CONFIG_NO_HZ_COMMON 540 /* 541 * In the semi idle case, use the nearest busy CPU for migrating timers 542 * from an idle CPU. This is good for power-savings. 543 * 544 * We don't do similar optimization for completely idle system, as 545 * selecting an idle CPU will add more delays to the timers than intended 546 * (as that CPU's timer base may not be uptodate wrt jiffies etc). 547 */ 548 int get_nohz_timer_target(void) 549 { 550 int i, cpu = smp_processor_id(), default_cpu = -1; 551 struct sched_domain *sd; 552 553 if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { 554 if (!idle_cpu(cpu)) 555 return cpu; 556 default_cpu = cpu; 557 } 558 559 rcu_read_lock(); 560 for_each_domain(cpu, sd) { 561 for_each_cpu_and(i, sched_domain_span(sd), 562 housekeeping_cpumask(HK_FLAG_TIMER)) { 563 if (cpu == i) 564 continue; 565 566 if (!idle_cpu(i)) { 567 cpu = i; 568 goto unlock; 569 } 570 } 571 } 572 573 if (default_cpu == -1) 574 default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); 575 cpu = default_cpu; 576 unlock: 577 rcu_read_unlock(); 578 return cpu; 579 } 580 581 /* 582 * When add_timer_on() enqueues a timer into the timer wheel of an 583 * idle CPU then this timer might expire before the next timer event 584 * which is scheduled to wake up that CPU. In case of a completely 585 * idle system the next event might even be infinite time into the 586 * future. wake_up_idle_cpu() ensures that the CPU is woken up and 587 * leaves the inner idle loop so the newly added timer is taken into 588 * account when the CPU goes back to idle and evaluates the timer 589 * wheel for the next timer event. 590 */ 591 static void wake_up_idle_cpu(int cpu) 592 { 593 struct rq *rq = cpu_rq(cpu); 594 595 if (cpu == smp_processor_id()) 596 return; 597 598 if (set_nr_and_not_polling(rq->idle)) 599 smp_send_reschedule(cpu); 600 else 601 trace_sched_wake_idle_without_ipi(cpu); 602 } 603 604 static bool wake_up_full_nohz_cpu(int cpu) 605 { 606 /* 607 * We just need the target to call irq_exit() and re-evaluate 608 * the next tick. The nohz full kick at least implies that. 609 * If needed we can still optimize that later with an 610 * empty IRQ. 611 */ 612 if (cpu_is_offline(cpu)) 613 return true; /* Don't try to wake offline CPUs. */ 614 if (tick_nohz_full_cpu(cpu)) { 615 if (cpu != smp_processor_id() || 616 tick_nohz_tick_stopped()) 617 tick_nohz_full_kick_cpu(cpu); 618 return true; 619 } 620 621 return false; 622 } 623 624 /* 625 * Wake up the specified CPU. If the CPU is going offline, it is the 626 * caller's responsibility to deal with the lost wakeup, for example, 627 * by hooking into the CPU_DEAD notifier like timers and hrtimers do. 628 */ 629 void wake_up_nohz_cpu(int cpu) 630 { 631 if (!wake_up_full_nohz_cpu(cpu)) 632 wake_up_idle_cpu(cpu); 633 } 634 635 static inline bool got_nohz_idle_kick(void) 636 { 637 int cpu = smp_processor_id(); 638 639 if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK)) 640 return false; 641 642 if (idle_cpu(cpu) && !need_resched()) 643 return true; 644 645 /* 646 * We can't run Idle Load Balance on this CPU for this time so we 647 * cancel it and clear NOHZ_BALANCE_KICK 648 */ 649 atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); 650 return false; 651 } 652 653 #else /* CONFIG_NO_HZ_COMMON */ 654 655 static inline bool got_nohz_idle_kick(void) 656 { 657 return false; 658 } 659 660 #endif /* CONFIG_NO_HZ_COMMON */ 661 662 #ifdef CONFIG_NO_HZ_FULL 663 bool sched_can_stop_tick(struct rq *rq) 664 { 665 int fifo_nr_running; 666 667 /* Deadline tasks, even if single, need the tick */ 668 if (rq->dl.dl_nr_running) 669 return false; 670 671 /* 672 * If there are more than one RR tasks, we need the tick to effect the 673 * actual RR behaviour. 674 */ 675 if (rq->rt.rr_nr_running) { 676 if (rq->rt.rr_nr_running == 1) 677 return true; 678 else 679 return false; 680 } 681 682 /* 683 * If there's no RR tasks, but FIFO tasks, we can skip the tick, no 684 * forced preemption between FIFO tasks. 685 */ 686 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running; 687 if (fifo_nr_running) 688 return true; 689 690 /* 691 * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left; 692 * if there's more than one we need the tick for involuntary 693 * preemption. 694 */ 695 if (rq->nr_running > 1) 696 return false; 697 698 return true; 699 } 700 #endif /* CONFIG_NO_HZ_FULL */ 701 #endif /* CONFIG_SMP */ 702 703 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 704 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 705 /* 706 * Iterate task_group tree rooted at *from, calling @down when first entering a 707 * node and @up when leaving it for the final time. 708 * 709 * Caller must hold rcu_lock or sufficient equivalent. 710 */ 711 int walk_tg_tree_from(struct task_group *from, 712 tg_visitor down, tg_visitor up, void *data) 713 { 714 struct task_group *parent, *child; 715 int ret; 716 717 parent = from; 718 719 down: 720 ret = (*down)(parent, data); 721 if (ret) 722 goto out; 723 list_for_each_entry_rcu(child, &parent->children, siblings) { 724 parent = child; 725 goto down; 726 727 up: 728 continue; 729 } 730 ret = (*up)(parent, data); 731 if (ret || parent == from) 732 goto out; 733 734 child = parent; 735 parent = parent->parent; 736 if (parent) 737 goto up; 738 out: 739 return ret; 740 } 741 742 int tg_nop(struct task_group *tg, void *data) 743 { 744 return 0; 745 } 746 #endif 747 748 static void set_load_weight(struct task_struct *p, bool update_load) 749 { 750 int prio = p->static_prio - MAX_RT_PRIO; 751 struct load_weight *load = &p->se.load; 752 753 /* 754 * SCHED_IDLE tasks get minimal weight: 755 */ 756 if (task_has_idle_policy(p)) { 757 load->weight = scale_load(WEIGHT_IDLEPRIO); 758 load->inv_weight = WMULT_IDLEPRIO; 759 return; 760 } 761 762 /* 763 * SCHED_OTHER tasks have to update their load when changing their 764 * weight 765 */ 766 if (update_load && p->sched_class == &fair_sched_class) { 767 reweight_task(p, prio); 768 } else { 769 load->weight = scale_load(sched_prio_to_weight[prio]); 770 load->inv_weight = sched_prio_to_wmult[prio]; 771 } 772 } 773 774 #ifdef CONFIG_UCLAMP_TASK 775 /* 776 * Serializes updates of utilization clamp values 777 * 778 * The (slow-path) user-space triggers utilization clamp value updates which 779 * can require updates on (fast-path) scheduler's data structures used to 780 * support enqueue/dequeue operations. 781 * While the per-CPU rq lock protects fast-path update operations, user-space 782 * requests are serialized using a mutex to reduce the risk of conflicting 783 * updates or API abuses. 784 */ 785 static DEFINE_MUTEX(uclamp_mutex); 786 787 /* Max allowed minimum utilization */ 788 unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; 789 790 /* Max allowed maximum utilization */ 791 unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; 792 793 /* All clamps are required to be less or equal than these values */ 794 static struct uclamp_se uclamp_default[UCLAMP_CNT]; 795 796 /* Integer rounded range for each bucket */ 797 #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) 798 799 #define for_each_clamp_id(clamp_id) \ 800 for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++) 801 802 static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) 803 { 804 return clamp_value / UCLAMP_BUCKET_DELTA; 805 } 806 807 static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) 808 { 809 return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); 810 } 811 812 static inline unsigned int uclamp_none(enum uclamp_id clamp_id) 813 { 814 if (clamp_id == UCLAMP_MIN) 815 return 0; 816 return SCHED_CAPACITY_SCALE; 817 } 818 819 static inline void uclamp_se_set(struct uclamp_se *uc_se, 820 unsigned int value, bool user_defined) 821 { 822 uc_se->value = value; 823 uc_se->bucket_id = uclamp_bucket_id(value); 824 uc_se->user_defined = user_defined; 825 } 826 827 static inline unsigned int 828 uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id, 829 unsigned int clamp_value) 830 { 831 /* 832 * Avoid blocked utilization pushing up the frequency when we go 833 * idle (which drops the max-clamp) by retaining the last known 834 * max-clamp. 835 */ 836 if (clamp_id == UCLAMP_MAX) { 837 rq->uclamp_flags |= UCLAMP_FLAG_IDLE; 838 return clamp_value; 839 } 840 841 return uclamp_none(UCLAMP_MIN); 842 } 843 844 static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id, 845 unsigned int clamp_value) 846 { 847 /* Reset max-clamp retention only on idle exit */ 848 if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE)) 849 return; 850 851 WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value); 852 } 853 854 static inline 855 unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, 856 unsigned int clamp_value) 857 { 858 struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; 859 int bucket_id = UCLAMP_BUCKETS - 1; 860 861 /* 862 * Since both min and max clamps are max aggregated, find the 863 * top most bucket with tasks in. 864 */ 865 for ( ; bucket_id >= 0; bucket_id--) { 866 if (!bucket[bucket_id].tasks) 867 continue; 868 return bucket[bucket_id].value; 869 } 870 871 /* No tasks -- default clamp values */ 872 return uclamp_idle_value(rq, clamp_id, clamp_value); 873 } 874 875 static inline struct uclamp_se 876 uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) 877 { 878 struct uclamp_se uc_req = p->uclamp_req[clamp_id]; 879 #ifdef CONFIG_UCLAMP_TASK_GROUP 880 struct uclamp_se uc_max; 881 882 /* 883 * Tasks in autogroups or root task group will be 884 * restricted by system defaults. 885 */ 886 if (task_group_is_autogroup(task_group(p))) 887 return uc_req; 888 if (task_group(p) == &root_task_group) 889 return uc_req; 890 891 uc_max = task_group(p)->uclamp[clamp_id]; 892 if (uc_req.value > uc_max.value || !uc_req.user_defined) 893 return uc_max; 894 #endif 895 896 return uc_req; 897 } 898 899 /* 900 * The effective clamp bucket index of a task depends on, by increasing 901 * priority: 902 * - the task specific clamp value, when explicitly requested from userspace 903 * - the task group effective clamp value, for tasks not either in the root 904 * group or in an autogroup 905 * - the system default clamp value, defined by the sysadmin 906 */ 907 static inline struct uclamp_se 908 uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id) 909 { 910 struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id); 911 struct uclamp_se uc_max = uclamp_default[clamp_id]; 912 913 /* System default restrictions always apply */ 914 if (unlikely(uc_req.value > uc_max.value)) 915 return uc_max; 916 917 return uc_req; 918 } 919 920 unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) 921 { 922 struct uclamp_se uc_eff; 923 924 /* Task currently refcounted: use back-annotated (effective) value */ 925 if (p->uclamp[clamp_id].active) 926 return (unsigned long)p->uclamp[clamp_id].value; 927 928 uc_eff = uclamp_eff_get(p, clamp_id); 929 930 return (unsigned long)uc_eff.value; 931 } 932 933 /* 934 * When a task is enqueued on a rq, the clamp bucket currently defined by the 935 * task's uclamp::bucket_id is refcounted on that rq. This also immediately 936 * updates the rq's clamp value if required. 937 * 938 * Tasks can have a task-specific value requested from user-space, track 939 * within each bucket the maximum value for tasks refcounted in it. 940 * This "local max aggregation" allows to track the exact "requested" value 941 * for each bucket when all its RUNNABLE tasks require the same clamp. 942 */ 943 static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, 944 enum uclamp_id clamp_id) 945 { 946 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; 947 struct uclamp_se *uc_se = &p->uclamp[clamp_id]; 948 struct uclamp_bucket *bucket; 949 950 lockdep_assert_held(&rq->lock); 951 952 /* Update task effective clamp */ 953 p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id); 954 955 bucket = &uc_rq->bucket[uc_se->bucket_id]; 956 bucket->tasks++; 957 uc_se->active = true; 958 959 uclamp_idle_reset(rq, clamp_id, uc_se->value); 960 961 /* 962 * Local max aggregation: rq buckets always track the max 963 * "requested" clamp value of its RUNNABLE tasks. 964 */ 965 if (bucket->tasks == 1 || uc_se->value > bucket->value) 966 bucket->value = uc_se->value; 967 968 if (uc_se->value > READ_ONCE(uc_rq->value)) 969 WRITE_ONCE(uc_rq->value, uc_se->value); 970 } 971 972 /* 973 * When a task is dequeued from a rq, the clamp bucket refcounted by the task 974 * is released. If this is the last task reference counting the rq's max 975 * active clamp value, then the rq's clamp value is updated. 976 * 977 * Both refcounted tasks and rq's cached clamp values are expected to be 978 * always valid. If it's detected they are not, as defensive programming, 979 * enforce the expected state and warn. 980 */ 981 static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, 982 enum uclamp_id clamp_id) 983 { 984 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; 985 struct uclamp_se *uc_se = &p->uclamp[clamp_id]; 986 struct uclamp_bucket *bucket; 987 unsigned int bkt_clamp; 988 unsigned int rq_clamp; 989 990 lockdep_assert_held(&rq->lock); 991 992 bucket = &uc_rq->bucket[uc_se->bucket_id]; 993 SCHED_WARN_ON(!bucket->tasks); 994 if (likely(bucket->tasks)) 995 bucket->tasks--; 996 uc_se->active = false; 997 998 /* 999 * Keep "local max aggregation" simple and accept to (possibly) 1000 * overboost some RUNNABLE tasks in the same bucket. 1001 * The rq clamp bucket value is reset to its base value whenever 1002 * there are no more RUNNABLE tasks refcounting it. 1003 */ 1004 if (likely(bucket->tasks)) 1005 return; 1006 1007 rq_clamp = READ_ONCE(uc_rq->value); 1008 /* 1009 * Defensive programming: this should never happen. If it happens, 1010 * e.g. due to future modification, warn and fixup the expected value. 1011 */ 1012 SCHED_WARN_ON(bucket->value > rq_clamp); 1013 if (bucket->value >= rq_clamp) { 1014 bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); 1015 WRITE_ONCE(uc_rq->value, bkt_clamp); 1016 } 1017 } 1018 1019 static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) 1020 { 1021 enum uclamp_id clamp_id; 1022 1023 if (unlikely(!p->sched_class->uclamp_enabled)) 1024 return; 1025 1026 for_each_clamp_id(clamp_id) 1027 uclamp_rq_inc_id(rq, p, clamp_id); 1028 1029 /* Reset clamp idle holding when there is one RUNNABLE task */ 1030 if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) 1031 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; 1032 } 1033 1034 static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) 1035 { 1036 enum uclamp_id clamp_id; 1037 1038 if (unlikely(!p->sched_class->uclamp_enabled)) 1039 return; 1040 1041 for_each_clamp_id(clamp_id) 1042 uclamp_rq_dec_id(rq, p, clamp_id); 1043 } 1044 1045 static inline void 1046 uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) 1047 { 1048 struct rq_flags rf; 1049 struct rq *rq; 1050 1051 /* 1052 * Lock the task and the rq where the task is (or was) queued. 1053 * 1054 * We might lock the (previous) rq of a !RUNNABLE task, but that's the 1055 * price to pay to safely serialize util_{min,max} updates with 1056 * enqueues, dequeues and migration operations. 1057 * This is the same locking schema used by __set_cpus_allowed_ptr(). 1058 */ 1059 rq = task_rq_lock(p, &rf); 1060 1061 /* 1062 * Setting the clamp bucket is serialized by task_rq_lock(). 1063 * If the task is not yet RUNNABLE and its task_struct is not 1064 * affecting a valid clamp bucket, the next time it's enqueued, 1065 * it will already see the updated clamp bucket value. 1066 */ 1067 if (p->uclamp[clamp_id].active) { 1068 uclamp_rq_dec_id(rq, p, clamp_id); 1069 uclamp_rq_inc_id(rq, p, clamp_id); 1070 } 1071 1072 task_rq_unlock(rq, p, &rf); 1073 } 1074 1075 #ifdef CONFIG_UCLAMP_TASK_GROUP 1076 static inline void 1077 uclamp_update_active_tasks(struct cgroup_subsys_state *css, 1078 unsigned int clamps) 1079 { 1080 enum uclamp_id clamp_id; 1081 struct css_task_iter it; 1082 struct task_struct *p; 1083 1084 css_task_iter_start(css, 0, &it); 1085 while ((p = css_task_iter_next(&it))) { 1086 for_each_clamp_id(clamp_id) { 1087 if ((0x1 << clamp_id) & clamps) 1088 uclamp_update_active(p, clamp_id); 1089 } 1090 } 1091 css_task_iter_end(&it); 1092 } 1093 1094 static void cpu_util_update_eff(struct cgroup_subsys_state *css); 1095 static void uclamp_update_root_tg(void) 1096 { 1097 struct task_group *tg = &root_task_group; 1098 1099 uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN], 1100 sysctl_sched_uclamp_util_min, false); 1101 uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX], 1102 sysctl_sched_uclamp_util_max, false); 1103 1104 rcu_read_lock(); 1105 cpu_util_update_eff(&root_task_group.css); 1106 rcu_read_unlock(); 1107 } 1108 #else 1109 static void uclamp_update_root_tg(void) { } 1110 #endif 1111 1112 int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, 1113 void *buffer, size_t *lenp, loff_t *ppos) 1114 { 1115 bool update_root_tg = false; 1116 int old_min, old_max; 1117 int result; 1118 1119 mutex_lock(&uclamp_mutex); 1120 old_min = sysctl_sched_uclamp_util_min; 1121 old_max = sysctl_sched_uclamp_util_max; 1122 1123 result = proc_dointvec(table, write, buffer, lenp, ppos); 1124 if (result) 1125 goto undo; 1126 if (!write) 1127 goto done; 1128 1129 if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || 1130 sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) { 1131 result = -EINVAL; 1132 goto undo; 1133 } 1134 1135 if (old_min != sysctl_sched_uclamp_util_min) { 1136 uclamp_se_set(&uclamp_default[UCLAMP_MIN], 1137 sysctl_sched_uclamp_util_min, false); 1138 update_root_tg = true; 1139 } 1140 if (old_max != sysctl_sched_uclamp_util_max) { 1141 uclamp_se_set(&uclamp_default[UCLAMP_MAX], 1142 sysctl_sched_uclamp_util_max, false); 1143 update_root_tg = true; 1144 } 1145 1146 if (update_root_tg) 1147 uclamp_update_root_tg(); 1148 1149 /* 1150 * We update all RUNNABLE tasks only when task groups are in use. 1151 * Otherwise, keep it simple and do just a lazy update at each next 1152 * task enqueue time. 1153 */ 1154 1155 goto done; 1156 1157 undo: 1158 sysctl_sched_uclamp_util_min = old_min; 1159 sysctl_sched_uclamp_util_max = old_max; 1160 done: 1161 mutex_unlock(&uclamp_mutex); 1162 1163 return result; 1164 } 1165 1166 static int uclamp_validate(struct task_struct *p, 1167 const struct sched_attr *attr) 1168 { 1169 unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value; 1170 unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value; 1171 1172 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) 1173 lower_bound = attr->sched_util_min; 1174 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) 1175 upper_bound = attr->sched_util_max; 1176 1177 if (lower_bound > upper_bound) 1178 return -EINVAL; 1179 if (upper_bound > SCHED_CAPACITY_SCALE) 1180 return -EINVAL; 1181 1182 return 0; 1183 } 1184 1185 static void __setscheduler_uclamp(struct task_struct *p, 1186 const struct sched_attr *attr) 1187 { 1188 enum uclamp_id clamp_id; 1189 1190 /* 1191 * On scheduling class change, reset to default clamps for tasks 1192 * without a task-specific value. 1193 */ 1194 for_each_clamp_id(clamp_id) { 1195 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; 1196 unsigned int clamp_value = uclamp_none(clamp_id); 1197 1198 /* Keep using defined clamps across class changes */ 1199 if (uc_se->user_defined) 1200 continue; 1201 1202 /* By default, RT tasks always get 100% boost */ 1203 if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) 1204 clamp_value = uclamp_none(UCLAMP_MAX); 1205 1206 uclamp_se_set(uc_se, clamp_value, false); 1207 } 1208 1209 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) 1210 return; 1211 1212 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { 1213 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], 1214 attr->sched_util_min, true); 1215 } 1216 1217 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { 1218 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], 1219 attr->sched_util_max, true); 1220 } 1221 } 1222 1223 static void uclamp_fork(struct task_struct *p) 1224 { 1225 enum uclamp_id clamp_id; 1226 1227 for_each_clamp_id(clamp_id) 1228 p->uclamp[clamp_id].active = false; 1229 1230 if (likely(!p->sched_reset_on_fork)) 1231 return; 1232 1233 for_each_clamp_id(clamp_id) { 1234 uclamp_se_set(&p->uclamp_req[clamp_id], 1235 uclamp_none(clamp_id), false); 1236 } 1237 } 1238 1239 static void __init init_uclamp(void) 1240 { 1241 struct uclamp_se uc_max = {}; 1242 enum uclamp_id clamp_id; 1243 int cpu; 1244 1245 mutex_init(&uclamp_mutex); 1246 1247 for_each_possible_cpu(cpu) { 1248 memset(&cpu_rq(cpu)->uclamp, 0, 1249 sizeof(struct uclamp_rq)*UCLAMP_CNT); 1250 cpu_rq(cpu)->uclamp_flags = 0; 1251 } 1252 1253 for_each_clamp_id(clamp_id) { 1254 uclamp_se_set(&init_task.uclamp_req[clamp_id], 1255 uclamp_none(clamp_id), false); 1256 } 1257 1258 /* System defaults allow max clamp values for both indexes */ 1259 uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false); 1260 for_each_clamp_id(clamp_id) { 1261 uclamp_default[clamp_id] = uc_max; 1262 #ifdef CONFIG_UCLAMP_TASK_GROUP 1263 root_task_group.uclamp_req[clamp_id] = uc_max; 1264 root_task_group.uclamp[clamp_id] = uc_max; 1265 #endif 1266 } 1267 } 1268 1269 #else /* CONFIG_UCLAMP_TASK */ 1270 static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } 1271 static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } 1272 static inline int uclamp_validate(struct task_struct *p, 1273 const struct sched_attr *attr) 1274 { 1275 return -EOPNOTSUPP; 1276 } 1277 static void __setscheduler_uclamp(struct task_struct *p, 1278 const struct sched_attr *attr) { } 1279 static inline void uclamp_fork(struct task_struct *p) { } 1280 static inline void init_uclamp(void) { } 1281 #endif /* CONFIG_UCLAMP_TASK */ 1282 1283 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 1284 { 1285 if (!(flags & ENQUEUE_NOCLOCK)) 1286 update_rq_clock(rq); 1287 1288 if (!(flags & ENQUEUE_RESTORE)) { 1289 sched_info_queued(rq, p); 1290 psi_enqueue(p, flags & ENQUEUE_WAKEUP); 1291 } 1292 1293 uclamp_rq_inc(rq, p); 1294 p->sched_class->enqueue_task(rq, p, flags); 1295 } 1296 1297 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 1298 { 1299 if (!(flags & DEQUEUE_NOCLOCK)) 1300 update_rq_clock(rq); 1301 1302 if (!(flags & DEQUEUE_SAVE)) { 1303 sched_info_dequeued(rq, p); 1304 psi_dequeue(p, flags & DEQUEUE_SLEEP); 1305 } 1306 1307 uclamp_rq_dec(rq, p); 1308 p->sched_class->dequeue_task(rq, p, flags); 1309 } 1310 1311 void activate_task(struct rq *rq, struct task_struct *p, int flags) 1312 { 1313 if (task_contributes_to_load(p)) 1314 rq->nr_uninterruptible--; 1315 1316 enqueue_task(rq, p, flags); 1317 1318 p->on_rq = TASK_ON_RQ_QUEUED; 1319 } 1320 1321 void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 1322 { 1323 p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; 1324 1325 if (task_contributes_to_load(p)) 1326 rq->nr_uninterruptible++; 1327 1328 dequeue_task(rq, p, flags); 1329 } 1330 1331 /* 1332 * __normal_prio - return the priority that is based on the static prio 1333 */ 1334 static inline int __normal_prio(struct task_struct *p) 1335 { 1336 return p->static_prio; 1337 } 1338 1339 /* 1340 * Calculate the expected normal priority: i.e. priority 1341 * without taking RT-inheritance into account. Might be 1342 * boosted by interactivity modifiers. Changes upon fork, 1343 * setprio syscalls, and whenever the interactivity 1344 * estimator recalculates. 1345 */ 1346 static inline int normal_prio(struct task_struct *p) 1347 { 1348 int prio; 1349 1350 if (task_has_dl_policy(p)) 1351 prio = MAX_DL_PRIO-1; 1352 else if (task_has_rt_policy(p)) 1353 prio = MAX_RT_PRIO-1 - p->rt_priority; 1354 else 1355 prio = __normal_prio(p); 1356 return prio; 1357 } 1358 1359 /* 1360 * Calculate the current priority, i.e. the priority 1361 * taken into account by the scheduler. This value might 1362 * be boosted by RT tasks, or might be boosted by 1363 * interactivity modifiers. Will be RT if the task got 1364 * RT-boosted. If not then it returns p->normal_prio. 1365 */ 1366 static int effective_prio(struct task_struct *p) 1367 { 1368 p->normal_prio = normal_prio(p); 1369 /* 1370 * If we are RT tasks or we were boosted to RT priority, 1371 * keep the priority unchanged. Otherwise, update priority 1372 * to the normal priority: 1373 */ 1374 if (!rt_prio(p->prio)) 1375 return p->normal_prio; 1376 return p->prio; 1377 } 1378 1379 /** 1380 * task_curr - is this task currently executing on a CPU? 1381 * @p: the task in question. 1382 * 1383 * Return: 1 if the task is currently executing. 0 otherwise. 1384 */ 1385 inline int task_curr(const struct task_struct *p) 1386 { 1387 return cpu_curr(task_cpu(p)) == p; 1388 } 1389 1390 /* 1391 * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, 1392 * use the balance_callback list if you want balancing. 1393 * 1394 * this means any call to check_class_changed() must be followed by a call to 1395 * balance_callback(). 1396 */ 1397 static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1398 const struct sched_class *prev_class, 1399 int oldprio) 1400 { 1401 if (prev_class != p->sched_class) { 1402 if (prev_class->switched_from) 1403 prev_class->switched_from(rq, p); 1404 1405 p->sched_class->switched_to(rq, p); 1406 } else if (oldprio != p->prio || dl_task(p)) 1407 p->sched_class->prio_changed(rq, p, oldprio); 1408 } 1409 1410 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 1411 { 1412 const struct sched_class *class; 1413 1414 if (p->sched_class == rq->curr->sched_class) { 1415 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 1416 } else { 1417 for_each_class(class) { 1418 if (class == rq->curr->sched_class) 1419 break; 1420 if (class == p->sched_class) { 1421 resched_curr(rq); 1422 break; 1423 } 1424 } 1425 } 1426 1427 /* 1428 * A queue event has occurred, and we're going to schedule. In 1429 * this case, we can save a useless back to back clock update. 1430 */ 1431 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) 1432 rq_clock_skip_update(rq); 1433 } 1434 1435 #ifdef CONFIG_SMP 1436 1437 /* 1438 * Per-CPU kthreads are allowed to run on !active && online CPUs, see 1439 * __set_cpus_allowed_ptr() and select_fallback_rq(). 1440 */ 1441 static inline bool is_cpu_allowed(struct task_struct *p, int cpu) 1442 { 1443 if (!cpumask_test_cpu(cpu, p->cpus_ptr)) 1444 return false; 1445 1446 if (is_per_cpu_kthread(p)) 1447 return cpu_online(cpu); 1448 1449 return cpu_active(cpu); 1450 } 1451 1452 /* 1453 * This is how migration works: 1454 * 1455 * 1) we invoke migration_cpu_stop() on the target CPU using 1456 * stop_one_cpu(). 1457 * 2) stopper starts to run (implicitly forcing the migrated thread 1458 * off the CPU) 1459 * 3) it checks whether the migrated task is still in the wrong runqueue. 1460 * 4) if it's in the wrong runqueue then the migration thread removes 1461 * it and puts it into the right queue. 1462 * 5) stopper completes and stop_one_cpu() returns and the migration 1463 * is done. 1464 */ 1465 1466 /* 1467 * move_queued_task - move a queued task to new rq. 1468 * 1469 * Returns (locked) new rq. Old rq's lock is released. 1470 */ 1471 static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, 1472 struct task_struct *p, int new_cpu) 1473 { 1474 lockdep_assert_held(&rq->lock); 1475 1476 WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); 1477 dequeue_task(rq, p, DEQUEUE_NOCLOCK); 1478 set_task_cpu(p, new_cpu); 1479 rq_unlock(rq, rf); 1480 1481 rq = cpu_rq(new_cpu); 1482 1483 rq_lock(rq, rf); 1484 BUG_ON(task_cpu(p) != new_cpu); 1485 enqueue_task(rq, p, 0); 1486 p->on_rq = TASK_ON_RQ_QUEUED; 1487 check_preempt_curr(rq, p, 0); 1488 1489 return rq; 1490 } 1491 1492 struct migration_arg { 1493 struct task_struct *task; 1494 int dest_cpu; 1495 }; 1496 1497 /* 1498 * Move (not current) task off this CPU, onto the destination CPU. We're doing 1499 * this because either it can't run here any more (set_cpus_allowed() 1500 * away from this CPU, or CPU going down), or because we're 1501 * attempting to rebalance this task on exec (sched_exec). 1502 * 1503 * So we race with normal scheduler movements, but that's OK, as long 1504 * as the task is no longer on this CPU. 1505 */ 1506 static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, 1507 struct task_struct *p, int dest_cpu) 1508 { 1509 /* Affinity changed (again). */ 1510 if (!is_cpu_allowed(p, dest_cpu)) 1511 return rq; 1512 1513 update_rq_clock(rq); 1514 rq = move_queued_task(rq, rf, p, dest_cpu); 1515 1516 return rq; 1517 } 1518 1519 /* 1520 * migration_cpu_stop - this will be executed by a highprio stopper thread 1521 * and performs thread migration by bumping thread off CPU then 1522 * 'pushing' onto another runqueue. 1523 */ 1524 static int migration_cpu_stop(void *data) 1525 { 1526 struct migration_arg *arg = data; 1527 struct task_struct *p = arg->task; 1528 struct rq *rq = this_rq(); 1529 struct rq_flags rf; 1530 1531 /* 1532 * The original target CPU might have gone down and we might 1533 * be on another CPU but it doesn't matter. 1534 */ 1535 local_irq_disable(); 1536 /* 1537 * We need to explicitly wake pending tasks before running 1538 * __migrate_task() such that we will not miss enforcing cpus_ptr 1539 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. 1540 */ 1541 sched_ttwu_pending(); 1542 1543 raw_spin_lock(&p->pi_lock); 1544 rq_lock(rq, &rf); 1545 /* 1546 * If task_rq(p) != rq, it cannot be migrated here, because we're 1547 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because 1548 * we're holding p->pi_lock. 1549 */ 1550 if (task_rq(p) == rq) { 1551 if (task_on_rq_queued(p)) 1552 rq = __migrate_task(rq, &rf, p, arg->dest_cpu); 1553 else 1554 p->wake_cpu = arg->dest_cpu; 1555 } 1556 rq_unlock(rq, &rf); 1557 raw_spin_unlock(&p->pi_lock); 1558 1559 local_irq_enable(); 1560 return 0; 1561 } 1562 1563 /* 1564 * sched_class::set_cpus_allowed must do the below, but is not required to 1565 * actually call this function. 1566 */ 1567 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) 1568 { 1569 cpumask_copy(&p->cpus_mask, new_mask); 1570 p->nr_cpus_allowed = cpumask_weight(new_mask); 1571 } 1572 1573 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 1574 { 1575 struct rq *rq = task_rq(p); 1576 bool queued, running; 1577 1578 lockdep_assert_held(&p->pi_lock); 1579 1580 queued = task_on_rq_queued(p); 1581 running = task_current(rq, p); 1582 1583 if (queued) { 1584 /* 1585 * Because __kthread_bind() calls this on blocked tasks without 1586 * holding rq->lock. 1587 */ 1588 lockdep_assert_held(&rq->lock); 1589 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); 1590 } 1591 if (running) 1592 put_prev_task(rq, p); 1593 1594 p->sched_class->set_cpus_allowed(p, new_mask); 1595 1596 if (queued) 1597 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); 1598 if (running) 1599 set_next_task(rq, p); 1600 } 1601 1602 /* 1603 * Change a given task's CPU affinity. Migrate the thread to a 1604 * proper CPU and schedule it away if the CPU it's executing on 1605 * is removed from the allowed bitmask. 1606 * 1607 * NOTE: the caller must have a valid reference to the task, the 1608 * task must not exit() & deallocate itself prematurely. The 1609 * call is not atomic; no spinlocks may be held. 1610 */ 1611 static int __set_cpus_allowed_ptr(struct task_struct *p, 1612 const struct cpumask *new_mask, bool check) 1613 { 1614 const struct cpumask *cpu_valid_mask = cpu_active_mask; 1615 unsigned int dest_cpu; 1616 struct rq_flags rf; 1617 struct rq *rq; 1618 int ret = 0; 1619 1620 rq = task_rq_lock(p, &rf); 1621 update_rq_clock(rq); 1622 1623 if (p->flags & PF_KTHREAD) { 1624 /* 1625 * Kernel threads are allowed on online && !active CPUs 1626 */ 1627 cpu_valid_mask = cpu_online_mask; 1628 } 1629 1630 /* 1631 * Must re-check here, to close a race against __kthread_bind(), 1632 * sched_setaffinity() is not guaranteed to observe the flag. 1633 */ 1634 if (check && (p->flags & PF_NO_SETAFFINITY)) { 1635 ret = -EINVAL; 1636 goto out; 1637 } 1638 1639 if (cpumask_equal(p->cpus_ptr, new_mask)) 1640 goto out; 1641 1642 /* 1643 * Picking a ~random cpu helps in cases where we are changing affinity 1644 * for groups of tasks (ie. cpuset), so that load balancing is not 1645 * immediately required to distribute the tasks within their new mask. 1646 */ 1647 dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask); 1648 if (dest_cpu >= nr_cpu_ids) { 1649 ret = -EINVAL; 1650 goto out; 1651 } 1652 1653 do_set_cpus_allowed(p, new_mask); 1654 1655 if (p->flags & PF_KTHREAD) { 1656 /* 1657 * For kernel threads that do indeed end up on online && 1658 * !active we want to ensure they are strict per-CPU threads. 1659 */ 1660 WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && 1661 !cpumask_intersects(new_mask, cpu_active_mask) && 1662 p->nr_cpus_allowed != 1); 1663 } 1664 1665 /* Can the task run on the task's current CPU? If so, we're done */ 1666 if (cpumask_test_cpu(task_cpu(p), new_mask)) 1667 goto out; 1668 1669 if (task_running(rq, p) || p->state == TASK_WAKING) { 1670 struct migration_arg arg = { p, dest_cpu }; 1671 /* Need help from migration thread: drop lock and wait. */ 1672 task_rq_unlock(rq, p, &rf); 1673 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 1674 return 0; 1675 } else if (task_on_rq_queued(p)) { 1676 /* 1677 * OK, since we're going to drop the lock immediately 1678 * afterwards anyway. 1679 */ 1680 rq = move_queued_task(rq, &rf, p, dest_cpu); 1681 } 1682 out: 1683 task_rq_unlock(rq, p, &rf); 1684 1685 return ret; 1686 } 1687 1688 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 1689 { 1690 return __set_cpus_allowed_ptr(p, new_mask, false); 1691 } 1692 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 1693 1694 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1695 { 1696 #ifdef CONFIG_SCHED_DEBUG 1697 /* 1698 * We should never call set_task_cpu() on a blocked task, 1699 * ttwu() will sort out the placement. 1700 */ 1701 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 1702 !p->on_rq); 1703 1704 /* 1705 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING, 1706 * because schedstat_wait_{start,end} rebase migrating task's wait_start 1707 * time relying on p->on_rq. 1708 */ 1709 WARN_ON_ONCE(p->state == TASK_RUNNING && 1710 p->sched_class == &fair_sched_class && 1711 (p->on_rq && !task_on_rq_migrating(p))); 1712 1713 #ifdef CONFIG_LOCKDEP 1714 /* 1715 * The caller should hold either p->pi_lock or rq->lock, when changing 1716 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 1717 * 1718 * sched_move_task() holds both and thus holding either pins the cgroup, 1719 * see task_group(). 1720 * 1721 * Furthermore, all task_rq users should acquire both locks, see 1722 * task_rq_lock(). 1723 */ 1724 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || 1725 lockdep_is_held(&task_rq(p)->lock))); 1726 #endif 1727 /* 1728 * Clearly, migrating tasks to offline CPUs is a fairly daft thing. 1729 */ 1730 WARN_ON_ONCE(!cpu_online(new_cpu)); 1731 #endif 1732 1733 trace_sched_migrate_task(p, new_cpu); 1734 1735 if (task_cpu(p) != new_cpu) { 1736 if (p->sched_class->migrate_task_rq) 1737 p->sched_class->migrate_task_rq(p, new_cpu); 1738 p->se.nr_migrations++; 1739 rseq_migrate(p); 1740 perf_event_task_migrate(p); 1741 } 1742 1743 __set_task_cpu(p, new_cpu); 1744 } 1745 1746 #ifdef CONFIG_NUMA_BALANCING 1747 static void __migrate_swap_task(struct task_struct *p, int cpu) 1748 { 1749 if (task_on_rq_queued(p)) { 1750 struct rq *src_rq, *dst_rq; 1751 struct rq_flags srf, drf; 1752 1753 src_rq = task_rq(p); 1754 dst_rq = cpu_rq(cpu); 1755 1756 rq_pin_lock(src_rq, &srf); 1757 rq_pin_lock(dst_rq, &drf); 1758 1759 deactivate_task(src_rq, p, 0); 1760 set_task_cpu(p, cpu); 1761 activate_task(dst_rq, p, 0); 1762 check_preempt_curr(dst_rq, p, 0); 1763 1764 rq_unpin_lock(dst_rq, &drf); 1765 rq_unpin_lock(src_rq, &srf); 1766 1767 } else { 1768 /* 1769 * Task isn't running anymore; make it appear like we migrated 1770 * it before it went to sleep. This means on wakeup we make the 1771 * previous CPU our target instead of where it really is. 1772 */ 1773 p->wake_cpu = cpu; 1774 } 1775 } 1776 1777 struct migration_swap_arg { 1778 struct task_struct *src_task, *dst_task; 1779 int src_cpu, dst_cpu; 1780 }; 1781 1782 static int migrate_swap_stop(void *data) 1783 { 1784 struct migration_swap_arg *arg = data; 1785 struct rq *src_rq, *dst_rq; 1786 int ret = -EAGAIN; 1787 1788 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) 1789 return -EAGAIN; 1790 1791 src_rq = cpu_rq(arg->src_cpu); 1792 dst_rq = cpu_rq(arg->dst_cpu); 1793 1794 double_raw_lock(&arg->src_task->pi_lock, 1795 &arg->dst_task->pi_lock); 1796 double_rq_lock(src_rq, dst_rq); 1797 1798 if (task_cpu(arg->dst_task) != arg->dst_cpu) 1799 goto unlock; 1800 1801 if (task_cpu(arg->src_task) != arg->src_cpu) 1802 goto unlock; 1803 1804 if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr)) 1805 goto unlock; 1806 1807 if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr)) 1808 goto unlock; 1809 1810 __migrate_swap_task(arg->src_task, arg->dst_cpu); 1811 __migrate_swap_task(arg->dst_task, arg->src_cpu); 1812 1813 ret = 0; 1814 1815 unlock: 1816 double_rq_unlock(src_rq, dst_rq); 1817 raw_spin_unlock(&arg->dst_task->pi_lock); 1818 raw_spin_unlock(&arg->src_task->pi_lock); 1819 1820 return ret; 1821 } 1822 1823 /* 1824 * Cross migrate two tasks 1825 */ 1826 int migrate_swap(struct task_struct *cur, struct task_struct *p, 1827 int target_cpu, int curr_cpu) 1828 { 1829 struct migration_swap_arg arg; 1830 int ret = -EINVAL; 1831 1832 arg = (struct migration_swap_arg){ 1833 .src_task = cur, 1834 .src_cpu = curr_cpu, 1835 .dst_task = p, 1836 .dst_cpu = target_cpu, 1837 }; 1838 1839 if (arg.src_cpu == arg.dst_cpu) 1840 goto out; 1841 1842 /* 1843 * These three tests are all lockless; this is OK since all of them 1844 * will be re-checked with proper locks held further down the line. 1845 */ 1846 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) 1847 goto out; 1848 1849 if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr)) 1850 goto out; 1851 1852 if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr)) 1853 goto out; 1854 1855 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); 1856 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); 1857 1858 out: 1859 return ret; 1860 } 1861 #endif /* CONFIG_NUMA_BALANCING */ 1862 1863 /* 1864 * wait_task_inactive - wait for a thread to unschedule. 1865 * 1866 * If @match_state is nonzero, it's the @p->state value just checked and 1867 * not expected to change. If it changes, i.e. @p might have woken up, 1868 * then return zero. When we succeed in waiting for @p to be off its CPU, 1869 * we return a positive number (its total switch count). If a second call 1870 * a short while later returns the same number, the caller can be sure that 1871 * @p has remained unscheduled the whole time. 1872 * 1873 * The caller must ensure that the task *will* unschedule sometime soon, 1874 * else this function might spin for a *long* time. This function can't 1875 * be called with interrupts off, or it may introduce deadlock with 1876 * smp_call_function() if an IPI is sent by the same process we are 1877 * waiting to become inactive. 1878 */ 1879 unsigned long wait_task_inactive(struct task_struct *p, long match_state) 1880 { 1881 int running, queued; 1882 struct rq_flags rf; 1883 unsigned long ncsw; 1884 struct rq *rq; 1885 1886 for (;;) { 1887 /* 1888 * We do the initial early heuristics without holding 1889 * any task-queue locks at all. We'll only try to get 1890 * the runqueue lock when things look like they will 1891 * work out! 1892 */ 1893 rq = task_rq(p); 1894 1895 /* 1896 * If the task is actively running on another CPU 1897 * still, just relax and busy-wait without holding 1898 * any locks. 1899 * 1900 * NOTE! Since we don't hold any locks, it's not 1901 * even sure that "rq" stays as the right runqueue! 1902 * But we don't care, since "task_running()" will 1903 * return false if the runqueue has changed and p 1904 * is actually now running somewhere else! 1905 */ 1906 while (task_running(rq, p)) { 1907 if (match_state && unlikely(p->state != match_state)) 1908 return 0; 1909 cpu_relax(); 1910 } 1911 1912 /* 1913 * Ok, time to look more closely! We need the rq 1914 * lock now, to be *sure*. If we're wrong, we'll 1915 * just go back and repeat. 1916 */ 1917 rq = task_rq_lock(p, &rf); 1918 trace_sched_wait_task(p); 1919 running = task_running(rq, p); 1920 queued = task_on_rq_queued(p); 1921 ncsw = 0; 1922 if (!match_state || p->state == match_state) 1923 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 1924 task_rq_unlock(rq, p, &rf); 1925 1926 /* 1927 * If it changed from the expected state, bail out now. 1928 */ 1929 if (unlikely(!ncsw)) 1930 break; 1931 1932 /* 1933 * Was it really running after all now that we 1934 * checked with the proper locks actually held? 1935 * 1936 * Oops. Go back and try again.. 1937 */ 1938 if (unlikely(running)) { 1939 cpu_relax(); 1940 continue; 1941 } 1942 1943 /* 1944 * It's not enough that it's not actively running, 1945 * it must be off the runqueue _entirely_, and not 1946 * preempted! 1947 * 1948 * So if it was still runnable (but just not actively 1949 * running right now), it's preempted, and we should 1950 * yield - it could be a while. 1951 */ 1952 if (unlikely(queued)) { 1953 ktime_t to = NSEC_PER_SEC / HZ; 1954 1955 set_current_state(TASK_UNINTERRUPTIBLE); 1956 schedule_hrtimeout(&to, HRTIMER_MODE_REL); 1957 continue; 1958 } 1959 1960 /* 1961 * Ahh, all good. It wasn't running, and it wasn't 1962 * runnable, which means that it will never become 1963 * running in the future either. We're all done! 1964 */ 1965 break; 1966 } 1967 1968 return ncsw; 1969 } 1970 1971 /*** 1972 * kick_process - kick a running thread to enter/exit the kernel 1973 * @p: the to-be-kicked thread 1974 * 1975 * Cause a process which is running on another CPU to enter 1976 * kernel-mode, without any delay. (to get signals handled.) 1977 * 1978 * NOTE: this function doesn't have to take the runqueue lock, 1979 * because all it wants to ensure is that the remote task enters 1980 * the kernel. If the IPI races and the task has been migrated 1981 * to another CPU then no harm is done and the purpose has been 1982 * achieved as well. 1983 */ 1984 void kick_process(struct task_struct *p) 1985 { 1986 int cpu; 1987 1988 preempt_disable(); 1989 cpu = task_cpu(p); 1990 if ((cpu != smp_processor_id()) && task_curr(p)) 1991 smp_send_reschedule(cpu); 1992 preempt_enable(); 1993 } 1994 EXPORT_SYMBOL_GPL(kick_process); 1995 1996 /* 1997 * ->cpus_ptr is protected by both rq->lock and p->pi_lock 1998 * 1999 * A few notes on cpu_active vs cpu_online: 2000 * 2001 * - cpu_active must be a subset of cpu_online 2002 * 2003 * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, 2004 * see __set_cpus_allowed_ptr(). At this point the newly online 2005 * CPU isn't yet part of the sched domains, and balancing will not 2006 * see it. 2007 * 2008 * - on CPU-down we clear cpu_active() to mask the sched domains and 2009 * avoid the load balancer to place new tasks on the to be removed 2010 * CPU. Existing tasks will remain running there and will be taken 2011 * off. 2012 * 2013 * This means that fallback selection must not select !active CPUs. 2014 * And can assume that any active CPU must be online. Conversely 2015 * select_task_rq() below may allow selection of !active CPUs in order 2016 * to satisfy the above rules. 2017 */ 2018 static int select_fallback_rq(int cpu, struct task_struct *p) 2019 { 2020 int nid = cpu_to_node(cpu); 2021 const struct cpumask *nodemask = NULL; 2022 enum { cpuset, possible, fail } state = cpuset; 2023 int dest_cpu; 2024 2025 /* 2026 * If the node that the CPU is on has been offlined, cpu_to_node() 2027 * will return -1. There is no CPU on the node, and we should 2028 * select the CPU on the other node. 2029 */ 2030 if (nid != -1) { 2031 nodemask = cpumask_of_node(nid); 2032 2033 /* Look for allowed, online CPU in same node. */ 2034 for_each_cpu(dest_cpu, nodemask) { 2035 if (!cpu_active(dest_cpu)) 2036 continue; 2037 if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) 2038 return dest_cpu; 2039 } 2040 } 2041 2042 for (;;) { 2043 /* Any allowed, online CPU? */ 2044 for_each_cpu(dest_cpu, p->cpus_ptr) { 2045 if (!is_cpu_allowed(p, dest_cpu)) 2046 continue; 2047 2048 goto out; 2049 } 2050 2051 /* No more Mr. Nice Guy. */ 2052 switch (state) { 2053 case cpuset: 2054 if (IS_ENABLED(CONFIG_CPUSETS)) { 2055 cpuset_cpus_allowed_fallback(p); 2056 state = possible; 2057 break; 2058 } 2059 /* Fall-through */ 2060 case possible: 2061 do_set_cpus_allowed(p, cpu_possible_mask); 2062 state = fail; 2063 break; 2064 2065 case fail: 2066 BUG(); 2067 break; 2068 } 2069 } 2070 2071 out: 2072 if (state != cpuset) { 2073 /* 2074 * Don't tell them about moving exiting tasks or 2075 * kernel threads (both mm NULL), since they never 2076 * leave kernel. 2077 */ 2078 if (p->mm && printk_ratelimit()) { 2079 printk_deferred("process %d (%s) no longer affine to cpu%d\n", 2080 task_pid_nr(p), p->comm, cpu); 2081 } 2082 } 2083 2084 return dest_cpu; 2085 } 2086 2087 /* 2088 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. 2089 */ 2090 static inline 2091 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 2092 { 2093 lockdep_assert_held(&p->pi_lock); 2094 2095 if (p->nr_cpus_allowed > 1) 2096 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 2097 else 2098 cpu = cpumask_any(p->cpus_ptr); 2099 2100 /* 2101 * In order not to call set_task_cpu() on a blocking task we need 2102 * to rely on ttwu() to place the task on a valid ->cpus_ptr 2103 * CPU. 2104 * 2105 * Since this is common to all placement strategies, this lives here. 2106 * 2107 * [ this allows ->select_task() to simply return task_cpu(p) and 2108 * not worry about this generic constraint ] 2109 */ 2110 if (unlikely(!is_cpu_allowed(p, cpu))) 2111 cpu = select_fallback_rq(task_cpu(p), p); 2112 2113 return cpu; 2114 } 2115 2116 void sched_set_stop_task(int cpu, struct task_struct *stop) 2117 { 2118 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 2119 struct task_struct *old_stop = cpu_rq(cpu)->stop; 2120 2121 if (stop) { 2122 /* 2123 * Make it appear like a SCHED_FIFO task, its something 2124 * userspace knows about and won't get confused about. 2125 * 2126 * Also, it will make PI more or less work without too 2127 * much confusion -- but then, stop work should not 2128 * rely on PI working anyway. 2129 */ 2130 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); 2131 2132 stop->sched_class = &stop_sched_class; 2133 } 2134 2135 cpu_rq(cpu)->stop = stop; 2136 2137 if (old_stop) { 2138 /* 2139 * Reset it back to a normal scheduling class so that 2140 * it can die in pieces. 2141 */ 2142 old_stop->sched_class = &rt_sched_class; 2143 } 2144 } 2145 2146 #else 2147 2148 static inline int __set_cpus_allowed_ptr(struct task_struct *p, 2149 const struct cpumask *new_mask, bool check) 2150 { 2151 return set_cpus_allowed_ptr(p, new_mask); 2152 } 2153 2154 #endif /* CONFIG_SMP */ 2155 2156 static void 2157 ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 2158 { 2159 struct rq *rq; 2160 2161 if (!schedstat_enabled()) 2162 return; 2163 2164 rq = this_rq(); 2165 2166 #ifdef CONFIG_SMP 2167 if (cpu == rq->cpu) { 2168 __schedstat_inc(rq->ttwu_local); 2169 __schedstat_inc(p->se.statistics.nr_wakeups_local); 2170 } else { 2171 struct sched_domain *sd; 2172 2173 __schedstat_inc(p->se.statistics.nr_wakeups_remote); 2174 rcu_read_lock(); 2175 for_each_domain(rq->cpu, sd) { 2176 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 2177 __schedstat_inc(sd->ttwu_wake_remote); 2178 break; 2179 } 2180 } 2181 rcu_read_unlock(); 2182 } 2183 2184 if (wake_flags & WF_MIGRATED) 2185 __schedstat_inc(p->se.statistics.nr_wakeups_migrate); 2186 #endif /* CONFIG_SMP */ 2187 2188 __schedstat_inc(rq->ttwu_count); 2189 __schedstat_inc(p->se.statistics.nr_wakeups); 2190 2191 if (wake_flags & WF_SYNC) 2192 __schedstat_inc(p->se.statistics.nr_wakeups_sync); 2193 } 2194 2195 /* 2196 * Mark the task runnable and perform wakeup-preemption. 2197 */ 2198 static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, 2199 struct rq_flags *rf) 2200 { 2201 check_preempt_curr(rq, p, wake_flags); 2202 p->state = TASK_RUNNING; 2203 trace_sched_wakeup(p); 2204 2205 #ifdef CONFIG_SMP 2206 if (p->sched_class->task_woken) { 2207 /* 2208 * Our task @p is fully woken up and running; so its safe to 2209 * drop the rq->lock, hereafter rq is only used for statistics. 2210 */ 2211 rq_unpin_lock(rq, rf); 2212 p->sched_class->task_woken(rq, p); 2213 rq_repin_lock(rq, rf); 2214 } 2215 2216 if (rq->idle_stamp) { 2217 u64 delta = rq_clock(rq) - rq->idle_stamp; 2218 u64 max = 2*rq->max_idle_balance_cost; 2219 2220 update_avg(&rq->avg_idle, delta); 2221 2222 if (rq->avg_idle > max) 2223 rq->avg_idle = max; 2224 2225 rq->idle_stamp = 0; 2226 } 2227 #endif 2228 } 2229 2230 static void 2231 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, 2232 struct rq_flags *rf) 2233 { 2234 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; 2235 2236 lockdep_assert_held(&rq->lock); 2237 2238 #ifdef CONFIG_SMP 2239 if (p->sched_contributes_to_load) 2240 rq->nr_uninterruptible--; 2241 2242 if (wake_flags & WF_MIGRATED) 2243 en_flags |= ENQUEUE_MIGRATED; 2244 #endif 2245 2246 activate_task(rq, p, en_flags); 2247 ttwu_do_wakeup(rq, p, wake_flags, rf); 2248 } 2249 2250 /* 2251 * Called in case the task @p isn't fully descheduled from its runqueue, 2252 * in this case we must do a remote wakeup. Its a 'light' wakeup though, 2253 * since all we need to do is flip p->state to TASK_RUNNING, since 2254 * the task is still ->on_rq. 2255 */ 2256 static int ttwu_remote(struct task_struct *p, int wake_flags) 2257 { 2258 struct rq_flags rf; 2259 struct rq *rq; 2260 int ret = 0; 2261 2262 rq = __task_rq_lock(p, &rf); 2263 if (task_on_rq_queued(p)) { 2264 /* check_preempt_curr() may use rq clock */ 2265 update_rq_clock(rq); 2266 ttwu_do_wakeup(rq, p, wake_flags, &rf); 2267 ret = 1; 2268 } 2269 __task_rq_unlock(rq, &rf); 2270 2271 return ret; 2272 } 2273 2274 #ifdef CONFIG_SMP 2275 void sched_ttwu_pending(void) 2276 { 2277 struct rq *rq = this_rq(); 2278 struct llist_node *llist = llist_del_all(&rq->wake_list); 2279 struct task_struct *p, *t; 2280 struct rq_flags rf; 2281 2282 if (!llist) 2283 return; 2284 2285 rq_lock_irqsave(rq, &rf); 2286 update_rq_clock(rq); 2287 2288 llist_for_each_entry_safe(p, t, llist, wake_entry) 2289 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); 2290 2291 rq_unlock_irqrestore(rq, &rf); 2292 } 2293 2294 void scheduler_ipi(void) 2295 { 2296 /* 2297 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting 2298 * TIF_NEED_RESCHED remotely (for the first time) will also send 2299 * this IPI. 2300 */ 2301 preempt_fold_need_resched(); 2302 2303 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 2304 return; 2305 2306 /* 2307 * Not all reschedule IPI handlers call irq_enter/irq_exit, since 2308 * traditionally all their work was done from the interrupt return 2309 * path. Now that we actually do some work, we need to make sure 2310 * we do call them. 2311 * 2312 * Some archs already do call them, luckily irq_enter/exit nest 2313 * properly. 2314 * 2315 * Arguably we should visit all archs and update all handlers, 2316 * however a fair share of IPIs are still resched only so this would 2317 * somewhat pessimize the simple resched case. 2318 */ 2319 irq_enter(); 2320 sched_ttwu_pending(); 2321 2322 /* 2323 * Check if someone kicked us for doing the nohz idle load balance. 2324 */ 2325 if (unlikely(got_nohz_idle_kick())) { 2326 this_rq()->idle_balance = 1; 2327 raise_softirq_irqoff(SCHED_SOFTIRQ); 2328 } 2329 irq_exit(); 2330 } 2331 2332 static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) 2333 { 2334 struct rq *rq = cpu_rq(cpu); 2335 2336 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); 2337 2338 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { 2339 if (!set_nr_if_polling(rq->idle)) 2340 smp_send_reschedule(cpu); 2341 else 2342 trace_sched_wake_idle_without_ipi(cpu); 2343 } 2344 } 2345 2346 void wake_up_if_idle(int cpu) 2347 { 2348 struct rq *rq = cpu_rq(cpu); 2349 struct rq_flags rf; 2350 2351 rcu_read_lock(); 2352 2353 if (!is_idle_task(rcu_dereference(rq->curr))) 2354 goto out; 2355 2356 if (set_nr_if_polling(rq->idle)) { 2357 trace_sched_wake_idle_without_ipi(cpu); 2358 } else { 2359 rq_lock_irqsave(rq, &rf); 2360 if (is_idle_task(rq->curr)) 2361 smp_send_reschedule(cpu); 2362 /* Else CPU is not idle, do nothing here: */ 2363 rq_unlock_irqrestore(rq, &rf); 2364 } 2365 2366 out: 2367 rcu_read_unlock(); 2368 } 2369 2370 bool cpus_share_cache(int this_cpu, int that_cpu) 2371 { 2372 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 2373 } 2374 #endif /* CONFIG_SMP */ 2375 2376 static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) 2377 { 2378 struct rq *rq = cpu_rq(cpu); 2379 struct rq_flags rf; 2380 2381 #if defined(CONFIG_SMP) 2382 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 2383 sched_clock_cpu(cpu); /* Sync clocks across CPUs */ 2384 ttwu_queue_remote(p, cpu, wake_flags); 2385 return; 2386 } 2387 #endif 2388 2389 rq_lock(rq, &rf); 2390 update_rq_clock(rq); 2391 ttwu_do_activate(rq, p, wake_flags, &rf); 2392 rq_unlock(rq, &rf); 2393 } 2394 2395 /* 2396 * Notes on Program-Order guarantees on SMP systems. 2397 * 2398 * MIGRATION 2399 * 2400 * The basic program-order guarantee on SMP systems is that when a task [t] 2401 * migrates, all its activity on its old CPU [c0] happens-before any subsequent 2402 * execution on its new CPU [c1]. 2403 * 2404 * For migration (of runnable tasks) this is provided by the following means: 2405 * 2406 * A) UNLOCK of the rq(c0)->lock scheduling out task t 2407 * B) migration for t is required to synchronize *both* rq(c0)->lock and 2408 * rq(c1)->lock (if not at the same time, then in that order). 2409 * C) LOCK of the rq(c1)->lock scheduling in task 2410 * 2411 * Release/acquire chaining guarantees that B happens after A and C after B. 2412 * Note: the CPU doing B need not be c0 or c1 2413 * 2414 * Example: 2415 * 2416 * CPU0 CPU1 CPU2 2417 * 2418 * LOCK rq(0)->lock 2419 * sched-out X 2420 * sched-in Y 2421 * UNLOCK rq(0)->lock 2422 * 2423 * LOCK rq(0)->lock // orders against CPU0 2424 * dequeue X 2425 * UNLOCK rq(0)->lock 2426 * 2427 * LOCK rq(1)->lock 2428 * enqueue X 2429 * UNLOCK rq(1)->lock 2430 * 2431 * LOCK rq(1)->lock // orders against CPU2 2432 * sched-out Z 2433 * sched-in X 2434 * UNLOCK rq(1)->lock 2435 * 2436 * 2437 * BLOCKING -- aka. SLEEP + WAKEUP 2438 * 2439 * For blocking we (obviously) need to provide the same guarantee as for 2440 * migration. However the means are completely different as there is no lock 2441 * chain to provide order. Instead we do: 2442 * 2443 * 1) smp_store_release(X->on_cpu, 0) 2444 * 2) smp_cond_load_acquire(!X->on_cpu) 2445 * 2446 * Example: 2447 * 2448 * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) 2449 * 2450 * LOCK rq(0)->lock LOCK X->pi_lock 2451 * dequeue X 2452 * sched-out X 2453 * smp_store_release(X->on_cpu, 0); 2454 * 2455 * smp_cond_load_acquire(&X->on_cpu, !VAL); 2456 * X->state = WAKING 2457 * set_task_cpu(X,2) 2458 * 2459 * LOCK rq(2)->lock 2460 * enqueue X 2461 * X->state = RUNNING 2462 * UNLOCK rq(2)->lock 2463 * 2464 * LOCK rq(2)->lock // orders against CPU1 2465 * sched-out Z 2466 * sched-in X 2467 * UNLOCK rq(2)->lock 2468 * 2469 * UNLOCK X->pi_lock 2470 * UNLOCK rq(0)->lock 2471 * 2472 * 2473 * However, for wakeups there is a second guarantee we must provide, namely we 2474 * must ensure that CONDITION=1 done by the caller can not be reordered with 2475 * accesses to the task state; see try_to_wake_up() and set_current_state(). 2476 */ 2477 2478 /** 2479 * try_to_wake_up - wake up a thread 2480 * @p: the thread to be awakened 2481 * @state: the mask of task states that can be woken 2482 * @wake_flags: wake modifier flags (WF_*) 2483 * 2484 * If (@state & @p->state) @p->state = TASK_RUNNING. 2485 * 2486 * If the task was not queued/runnable, also place it back on a runqueue. 2487 * 2488 * Atomic against schedule() which would dequeue a task, also see 2489 * set_current_state(). 2490 * 2491 * This function executes a full memory barrier before accessing the task 2492 * state; see set_current_state(). 2493 * 2494 * Return: %true if @p->state changes (an actual wakeup was done), 2495 * %false otherwise. 2496 */ 2497 static int 2498 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 2499 { 2500 unsigned long flags; 2501 int cpu, success = 0; 2502 2503 preempt_disable(); 2504 if (p == current) { 2505 /* 2506 * We're waking current, this means 'p->on_rq' and 'task_cpu(p) 2507 * == smp_processor_id()'. Together this means we can special 2508 * case the whole 'p->on_rq && ttwu_remote()' case below 2509 * without taking any locks. 2510 * 2511 * In particular: 2512 * - we rely on Program-Order guarantees for all the ordering, 2513 * - we're serialized against set_special_state() by virtue of 2514 * it disabling IRQs (this allows not taking ->pi_lock). 2515 */ 2516 if (!(p->state & state)) 2517 goto out; 2518 2519 success = 1; 2520 cpu = task_cpu(p); 2521 trace_sched_waking(p); 2522 p->state = TASK_RUNNING; 2523 trace_sched_wakeup(p); 2524 goto out; 2525 } 2526 2527 /* 2528 * If we are going to wake up a thread waiting for CONDITION we 2529 * need to ensure that CONDITION=1 done by the caller can not be 2530 * reordered with p->state check below. This pairs with mb() in 2531 * set_current_state() the waiting thread does. 2532 */ 2533 raw_spin_lock_irqsave(&p->pi_lock, flags); 2534 smp_mb__after_spinlock(); 2535 if (!(p->state & state)) 2536 goto unlock; 2537 2538 trace_sched_waking(p); 2539 2540 /* We're going to change ->state: */ 2541 success = 1; 2542 cpu = task_cpu(p); 2543 2544 /* 2545 * Ensure we load p->on_rq _after_ p->state, otherwise it would 2546 * be possible to, falsely, observe p->on_rq == 0 and get stuck 2547 * in smp_cond_load_acquire() below. 2548 * 2549 * sched_ttwu_pending() try_to_wake_up() 2550 * STORE p->on_rq = 1 LOAD p->state 2551 * UNLOCK rq->lock 2552 * 2553 * __schedule() (switch to task 'p') 2554 * LOCK rq->lock smp_rmb(); 2555 * smp_mb__after_spinlock(); 2556 * UNLOCK rq->lock 2557 * 2558 * [task p] 2559 * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq 2560 * 2561 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in 2562 * __schedule(). See the comment for smp_mb__after_spinlock(). 2563 */ 2564 smp_rmb(); 2565 if (p->on_rq && ttwu_remote(p, wake_flags)) 2566 goto unlock; 2567 2568 #ifdef CONFIG_SMP 2569 /* 2570 * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be 2571 * possible to, falsely, observe p->on_cpu == 0. 2572 * 2573 * One must be running (->on_cpu == 1) in order to remove oneself 2574 * from the runqueue. 2575 * 2576 * __schedule() (switch to task 'p') try_to_wake_up() 2577 * STORE p->on_cpu = 1 LOAD p->on_rq 2578 * UNLOCK rq->lock 2579 * 2580 * __schedule() (put 'p' to sleep) 2581 * LOCK rq->lock smp_rmb(); 2582 * smp_mb__after_spinlock(); 2583 * STORE p->on_rq = 0 LOAD p->on_cpu 2584 * 2585 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in 2586 * __schedule(). See the comment for smp_mb__after_spinlock(). 2587 */ 2588 smp_rmb(); 2589 2590 /* 2591 * If the owning (remote) CPU is still in the middle of schedule() with 2592 * this task as prev, wait until its done referencing the task. 2593 * 2594 * Pairs with the smp_store_release() in finish_task(). 2595 * 2596 * This ensures that tasks getting woken will be fully ordered against 2597 * their previous state and preserve Program Order. 2598 */ 2599 smp_cond_load_acquire(&p->on_cpu, !VAL); 2600 2601 p->sched_contributes_to_load = !!task_contributes_to_load(p); 2602 p->state = TASK_WAKING; 2603 2604 if (p->in_iowait) { 2605 delayacct_blkio_end(p); 2606 atomic_dec(&task_rq(p)->nr_iowait); 2607 } 2608 2609 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); 2610 if (task_cpu(p) != cpu) { 2611 wake_flags |= WF_MIGRATED; 2612 psi_ttwu_dequeue(p); 2613 set_task_cpu(p, cpu); 2614 } 2615 2616 #else /* CONFIG_SMP */ 2617 2618 if (p->in_iowait) { 2619 delayacct_blkio_end(p); 2620 atomic_dec(&task_rq(p)->nr_iowait); 2621 } 2622 2623 #endif /* CONFIG_SMP */ 2624 2625 ttwu_queue(p, cpu, wake_flags); 2626 unlock: 2627 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2628 out: 2629 if (success) 2630 ttwu_stat(p, cpu, wake_flags); 2631 preempt_enable(); 2632 2633 return success; 2634 } 2635 2636 /** 2637 * wake_up_process - Wake up a specific process 2638 * @p: The process to be woken up. 2639 * 2640 * Attempt to wake up the nominated process and move it to the set of runnable 2641 * processes. 2642 * 2643 * Return: 1 if the process was woken up, 0 if it was already running. 2644 * 2645 * This function executes a full memory barrier before accessing the task state. 2646 */ 2647 int wake_up_process(struct task_struct *p) 2648 { 2649 return try_to_wake_up(p, TASK_NORMAL, 0); 2650 } 2651 EXPORT_SYMBOL(wake_up_process); 2652 2653 int wake_up_state(struct task_struct *p, unsigned int state) 2654 { 2655 return try_to_wake_up(p, state, 0); 2656 } 2657 2658 /* 2659 * Perform scheduler related setup for a newly forked process p. 2660 * p is forked by current. 2661 * 2662 * __sched_fork() is basic setup used by init_idle() too: 2663 */ 2664 static void __sched_fork(unsigned long clone_flags, struct task_struct *p) 2665 { 2666 p->on_rq = 0; 2667 2668 p->se.on_rq = 0; 2669 p->se.exec_start = 0; 2670 p->se.sum_exec_runtime = 0; 2671 p->se.prev_sum_exec_runtime = 0; 2672 p->se.nr_migrations = 0; 2673 p->se.vruntime = 0; 2674 INIT_LIST_HEAD(&p->se.group_node); 2675 2676 #ifdef CONFIG_FAIR_GROUP_SCHED 2677 p->se.cfs_rq = NULL; 2678 #endif 2679 2680 #ifdef CONFIG_SCHEDSTATS 2681 /* Even if schedstat is disabled, there should not be garbage */ 2682 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2683 #endif 2684 2685 RB_CLEAR_NODE(&p->dl.rb_node); 2686 init_dl_task_timer(&p->dl); 2687 init_dl_inactive_task_timer(&p->dl); 2688 __dl_clear_params(p); 2689 2690 INIT_LIST_HEAD(&p->rt.run_list); 2691 p->rt.timeout = 0; 2692 p->rt.time_slice = sched_rr_timeslice; 2693 p->rt.on_rq = 0; 2694 p->rt.on_list = 0; 2695 2696 #ifdef CONFIG_PREEMPT_NOTIFIERS 2697 INIT_HLIST_HEAD(&p->preempt_notifiers); 2698 #endif 2699 2700 #ifdef CONFIG_COMPACTION 2701 p->capture_control = NULL; 2702 #endif 2703 init_numa_balancing(clone_flags, p); 2704 } 2705 2706 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); 2707 2708 #ifdef CONFIG_NUMA_BALANCING 2709 2710 void set_numabalancing_state(bool enabled) 2711 { 2712 if (enabled) 2713 static_branch_enable(&sched_numa_balancing); 2714 else 2715 static_branch_disable(&sched_numa_balancing); 2716 } 2717 2718 #ifdef CONFIG_PROC_SYSCTL 2719 int sysctl_numa_balancing(struct ctl_table *table, int write, 2720 void *buffer, size_t *lenp, loff_t *ppos) 2721 { 2722 struct ctl_table t; 2723 int err; 2724 int state = static_branch_likely(&sched_numa_balancing); 2725 2726 if (write && !capable(CAP_SYS_ADMIN)) 2727 return -EPERM; 2728 2729 t = *table; 2730 t.data = &state; 2731 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 2732 if (err < 0) 2733 return err; 2734 if (write) 2735 set_numabalancing_state(state); 2736 return err; 2737 } 2738 #endif 2739 #endif 2740 2741 #ifdef CONFIG_SCHEDSTATS 2742 2743 DEFINE_STATIC_KEY_FALSE(sched_schedstats); 2744 static bool __initdata __sched_schedstats = false; 2745 2746 static void set_schedstats(bool enabled) 2747 { 2748 if (enabled) 2749 static_branch_enable(&sched_schedstats); 2750 else 2751 static_branch_disable(&sched_schedstats); 2752 } 2753 2754 void force_schedstat_enabled(void) 2755 { 2756 if (!schedstat_enabled()) { 2757 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); 2758 static_branch_enable(&sched_schedstats); 2759 } 2760 } 2761 2762 static int __init setup_schedstats(char *str) 2763 { 2764 int ret = 0; 2765 if (!str) 2766 goto out; 2767 2768 /* 2769 * This code is called before jump labels have been set up, so we can't 2770 * change the static branch directly just yet. Instead set a temporary 2771 * variable so init_schedstats() can do it later. 2772 */ 2773 if (!strcmp(str, "enable")) { 2774 __sched_schedstats = true; 2775 ret = 1; 2776 } else if (!strcmp(str, "disable")) { 2777 __sched_schedstats = false; 2778 ret = 1; 2779 } 2780 out: 2781 if (!ret) 2782 pr_warn("Unable to parse schedstats=\n"); 2783 2784 return ret; 2785 } 2786 __setup("schedstats=", setup_schedstats); 2787 2788 static void __init init_schedstats(void) 2789 { 2790 set_schedstats(__sched_schedstats); 2791 } 2792 2793 #ifdef CONFIG_PROC_SYSCTL 2794 int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, 2795 size_t *lenp, loff_t *ppos) 2796 { 2797 struct ctl_table t; 2798 int err; 2799 int state = static_branch_likely(&sched_schedstats); 2800 2801 if (write && !capable(CAP_SYS_ADMIN)) 2802 return -EPERM; 2803 2804 t = *table; 2805 t.data = &state; 2806 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 2807 if (err < 0) 2808 return err; 2809 if (write) 2810 set_schedstats(state); 2811 return err; 2812 } 2813 #endif /* CONFIG_PROC_SYSCTL */ 2814 #else /* !CONFIG_SCHEDSTATS */ 2815 static inline void init_schedstats(void) {} 2816 #endif /* CONFIG_SCHEDSTATS */ 2817 2818 /* 2819 * fork()/clone()-time setup: 2820 */ 2821 int sched_fork(unsigned long clone_flags, struct task_struct *p) 2822 { 2823 unsigned long flags; 2824 2825 __sched_fork(clone_flags, p); 2826 /* 2827 * We mark the process as NEW here. This guarantees that 2828 * nobody will actually run it, and a signal or other external 2829 * event cannot wake it up and insert it on the runqueue either. 2830 */ 2831 p->state = TASK_NEW; 2832 2833 /* 2834 * Make sure we do not leak PI boosting priority to the child. 2835 */ 2836 p->prio = current->normal_prio; 2837 2838 uclamp_fork(p); 2839 2840 /* 2841 * Revert to default priority/policy on fork if requested. 2842 */ 2843 if (unlikely(p->sched_reset_on_fork)) { 2844 if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 2845 p->policy = SCHED_NORMAL; 2846 p->static_prio = NICE_TO_PRIO(0); 2847 p->rt_priority = 0; 2848 } else if (PRIO_TO_NICE(p->static_prio) < 0) 2849 p->static_prio = NICE_TO_PRIO(0); 2850 2851 p->prio = p->normal_prio = __normal_prio(p); 2852 set_load_weight(p, false); 2853 2854 /* 2855 * We don't need the reset flag anymore after the fork. It has 2856 * fulfilled its duty: 2857 */ 2858 p->sched_reset_on_fork = 0; 2859 } 2860 2861 if (dl_prio(p->prio)) 2862 return -EAGAIN; 2863 else if (rt_prio(p->prio)) 2864 p->sched_class = &rt_sched_class; 2865 else 2866 p->sched_class = &fair_sched_class; 2867 2868 init_entity_runnable_average(&p->se); 2869 2870 /* 2871 * The child is not yet in the pid-hash so no cgroup attach races, 2872 * and the cgroup is pinned to this child due to cgroup_fork() 2873 * is ran before sched_fork(). 2874 * 2875 * Silence PROVE_RCU. 2876 */ 2877 raw_spin_lock_irqsave(&p->pi_lock, flags); 2878 /* 2879 * We're setting the CPU for the first time, we don't migrate, 2880 * so use __set_task_cpu(). 2881 */ 2882 __set_task_cpu(p, smp_processor_id()); 2883 if (p->sched_class->task_fork) 2884 p->sched_class->task_fork(p); 2885 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2886 2887 #ifdef CONFIG_SCHED_INFO 2888 if (likely(sched_info_on())) 2889 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2890 #endif 2891 #if defined(CONFIG_SMP) 2892 p->on_cpu = 0; 2893 #endif 2894 init_task_preempt_count(p); 2895 #ifdef CONFIG_SMP 2896 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2897 RB_CLEAR_NODE(&p->pushable_dl_tasks); 2898 #endif 2899 return 0; 2900 } 2901 2902 unsigned long to_ratio(u64 period, u64 runtime) 2903 { 2904 if (runtime == RUNTIME_INF) 2905 return BW_UNIT; 2906 2907 /* 2908 * Doing this here saves a lot of checks in all 2909 * the calling paths, and returning zero seems 2910 * safe for them anyway. 2911 */ 2912 if (period == 0) 2913 return 0; 2914 2915 return div64_u64(runtime << BW_SHIFT, period); 2916 } 2917 2918 /* 2919 * wake_up_new_task - wake up a newly created task for the first time. 2920 * 2921 * This function will do some initial scheduler statistics housekeeping 2922 * that must be done for every newly created context, then puts the task 2923 * on the runqueue and wakes it. 2924 */ 2925 void wake_up_new_task(struct task_struct *p) 2926 { 2927 struct rq_flags rf; 2928 struct rq *rq; 2929 2930 raw_spin_lock_irqsave(&p->pi_lock, rf.flags); 2931 p->state = TASK_RUNNING; 2932 #ifdef CONFIG_SMP 2933 /* 2934 * Fork balancing, do it here and not earlier because: 2935 * - cpus_ptr can change in the fork path 2936 * - any previously selected CPU might disappear through hotplug 2937 * 2938 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, 2939 * as we're not fully set-up yet. 2940 */ 2941 p->recent_used_cpu = task_cpu(p); 2942 __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); 2943 #endif 2944 rq = __task_rq_lock(p, &rf); 2945 update_rq_clock(rq); 2946 post_init_entity_util_avg(p); 2947 2948 activate_task(rq, p, ENQUEUE_NOCLOCK); 2949 trace_sched_wakeup_new(p); 2950 check_preempt_curr(rq, p, WF_FORK); 2951 #ifdef CONFIG_SMP 2952 if (p->sched_class->task_woken) { 2953 /* 2954 * Nothing relies on rq->lock after this, so its fine to 2955 * drop it. 2956 */ 2957 rq_unpin_lock(rq, &rf); 2958 p->sched_class->task_woken(rq, p); 2959 rq_repin_lock(rq, &rf); 2960 } 2961 #endif 2962 task_rq_unlock(rq, p, &rf); 2963 } 2964 2965 #ifdef CONFIG_PREEMPT_NOTIFIERS 2966 2967 static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); 2968 2969 void preempt_notifier_inc(void) 2970 { 2971 static_branch_inc(&preempt_notifier_key); 2972 } 2973 EXPORT_SYMBOL_GPL(preempt_notifier_inc); 2974 2975 void preempt_notifier_dec(void) 2976 { 2977 static_branch_dec(&preempt_notifier_key); 2978 } 2979 EXPORT_SYMBOL_GPL(preempt_notifier_dec); 2980 2981 /** 2982 * preempt_notifier_register - tell me when current is being preempted & rescheduled 2983 * @notifier: notifier struct to register 2984 */ 2985 void preempt_notifier_register(struct preempt_notifier *notifier) 2986 { 2987 if (!static_branch_unlikely(&preempt_notifier_key)) 2988 WARN(1, "registering preempt_notifier while notifiers disabled\n"); 2989 2990 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); 2991 } 2992 EXPORT_SYMBOL_GPL(preempt_notifier_register); 2993 2994 /** 2995 * preempt_notifier_unregister - no longer interested in preemption notifications 2996 * @notifier: notifier struct to unregister 2997 * 2998 * This is *not* safe to call from within a preemption notifier. 2999 */ 3000 void preempt_notifier_unregister(struct preempt_notifier *notifier) 3001 { 3002 hlist_del(¬ifier->link); 3003 } 3004 EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 3005 3006 static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) 3007 { 3008 struct preempt_notifier *notifier; 3009 3010 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 3011 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 3012 } 3013 3014 static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) 3015 { 3016 if (static_branch_unlikely(&preempt_notifier_key)) 3017 __fire_sched_in_preempt_notifiers(curr); 3018 } 3019 3020 static void 3021 __fire_sched_out_preempt_notifiers(struct task_struct *curr, 3022 struct task_struct *next) 3023 { 3024 struct preempt_notifier *notifier; 3025 3026 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 3027 notifier->ops->sched_out(notifier, next); 3028 } 3029 3030 static __always_inline void 3031 fire_sched_out_preempt_notifiers(struct task_struct *curr, 3032 struct task_struct *next) 3033 { 3034 if (static_branch_unlikely(&preempt_notifier_key)) 3035 __fire_sched_out_preempt_notifiers(curr, next); 3036 } 3037 3038 #else /* !CONFIG_PREEMPT_NOTIFIERS */ 3039 3040 static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) 3041 { 3042 } 3043 3044 static inline void 3045 fire_sched_out_preempt_notifiers(struct task_struct *curr, 3046 struct task_struct *next) 3047 { 3048 } 3049 3050 #endif /* CONFIG_PREEMPT_NOTIFIERS */ 3051 3052 static inline void prepare_task(struct task_struct *next) 3053 { 3054 #ifdef CONFIG_SMP 3055 /* 3056 * Claim the task as running, we do this before switching to it 3057 * such that any running task will have this set. 3058 */ 3059 next->on_cpu = 1; 3060 #endif 3061 } 3062 3063 static inline void finish_task(struct task_struct *prev) 3064 { 3065 #ifdef CONFIG_SMP 3066 /* 3067 * After ->on_cpu is cleared, the task can be moved to a different CPU. 3068 * We must ensure this doesn't happen until the switch is completely 3069 * finished. 3070 * 3071 * In particular, the load of prev->state in finish_task_switch() must 3072 * happen before this. 3073 * 3074 * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). 3075 */ 3076 smp_store_release(&prev->on_cpu, 0); 3077 #endif 3078 } 3079 3080 static inline void 3081 prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) 3082 { 3083 /* 3084 * Since the runqueue lock will be released by the next 3085 * task (which is an invalid locking op but in the case 3086 * of the scheduler it's an obvious special-case), so we 3087 * do an early lockdep release here: 3088 */ 3089 rq_unpin_lock(rq, rf); 3090 spin_release(&rq->lock.dep_map, _THIS_IP_); 3091 #ifdef CONFIG_DEBUG_SPINLOCK 3092 /* this is a valid case when another task releases the spinlock */ 3093 rq->lock.owner = next; 3094 #endif 3095 } 3096 3097 static inline void finish_lock_switch(struct rq *rq) 3098 { 3099 /* 3100 * If we are tracking spinlock dependencies then we have to 3101 * fix up the runqueue lock - which gets 'carried over' from 3102 * prev into current: 3103 */ 3104 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 3105 raw_spin_unlock_irq(&rq->lock); 3106 } 3107 3108 /* 3109 * NOP if the arch has not defined these: 3110 */ 3111 3112 #ifndef prepare_arch_switch 3113 # define prepare_arch_switch(next) do { } while (0) 3114 #endif 3115 3116 #ifndef finish_arch_post_lock_switch 3117 # define finish_arch_post_lock_switch() do { } while (0) 3118 #endif 3119 3120 /** 3121 * prepare_task_switch - prepare to switch tasks 3122 * @rq: the runqueue preparing to switch 3123 * @prev: the current task that is being switched out 3124 * @next: the task we are going to switch to. 3125 * 3126 * This is called with the rq lock held and interrupts off. It must 3127 * be paired with a subsequent finish_task_switch after the context 3128 * switch. 3129 * 3130 * prepare_task_switch sets up locking and calls architecture specific 3131 * hooks. 3132 */ 3133 static inline void 3134 prepare_task_switch(struct rq *rq, struct task_struct *prev, 3135 struct task_struct *next) 3136 { 3137 kcov_prepare_switch(prev); 3138 sched_info_switch(rq, prev, next); 3139 perf_event_task_sched_out(prev, next); 3140 rseq_preempt(prev); 3141 fire_sched_out_preempt_notifiers(prev, next); 3142 prepare_task(next); 3143 prepare_arch_switch(next); 3144 } 3145 3146 /** 3147 * finish_task_switch - clean up after a task-switch 3148 * @prev: the thread we just switched away from. 3149 * 3150 * finish_task_switch must be called after the context switch, paired 3151 * with a prepare_task_switch call before the context switch. 3152 * finish_task_switch will reconcile locking set up by prepare_task_switch, 3153 * and do any other architecture-specific cleanup actions. 3154 * 3155 * Note that we may have delayed dropping an mm in context_switch(). If 3156 * so, we finish that here outside of the runqueue lock. (Doing it 3157 * with the lock held can cause deadlocks; see schedule() for 3158 * details.) 3159 * 3160 * The context switch have flipped the stack from under us and restored the 3161 * local variables which were saved when this task called schedule() in the 3162 * past. prev == current is still correct but we need to recalculate this_rq 3163 * because prev may have moved to another CPU. 3164 */ 3165 static struct rq *finish_task_switch(struct task_struct *prev) 3166 __releases(rq->lock) 3167 { 3168 struct rq *rq = this_rq(); 3169 struct mm_struct *mm = rq->prev_mm; 3170 long prev_state; 3171 3172 /* 3173 * The previous task will have left us with a preempt_count of 2 3174 * because it left us after: 3175 * 3176 * schedule() 3177 * preempt_disable(); // 1 3178 * __schedule() 3179 * raw_spin_lock_irq(&rq->lock) // 2 3180 * 3181 * Also, see FORK_PREEMPT_COUNT. 3182 */ 3183 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, 3184 "corrupted preempt_count: %s/%d/0x%x\n", 3185 current->comm, current->pid, preempt_count())) 3186 preempt_count_set(FORK_PREEMPT_COUNT); 3187 3188 rq->prev_mm = NULL; 3189 3190 /* 3191 * A task struct has one reference for the use as "current". 3192 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 3193 * schedule one last time. The schedule call will never return, and 3194 * the scheduled task must drop that reference. 3195 * 3196 * We must observe prev->state before clearing prev->on_cpu (in 3197 * finish_task), otherwise a concurrent wakeup can get prev 3198 * running on another CPU and we could rave with its RUNNING -> DEAD 3199 * transition, resulting in a double drop. 3200 */ 3201 prev_state = prev->state; 3202 vtime_task_switch(prev); 3203 perf_event_task_sched_in(prev, current); 3204 finish_task(prev); 3205 finish_lock_switch(rq); 3206 finish_arch_post_lock_switch(); 3207 kcov_finish_switch(current); 3208 3209 fire_sched_in_preempt_notifiers(current); 3210 /* 3211 * When switching through a kernel thread, the loop in 3212 * membarrier_{private,global}_expedited() may have observed that 3213 * kernel thread and not issued an IPI. It is therefore possible to 3214 * schedule between user->kernel->user threads without passing though 3215 * switch_mm(). Membarrier requires a barrier after storing to 3216 * rq->curr, before returning to userspace, so provide them here: 3217 * 3218 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly 3219 * provided by mmdrop(), 3220 * - a sync_core for SYNC_CORE. 3221 */ 3222 if (mm) { 3223 membarrier_mm_sync_core_before_usermode(mm); 3224 mmdrop(mm); 3225 } 3226 if (unlikely(prev_state == TASK_DEAD)) { 3227 if (prev->sched_class->task_dead) 3228 prev->sched_class->task_dead(prev); 3229 3230 /* 3231 * Remove function-return probe instances associated with this 3232 * task and put them back on the free list. 3233 */ 3234 kprobe_flush_task(prev); 3235 3236 /* Task is done with its stack. */ 3237 put_task_stack(prev); 3238 3239 put_task_struct_rcu_user(prev); 3240 } 3241 3242 tick_nohz_task_switch(); 3243 return rq; 3244 } 3245 3246 #ifdef CONFIG_SMP 3247 3248 /* rq->lock is NOT held, but preemption is disabled */ 3249 static void __balance_callback(struct rq *rq) 3250 { 3251 struct callback_head *head, *next; 3252 void (*func)(struct rq *rq); 3253 unsigned long flags; 3254 3255 raw_spin_lock_irqsave(&rq->lock, flags); 3256 head = rq->balance_callback; 3257 rq->balance_callback = NULL; 3258 while (head) { 3259 func = (void (*)(struct rq *))head->func; 3260 next = head->next; 3261 head->next = NULL; 3262 head = next; 3263 3264 func(rq); 3265 } 3266 raw_spin_unlock_irqrestore(&rq->lock, flags); 3267 } 3268 3269 static inline void balance_callback(struct rq *rq) 3270 { 3271 if (unlikely(rq->balance_callback)) 3272 __balance_callback(rq); 3273 } 3274 3275 #else 3276 3277 static inline void balance_callback(struct rq *rq) 3278 { 3279 } 3280 3281 #endif 3282 3283 /** 3284 * schedule_tail - first thing a freshly forked thread must call. 3285 * @prev: the thread we just switched away from. 3286 */ 3287 asmlinkage __visible void schedule_tail(struct task_struct *prev) 3288 __releases(rq->lock) 3289 { 3290 struct rq *rq; 3291 3292 /* 3293 * New tasks start with FORK_PREEMPT_COUNT, see there and 3294 * finish_task_switch() for details. 3295 * 3296 * finish_task_switch() will drop rq->lock() and lower preempt_count 3297 * and the preempt_enable() will end up enabling preemption (on 3298 * PREEMPT_COUNT kernels). 3299 */ 3300 3301 rq = finish_task_switch(prev); 3302 balance_callback(rq); 3303 preempt_enable(); 3304 3305 if (current->set_child_tid) 3306 put_user(task_pid_vnr(current), current->set_child_tid); 3307 3308 calculate_sigpending(); 3309 } 3310 3311 /* 3312 * context_switch - switch to the new MM and the new thread's register state. 3313 */ 3314 static __always_inline struct rq * 3315 context_switch(struct rq *rq, struct task_struct *prev, 3316 struct task_struct *next, struct rq_flags *rf) 3317 { 3318 prepare_task_switch(rq, prev, next); 3319 3320 /* 3321 * For paravirt, this is coupled with an exit in switch_to to 3322 * combine the page table reload and the switch backend into 3323 * one hypercall. 3324 */ 3325 arch_start_context_switch(prev); 3326 3327 /* 3328 * kernel -> kernel lazy + transfer active 3329 * user -> kernel lazy + mmgrab() active 3330 * 3331 * kernel -> user switch + mmdrop() active 3332 * user -> user switch 3333 */ 3334 if (!next->mm) { // to kernel 3335 enter_lazy_tlb(prev->active_mm, next); 3336 3337 next->active_mm = prev->active_mm; 3338 if (prev->mm) // from user 3339 mmgrab(prev->active_mm); 3340 else 3341 prev->active_mm = NULL; 3342 } else { // to user 3343 membarrier_switch_mm(rq, prev->active_mm, next->mm); 3344 /* 3345 * sys_membarrier() requires an smp_mb() between setting 3346 * rq->curr / membarrier_switch_mm() and returning to userspace. 3347 * 3348 * The below provides this either through switch_mm(), or in 3349 * case 'prev->active_mm == next->mm' through 3350 * finish_task_switch()'s mmdrop(). 3351 */ 3352 switch_mm_irqs_off(prev->active_mm, next->mm, next); 3353 3354 if (!prev->mm) { // from kernel 3355 /* will mmdrop() in finish_task_switch(). */ 3356 rq->prev_mm = prev->active_mm; 3357 prev->active_mm = NULL; 3358 } 3359 } 3360 3361 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); 3362 3363 prepare_lock_switch(rq, next, rf); 3364 3365 /* Here we just switch the register state and the stack. */ 3366 switch_to(prev, next, prev); 3367 barrier(); 3368 3369 return finish_task_switch(prev); 3370 } 3371 3372 /* 3373 * nr_running and nr_context_switches: 3374 * 3375 * externally visible scheduler statistics: current number of runnable 3376 * threads, total number of context switches performed since bootup. 3377 */ 3378 unsigned long nr_running(void) 3379 { 3380 unsigned long i, sum = 0; 3381 3382 for_each_online_cpu(i) 3383 sum += cpu_rq(i)->nr_running; 3384 3385 return sum; 3386 } 3387 3388 /* 3389 * Check if only the current task is running on the CPU. 3390 * 3391 * Caution: this function does not check that the caller has disabled 3392 * preemption, thus the result might have a time-of-check-to-time-of-use 3393 * race. The caller is responsible to use it correctly, for example: 3394 * 3395 * - from a non-preemptible section (of course) 3396 * 3397 * - from a thread that is bound to a single CPU 3398 * 3399 * - in a loop with very short iterations (e.g. a polling loop) 3400 */ 3401 bool single_task_running(void) 3402 { 3403 return raw_rq()->nr_running == 1; 3404 } 3405 EXPORT_SYMBOL(single_task_running); 3406 3407 unsigned long long nr_context_switches(void) 3408 { 3409 int i; 3410 unsigned long long sum = 0; 3411 3412 for_each_possible_cpu(i) 3413 sum += cpu_rq(i)->nr_switches; 3414 3415 return sum; 3416 } 3417 3418 /* 3419 * Consumers of these two interfaces, like for example the cpuidle menu 3420 * governor, are using nonsensical data. Preferring shallow idle state selection 3421 * for a CPU that has IO-wait which might not even end up running the task when 3422 * it does become runnable. 3423 */ 3424 3425 unsigned long nr_iowait_cpu(int cpu) 3426 { 3427 return atomic_read(&cpu_rq(cpu)->nr_iowait); 3428 } 3429 3430 /* 3431 * IO-wait accounting, and how its mostly bollocks (on SMP). 3432 * 3433 * The idea behind IO-wait account is to account the idle time that we could 3434 * have spend running if it were not for IO. That is, if we were to improve the 3435 * storage performance, we'd have a proportional reduction in IO-wait time. 3436 * 3437 * This all works nicely on UP, where, when a task blocks on IO, we account 3438 * idle time as IO-wait, because if the storage were faster, it could've been 3439 * running and we'd not be idle. 3440 * 3441 * This has been extended to SMP, by doing the same for each CPU. This however 3442 * is broken. 3443 * 3444 * Imagine for instance the case where two tasks block on one CPU, only the one 3445 * CPU will have IO-wait accounted, while the other has regular idle. Even 3446 * though, if the storage were faster, both could've ran at the same time, 3447 * utilising both CPUs. 3448 * 3449 * This means, that when looking globally, the current IO-wait accounting on 3450 * SMP is a lower bound, by reason of under accounting. 3451 * 3452 * Worse, since the numbers are provided per CPU, they are sometimes 3453 * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly 3454 * associated with any one particular CPU, it can wake to another CPU than it 3455 * blocked on. This means the per CPU IO-wait number is meaningless. 3456 * 3457 * Task CPU affinities can make all that even more 'interesting'. 3458 */ 3459 3460 unsigned long nr_iowait(void) 3461 { 3462 unsigned long i, sum = 0; 3463 3464 for_each_possible_cpu(i) 3465 sum += nr_iowait_cpu(i); 3466 3467 return sum; 3468 } 3469 3470 #ifdef CONFIG_SMP 3471 3472 /* 3473 * sched_exec - execve() is a valuable balancing opportunity, because at 3474 * this point the task has the smallest effective memory and cache footprint. 3475 */ 3476 void sched_exec(void) 3477 { 3478 struct task_struct *p = current; 3479 unsigned long flags; 3480 int dest_cpu; 3481 3482 raw_spin_lock_irqsave(&p->pi_lock, flags); 3483 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 3484 if (dest_cpu == smp_processor_id()) 3485 goto unlock; 3486 3487 if (likely(cpu_active(dest_cpu))) { 3488 struct migration_arg arg = { p, dest_cpu }; 3489 3490 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 3491 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); 3492 return; 3493 } 3494 unlock: 3495 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 3496 } 3497 3498 #endif 3499 3500 DEFINE_PER_CPU(struct kernel_stat, kstat); 3501 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); 3502 3503 EXPORT_PER_CPU_SYMBOL(kstat); 3504 EXPORT_PER_CPU_SYMBOL(kernel_cpustat); 3505 3506 /* 3507 * The function fair_sched_class.update_curr accesses the struct curr 3508 * and its field curr->exec_start; when called from task_sched_runtime(), 3509 * we observe a high rate of cache misses in practice. 3510 * Prefetching this data results in improved performance. 3511 */ 3512 static inline void prefetch_curr_exec_start(struct task_struct *p) 3513 { 3514 #ifdef CONFIG_FAIR_GROUP_SCHED 3515 struct sched_entity *curr = (&p->se)->cfs_rq->curr; 3516 #else 3517 struct sched_entity *curr = (&task_rq(p)->cfs)->curr; 3518 #endif 3519 prefetch(curr); 3520 prefetch(&curr->exec_start); 3521 } 3522 3523 /* 3524 * Return accounted runtime for the task. 3525 * In case the task is currently running, return the runtime plus current's 3526 * pending runtime that have not been accounted yet. 3527 */ 3528 unsigned long long task_sched_runtime(struct task_struct *p) 3529 { 3530 struct rq_flags rf; 3531 struct rq *rq; 3532 u64 ns; 3533 3534 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) 3535 /* 3536 * 64-bit doesn't need locks to atomically read a 64-bit value. 3537 * So we have a optimization chance when the task's delta_exec is 0. 3538 * Reading ->on_cpu is racy, but this is ok. 3539 * 3540 * If we race with it leaving CPU, we'll take a lock. So we're correct. 3541 * If we race with it entering CPU, unaccounted time is 0. This is 3542 * indistinguishable from the read occurring a few cycles earlier. 3543 * If we see ->on_cpu without ->on_rq, the task is leaving, and has 3544 * been accounted, so we're correct here as well. 3545 */ 3546 if (!p->on_cpu || !task_on_rq_queued(p)) 3547 return p->se.sum_exec_runtime; 3548 #endif 3549 3550 rq = task_rq_lock(p, &rf); 3551 /* 3552 * Must be ->curr _and_ ->on_rq. If dequeued, we would 3553 * project cycles that may never be accounted to this 3554 * thread, breaking clock_gettime(). 3555 */ 3556 if (task_current(rq, p) && task_on_rq_queued(p)) { 3557 prefetch_curr_exec_start(p); 3558 update_rq_clock(rq); 3559 p->sched_class->update_curr(rq); 3560 } 3561 ns = p->se.sum_exec_runtime; 3562 task_rq_unlock(rq, p, &rf); 3563 3564 return ns; 3565 } 3566 3567 DEFINE_PER_CPU(unsigned long, thermal_pressure); 3568 3569 void arch_set_thermal_pressure(struct cpumask *cpus, 3570 unsigned long th_pressure) 3571 { 3572 int cpu; 3573 3574 for_each_cpu(cpu, cpus) 3575 WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); 3576 } 3577 3578 /* 3579 * This function gets called by the timer code, with HZ frequency. 3580 * We call it with interrupts disabled. 3581 */ 3582 void scheduler_tick(void) 3583 { 3584 int cpu = smp_processor_id(); 3585 struct rq *rq = cpu_rq(cpu); 3586 struct task_struct *curr = rq->curr; 3587 struct rq_flags rf; 3588 unsigned long thermal_pressure; 3589 3590 arch_scale_freq_tick(); 3591 sched_clock_tick(); 3592 3593 rq_lock(rq, &rf); 3594 3595 update_rq_clock(rq); 3596 thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); 3597 update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); 3598 curr->sched_class->task_tick(rq, curr, 0); 3599 calc_global_load_tick(rq); 3600 psi_task_tick(rq); 3601 3602 rq_unlock(rq, &rf); 3603 3604 perf_event_task_tick(); 3605 3606 #ifdef CONFIG_SMP 3607 rq->idle_balance = idle_cpu(cpu); 3608 trigger_load_balance(rq); 3609 #endif 3610 } 3611 3612 #ifdef CONFIG_NO_HZ_FULL 3613 3614 struct tick_work { 3615 int cpu; 3616 atomic_t state; 3617 struct delayed_work work; 3618 }; 3619 /* Values for ->state, see diagram below. */ 3620 #define TICK_SCHED_REMOTE_OFFLINE 0 3621 #define TICK_SCHED_REMOTE_OFFLINING 1 3622 #define TICK_SCHED_REMOTE_RUNNING 2 3623 3624 /* 3625 * State diagram for ->state: 3626 * 3627 * 3628 * TICK_SCHED_REMOTE_OFFLINE 3629 * | ^ 3630 * | | 3631 * | | sched_tick_remote() 3632 * | | 3633 * | | 3634 * +--TICK_SCHED_REMOTE_OFFLINING 3635 * | ^ 3636 * | | 3637 * sched_tick_start() | | sched_tick_stop() 3638 * | | 3639 * V | 3640 * TICK_SCHED_REMOTE_RUNNING 3641 * 3642 * 3643 * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() 3644 * and sched_tick_start() are happy to leave the state in RUNNING. 3645 */ 3646 3647 static struct tick_work __percpu *tick_work_cpu; 3648 3649 static void sched_tick_remote(struct work_struct *work) 3650 { 3651 struct delayed_work *dwork = to_delayed_work(work); 3652 struct tick_work *twork = container_of(dwork, struct tick_work, work); 3653 int cpu = twork->cpu; 3654 struct rq *rq = cpu_rq(cpu); 3655 struct task_struct *curr; 3656 struct rq_flags rf; 3657 u64 delta; 3658 int os; 3659 3660 /* 3661 * Handle the tick only if it appears the remote CPU is running in full 3662 * dynticks mode. The check is racy by nature, but missing a tick or 3663 * having one too much is no big deal because the scheduler tick updates 3664 * statistics and checks timeslices in a time-independent way, regardless 3665 * of when exactly it is running. 3666 */ 3667 if (!tick_nohz_tick_stopped_cpu(cpu)) 3668 goto out_requeue; 3669 3670 rq_lock_irq(rq, &rf); 3671 curr = rq->curr; 3672 if (cpu_is_offline(cpu)) 3673 goto out_unlock; 3674 3675 update_rq_clock(rq); 3676 3677 if (!is_idle_task(curr)) { 3678 /* 3679 * Make sure the next tick runs within a reasonable 3680 * amount of time. 3681 */ 3682 delta = rq_clock_task(rq) - curr->se.exec_start; 3683 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); 3684 } 3685 curr->sched_class->task_tick(rq, curr, 0); 3686 3687 calc_load_nohz_remote(rq); 3688 out_unlock: 3689 rq_unlock_irq(rq, &rf); 3690 out_requeue: 3691 3692 /* 3693 * Run the remote tick once per second (1Hz). This arbitrary 3694 * frequency is large enough to avoid overload but short enough 3695 * to keep scheduler internal stats reasonably up to date. But 3696 * first update state to reflect hotplug activity if required. 3697 */ 3698 os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); 3699 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); 3700 if (os == TICK_SCHED_REMOTE_RUNNING) 3701 queue_delayed_work(system_unbound_wq, dwork, HZ); 3702 } 3703 3704 static void sched_tick_start(int cpu) 3705 { 3706 int os; 3707 struct tick_work *twork; 3708 3709 if (housekeeping_cpu(cpu, HK_FLAG_TICK)) 3710 return; 3711 3712 WARN_ON_ONCE(!tick_work_cpu); 3713 3714 twork = per_cpu_ptr(tick_work_cpu, cpu); 3715 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); 3716 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); 3717 if (os == TICK_SCHED_REMOTE_OFFLINE) { 3718 twork->cpu = cpu; 3719 INIT_DELAYED_WORK(&twork->work, sched_tick_remote); 3720 queue_delayed_work(system_unbound_wq, &twork->work, HZ); 3721 } 3722 } 3723 3724 #ifdef CONFIG_HOTPLUG_CPU 3725 static void sched_tick_stop(int cpu) 3726 { 3727 struct tick_work *twork; 3728 int os; 3729 3730 if (housekeeping_cpu(cpu, HK_FLAG_TICK)) 3731 return; 3732 3733 WARN_ON_ONCE(!tick_work_cpu); 3734 3735 twork = per_cpu_ptr(tick_work_cpu, cpu); 3736 /* There cannot be competing actions, but don't rely on stop-machine. */ 3737 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); 3738 WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); 3739 /* Don't cancel, as this would mess up the state machine. */ 3740 } 3741 #endif /* CONFIG_HOTPLUG_CPU */ 3742 3743 int __init sched_tick_offload_init(void) 3744 { 3745 tick_work_cpu = alloc_percpu(struct tick_work); 3746 BUG_ON(!tick_work_cpu); 3747 return 0; 3748 } 3749 3750 #else /* !CONFIG_NO_HZ_FULL */ 3751 static inline void sched_tick_start(int cpu) { } 3752 static inline void sched_tick_stop(int cpu) { } 3753 #endif 3754 3755 #if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ 3756 defined(CONFIG_TRACE_PREEMPT_TOGGLE)) 3757 /* 3758 * If the value passed in is equal to the current preempt count 3759 * then we just disabled preemption. Start timing the latency. 3760 */ 3761 static inline void preempt_latency_start(int val) 3762 { 3763 if (preempt_count() == val) { 3764 unsigned long ip = get_lock_parent_ip(); 3765 #ifdef CONFIG_DEBUG_PREEMPT 3766 current->preempt_disable_ip = ip; 3767 #endif 3768 trace_preempt_off(CALLER_ADDR0, ip); 3769 } 3770 } 3771 3772 void preempt_count_add(int val) 3773 { 3774 #ifdef CONFIG_DEBUG_PREEMPT 3775 /* 3776 * Underflow? 3777 */ 3778 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 3779 return; 3780 #endif 3781 __preempt_count_add(val); 3782 #ifdef CONFIG_DEBUG_PREEMPT 3783 /* 3784 * Spinlock count overflowing soon? 3785 */ 3786 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 3787 PREEMPT_MASK - 10); 3788 #endif 3789 preempt_latency_start(val); 3790 } 3791 EXPORT_SYMBOL(preempt_count_add); 3792 NOKPROBE_SYMBOL(preempt_count_add); 3793 3794 /* 3795 * If the value passed in equals to the current preempt count 3796 * then we just enabled preemption. Stop timing the latency. 3797 */ 3798 static inline void preempt_latency_stop(int val) 3799 { 3800 if (preempt_count() == val) 3801 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); 3802 } 3803 3804 void preempt_count_sub(int val) 3805 { 3806 #ifdef CONFIG_DEBUG_PREEMPT 3807 /* 3808 * Underflow? 3809 */ 3810 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 3811 return; 3812 /* 3813 * Is the spinlock portion underflowing? 3814 */ 3815 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 3816 !(preempt_count() & PREEMPT_MASK))) 3817 return; 3818 #endif 3819 3820 preempt_latency_stop(val); 3821 __preempt_count_sub(val); 3822 } 3823 EXPORT_SYMBOL(preempt_count_sub); 3824 NOKPROBE_SYMBOL(preempt_count_sub); 3825 3826 #else 3827 static inline void preempt_latency_start(int val) { } 3828 static inline void preempt_latency_stop(int val) { } 3829 #endif 3830 3831 static inline unsigned long get_preempt_disable_ip(struct task_struct *p) 3832 { 3833 #ifdef CONFIG_DEBUG_PREEMPT 3834 return p->preempt_disable_ip; 3835 #else 3836 return 0; 3837 #endif 3838 } 3839 3840 /* 3841 * Print scheduling while atomic bug: 3842 */ 3843 static noinline void __schedule_bug(struct task_struct *prev) 3844 { 3845 /* Save this before calling printk(), since that will clobber it */ 3846 unsigned long preempt_disable_ip = get_preempt_disable_ip(current); 3847 3848 if (oops_in_progress) 3849 return; 3850 3851 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 3852 prev->comm, prev->pid, preempt_count()); 3853 3854 debug_show_held_locks(prev); 3855 print_modules(); 3856 if (irqs_disabled()) 3857 print_irqtrace_events(prev); 3858 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) 3859 && in_atomic_preempt_off()) { 3860 pr_err("Preemption disabled at:"); 3861 print_ip_sym(preempt_disable_ip); 3862 pr_cont("\n"); 3863 } 3864 if (panic_on_warn) 3865 panic("scheduling while atomic\n"); 3866 3867 dump_stack(); 3868 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 3869 } 3870 3871 /* 3872 * Various schedule()-time debugging checks and statistics: 3873 */ 3874 static inline void schedule_debug(struct task_struct *prev, bool preempt) 3875 { 3876 #ifdef CONFIG_SCHED_STACK_END_CHECK 3877 if (task_stack_end_corrupted(prev)) 3878 panic("corrupted stack end detected inside scheduler\n"); 3879 #endif 3880 3881 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 3882 if (!preempt && prev->state && prev->non_block_count) { 3883 printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", 3884 prev->comm, prev->pid, prev->non_block_count); 3885 dump_stack(); 3886 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 3887 } 3888 #endif 3889 3890 if (unlikely(in_atomic_preempt_off())) { 3891 __schedule_bug(prev); 3892 preempt_count_set(PREEMPT_DISABLED); 3893 } 3894 rcu_sleep_check(); 3895 3896 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3897 3898 schedstat_inc(this_rq()->sched_count); 3899 } 3900 3901 /* 3902 * Pick up the highest-prio task: 3903 */ 3904 static inline struct task_struct * 3905 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 3906 { 3907 const struct sched_class *class; 3908 struct task_struct *p; 3909 3910 /* 3911 * Optimization: we know that if all tasks are in the fair class we can 3912 * call that function directly, but only if the @prev task wasn't of a 3913 * higher scheduling class, because otherwise those loose the 3914 * opportunity to pull in more work from other CPUs. 3915 */ 3916 if (likely((prev->sched_class == &idle_sched_class || 3917 prev->sched_class == &fair_sched_class) && 3918 rq->nr_running == rq->cfs.h_nr_running)) { 3919 3920 p = pick_next_task_fair(rq, prev, rf); 3921 if (unlikely(p == RETRY_TASK)) 3922 goto restart; 3923 3924 /* Assumes fair_sched_class->next == idle_sched_class */ 3925 if (!p) { 3926 put_prev_task(rq, prev); 3927 p = pick_next_task_idle(rq); 3928 } 3929 3930 return p; 3931 } 3932 3933 restart: 3934 #ifdef CONFIG_SMP 3935 /* 3936 * We must do the balancing pass before put_next_task(), such 3937 * that when we release the rq->lock the task is in the same 3938 * state as before we took rq->lock. 3939 * 3940 * We can terminate the balance pass as soon as we know there is 3941 * a runnable task of @class priority or higher. 3942 */ 3943 for_class_range(class, prev->sched_class, &idle_sched_class) { 3944 if (class->balance(rq, prev, rf)) 3945 break; 3946 } 3947 #endif 3948 3949 put_prev_task(rq, prev); 3950 3951 for_each_class(class) { 3952 p = class->pick_next_task(rq); 3953 if (p) 3954 return p; 3955 } 3956 3957 /* The idle class should always have a runnable task: */ 3958 BUG(); 3959 } 3960 3961 /* 3962 * __schedule() is the main scheduler function. 3963 * 3964 * The main means of driving the scheduler and thus entering this function are: 3965 * 3966 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. 3967 * 3968 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return 3969 * paths. For example, see arch/x86/entry_64.S. 3970 * 3971 * To drive preemption between tasks, the scheduler sets the flag in timer 3972 * interrupt handler scheduler_tick(). 3973 * 3974 * 3. Wakeups don't really cause entry into schedule(). They add a 3975 * task to the run-queue and that's it. 3976 * 3977 * Now, if the new task added to the run-queue preempts the current 3978 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets 3979 * called on the nearest possible occasion: 3980 * 3981 * - If the kernel is preemptible (CONFIG_PREEMPTION=y): 3982 * 3983 * - in syscall or exception context, at the next outmost 3984 * preempt_enable(). (this might be as soon as the wake_up()'s 3985 * spin_unlock()!) 3986 * 3987 * - in IRQ context, return from interrupt-handler to 3988 * preemptible context 3989 * 3990 * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) 3991 * then at the next: 3992 * 3993 * - cond_resched() call 3994 * - explicit schedule() call 3995 * - return from syscall or exception to user-space 3996 * - return from interrupt-handler to user-space 3997 * 3998 * WARNING: must be called with preemption disabled! 3999 */ 4000 static void __sched notrace __schedule(bool preempt) 4001 { 4002 struct task_struct *prev, *next; 4003 unsigned long *switch_count; 4004 struct rq_flags rf; 4005 struct rq *rq; 4006 int cpu; 4007 4008 cpu = smp_processor_id(); 4009 rq = cpu_rq(cpu); 4010 prev = rq->curr; 4011 4012 schedule_debug(prev, preempt); 4013 4014 if (sched_feat(HRTICK)) 4015 hrtick_clear(rq); 4016 4017 local_irq_disable(); 4018 rcu_note_context_switch(preempt); 4019 4020 /* 4021 * Make sure that signal_pending_state()->signal_pending() below 4022 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) 4023 * done by the caller to avoid the race with signal_wake_up(). 4024 * 4025 * The membarrier system call requires a full memory barrier 4026 * after coming from user-space, before storing to rq->curr. 4027 */ 4028 rq_lock(rq, &rf); 4029 smp_mb__after_spinlock(); 4030 4031 /* Promote REQ to ACT */ 4032 rq->clock_update_flags <<= 1; 4033 update_rq_clock(rq); 4034 4035 switch_count = &prev->nivcsw; 4036 if (!preempt && prev->state) { 4037 if (signal_pending_state(prev->state, prev)) { 4038 prev->state = TASK_RUNNING; 4039 } else { 4040 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); 4041 4042 if (prev->in_iowait) { 4043 atomic_inc(&rq->nr_iowait); 4044 delayacct_blkio_start(); 4045 } 4046 } 4047 switch_count = &prev->nvcsw; 4048 } 4049 4050 next = pick_next_task(rq, prev, &rf); 4051 clear_tsk_need_resched(prev); 4052 clear_preempt_need_resched(); 4053 4054 if (likely(prev != next)) { 4055 rq->nr_switches++; 4056 /* 4057 * RCU users of rcu_dereference(rq->curr) may not see 4058 * changes to task_struct made by pick_next_task(). 4059 */ 4060 RCU_INIT_POINTER(rq->curr, next); 4061 /* 4062 * The membarrier system call requires each architecture 4063 * to have a full memory barrier after updating 4064 * rq->curr, before returning to user-space. 4065 * 4066 * Here are the schemes providing that barrier on the 4067 * various architectures: 4068 * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. 4069 * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. 4070 * - finish_lock_switch() for weakly-ordered 4071 * architectures where spin_unlock is a full barrier, 4072 * - switch_to() for arm64 (weakly-ordered, spin_unlock 4073 * is a RELEASE barrier), 4074 */ 4075 ++*switch_count; 4076 4077 psi_sched_switch(prev, next, !task_on_rq_queued(prev)); 4078 4079 trace_sched_switch(preempt, prev, next); 4080 4081 /* Also unlocks the rq: */ 4082 rq = context_switch(rq, prev, next, &rf); 4083 } else { 4084 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); 4085 rq_unlock_irq(rq, &rf); 4086 } 4087 4088 balance_callback(rq); 4089 } 4090 4091 void __noreturn do_task_dead(void) 4092 { 4093 /* Causes final put_task_struct in finish_task_switch(): */ 4094 set_special_state(TASK_DEAD); 4095 4096 /* Tell freezer to ignore us: */ 4097 current->flags |= PF_NOFREEZE; 4098 4099 __schedule(false); 4100 BUG(); 4101 4102 /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ 4103 for (;;) 4104 cpu_relax(); 4105 } 4106 4107 static inline void sched_submit_work(struct task_struct *tsk) 4108 { 4109 if (!tsk->state) 4110 return; 4111 4112 /* 4113 * If a worker went to sleep, notify and ask workqueue whether 4114 * it wants to wake up a task to maintain concurrency. 4115 * As this function is called inside the schedule() context, 4116 * we disable preemption to avoid it calling schedule() again 4117 * in the possible wakeup of a kworker and because wq_worker_sleeping() 4118 * requires it. 4119 */ 4120 if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { 4121 preempt_disable(); 4122 if (tsk->flags & PF_WQ_WORKER) 4123 wq_worker_sleeping(tsk); 4124 else 4125 io_wq_worker_sleeping(tsk); 4126 preempt_enable_no_resched(); 4127 } 4128 4129 if (tsk_is_pi_blocked(tsk)) 4130 return; 4131 4132 /* 4133 * If we are going to sleep and we have plugged IO queued, 4134 * make sure to submit it to avoid deadlocks. 4135 */ 4136 if (blk_needs_flush_plug(tsk)) 4137 blk_schedule_flush_plug(tsk); 4138 } 4139 4140 static void sched_update_worker(struct task_struct *tsk) 4141 { 4142 if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { 4143 if (tsk->flags & PF_WQ_WORKER) 4144 wq_worker_running(tsk); 4145 else 4146 io_wq_worker_running(tsk); 4147 } 4148 } 4149 4150 asmlinkage __visible void __sched schedule(void) 4151 { 4152 struct task_struct *tsk = current; 4153 4154 sched_submit_work(tsk); 4155 do { 4156 preempt_disable(); 4157 __schedule(false); 4158 sched_preempt_enable_no_resched(); 4159 } while (need_resched()); 4160 sched_update_worker(tsk); 4161 } 4162 EXPORT_SYMBOL(schedule); 4163 4164 /* 4165 * synchronize_rcu_tasks() makes sure that no task is stuck in preempted 4166 * state (have scheduled out non-voluntarily) by making sure that all 4167 * tasks have either left the run queue or have gone into user space. 4168 * As idle tasks do not do either, they must not ever be preempted 4169 * (schedule out non-voluntarily). 4170 * 4171 * schedule_idle() is similar to schedule_preempt_disable() except that it 4172 * never enables preemption because it does not call sched_submit_work(). 4173 */ 4174 void __sched schedule_idle(void) 4175 { 4176 /* 4177 * As this skips calling sched_submit_work(), which the idle task does 4178 * regardless because that function is a nop when the task is in a 4179 * TASK_RUNNING state, make sure this isn't used someplace that the 4180 * current task can be in any other state. Note, idle is always in the 4181 * TASK_RUNNING state. 4182 */ 4183 WARN_ON_ONCE(current->state); 4184 do { 4185 __schedule(false); 4186 } while (need_resched()); 4187 } 4188 4189 #ifdef CONFIG_CONTEXT_TRACKING 4190 asmlinkage __visible void __sched schedule_user(void) 4191 { 4192 /* 4193 * If we come here after a random call to set_need_resched(), 4194 * or we have been woken up remotely but the IPI has not yet arrived, 4195 * we haven't yet exited the RCU idle mode. Do it here manually until 4196 * we find a better solution. 4197 * 4198 * NB: There are buggy callers of this function. Ideally we 4199 * should warn if prev_state != CONTEXT_USER, but that will trigger 4200 * too frequently to make sense yet. 4201 */ 4202 enum ctx_state prev_state = exception_enter(); 4203 schedule(); 4204 exception_exit(prev_state); 4205 } 4206 #endif 4207 4208 /** 4209 * schedule_preempt_disabled - called with preemption disabled 4210 * 4211 * Returns with preemption disabled. Note: preempt_count must be 1 4212 */ 4213 void __sched schedule_preempt_disabled(void) 4214 { 4215 sched_preempt_enable_no_resched(); 4216 schedule(); 4217 preempt_disable(); 4218 } 4219 4220 static void __sched notrace preempt_schedule_common(void) 4221 { 4222 do { 4223 /* 4224 * Because the function tracer can trace preempt_count_sub() 4225 * and it also uses preempt_enable/disable_notrace(), if 4226 * NEED_RESCHED is set, the preempt_enable_notrace() called 4227 * by the function tracer will call this function again and 4228 * cause infinite recursion. 4229 * 4230 * Preemption must be disabled here before the function 4231 * tracer can trace. Break up preempt_disable() into two 4232 * calls. One to disable preemption without fear of being 4233 * traced. The other to still record the preemption latency, 4234 * which can also be traced by the function tracer. 4235 */ 4236 preempt_disable_notrace(); 4237 preempt_latency_start(1); 4238 __schedule(true); 4239 preempt_latency_stop(1); 4240 preempt_enable_no_resched_notrace(); 4241 4242 /* 4243 * Check again in case we missed a preemption opportunity 4244 * between schedule and now. 4245 */ 4246 } while (need_resched()); 4247 } 4248 4249 #ifdef CONFIG_PREEMPTION 4250 /* 4251 * This is the entry point to schedule() from in-kernel preemption 4252 * off of preempt_enable. 4253 */ 4254 asmlinkage __visible void __sched notrace preempt_schedule(void) 4255 { 4256 /* 4257 * If there is a non-zero preempt_count or interrupts are disabled, 4258 * we do not want to preempt the current task. Just return.. 4259 */ 4260 if (likely(!preemptible())) 4261 return; 4262 4263 preempt_schedule_common(); 4264 } 4265 NOKPROBE_SYMBOL(preempt_schedule); 4266 EXPORT_SYMBOL(preempt_schedule); 4267 4268 /** 4269 * preempt_schedule_notrace - preempt_schedule called by tracing 4270 * 4271 * The tracing infrastructure uses preempt_enable_notrace to prevent 4272 * recursion and tracing preempt enabling caused by the tracing 4273 * infrastructure itself. But as tracing can happen in areas coming 4274 * from userspace or just about to enter userspace, a preempt enable 4275 * can occur before user_exit() is called. This will cause the scheduler 4276 * to be called when the system is still in usermode. 4277 * 4278 * To prevent this, the preempt_enable_notrace will use this function 4279 * instead of preempt_schedule() to exit user context if needed before 4280 * calling the scheduler. 4281 */ 4282 asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) 4283 { 4284 enum ctx_state prev_ctx; 4285 4286 if (likely(!preemptible())) 4287 return; 4288 4289 do { 4290 /* 4291 * Because the function tracer can trace preempt_count_sub() 4292 * and it also uses preempt_enable/disable_notrace(), if 4293 * NEED_RESCHED is set, the preempt_enable_notrace() called 4294 * by the function tracer will call this function again and 4295 * cause infinite recursion. 4296 * 4297 * Preemption must be disabled here before the function 4298 * tracer can trace. Break up preempt_disable() into two 4299 * calls. One to disable preemption without fear of being 4300 * traced. The other to still record the preemption latency, 4301 * which can also be traced by the function tracer. 4302 */ 4303 preempt_disable_notrace(); 4304 preempt_latency_start(1); 4305 /* 4306 * Needs preempt disabled in case user_exit() is traced 4307 * and the tracer calls preempt_enable_notrace() causing 4308 * an infinite recursion. 4309 */ 4310 prev_ctx = exception_enter(); 4311 __schedule(true); 4312 exception_exit(prev_ctx); 4313 4314 preempt_latency_stop(1); 4315 preempt_enable_no_resched_notrace(); 4316 } while (need_resched()); 4317 } 4318 EXPORT_SYMBOL_GPL(preempt_schedule_notrace); 4319 4320 #endif /* CONFIG_PREEMPTION */ 4321 4322 /* 4323 * This is the entry point to schedule() from kernel preemption 4324 * off of irq context. 4325 * Note, that this is called and return with irqs disabled. This will 4326 * protect us against recursive calling from irq. 4327 */ 4328 asmlinkage __visible void __sched preempt_schedule_irq(void) 4329 { 4330 enum ctx_state prev_state; 4331 4332 /* Catch callers which need to be fixed */ 4333 BUG_ON(preempt_count() || !irqs_disabled()); 4334 4335 prev_state = exception_enter(); 4336 4337 do { 4338 preempt_disable(); 4339 local_irq_enable(); 4340 __schedule(true); 4341 local_irq_disable(); 4342 sched_preempt_enable_no_resched(); 4343 } while (need_resched()); 4344 4345 exception_exit(prev_state); 4346 } 4347 4348 int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, 4349 void *key) 4350 { 4351 return try_to_wake_up(curr->private, mode, wake_flags); 4352 } 4353 EXPORT_SYMBOL(default_wake_function); 4354 4355 #ifdef CONFIG_RT_MUTEXES 4356 4357 static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) 4358 { 4359 if (pi_task) 4360 prio = min(prio, pi_task->prio); 4361 4362 return prio; 4363 } 4364 4365 static inline int rt_effective_prio(struct task_struct *p, int prio) 4366 { 4367 struct task_struct *pi_task = rt_mutex_get_top_task(p); 4368 4369 return __rt_effective_prio(pi_task, prio); 4370 } 4371 4372 /* 4373 * rt_mutex_setprio - set the current priority of a task 4374 * @p: task to boost 4375 * @pi_task: donor task 4376 * 4377 * This function changes the 'effective' priority of a task. It does 4378 * not touch ->normal_prio like __setscheduler(). 4379 * 4380 * Used by the rt_mutex code to implement priority inheritance 4381 * logic. Call site only calls if the priority of the task changed. 4382 */ 4383 void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) 4384 { 4385 int prio, oldprio, queued, running, queue_flag = 4386 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 4387 const struct sched_class *prev_class; 4388 struct rq_flags rf; 4389 struct rq *rq; 4390 4391 /* XXX used to be waiter->prio, not waiter->task->prio */ 4392 prio = __rt_effective_prio(pi_task, p->normal_prio); 4393 4394 /* 4395 * If nothing changed; bail early. 4396 */ 4397 if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio)) 4398 return; 4399 4400 rq = __task_rq_lock(p, &rf); 4401 update_rq_clock(rq); 4402 /* 4403 * Set under pi_lock && rq->lock, such that the value can be used under 4404 * either lock. 4405 * 4406 * Note that there is loads of tricky to make this pointer cache work 4407 * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to 4408 * ensure a task is de-boosted (pi_task is set to NULL) before the 4409 * task is allowed to run again (and can exit). This ensures the pointer 4410 * points to a blocked task -- which guaratees the task is present. 4411 */ 4412 p->pi_top_task = pi_task; 4413 4414 /* 4415 * For FIFO/RR we only need to set prio, if that matches we're done. 4416 */ 4417 if (prio == p->prio && !dl_prio(prio)) 4418 goto out_unlock; 4419 4420 /* 4421 * Idle task boosting is a nono in general. There is one 4422 * exception, when PREEMPT_RT and NOHZ is active: 4423 * 4424 * The idle task calls get_next_timer_interrupt() and holds 4425 * the timer wheel base->lock on the CPU and another CPU wants 4426 * to access the timer (probably to cancel it). We can safely 4427 * ignore the boosting request, as the idle CPU runs this code 4428 * with interrupts disabled and will complete the lock 4429 * protected section without being interrupted. So there is no 4430 * real need to boost. 4431 */ 4432 if (unlikely(p == rq->idle)) { 4433 WARN_ON(p != rq->curr); 4434 WARN_ON(p->pi_blocked_on); 4435 goto out_unlock; 4436 } 4437 4438 trace_sched_pi_setprio(p, pi_task); 4439 oldprio = p->prio; 4440 4441 if (oldprio == prio) 4442 queue_flag &= ~DEQUEUE_MOVE; 4443 4444 prev_class = p->sched_class; 4445 queued = task_on_rq_queued(p); 4446 running = task_current(rq, p); 4447 if (queued) 4448 dequeue_task(rq, p, queue_flag); 4449 if (running) 4450 put_prev_task(rq, p); 4451 4452 /* 4453 * Boosting condition are: 4454 * 1. -rt task is running and holds mutex A 4455 * --> -dl task blocks on mutex A 4456 * 4457 * 2. -dl task is running and holds mutex A 4458 * --> -dl task blocks on mutex A and could preempt the 4459 * running task 4460 */ 4461 if (dl_prio(prio)) { 4462 if (!dl_prio(p->normal_prio) || 4463 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { 4464 p->dl.dl_boosted = 1; 4465 queue_flag |= ENQUEUE_REPLENISH; 4466 } else 4467 p->dl.dl_boosted = 0; 4468 p->sched_class = &dl_sched_class; 4469 } else if (rt_prio(prio)) { 4470 if (dl_prio(oldprio)) 4471 p->dl.dl_boosted = 0; 4472 if (oldprio < prio) 4473 queue_flag |= ENQUEUE_HEAD; 4474 p->sched_class = &rt_sched_class; 4475 } else { 4476 if (dl_prio(oldprio)) 4477 p->dl.dl_boosted = 0; 4478 if (rt_prio(oldprio)) 4479 p->rt.timeout = 0; 4480 p->sched_class = &fair_sched_class; 4481 } 4482 4483 p->prio = prio; 4484 4485 if (queued) 4486 enqueue_task(rq, p, queue_flag); 4487 if (running) 4488 set_next_task(rq, p); 4489 4490 check_class_changed(rq, p, prev_class, oldprio); 4491 out_unlock: 4492 /* Avoid rq from going away on us: */ 4493 preempt_disable(); 4494 __task_rq_unlock(rq, &rf); 4495 4496 balance_callback(rq); 4497 preempt_enable(); 4498 } 4499 #else 4500 static inline int rt_effective_prio(struct task_struct *p, int prio) 4501 { 4502 return prio; 4503 } 4504 #endif 4505 4506 void set_user_nice(struct task_struct *p, long nice) 4507 { 4508 bool queued, running; 4509 int old_prio; 4510 struct rq_flags rf; 4511 struct rq *rq; 4512 4513 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) 4514 return; 4515 /* 4516 * We have to be careful, if called from sys_setpriority(), 4517 * the task might be in the middle of scheduling on another CPU. 4518 */ 4519 rq = task_rq_lock(p, &rf); 4520 update_rq_clock(rq); 4521 4522 /* 4523 * The RT priorities are set via sched_setscheduler(), but we still 4524 * allow the 'normal' nice value to be set - but as expected 4525 * it wont have any effect on scheduling until the task is 4526 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: 4527 */ 4528 if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 4529 p->static_prio = NICE_TO_PRIO(nice); 4530 goto out_unlock; 4531 } 4532 queued = task_on_rq_queued(p); 4533 running = task_current(rq, p); 4534 if (queued) 4535 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); 4536 if (running) 4537 put_prev_task(rq, p); 4538 4539 p->static_prio = NICE_TO_PRIO(nice); 4540 set_load_weight(p, true); 4541 old_prio = p->prio; 4542 p->prio = effective_prio(p); 4543 4544 if (queued) 4545 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); 4546 if (running) 4547 set_next_task(rq, p); 4548 4549 /* 4550 * If the task increased its priority or is running and 4551 * lowered its priority, then reschedule its CPU: 4552 */ 4553 p->sched_class->prio_changed(rq, p, old_prio); 4554 4555 out_unlock: 4556 task_rq_unlock(rq, p, &rf); 4557 } 4558 EXPORT_SYMBOL(set_user_nice); 4559 4560 /* 4561 * can_nice - check if a task can reduce its nice value 4562 * @p: task 4563 * @nice: nice value 4564 */ 4565 int can_nice(const struct task_struct *p, const int nice) 4566 { 4567 /* Convert nice value [19,-20] to rlimit style value [1,40]: */ 4568 int nice_rlim = nice_to_rlimit(nice); 4569 4570 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 4571 capable(CAP_SYS_NICE)); 4572 } 4573 4574 #ifdef __ARCH_WANT_SYS_NICE 4575 4576 /* 4577 * sys_nice - change the priority of the current process. 4578 * @increment: priority increment 4579 * 4580 * sys_setpriority is a more generic, but much slower function that 4581 * does similar things. 4582 */ 4583 SYSCALL_DEFINE1(nice, int, increment) 4584 { 4585 long nice, retval; 4586 4587 /* 4588 * Setpriority might change our priority at the same moment. 4589 * We don't have to worry. Conceptually one call occurs first 4590 * and we have a single winner. 4591 */ 4592 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); 4593 nice = task_nice(current) + increment; 4594 4595 nice = clamp_val(nice, MIN_NICE, MAX_NICE); 4596 if (increment < 0 && !can_nice(current, nice)) 4597 return -EPERM; 4598 4599 retval = security_task_setnice(current, nice); 4600 if (retval) 4601 return retval; 4602 4603 set_user_nice(current, nice); 4604 return 0; 4605 } 4606 4607 #endif 4608 4609 /** 4610 * task_prio - return the priority value of a given task. 4611 * @p: the task in question. 4612 * 4613 * Return: The priority value as seen by users in /proc. 4614 * RT tasks are offset by -200. Normal tasks are centered 4615 * around 0, value goes from -16 to +15. 4616 */ 4617 int task_prio(const struct task_struct *p) 4618 { 4619 return p->prio - MAX_RT_PRIO; 4620 } 4621 4622 /** 4623 * idle_cpu - is a given CPU idle currently? 4624 * @cpu: the processor in question. 4625 * 4626 * Return: 1 if the CPU is currently idle. 0 otherwise. 4627 */ 4628 int idle_cpu(int cpu) 4629 { 4630 struct rq *rq = cpu_rq(cpu); 4631 4632 if (rq->curr != rq->idle) 4633 return 0; 4634 4635 if (rq->nr_running) 4636 return 0; 4637 4638 #ifdef CONFIG_SMP 4639 if (!llist_empty(&rq->wake_list)) 4640 return 0; 4641 #endif 4642 4643 return 1; 4644 } 4645 4646 /** 4647 * available_idle_cpu - is a given CPU idle for enqueuing work. 4648 * @cpu: the CPU in question. 4649 * 4650 * Return: 1 if the CPU is currently idle. 0 otherwise. 4651 */ 4652 int available_idle_cpu(int cpu) 4653 { 4654 if (!idle_cpu(cpu)) 4655 return 0; 4656 4657 if (vcpu_is_preempted(cpu)) 4658 return 0; 4659 4660 return 1; 4661 } 4662 4663 /** 4664 * idle_task - return the idle task for a given CPU. 4665 * @cpu: the processor in question. 4666 * 4667 * Return: The idle task for the CPU @cpu. 4668 */ 4669 struct task_struct *idle_task(int cpu) 4670 { 4671 return cpu_rq(cpu)->idle; 4672 } 4673 4674 /** 4675 * find_process_by_pid - find a process with a matching PID value. 4676 * @pid: the pid in question. 4677 * 4678 * The task of @pid, if found. %NULL otherwise. 4679 */ 4680 static struct task_struct *find_process_by_pid(pid_t pid) 4681 { 4682 return pid ? find_task_by_vpid(pid) : current; 4683 } 4684 4685 /* 4686 * sched_setparam() passes in -1 for its policy, to let the functions 4687 * it calls know not to change it. 4688 */ 4689 #define SETPARAM_POLICY -1 4690 4691 static void __setscheduler_params(struct task_struct *p, 4692 const struct sched_attr *attr) 4693 { 4694 int policy = attr->sched_policy; 4695 4696 if (policy == SETPARAM_POLICY) 4697 policy = p->policy; 4698 4699 p->policy = policy; 4700 4701 if (dl_policy(policy)) 4702 __setparam_dl(p, attr); 4703 else if (fair_policy(policy)) 4704 p->static_prio = NICE_TO_PRIO(attr->sched_nice); 4705 4706 /* 4707 * __sched_setscheduler() ensures attr->sched_priority == 0 when 4708 * !rt_policy. Always setting this ensures that things like 4709 * getparam()/getattr() don't report silly values for !rt tasks. 4710 */ 4711 p->rt_priority = attr->sched_priority; 4712 p->normal_prio = normal_prio(p); 4713 set_load_weight(p, true); 4714 } 4715 4716 /* Actually do priority change: must hold pi & rq lock. */ 4717 static void __setscheduler(struct rq *rq, struct task_struct *p, 4718 const struct sched_attr *attr, bool keep_boost) 4719 { 4720 /* 4721 * If params can't change scheduling class changes aren't allowed 4722 * either. 4723 */ 4724 if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) 4725 return; 4726 4727 __setscheduler_params(p, attr); 4728 4729 /* 4730 * Keep a potential priority boosting if called from 4731 * sched_setscheduler(). 4732 */ 4733 p->prio = normal_prio(p); 4734 if (keep_boost) 4735 p->prio = rt_effective_prio(p, p->prio); 4736 4737 if (dl_prio(p->prio)) 4738 p->sched_class = &dl_sched_class; 4739 else if (rt_prio(p->prio)) 4740 p->sched_class = &rt_sched_class; 4741 else 4742 p->sched_class = &fair_sched_class; 4743 } 4744 4745 /* 4746 * Check the target process has a UID that matches the current process's: 4747 */ 4748 static bool check_same_owner(struct task_struct *p) 4749 { 4750 const struct cred *cred = current_cred(), *pcred; 4751 bool match; 4752 4753 rcu_read_lock(); 4754 pcred = __task_cred(p); 4755 match = (uid_eq(cred->euid, pcred->euid) || 4756 uid_eq(cred->euid, pcred->uid)); 4757 rcu_read_unlock(); 4758 return match; 4759 } 4760 4761 static int __sched_setscheduler(struct task_struct *p, 4762 const struct sched_attr *attr, 4763 bool user, bool pi) 4764 { 4765 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : 4766 MAX_RT_PRIO - 1 - attr->sched_priority; 4767 int retval, oldprio, oldpolicy = -1, queued, running; 4768 int new_effective_prio, policy = attr->sched_policy; 4769 const struct sched_class *prev_class; 4770 struct rq_flags rf; 4771 int reset_on_fork; 4772 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 4773 struct rq *rq; 4774 4775 /* The pi code expects interrupts enabled */ 4776 BUG_ON(pi && in_interrupt()); 4777 recheck: 4778 /* Double check policy once rq lock held: */ 4779 if (policy < 0) { 4780 reset_on_fork = p->sched_reset_on_fork; 4781 policy = oldpolicy = p->policy; 4782 } else { 4783 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); 4784 4785 if (!valid_policy(policy)) 4786 return -EINVAL; 4787 } 4788 4789 if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) 4790 return -EINVAL; 4791 4792 /* 4793 * Valid priorities for SCHED_FIFO and SCHED_RR are 4794 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 4795 * SCHED_BATCH and SCHED_IDLE is 0. 4796 */ 4797 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || 4798 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) 4799 return -EINVAL; 4800 if ((dl_policy(policy) && !__checkparam_dl(attr)) || 4801 (rt_policy(policy) != (attr->sched_priority != 0))) 4802 return -EINVAL; 4803 4804 /* 4805 * Allow unprivileged RT tasks to decrease priority: 4806 */ 4807 if (user && !capable(CAP_SYS_NICE)) { 4808 if (fair_policy(policy)) { 4809 if (attr->sched_nice < task_nice(p) && 4810 !can_nice(p, attr->sched_nice)) 4811 return -EPERM; 4812 } 4813 4814 if (rt_policy(policy)) { 4815 unsigned long rlim_rtprio = 4816 task_rlimit(p, RLIMIT_RTPRIO); 4817 4818 /* Can't set/change the rt policy: */ 4819 if (policy != p->policy && !rlim_rtprio) 4820 return -EPERM; 4821 4822 /* Can't increase priority: */ 4823 if (attr->sched_priority > p->rt_priority && 4824 attr->sched_priority > rlim_rtprio) 4825 return -EPERM; 4826 } 4827 4828 /* 4829 * Can't set/change SCHED_DEADLINE policy at all for now 4830 * (safest behavior); in the future we would like to allow 4831 * unprivileged DL tasks to increase their relative deadline 4832 * or reduce their runtime (both ways reducing utilization) 4833 */ 4834 if (dl_policy(policy)) 4835 return -EPERM; 4836 4837 /* 4838 * Treat SCHED_IDLE as nice 20. Only allow a switch to 4839 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 4840 */ 4841 if (task_has_idle_policy(p) && !idle_policy(policy)) { 4842 if (!can_nice(p, task_nice(p))) 4843 return -EPERM; 4844 } 4845 4846 /* Can't change other user's priorities: */ 4847 if (!check_same_owner(p)) 4848 return -EPERM; 4849 4850 /* Normal users shall not reset the sched_reset_on_fork flag: */ 4851 if (p->sched_reset_on_fork && !reset_on_fork) 4852 return -EPERM; 4853 } 4854 4855 if (user) { 4856 if (attr->sched_flags & SCHED_FLAG_SUGOV) 4857 return -EINVAL; 4858 4859 retval = security_task_setscheduler(p); 4860 if (retval) 4861 return retval; 4862 } 4863 4864 /* Update task specific "requested" clamps */ 4865 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { 4866 retval = uclamp_validate(p, attr); 4867 if (retval) 4868 return retval; 4869 } 4870 4871 if (pi) 4872 cpuset_read_lock(); 4873 4874 /* 4875 * Make sure no PI-waiters arrive (or leave) while we are 4876 * changing the priority of the task: 4877 * 4878 * To be able to change p->policy safely, the appropriate 4879 * runqueue lock must be held. 4880 */ 4881 rq = task_rq_lock(p, &rf); 4882 update_rq_clock(rq); 4883 4884 /* 4885 * Changing the policy of the stop threads its a very bad idea: 4886 */ 4887 if (p == rq->stop) { 4888 retval = -EINVAL; 4889 goto unlock; 4890 } 4891 4892 /* 4893 * If not changing anything there's no need to proceed further, 4894 * but store a possible modification of reset_on_fork. 4895 */ 4896 if (unlikely(policy == p->policy)) { 4897 if (fair_policy(policy) && attr->sched_nice != task_nice(p)) 4898 goto change; 4899 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 4900 goto change; 4901 if (dl_policy(policy) && dl_param_changed(p, attr)) 4902 goto change; 4903 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) 4904 goto change; 4905 4906 p->sched_reset_on_fork = reset_on_fork; 4907 retval = 0; 4908 goto unlock; 4909 } 4910 change: 4911 4912 if (user) { 4913 #ifdef CONFIG_RT_GROUP_SCHED 4914 /* 4915 * Do not allow realtime tasks into groups that have no runtime 4916 * assigned. 4917 */ 4918 if (rt_bandwidth_enabled() && rt_policy(policy) && 4919 task_group(p)->rt_bandwidth.rt_runtime == 0 && 4920 !task_group_is_autogroup(task_group(p))) { 4921 retval = -EPERM; 4922 goto unlock; 4923 } 4924 #endif 4925 #ifdef CONFIG_SMP 4926 if (dl_bandwidth_enabled() && dl_policy(policy) && 4927 !(attr->sched_flags & SCHED_FLAG_SUGOV)) { 4928 cpumask_t *span = rq->rd->span; 4929 4930 /* 4931 * Don't allow tasks with an affinity mask smaller than 4932 * the entire root_domain to become SCHED_DEADLINE. We 4933 * will also fail if there's no bandwidth available. 4934 */ 4935 if (!cpumask_subset(span, p->cpus_ptr) || 4936 rq->rd->dl_bw.bw == 0) { 4937 retval = -EPERM; 4938 goto unlock; 4939 } 4940 } 4941 #endif 4942 } 4943 4944 /* Re-check policy now with rq lock held: */ 4945 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4946 policy = oldpolicy = -1; 4947 task_rq_unlock(rq, p, &rf); 4948 if (pi) 4949 cpuset_read_unlock(); 4950 goto recheck; 4951 } 4952 4953 /* 4954 * If setscheduling to SCHED_DEADLINE (or changing the parameters 4955 * of a SCHED_DEADLINE task) we need to check if enough bandwidth 4956 * is available. 4957 */ 4958 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { 4959 retval = -EBUSY; 4960 goto unlock; 4961 } 4962 4963 p->sched_reset_on_fork = reset_on_fork; 4964 oldprio = p->prio; 4965 4966 if (pi) { 4967 /* 4968 * Take priority boosted tasks into account. If the new 4969 * effective priority is unchanged, we just store the new 4970 * normal parameters and do not touch the scheduler class and 4971 * the runqueue. This will be done when the task deboost 4972 * itself. 4973 */ 4974 new_effective_prio = rt_effective_prio(p, newprio); 4975 if (new_effective_prio == oldprio) 4976 queue_flags &= ~DEQUEUE_MOVE; 4977 } 4978 4979 queued = task_on_rq_queued(p); 4980 running = task_current(rq, p); 4981 if (queued) 4982 dequeue_task(rq, p, queue_flags); 4983 if (running) 4984 put_prev_task(rq, p); 4985 4986 prev_class = p->sched_class; 4987 4988 __setscheduler(rq, p, attr, pi); 4989 __setscheduler_uclamp(p, attr); 4990 4991 if (queued) { 4992 /* 4993 * We enqueue to tail when the priority of a task is 4994 * increased (user space view). 4995 */ 4996 if (oldprio < p->prio) 4997 queue_flags |= ENQUEUE_HEAD; 4998 4999 enqueue_task(rq, p, queue_flags); 5000 } 5001 if (running) 5002 set_next_task(rq, p); 5003 5004 check_class_changed(rq, p, prev_class, oldprio); 5005 5006 /* Avoid rq from going away on us: */ 5007 preempt_disable(); 5008 task_rq_unlock(rq, p, &rf); 5009 5010 if (pi) { 5011 cpuset_read_unlock(); 5012 rt_mutex_adjust_pi(p); 5013 } 5014 5015 /* Run balance callbacks after we've adjusted the PI chain: */ 5016 balance_callback(rq); 5017 preempt_enable(); 5018 5019 return 0; 5020 5021 unlock: 5022 task_rq_unlock(rq, p, &rf); 5023 if (pi) 5024 cpuset_read_unlock(); 5025 return retval; 5026 } 5027 5028 static int _sched_setscheduler(struct task_struct *p, int policy, 5029 const struct sched_param *param, bool check) 5030 { 5031 struct sched_attr attr = { 5032 .sched_policy = policy, 5033 .sched_priority = param->sched_priority, 5034 .sched_nice = PRIO_TO_NICE(p->static_prio), 5035 }; 5036 5037 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ 5038 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { 5039 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 5040 policy &= ~SCHED_RESET_ON_FORK; 5041 attr.sched_policy = policy; 5042 } 5043 5044 return __sched_setscheduler(p, &attr, check, true); 5045 } 5046 /** 5047 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 5048 * @p: the task in question. 5049 * @policy: new policy. 5050 * @param: structure containing the new RT priority. 5051 * 5052 * Return: 0 on success. An error code otherwise. 5053 * 5054 * NOTE that the task may be already dead. 5055 */ 5056 int sched_setscheduler(struct task_struct *p, int policy, 5057 const struct sched_param *param) 5058 { 5059 return _sched_setscheduler(p, policy, param, true); 5060 } 5061 EXPORT_SYMBOL_GPL(sched_setscheduler); 5062 5063 int sched_setattr(struct task_struct *p, const struct sched_attr *attr) 5064 { 5065 return __sched_setscheduler(p, attr, true, true); 5066 } 5067 EXPORT_SYMBOL_GPL(sched_setattr); 5068 5069 int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) 5070 { 5071 return __sched_setscheduler(p, attr, false, true); 5072 } 5073 5074 /** 5075 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 5076 * @p: the task in question. 5077 * @policy: new policy. 5078 * @param: structure containing the new RT priority. 5079 * 5080 * Just like sched_setscheduler, only don't bother checking if the 5081 * current context has permission. For example, this is needed in 5082 * stop_machine(): we create temporary high priority worker threads, 5083 * but our caller might not have that capability. 5084 * 5085 * Return: 0 on success. An error code otherwise. 5086 */ 5087 int sched_setscheduler_nocheck(struct task_struct *p, int policy, 5088 const struct sched_param *param) 5089 { 5090 return _sched_setscheduler(p, policy, param, false); 5091 } 5092 EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); 5093 5094 static int 5095 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 5096 { 5097 struct sched_param lparam; 5098 struct task_struct *p; 5099 int retval; 5100 5101 if (!param || pid < 0) 5102 return -EINVAL; 5103 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 5104 return -EFAULT; 5105 5106 rcu_read_lock(); 5107 retval = -ESRCH; 5108 p = find_process_by_pid(pid); 5109 if (likely(p)) 5110 get_task_struct(p); 5111 rcu_read_unlock(); 5112 5113 if (likely(p)) { 5114 retval = sched_setscheduler(p, policy, &lparam); 5115 put_task_struct(p); 5116 } 5117 5118 return retval; 5119 } 5120 5121 /* 5122 * Mimics kernel/events/core.c perf_copy_attr(). 5123 */ 5124 static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) 5125 { 5126 u32 size; 5127 int ret; 5128 5129 /* Zero the full structure, so that a short copy will be nice: */ 5130 memset(attr, 0, sizeof(*attr)); 5131 5132 ret = get_user(size, &uattr->size); 5133 if (ret) 5134 return ret; 5135 5136 /* ABI compatibility quirk: */ 5137 if (!size) 5138 size = SCHED_ATTR_SIZE_VER0; 5139 if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) 5140 goto err_size; 5141 5142 ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); 5143 if (ret) { 5144 if (ret == -E2BIG) 5145 goto err_size; 5146 return ret; 5147 } 5148 5149 if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && 5150 size < SCHED_ATTR_SIZE_VER1) 5151 return -EINVAL; 5152 5153 /* 5154 * XXX: Do we want to be lenient like existing syscalls; or do we want 5155 * to be strict and return an error on out-of-bounds values? 5156 */ 5157 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); 5158 5159 return 0; 5160 5161 err_size: 5162 put_user(sizeof(*attr), &uattr->size); 5163 return -E2BIG; 5164 } 5165 5166 /** 5167 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 5168 * @pid: the pid in question. 5169 * @policy: new policy. 5170 * @param: structure containing the new RT priority. 5171 * 5172 * Return: 0 on success. An error code otherwise. 5173 */ 5174 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) 5175 { 5176 if (policy < 0) 5177 return -EINVAL; 5178 5179 return do_sched_setscheduler(pid, policy, param); 5180 } 5181 5182 /** 5183 * sys_sched_setparam - set/change the RT priority of a thread 5184 * @pid: the pid in question. 5185 * @param: structure containing the new RT priority. 5186 * 5187 * Return: 0 on success. An error code otherwise. 5188 */ 5189 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 5190 { 5191 return do_sched_setscheduler(pid, SETPARAM_POLICY, param); 5192 } 5193 5194 /** 5195 * sys_sched_setattr - same as above, but with extended sched_attr 5196 * @pid: the pid in question. 5197 * @uattr: structure containing the extended parameters. 5198 * @flags: for future extension. 5199 */ 5200 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, 5201 unsigned int, flags) 5202 { 5203 struct sched_attr attr; 5204 struct task_struct *p; 5205 int retval; 5206 5207 if (!uattr || pid < 0 || flags) 5208 return -EINVAL; 5209 5210 retval = sched_copy_attr(uattr, &attr); 5211 if (retval) 5212 return retval; 5213 5214 if ((int)attr.sched_policy < 0) 5215 return -EINVAL; 5216 if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) 5217 attr.sched_policy = SETPARAM_POLICY; 5218 5219 rcu_read_lock(); 5220 retval = -ESRCH; 5221 p = find_process_by_pid(pid); 5222 if (likely(p)) 5223 get_task_struct(p); 5224 rcu_read_unlock(); 5225 5226 if (likely(p)) { 5227 retval = sched_setattr(p, &attr); 5228 put_task_struct(p); 5229 } 5230 5231 return retval; 5232 } 5233 5234 /** 5235 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 5236 * @pid: the pid in question. 5237 * 5238 * Return: On success, the policy of the thread. Otherwise, a negative error 5239 * code. 5240 */ 5241 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 5242 { 5243 struct task_struct *p; 5244 int retval; 5245 5246 if (pid < 0) 5247 return -EINVAL; 5248 5249 retval = -ESRCH; 5250 rcu_read_lock(); 5251 p = find_process_by_pid(pid); 5252 if (p) { 5253 retval = security_task_getscheduler(p); 5254 if (!retval) 5255 retval = p->policy 5256 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 5257 } 5258 rcu_read_unlock(); 5259 return retval; 5260 } 5261 5262 /** 5263 * sys_sched_getparam - get the RT priority of a thread 5264 * @pid: the pid in question. 5265 * @param: structure containing the RT priority. 5266 * 5267 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error 5268 * code. 5269 */ 5270 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 5271 { 5272 struct sched_param lp = { .sched_priority = 0 }; 5273 struct task_struct *p; 5274 int retval; 5275 5276 if (!param || pid < 0) 5277 return -EINVAL; 5278 5279 rcu_read_lock(); 5280 p = find_process_by_pid(pid); 5281 retval = -ESRCH; 5282 if (!p) 5283 goto out_unlock; 5284 5285 retval = security_task_getscheduler(p); 5286 if (retval) 5287 goto out_unlock; 5288 5289 if (task_has_rt_policy(p)) 5290 lp.sched_priority = p->rt_priority; 5291 rcu_read_unlock(); 5292 5293 /* 5294 * This one might sleep, we cannot do it with a spinlock held ... 5295 */ 5296 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 5297 5298 return retval; 5299 5300 out_unlock: 5301 rcu_read_unlock(); 5302 return retval; 5303 } 5304 5305 /* 5306 * Copy the kernel size attribute structure (which might be larger 5307 * than what user-space knows about) to user-space. 5308 * 5309 * Note that all cases are valid: user-space buffer can be larger or 5310 * smaller than the kernel-space buffer. The usual case is that both 5311 * have the same size. 5312 */ 5313 static int 5314 sched_attr_copy_to_user(struct sched_attr __user *uattr, 5315 struct sched_attr *kattr, 5316 unsigned int usize) 5317 { 5318 unsigned int ksize = sizeof(*kattr); 5319 5320 if (!access_ok(uattr, usize)) 5321 return -EFAULT; 5322 5323 /* 5324 * sched_getattr() ABI forwards and backwards compatibility: 5325 * 5326 * If usize == ksize then we just copy everything to user-space and all is good. 5327 * 5328 * If usize < ksize then we only copy as much as user-space has space for, 5329 * this keeps ABI compatibility as well. We skip the rest. 5330 * 5331 * If usize > ksize then user-space is using a newer version of the ABI, 5332 * which part the kernel doesn't know about. Just ignore it - tooling can 5333 * detect the kernel's knowledge of attributes from the attr->size value 5334 * which is set to ksize in this case. 5335 */ 5336 kattr->size = min(usize, ksize); 5337 5338 if (copy_to_user(uattr, kattr, kattr->size)) 5339 return -EFAULT; 5340 5341 return 0; 5342 } 5343 5344 /** 5345 * sys_sched_getattr - similar to sched_getparam, but with sched_attr 5346 * @pid: the pid in question. 5347 * @uattr: structure containing the extended parameters. 5348 * @usize: sizeof(attr) for fwd/bwd comp. 5349 * @flags: for future extension. 5350 */ 5351 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, 5352 unsigned int, usize, unsigned int, flags) 5353 { 5354 struct sched_attr kattr = { }; 5355 struct task_struct *p; 5356 int retval; 5357 5358 if (!uattr || pid < 0 || usize > PAGE_SIZE || 5359 usize < SCHED_ATTR_SIZE_VER0 || flags) 5360 return -EINVAL; 5361 5362 rcu_read_lock(); 5363 p = find_process_by_pid(pid); 5364 retval = -ESRCH; 5365 if (!p) 5366 goto out_unlock; 5367 5368 retval = security_task_getscheduler(p); 5369 if (retval) 5370 goto out_unlock; 5371 5372 kattr.sched_policy = p->policy; 5373 if (p->sched_reset_on_fork) 5374 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 5375 if (task_has_dl_policy(p)) 5376 __getparam_dl(p, &kattr); 5377 else if (task_has_rt_policy(p)) 5378 kattr.sched_priority = p->rt_priority; 5379 else 5380 kattr.sched_nice = task_nice(p); 5381 5382 #ifdef CONFIG_UCLAMP_TASK 5383 kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; 5384 kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; 5385 #endif 5386 5387 rcu_read_unlock(); 5388 5389 return sched_attr_copy_to_user(uattr, &kattr, usize); 5390 5391 out_unlock: 5392 rcu_read_unlock(); 5393 return retval; 5394 } 5395 5396 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 5397 { 5398 cpumask_var_t cpus_allowed, new_mask; 5399 struct task_struct *p; 5400 int retval; 5401 5402 rcu_read_lock(); 5403 5404 p = find_process_by_pid(pid); 5405 if (!p) { 5406 rcu_read_unlock(); 5407 return -ESRCH; 5408 } 5409 5410 /* Prevent p going away */ 5411 get_task_struct(p); 5412 rcu_read_unlock(); 5413 5414 if (p->flags & PF_NO_SETAFFINITY) { 5415 retval = -EINVAL; 5416 goto out_put_task; 5417 } 5418 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 5419 retval = -ENOMEM; 5420 goto out_put_task; 5421 } 5422 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { 5423 retval = -ENOMEM; 5424 goto out_free_cpus_allowed; 5425 } 5426 retval = -EPERM; 5427 if (!check_same_owner(p)) { 5428 rcu_read_lock(); 5429 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { 5430 rcu_read_unlock(); 5431 goto out_free_new_mask; 5432 } 5433 rcu_read_unlock(); 5434 } 5435 5436 retval = security_task_setscheduler(p); 5437 if (retval) 5438 goto out_free_new_mask; 5439 5440 5441 cpuset_cpus_allowed(p, cpus_allowed); 5442 cpumask_and(new_mask, in_mask, cpus_allowed); 5443 5444 /* 5445 * Since bandwidth control happens on root_domain basis, 5446 * if admission test is enabled, we only admit -deadline 5447 * tasks allowed to run on all the CPUs in the task's 5448 * root_domain. 5449 */ 5450 #ifdef CONFIG_SMP 5451 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { 5452 rcu_read_lock(); 5453 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { 5454 retval = -EBUSY; 5455 rcu_read_unlock(); 5456 goto out_free_new_mask; 5457 } 5458 rcu_read_unlock(); 5459 } 5460 #endif 5461 again: 5462 retval = __set_cpus_allowed_ptr(p, new_mask, true); 5463 5464 if (!retval) { 5465 cpuset_cpus_allowed(p, cpus_allowed); 5466 if (!cpumask_subset(new_mask, cpus_allowed)) { 5467 /* 5468 * We must have raced with a concurrent cpuset 5469 * update. Just reset the cpus_allowed to the 5470 * cpuset's cpus_allowed 5471 */ 5472 cpumask_copy(new_mask, cpus_allowed); 5473 goto again; 5474 } 5475 } 5476 out_free_new_mask: 5477 free_cpumask_var(new_mask); 5478 out_free_cpus_allowed: 5479 free_cpumask_var(cpus_allowed); 5480 out_put_task: 5481 put_task_struct(p); 5482 return retval; 5483 } 5484 5485 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 5486 struct cpumask *new_mask) 5487 { 5488 if (len < cpumask_size()) 5489 cpumask_clear(new_mask); 5490 else if (len > cpumask_size()) 5491 len = cpumask_size(); 5492 5493 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 5494 } 5495 5496 /** 5497 * sys_sched_setaffinity - set the CPU affinity of a process 5498 * @pid: pid of the process 5499 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 5500 * @user_mask_ptr: user-space pointer to the new CPU mask 5501 * 5502 * Return: 0 on success. An error code otherwise. 5503 */ 5504 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 5505 unsigned long __user *, user_mask_ptr) 5506 { 5507 cpumask_var_t new_mask; 5508 int retval; 5509 5510 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 5511 return -ENOMEM; 5512 5513 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); 5514 if (retval == 0) 5515 retval = sched_setaffinity(pid, new_mask); 5516 free_cpumask_var(new_mask); 5517 return retval; 5518 } 5519 5520 long sched_getaffinity(pid_t pid, struct cpumask *mask) 5521 { 5522 struct task_struct *p; 5523 unsigned long flags; 5524 int retval; 5525 5526 rcu_read_lock(); 5527 5528 retval = -ESRCH; 5529 p = find_process_by_pid(pid); 5530 if (!p) 5531 goto out_unlock; 5532 5533 retval = security_task_getscheduler(p); 5534 if (retval) 5535 goto out_unlock; 5536 5537 raw_spin_lock_irqsave(&p->pi_lock, flags); 5538 cpumask_and(mask, &p->cpus_mask, cpu_active_mask); 5539 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5540 5541 out_unlock: 5542 rcu_read_unlock(); 5543 5544 return retval; 5545 } 5546 5547 /** 5548 * sys_sched_getaffinity - get the CPU affinity of a process 5549 * @pid: pid of the process 5550 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 5551 * @user_mask_ptr: user-space pointer to hold the current CPU mask 5552 * 5553 * Return: size of CPU mask copied to user_mask_ptr on success. An 5554 * error code otherwise. 5555 */ 5556 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 5557 unsigned long __user *, user_mask_ptr) 5558 { 5559 int ret; 5560 cpumask_var_t mask; 5561 5562 if ((len * BITS_PER_BYTE) < nr_cpu_ids) 5563 return -EINVAL; 5564 if (len & (sizeof(unsigned long)-1)) 5565 return -EINVAL; 5566 5567 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 5568 return -ENOMEM; 5569 5570 ret = sched_getaffinity(pid, mask); 5571 if (ret == 0) { 5572 unsigned int retlen = min(len, cpumask_size()); 5573 5574 if (copy_to_user(user_mask_ptr, mask, retlen)) 5575 ret = -EFAULT; 5576 else 5577 ret = retlen; 5578 } 5579 free_cpumask_var(mask); 5580 5581 return ret; 5582 } 5583 5584 /** 5585 * sys_sched_yield - yield the current processor to other threads. 5586 * 5587 * This function yields the current CPU to other tasks. If there are no 5588 * other threads running on this CPU then this function will return. 5589 * 5590 * Return: 0. 5591 */ 5592 static void do_sched_yield(void) 5593 { 5594 struct rq_flags rf; 5595 struct rq *rq; 5596 5597 rq = this_rq_lock_irq(&rf); 5598 5599 schedstat_inc(rq->yld_count); 5600 current->sched_class->yield_task(rq); 5601 5602 /* 5603 * Since we are going to call schedule() anyway, there's 5604 * no need to preempt or enable interrupts: 5605 */ 5606 preempt_disable(); 5607 rq_unlock(rq, &rf); 5608 sched_preempt_enable_no_resched(); 5609 5610 schedule(); 5611 } 5612 5613 SYSCALL_DEFINE0(sched_yield) 5614 { 5615 do_sched_yield(); 5616 return 0; 5617 } 5618 5619 #ifndef CONFIG_PREEMPTION 5620 int __sched _cond_resched(void) 5621 { 5622 if (should_resched(0)) { 5623 preempt_schedule_common(); 5624 return 1; 5625 } 5626 rcu_all_qs(); 5627 return 0; 5628 } 5629 EXPORT_SYMBOL(_cond_resched); 5630 #endif 5631 5632 /* 5633 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 5634 * call schedule, and on return reacquire the lock. 5635 * 5636 * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level 5637 * operations here to prevent schedule() from being called twice (once via 5638 * spin_unlock(), once by hand). 5639 */ 5640 int __cond_resched_lock(spinlock_t *lock) 5641 { 5642 int resched = should_resched(PREEMPT_LOCK_OFFSET); 5643 int ret = 0; 5644 5645 lockdep_assert_held(lock); 5646 5647 if (spin_needbreak(lock) || resched) { 5648 spin_unlock(lock); 5649 if (resched) 5650 preempt_schedule_common(); 5651 else 5652 cpu_relax(); 5653 ret = 1; 5654 spin_lock(lock); 5655 } 5656 return ret; 5657 } 5658 EXPORT_SYMBOL(__cond_resched_lock); 5659 5660 /** 5661 * yield - yield the current processor to other threads. 5662 * 5663 * Do not ever use this function, there's a 99% chance you're doing it wrong. 5664 * 5665 * The scheduler is at all times free to pick the calling task as the most 5666 * eligible task to run, if removing the yield() call from your code breaks 5667 * it, its already broken. 5668 * 5669 * Typical broken usage is: 5670 * 5671 * while (!event) 5672 * yield(); 5673 * 5674 * where one assumes that yield() will let 'the other' process run that will 5675 * make event true. If the current task is a SCHED_FIFO task that will never 5676 * happen. Never use yield() as a progress guarantee!! 5677 * 5678 * If you want to use yield() to wait for something, use wait_event(). 5679 * If you want to use yield() to be 'nice' for others, use cond_resched(). 5680 * If you still want to use yield(), do not! 5681 */ 5682 void __sched yield(void) 5683 { 5684 set_current_state(TASK_RUNNING); 5685 do_sched_yield(); 5686 } 5687 EXPORT_SYMBOL(yield); 5688 5689 /** 5690 * yield_to - yield the current processor to another thread in 5691 * your thread group, or accelerate that thread toward the 5692 * processor it's on. 5693 * @p: target task 5694 * @preempt: whether task preemption is allowed or not 5695 * 5696 * It's the caller's job to ensure that the target task struct 5697 * can't go away on us before we can do any checks. 5698 * 5699 * Return: 5700 * true (>0) if we indeed boosted the target task. 5701 * false (0) if we failed to boost the target. 5702 * -ESRCH if there's no task to yield to. 5703 */ 5704 int __sched yield_to(struct task_struct *p, bool preempt) 5705 { 5706 struct task_struct *curr = current; 5707 struct rq *rq, *p_rq; 5708 unsigned long flags; 5709 int yielded = 0; 5710 5711 local_irq_save(flags); 5712 rq = this_rq(); 5713 5714 again: 5715 p_rq = task_rq(p); 5716 /* 5717 * If we're the only runnable task on the rq and target rq also 5718 * has only one task, there's absolutely no point in yielding. 5719 */ 5720 if (rq->nr_running == 1 && p_rq->nr_running == 1) { 5721 yielded = -ESRCH; 5722 goto out_irq; 5723 } 5724 5725 double_rq_lock(rq, p_rq); 5726 if (task_rq(p) != p_rq) { 5727 double_rq_unlock(rq, p_rq); 5728 goto again; 5729 } 5730 5731 if (!curr->sched_class->yield_to_task) 5732 goto out_unlock; 5733 5734 if (curr->sched_class != p->sched_class) 5735 goto out_unlock; 5736 5737 if (task_running(p_rq, p) || p->state) 5738 goto out_unlock; 5739 5740 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 5741 if (yielded) { 5742 schedstat_inc(rq->yld_count); 5743 /* 5744 * Make p's CPU reschedule; pick_next_entity takes care of 5745 * fairness. 5746 */ 5747 if (preempt && rq != p_rq) 5748 resched_curr(p_rq); 5749 } 5750 5751 out_unlock: 5752 double_rq_unlock(rq, p_rq); 5753 out_irq: 5754 local_irq_restore(flags); 5755 5756 if (yielded > 0) 5757 schedule(); 5758 5759 return yielded; 5760 } 5761 EXPORT_SYMBOL_GPL(yield_to); 5762 5763 int io_schedule_prepare(void) 5764 { 5765 int old_iowait = current->in_iowait; 5766 5767 current->in_iowait = 1; 5768 blk_schedule_flush_plug(current); 5769 5770 return old_iowait; 5771 } 5772 5773 void io_schedule_finish(int token) 5774 { 5775 current->in_iowait = token; 5776 } 5777 5778 /* 5779 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 5780 * that process accounting knows that this is a task in IO wait state. 5781 */ 5782 long __sched io_schedule_timeout(long timeout) 5783 { 5784 int token; 5785 long ret; 5786 5787 token = io_schedule_prepare(); 5788 ret = schedule_timeout(timeout); 5789 io_schedule_finish(token); 5790 5791 return ret; 5792 } 5793 EXPORT_SYMBOL(io_schedule_timeout); 5794 5795 void __sched io_schedule(void) 5796 { 5797 int token; 5798 5799 token = io_schedule_prepare(); 5800 schedule(); 5801 io_schedule_finish(token); 5802 } 5803 EXPORT_SYMBOL(io_schedule); 5804 5805 /** 5806 * sys_sched_get_priority_max - return maximum RT priority. 5807 * @policy: scheduling class. 5808 * 5809 * Return: On success, this syscall returns the maximum 5810 * rt_priority that can be used by a given scheduling class. 5811 * On failure, a negative error code is returned. 5812 */ 5813 SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 5814 { 5815 int ret = -EINVAL; 5816 5817 switch (policy) { 5818 case SCHED_FIFO: 5819 case SCHED_RR: 5820 ret = MAX_USER_RT_PRIO-1; 5821 break; 5822 case SCHED_DEADLINE: 5823 case SCHED_NORMAL: 5824 case SCHED_BATCH: 5825 case SCHED_IDLE: 5826 ret = 0; 5827 break; 5828 } 5829 return ret; 5830 } 5831 5832 /** 5833 * sys_sched_get_priority_min - return minimum RT priority. 5834 * @policy: scheduling class. 5835 * 5836 * Return: On success, this syscall returns the minimum 5837 * rt_priority that can be used by a given scheduling class. 5838 * On failure, a negative error code is returned. 5839 */ 5840 SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 5841 { 5842 int ret = -EINVAL; 5843 5844 switch (policy) { 5845 case SCHED_FIFO: 5846 case SCHED_RR: 5847 ret = 1; 5848 break; 5849 case SCHED_DEADLINE: 5850 case SCHED_NORMAL: 5851 case SCHED_BATCH: 5852 case SCHED_IDLE: 5853 ret = 0; 5854 } 5855 return ret; 5856 } 5857 5858 static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) 5859 { 5860 struct task_struct *p; 5861 unsigned int time_slice; 5862 struct rq_flags rf; 5863 struct rq *rq; 5864 int retval; 5865 5866 if (pid < 0) 5867 return -EINVAL; 5868 5869 retval = -ESRCH; 5870 rcu_read_lock(); 5871 p = find_process_by_pid(pid); 5872 if (!p) 5873 goto out_unlock; 5874 5875 retval = security_task_getscheduler(p); 5876 if (retval) 5877 goto out_unlock; 5878 5879 rq = task_rq_lock(p, &rf); 5880 time_slice = 0; 5881 if (p->sched_class->get_rr_interval) 5882 time_slice = p->sched_class->get_rr_interval(rq, p); 5883 task_rq_unlock(rq, p, &rf); 5884 5885 rcu_read_unlock(); 5886 jiffies_to_timespec64(time_slice, t); 5887 return 0; 5888 5889 out_unlock: 5890 rcu_read_unlock(); 5891 return retval; 5892 } 5893 5894 /** 5895 * sys_sched_rr_get_interval - return the default timeslice of a process. 5896 * @pid: pid of the process. 5897 * @interval: userspace pointer to the timeslice value. 5898 * 5899 * this syscall writes the default timeslice value of a given process 5900 * into the user-space timespec buffer. A value of '0' means infinity. 5901 * 5902 * Return: On success, 0 and the timeslice is in @interval. Otherwise, 5903 * an error code. 5904 */ 5905 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 5906 struct __kernel_timespec __user *, interval) 5907 { 5908 struct timespec64 t; 5909 int retval = sched_rr_get_interval(pid, &t); 5910 5911 if (retval == 0) 5912 retval = put_timespec64(&t, interval); 5913 5914 return retval; 5915 } 5916 5917 #ifdef CONFIG_COMPAT_32BIT_TIME 5918 SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, 5919 struct old_timespec32 __user *, interval) 5920 { 5921 struct timespec64 t; 5922 int retval = sched_rr_get_interval(pid, &t); 5923 5924 if (retval == 0) 5925 retval = put_old_timespec32(&t, interval); 5926 return retval; 5927 } 5928 #endif 5929 5930 void sched_show_task(struct task_struct *p) 5931 { 5932 unsigned long free = 0; 5933 int ppid; 5934 5935 if (!try_get_task_stack(p)) 5936 return; 5937 5938 printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); 5939 5940 if (p->state == TASK_RUNNING) 5941 printk(KERN_CONT " running task "); 5942 #ifdef CONFIG_DEBUG_STACK_USAGE 5943 free = stack_not_used(p); 5944 #endif 5945 ppid = 0; 5946 rcu_read_lock(); 5947 if (pid_alive(p)) 5948 ppid = task_pid_nr(rcu_dereference(p->real_parent)); 5949 rcu_read_unlock(); 5950 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 5951 task_pid_nr(p), ppid, 5952 (unsigned long)task_thread_info(p)->flags); 5953 5954 print_worker_info(KERN_INFO, p); 5955 show_stack(p, NULL); 5956 put_task_stack(p); 5957 } 5958 EXPORT_SYMBOL_GPL(sched_show_task); 5959 5960 static inline bool 5961 state_filter_match(unsigned long state_filter, struct task_struct *p) 5962 { 5963 /* no filter, everything matches */ 5964 if (!state_filter) 5965 return true; 5966 5967 /* filter, but doesn't match */ 5968 if (!(p->state & state_filter)) 5969 return false; 5970 5971 /* 5972 * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows 5973 * TASK_KILLABLE). 5974 */ 5975 if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) 5976 return false; 5977 5978 return true; 5979 } 5980 5981 5982 void show_state_filter(unsigned long state_filter) 5983 { 5984 struct task_struct *g, *p; 5985 5986 #if BITS_PER_LONG == 32 5987 printk(KERN_INFO 5988 " task PC stack pid father\n"); 5989 #else 5990 printk(KERN_INFO 5991 " task PC stack pid father\n"); 5992 #endif 5993 rcu_read_lock(); 5994 for_each_process_thread(g, p) { 5995 /* 5996 * reset the NMI-timeout, listing all files on a slow 5997 * console might take a lot of time: 5998 * Also, reset softlockup watchdogs on all CPUs, because 5999 * another CPU might be blocked waiting for us to process 6000 * an IPI. 6001 */ 6002 touch_nmi_watchdog(); 6003 touch_all_softlockup_watchdogs(); 6004 if (state_filter_match(state_filter, p)) 6005 sched_show_task(p); 6006 } 6007 6008 #ifdef CONFIG_SCHED_DEBUG 6009 if (!state_filter) 6010 sysrq_sched_debug_show(); 6011 #endif 6012 rcu_read_unlock(); 6013 /* 6014 * Only show locks if all tasks are dumped: 6015 */ 6016 if (!state_filter) 6017 debug_show_all_locks(); 6018 } 6019 6020 /** 6021 * init_idle - set up an idle thread for a given CPU 6022 * @idle: task in question 6023 * @cpu: CPU the idle task belongs to 6024 * 6025 * NOTE: this function does not set the idle thread's NEED_RESCHED 6026 * flag, to make booting more robust. 6027 */ 6028 void init_idle(struct task_struct *idle, int cpu) 6029 { 6030 struct rq *rq = cpu_rq(cpu); 6031 unsigned long flags; 6032 6033 __sched_fork(0, idle); 6034 6035 raw_spin_lock_irqsave(&idle->pi_lock, flags); 6036 raw_spin_lock(&rq->lock); 6037 6038 idle->state = TASK_RUNNING; 6039 idle->se.exec_start = sched_clock(); 6040 idle->flags |= PF_IDLE; 6041 6042 kasan_unpoison_task_stack(idle); 6043 6044 #ifdef CONFIG_SMP 6045 /* 6046 * Its possible that init_idle() gets called multiple times on a task, 6047 * in that case do_set_cpus_allowed() will not do the right thing. 6048 * 6049 * And since this is boot we can forgo the serialization. 6050 */ 6051 set_cpus_allowed_common(idle, cpumask_of(cpu)); 6052 #endif 6053 /* 6054 * We're having a chicken and egg problem, even though we are 6055 * holding rq->lock, the CPU isn't yet set to this CPU so the 6056 * lockdep check in task_group() will fail. 6057 * 6058 * Similar case to sched_fork(). / Alternatively we could 6059 * use task_rq_lock() here and obtain the other rq->lock. 6060 * 6061 * Silence PROVE_RCU 6062 */ 6063 rcu_read_lock(); 6064 __set_task_cpu(idle, cpu); 6065 rcu_read_unlock(); 6066 6067 rq->idle = idle; 6068 rcu_assign_pointer(rq->curr, idle); 6069 idle->on_rq = TASK_ON_RQ_QUEUED; 6070 #ifdef CONFIG_SMP 6071 idle->on_cpu = 1; 6072 #endif 6073 raw_spin_unlock(&rq->lock); 6074 raw_spin_unlock_irqrestore(&idle->pi_lock, flags); 6075 6076 /* Set the preempt count _outside_ the spinlocks! */ 6077 init_idle_preempt_count(idle, cpu); 6078 6079 /* 6080 * The idle tasks have their own, simple scheduling class: 6081 */ 6082 idle->sched_class = &idle_sched_class; 6083 ftrace_graph_init_idle_task(idle, cpu); 6084 vtime_init_idle(idle, cpu); 6085 #ifdef CONFIG_SMP 6086 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 6087 #endif 6088 } 6089 6090 #ifdef CONFIG_SMP 6091 6092 int cpuset_cpumask_can_shrink(const struct cpumask *cur, 6093 const struct cpumask *trial) 6094 { 6095 int ret = 1; 6096 6097 if (!cpumask_weight(cur)) 6098 return ret; 6099 6100 ret = dl_cpuset_cpumask_can_shrink(cur, trial); 6101 6102 return ret; 6103 } 6104 6105 int task_can_attach(struct task_struct *p, 6106 const struct cpumask *cs_cpus_allowed) 6107 { 6108 int ret = 0; 6109 6110 /* 6111 * Kthreads which disallow setaffinity shouldn't be moved 6112 * to a new cpuset; we don't want to change their CPU 6113 * affinity and isolating such threads by their set of 6114 * allowed nodes is unnecessary. Thus, cpusets are not 6115 * applicable for such threads. This prevents checking for 6116 * success of set_cpus_allowed_ptr() on all attached tasks 6117 * before cpus_mask may be changed. 6118 */ 6119 if (p->flags & PF_NO_SETAFFINITY) { 6120 ret = -EINVAL; 6121 goto out; 6122 } 6123 6124 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, 6125 cs_cpus_allowed)) 6126 ret = dl_task_can_attach(p, cs_cpus_allowed); 6127 6128 out: 6129 return ret; 6130 } 6131 6132 bool sched_smp_initialized __read_mostly; 6133 6134 #ifdef CONFIG_NUMA_BALANCING 6135 /* Migrate current task p to target_cpu */ 6136 int migrate_task_to(struct task_struct *p, int target_cpu) 6137 { 6138 struct migration_arg arg = { p, target_cpu }; 6139 int curr_cpu = task_cpu(p); 6140 6141 if (curr_cpu == target_cpu) 6142 return 0; 6143 6144 if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) 6145 return -EINVAL; 6146 6147 /* TODO: This is not properly updating schedstats */ 6148 6149 trace_sched_move_numa(p, curr_cpu, target_cpu); 6150 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); 6151 } 6152 6153 /* 6154 * Requeue a task on a given node and accurately track the number of NUMA 6155 * tasks on the runqueues 6156 */ 6157 void sched_setnuma(struct task_struct *p, int nid) 6158 { 6159 bool queued, running; 6160 struct rq_flags rf; 6161 struct rq *rq; 6162 6163 rq = task_rq_lock(p, &rf); 6164 queued = task_on_rq_queued(p); 6165 running = task_current(rq, p); 6166 6167 if (queued) 6168 dequeue_task(rq, p, DEQUEUE_SAVE); 6169 if (running) 6170 put_prev_task(rq, p); 6171 6172 p->numa_preferred_nid = nid; 6173 6174 if (queued) 6175 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); 6176 if (running) 6177 set_next_task(rq, p); 6178 task_rq_unlock(rq, p, &rf); 6179 } 6180 #endif /* CONFIG_NUMA_BALANCING */ 6181 6182 #ifdef CONFIG_HOTPLUG_CPU 6183 /* 6184 * Ensure that the idle task is using init_mm right before its CPU goes 6185 * offline. 6186 */ 6187 void idle_task_exit(void) 6188 { 6189 struct mm_struct *mm = current->active_mm; 6190 6191 BUG_ON(cpu_online(smp_processor_id())); 6192 6193 if (mm != &init_mm) { 6194 switch_mm(mm, &init_mm, current); 6195 current->active_mm = &init_mm; 6196 finish_arch_post_lock_switch(); 6197 } 6198 mmdrop(mm); 6199 } 6200 6201 /* 6202 * Since this CPU is going 'away' for a while, fold any nr_active delta 6203 * we might have. Assumes we're called after migrate_tasks() so that the 6204 * nr_active count is stable. We need to take the teardown thread which 6205 * is calling this into account, so we hand in adjust = 1 to the load 6206 * calculation. 6207 * 6208 * Also see the comment "Global load-average calculations". 6209 */ 6210 static void calc_load_migrate(struct rq *rq) 6211 { 6212 long delta = calc_load_fold_active(rq, 1); 6213 if (delta) 6214 atomic_long_add(delta, &calc_load_tasks); 6215 } 6216 6217 static struct task_struct *__pick_migrate_task(struct rq *rq) 6218 { 6219 const struct sched_class *class; 6220 struct task_struct *next; 6221 6222 for_each_class(class) { 6223 next = class->pick_next_task(rq); 6224 if (next) { 6225 next->sched_class->put_prev_task(rq, next); 6226 return next; 6227 } 6228 } 6229 6230 /* The idle class should always have a runnable task */ 6231 BUG(); 6232 } 6233 6234 /* 6235 * Migrate all tasks from the rq, sleeping tasks will be migrated by 6236 * try_to_wake_up()->select_task_rq(). 6237 * 6238 * Called with rq->lock held even though we'er in stop_machine() and 6239 * there's no concurrency possible, we hold the required locks anyway 6240 * because of lock validation efforts. 6241 */ 6242 static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) 6243 { 6244 struct rq *rq = dead_rq; 6245 struct task_struct *next, *stop = rq->stop; 6246 struct rq_flags orf = *rf; 6247 int dest_cpu; 6248 6249 /* 6250 * Fudge the rq selection such that the below task selection loop 6251 * doesn't get stuck on the currently eligible stop task. 6252 * 6253 * We're currently inside stop_machine() and the rq is either stuck 6254 * in the stop_machine_cpu_stop() loop, or we're executing this code, 6255 * either way we should never end up calling schedule() until we're 6256 * done here. 6257 */ 6258 rq->stop = NULL; 6259 6260 /* 6261 * put_prev_task() and pick_next_task() sched 6262 * class method both need to have an up-to-date 6263 * value of rq->clock[_task] 6264 */ 6265 update_rq_clock(rq); 6266 6267 for (;;) { 6268 /* 6269 * There's this thread running, bail when that's the only 6270 * remaining thread: 6271 */ 6272 if (rq->nr_running == 1) 6273 break; 6274 6275 next = __pick_migrate_task(rq); 6276 6277 /* 6278 * Rules for changing task_struct::cpus_mask are holding 6279 * both pi_lock and rq->lock, such that holding either 6280 * stabilizes the mask. 6281 * 6282 * Drop rq->lock is not quite as disastrous as it usually is 6283 * because !cpu_active at this point, which means load-balance 6284 * will not interfere. Also, stop-machine. 6285 */ 6286 rq_unlock(rq, rf); 6287 raw_spin_lock(&next->pi_lock); 6288 rq_relock(rq, rf); 6289 6290 /* 6291 * Since we're inside stop-machine, _nothing_ should have 6292 * changed the task, WARN if weird stuff happened, because in 6293 * that case the above rq->lock drop is a fail too. 6294 */ 6295 if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { 6296 raw_spin_unlock(&next->pi_lock); 6297 continue; 6298 } 6299 6300 /* Find suitable destination for @next, with force if needed. */ 6301 dest_cpu = select_fallback_rq(dead_rq->cpu, next); 6302 rq = __migrate_task(rq, rf, next, dest_cpu); 6303 if (rq != dead_rq) { 6304 rq_unlock(rq, rf); 6305 rq = dead_rq; 6306 *rf = orf; 6307 rq_relock(rq, rf); 6308 } 6309 raw_spin_unlock(&next->pi_lock); 6310 } 6311 6312 rq->stop = stop; 6313 } 6314 #endif /* CONFIG_HOTPLUG_CPU */ 6315 6316 void set_rq_online(struct rq *rq) 6317 { 6318 if (!rq->online) { 6319 const struct sched_class *class; 6320 6321 cpumask_set_cpu(rq->cpu, rq->rd->online); 6322 rq->online = 1; 6323 6324 for_each_class(class) { 6325 if (class->rq_online) 6326 class->rq_online(rq); 6327 } 6328 } 6329 } 6330 6331 void set_rq_offline(struct rq *rq) 6332 { 6333 if (rq->online) { 6334 const struct sched_class *class; 6335 6336 for_each_class(class) { 6337 if (class->rq_offline) 6338 class->rq_offline(rq); 6339 } 6340 6341 cpumask_clear_cpu(rq->cpu, rq->rd->online); 6342 rq->online = 0; 6343 } 6344 } 6345 6346 /* 6347 * used to mark begin/end of suspend/resume: 6348 */ 6349 static int num_cpus_frozen; 6350 6351 /* 6352 * Update cpusets according to cpu_active mask. If cpusets are 6353 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 6354 * around partition_sched_domains(). 6355 * 6356 * If we come here as part of a suspend/resume, don't touch cpusets because we 6357 * want to restore it back to its original state upon resume anyway. 6358 */ 6359 static void cpuset_cpu_active(void) 6360 { 6361 if (cpuhp_tasks_frozen) { 6362 /* 6363 * num_cpus_frozen tracks how many CPUs are involved in suspend 6364 * resume sequence. As long as this is not the last online 6365 * operation in the resume sequence, just build a single sched 6366 * domain, ignoring cpusets. 6367 */ 6368 partition_sched_domains(1, NULL, NULL); 6369 if (--num_cpus_frozen) 6370 return; 6371 /* 6372 * This is the last CPU online operation. So fall through and 6373 * restore the original sched domains by considering the 6374 * cpuset configurations. 6375 */ 6376 cpuset_force_rebuild(); 6377 } 6378 cpuset_update_active_cpus(); 6379 } 6380 6381 static int cpuset_cpu_inactive(unsigned int cpu) 6382 { 6383 if (!cpuhp_tasks_frozen) { 6384 if (dl_cpu_busy(cpu)) 6385 return -EBUSY; 6386 cpuset_update_active_cpus(); 6387 } else { 6388 num_cpus_frozen++; 6389 partition_sched_domains(1, NULL, NULL); 6390 } 6391 return 0; 6392 } 6393 6394 int sched_cpu_activate(unsigned int cpu) 6395 { 6396 struct rq *rq = cpu_rq(cpu); 6397 struct rq_flags rf; 6398 6399 #ifdef CONFIG_SCHED_SMT 6400 /* 6401 * When going up, increment the number of cores with SMT present. 6402 */ 6403 if (cpumask_weight(cpu_smt_mask(cpu)) == 2) 6404 static_branch_inc_cpuslocked(&sched_smt_present); 6405 #endif 6406 set_cpu_active(cpu, true); 6407 6408 if (sched_smp_initialized) { 6409 sched_domains_numa_masks_set(cpu); 6410 cpuset_cpu_active(); 6411 } 6412 6413 /* 6414 * Put the rq online, if not already. This happens: 6415 * 6416 * 1) In the early boot process, because we build the real domains 6417 * after all CPUs have been brought up. 6418 * 6419 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the 6420 * domains. 6421 */ 6422 rq_lock_irqsave(rq, &rf); 6423 if (rq->rd) { 6424 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6425 set_rq_online(rq); 6426 } 6427 rq_unlock_irqrestore(rq, &rf); 6428 6429 return 0; 6430 } 6431 6432 int sched_cpu_deactivate(unsigned int cpu) 6433 { 6434 int ret; 6435 6436 set_cpu_active(cpu, false); 6437 /* 6438 * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU 6439 * users of this state to go away such that all new such users will 6440 * observe it. 6441 * 6442 * Do sync before park smpboot threads to take care the rcu boost case. 6443 */ 6444 synchronize_rcu(); 6445 6446 #ifdef CONFIG_SCHED_SMT 6447 /* 6448 * When going down, decrement the number of cores with SMT present. 6449 */ 6450 if (cpumask_weight(cpu_smt_mask(cpu)) == 2) 6451 static_branch_dec_cpuslocked(&sched_smt_present); 6452 #endif 6453 6454 if (!sched_smp_initialized) 6455 return 0; 6456 6457 ret = cpuset_cpu_inactive(cpu); 6458 if (ret) { 6459 set_cpu_active(cpu, true); 6460 return ret; 6461 } 6462 sched_domains_numa_masks_clear(cpu); 6463 return 0; 6464 } 6465 6466 static void sched_rq_cpu_starting(unsigned int cpu) 6467 { 6468 struct rq *rq = cpu_rq(cpu); 6469 6470 rq->calc_load_update = calc_load_update; 6471 update_max_interval(); 6472 } 6473 6474 int sched_cpu_starting(unsigned int cpu) 6475 { 6476 sched_rq_cpu_starting(cpu); 6477 sched_tick_start(cpu); 6478 return 0; 6479 } 6480 6481 #ifdef CONFIG_HOTPLUG_CPU 6482 int sched_cpu_dying(unsigned int cpu) 6483 { 6484 struct rq *rq = cpu_rq(cpu); 6485 struct rq_flags rf; 6486 6487 /* Handle pending wakeups and then migrate everything off */ 6488 sched_ttwu_pending(); 6489 sched_tick_stop(cpu); 6490 6491 rq_lock_irqsave(rq, &rf); 6492 if (rq->rd) { 6493 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6494 set_rq_offline(rq); 6495 } 6496 migrate_tasks(rq, &rf); 6497 BUG_ON(rq->nr_running != 1); 6498 rq_unlock_irqrestore(rq, &rf); 6499 6500 calc_load_migrate(rq); 6501 update_max_interval(); 6502 nohz_balance_exit_idle(rq); 6503 hrtick_clear(rq); 6504 return 0; 6505 } 6506 #endif 6507 6508 void __init sched_init_smp(void) 6509 { 6510 sched_init_numa(); 6511 6512 /* 6513 * There's no userspace yet to cause hotplug operations; hence all the 6514 * CPU masks are stable and all blatant races in the below code cannot 6515 * happen. 6516 */ 6517 mutex_lock(&sched_domains_mutex); 6518 sched_init_domains(cpu_active_mask); 6519 mutex_unlock(&sched_domains_mutex); 6520 6521 /* Move init over to a non-isolated CPU */ 6522 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) 6523 BUG(); 6524 sched_init_granularity(); 6525 6526 init_sched_rt_class(); 6527 init_sched_dl_class(); 6528 6529 sched_smp_initialized = true; 6530 } 6531 6532 static int __init migration_init(void) 6533 { 6534 sched_cpu_starting(smp_processor_id()); 6535 return 0; 6536 } 6537 early_initcall(migration_init); 6538 6539 #else 6540 void __init sched_init_smp(void) 6541 { 6542 sched_init_granularity(); 6543 } 6544 #endif /* CONFIG_SMP */ 6545 6546 int in_sched_functions(unsigned long addr) 6547 { 6548 return in_lock_functions(addr) || 6549 (addr >= (unsigned long)__sched_text_start 6550 && addr < (unsigned long)__sched_text_end); 6551 } 6552 6553 #ifdef CONFIG_CGROUP_SCHED 6554 /* 6555 * Default task group. 6556 * Every task in system belongs to this group at bootup. 6557 */ 6558 struct task_group root_task_group; 6559 LIST_HEAD(task_groups); 6560 6561 /* Cacheline aligned slab cache for task_group */ 6562 static struct kmem_cache *task_group_cache __read_mostly; 6563 #endif 6564 6565 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); 6566 DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); 6567 6568 void __init sched_init(void) 6569 { 6570 unsigned long ptr = 0; 6571 int i; 6572 6573 wait_bit_init(); 6574 6575 #ifdef CONFIG_FAIR_GROUP_SCHED 6576 ptr += 2 * nr_cpu_ids * sizeof(void **); 6577 #endif 6578 #ifdef CONFIG_RT_GROUP_SCHED 6579 ptr += 2 * nr_cpu_ids * sizeof(void **); 6580 #endif 6581 if (ptr) { 6582 ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT); 6583 6584 #ifdef CONFIG_FAIR_GROUP_SCHED 6585 root_task_group.se = (struct sched_entity **)ptr; 6586 ptr += nr_cpu_ids * sizeof(void **); 6587 6588 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 6589 ptr += nr_cpu_ids * sizeof(void **); 6590 6591 #endif /* CONFIG_FAIR_GROUP_SCHED */ 6592 #ifdef CONFIG_RT_GROUP_SCHED 6593 root_task_group.rt_se = (struct sched_rt_entity **)ptr; 6594 ptr += nr_cpu_ids * sizeof(void **); 6595 6596 root_task_group.rt_rq = (struct rt_rq **)ptr; 6597 ptr += nr_cpu_ids * sizeof(void **); 6598 6599 #endif /* CONFIG_RT_GROUP_SCHED */ 6600 } 6601 #ifdef CONFIG_CPUMASK_OFFSTACK 6602 for_each_possible_cpu(i) { 6603 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( 6604 cpumask_size(), GFP_KERNEL, cpu_to_node(i)); 6605 per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node( 6606 cpumask_size(), GFP_KERNEL, cpu_to_node(i)); 6607 } 6608 #endif /* CONFIG_CPUMASK_OFFSTACK */ 6609 6610 init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); 6611 init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime()); 6612 6613 #ifdef CONFIG_SMP 6614 init_defrootdomain(); 6615 #endif 6616 6617 #ifdef CONFIG_RT_GROUP_SCHED 6618 init_rt_bandwidth(&root_task_group.rt_bandwidth, 6619 global_rt_period(), global_rt_runtime()); 6620 #endif /* CONFIG_RT_GROUP_SCHED */ 6621 6622 #ifdef CONFIG_CGROUP_SCHED 6623 task_group_cache = KMEM_CACHE(task_group, 0); 6624 6625 list_add(&root_task_group.list, &task_groups); 6626 INIT_LIST_HEAD(&root_task_group.children); 6627 INIT_LIST_HEAD(&root_task_group.siblings); 6628 autogroup_init(&init_task); 6629 #endif /* CONFIG_CGROUP_SCHED */ 6630 6631 for_each_possible_cpu(i) { 6632 struct rq *rq; 6633 6634 rq = cpu_rq(i); 6635 raw_spin_lock_init(&rq->lock); 6636 rq->nr_running = 0; 6637 rq->calc_load_active = 0; 6638 rq->calc_load_update = jiffies + LOAD_FREQ; 6639 init_cfs_rq(&rq->cfs); 6640 init_rt_rq(&rq->rt); 6641 init_dl_rq(&rq->dl); 6642 #ifdef CONFIG_FAIR_GROUP_SCHED 6643 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 6644 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6645 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; 6646 /* 6647 * How much CPU bandwidth does root_task_group get? 6648 * 6649 * In case of task-groups formed thr' the cgroup filesystem, it 6650 * gets 100% of the CPU resources in the system. This overall 6651 * system CPU resource is divided among the tasks of 6652 * root_task_group and its child task-groups in a fair manner, 6653 * based on each entity's (task or task-group's) weight 6654 * (se->load.weight). 6655 * 6656 * In other words, if root_task_group has 10 tasks of weight 6657 * 1024) and two child groups A0 and A1 (of weight 1024 each), 6658 * then A0's share of the CPU resource is: 6659 * 6660 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 6661 * 6662 * We achieve this by letting root_task_group's tasks sit 6663 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 6664 */ 6665 init_cfs_bandwidth(&root_task_group.cfs_bandwidth); 6666 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 6667 #endif /* CONFIG_FAIR_GROUP_SCHED */ 6668 6669 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 6670 #ifdef CONFIG_RT_GROUP_SCHED 6671 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 6672 #endif 6673 #ifdef CONFIG_SMP 6674 rq->sd = NULL; 6675 rq->rd = NULL; 6676 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; 6677 rq->balance_callback = NULL; 6678 rq->active_balance = 0; 6679 rq->next_balance = jiffies; 6680 rq->push_cpu = 0; 6681 rq->cpu = i; 6682 rq->online = 0; 6683 rq->idle_stamp = 0; 6684 rq->avg_idle = 2*sysctl_sched_migration_cost; 6685 rq->max_idle_balance_cost = sysctl_sched_migration_cost; 6686 6687 INIT_LIST_HEAD(&rq->cfs_tasks); 6688 6689 rq_attach_root(rq, &def_root_domain); 6690 #ifdef CONFIG_NO_HZ_COMMON 6691 rq->last_blocked_load_update_tick = jiffies; 6692 atomic_set(&rq->nohz_flags, 0); 6693 #endif 6694 #endif /* CONFIG_SMP */ 6695 hrtick_rq_init(rq); 6696 atomic_set(&rq->nr_iowait, 0); 6697 } 6698 6699 set_load_weight(&init_task, false); 6700 6701 /* 6702 * The boot idle thread does lazy MMU switching as well: 6703 */ 6704 mmgrab(&init_mm); 6705 enter_lazy_tlb(&init_mm, current); 6706 6707 /* 6708 * Make us the idle thread. Technically, schedule() should not be 6709 * called from this thread, however somewhere below it might be, 6710 * but because we are the idle thread, we just pick up running again 6711 * when this runqueue becomes "idle". 6712 */ 6713 init_idle(current, smp_processor_id()); 6714 6715 calc_load_update = jiffies + LOAD_FREQ; 6716 6717 #ifdef CONFIG_SMP 6718 idle_thread_set_boot_cpu(); 6719 #endif 6720 init_sched_fair_class(); 6721 6722 init_schedstats(); 6723 6724 psi_init(); 6725 6726 init_uclamp(); 6727 6728 scheduler_running = 1; 6729 } 6730 6731 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 6732 static inline int preempt_count_equals(int preempt_offset) 6733 { 6734 int nested = preempt_count() + rcu_preempt_depth(); 6735 6736 return (nested == preempt_offset); 6737 } 6738 6739 void __might_sleep(const char *file, int line, int preempt_offset) 6740 { 6741 /* 6742 * Blocking primitives will set (and therefore destroy) current->state, 6743 * since we will exit with TASK_RUNNING make sure we enter with it, 6744 * otherwise we will destroy state. 6745 */ 6746 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, 6747 "do not call blocking ops when !TASK_RUNNING; " 6748 "state=%lx set at [<%p>] %pS\n", 6749 current->state, 6750 (void *)current->task_state_change, 6751 (void *)current->task_state_change); 6752 6753 ___might_sleep(file, line, preempt_offset); 6754 } 6755 EXPORT_SYMBOL(__might_sleep); 6756 6757 void ___might_sleep(const char *file, int line, int preempt_offset) 6758 { 6759 /* Ratelimiting timestamp: */ 6760 static unsigned long prev_jiffy; 6761 6762 unsigned long preempt_disable_ip; 6763 6764 /* WARN_ON_ONCE() by default, no rate limit required: */ 6765 rcu_sleep_check(); 6766 6767 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && 6768 !is_idle_task(current) && !current->non_block_count) || 6769 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || 6770 oops_in_progress) 6771 return; 6772 6773 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 6774 return; 6775 prev_jiffy = jiffies; 6776 6777 /* Save this before calling printk(), since that will clobber it: */ 6778 preempt_disable_ip = get_preempt_disable_ip(current); 6779 6780 printk(KERN_ERR 6781 "BUG: sleeping function called from invalid context at %s:%d\n", 6782 file, line); 6783 printk(KERN_ERR 6784 "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", 6785 in_atomic(), irqs_disabled(), current->non_block_count, 6786 current->pid, current->comm); 6787 6788 if (task_stack_end_corrupted(current)) 6789 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); 6790 6791 debug_show_held_locks(current); 6792 if (irqs_disabled()) 6793 print_irqtrace_events(current); 6794 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) 6795 && !preempt_count_equals(preempt_offset)) { 6796 pr_err("Preemption disabled at:"); 6797 print_ip_sym(preempt_disable_ip); 6798 pr_cont("\n"); 6799 } 6800 dump_stack(); 6801 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 6802 } 6803 EXPORT_SYMBOL(___might_sleep); 6804 6805 void __cant_sleep(const char *file, int line, int preempt_offset) 6806 { 6807 static unsigned long prev_jiffy; 6808 6809 if (irqs_disabled()) 6810 return; 6811 6812 if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) 6813 return; 6814 6815 if (preempt_count() > preempt_offset) 6816 return; 6817 6818 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 6819 return; 6820 prev_jiffy = jiffies; 6821 6822 printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); 6823 printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 6824 in_atomic(), irqs_disabled(), 6825 current->pid, current->comm); 6826 6827 debug_show_held_locks(current); 6828 dump_stack(); 6829 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 6830 } 6831 EXPORT_SYMBOL_GPL(__cant_sleep); 6832 #endif 6833 6834 #ifdef CONFIG_MAGIC_SYSRQ 6835 void normalize_rt_tasks(void) 6836 { 6837 struct task_struct *g, *p; 6838 struct sched_attr attr = { 6839 .sched_policy = SCHED_NORMAL, 6840 }; 6841 6842 read_lock(&tasklist_lock); 6843 for_each_process_thread(g, p) { 6844 /* 6845 * Only normalize user tasks: 6846 */ 6847 if (p->flags & PF_KTHREAD) 6848 continue; 6849 6850 p->se.exec_start = 0; 6851 schedstat_set(p->se.statistics.wait_start, 0); 6852 schedstat_set(p->se.statistics.sleep_start, 0); 6853 schedstat_set(p->se.statistics.block_start, 0); 6854 6855 if (!dl_task(p) && !rt_task(p)) { 6856 /* 6857 * Renice negative nice level userspace 6858 * tasks back to 0: 6859 */ 6860 if (task_nice(p) < 0) 6861 set_user_nice(p, 0); 6862 continue; 6863 } 6864 6865 __sched_setscheduler(p, &attr, false, false); 6866 } 6867 read_unlock(&tasklist_lock); 6868 } 6869 6870 #endif /* CONFIG_MAGIC_SYSRQ */ 6871 6872 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) 6873 /* 6874 * These functions are only useful for the IA64 MCA handling, or kdb. 6875 * 6876 * They can only be called when the whole system has been 6877 * stopped - every CPU needs to be quiescent, and no scheduling 6878 * activity can take place. Using them for anything else would 6879 * be a serious bug, and as a result, they aren't even visible 6880 * under any other configuration. 6881 */ 6882 6883 /** 6884 * curr_task - return the current task for a given CPU. 6885 * @cpu: the processor in question. 6886 * 6887 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6888 * 6889 * Return: The current task for @cpu. 6890 */ 6891 struct task_struct *curr_task(int cpu) 6892 { 6893 return cpu_curr(cpu); 6894 } 6895 6896 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ 6897 6898 #ifdef CONFIG_IA64 6899 /** 6900 * ia64_set_curr_task - set the current task for a given CPU. 6901 * @cpu: the processor in question. 6902 * @p: the task pointer to set. 6903 * 6904 * Description: This function must only be used when non-maskable interrupts 6905 * are serviced on a separate stack. It allows the architecture to switch the 6906 * notion of the current task on a CPU in a non-blocking manner. This function 6907 * must be called with all CPU's synchronized, and interrupts disabled, the 6908 * and caller must save the original value of the current task (see 6909 * curr_task() above) and restore that value before reenabling interrupts and 6910 * re-starting the system. 6911 * 6912 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6913 */ 6914 void ia64_set_curr_task(int cpu, struct task_struct *p) 6915 { 6916 cpu_curr(cpu) = p; 6917 } 6918 6919 #endif 6920 6921 #ifdef CONFIG_CGROUP_SCHED 6922 /* task_group_lock serializes the addition/removal of task groups */ 6923 static DEFINE_SPINLOCK(task_group_lock); 6924 6925 static inline void alloc_uclamp_sched_group(struct task_group *tg, 6926 struct task_group *parent) 6927 { 6928 #ifdef CONFIG_UCLAMP_TASK_GROUP 6929 enum uclamp_id clamp_id; 6930 6931 for_each_clamp_id(clamp_id) { 6932 uclamp_se_set(&tg->uclamp_req[clamp_id], 6933 uclamp_none(clamp_id), false); 6934 tg->uclamp[clamp_id] = parent->uclamp[clamp_id]; 6935 } 6936 #endif 6937 } 6938 6939 static void sched_free_group(struct task_group *tg) 6940 { 6941 free_fair_sched_group(tg); 6942 free_rt_sched_group(tg); 6943 autogroup_free(tg); 6944 kmem_cache_free(task_group_cache, tg); 6945 } 6946 6947 /* allocate runqueue etc for a new task group */ 6948 struct task_group *sched_create_group(struct task_group *parent) 6949 { 6950 struct task_group *tg; 6951 6952 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); 6953 if (!tg) 6954 return ERR_PTR(-ENOMEM); 6955 6956 if (!alloc_fair_sched_group(tg, parent)) 6957 goto err; 6958 6959 if (!alloc_rt_sched_group(tg, parent)) 6960 goto err; 6961 6962 alloc_uclamp_sched_group(tg, parent); 6963 6964 return tg; 6965 6966 err: 6967 sched_free_group(tg); 6968 return ERR_PTR(-ENOMEM); 6969 } 6970 6971 void sched_online_group(struct task_group *tg, struct task_group *parent) 6972 { 6973 unsigned long flags; 6974 6975 spin_lock_irqsave(&task_group_lock, flags); 6976 list_add_rcu(&tg->list, &task_groups); 6977 6978 /* Root should already exist: */ 6979 WARN_ON(!parent); 6980 6981 tg->parent = parent; 6982 INIT_LIST_HEAD(&tg->children); 6983 list_add_rcu(&tg->siblings, &parent->children); 6984 spin_unlock_irqrestore(&task_group_lock, flags); 6985 6986 online_fair_sched_group(tg); 6987 } 6988 6989 /* rcu callback to free various structures associated with a task group */ 6990 static void sched_free_group_rcu(struct rcu_head *rhp) 6991 { 6992 /* Now it should be safe to free those cfs_rqs: */ 6993 sched_free_group(container_of(rhp, struct task_group, rcu)); 6994 } 6995 6996 void sched_destroy_group(struct task_group *tg) 6997 { 6998 /* Wait for possible concurrent references to cfs_rqs complete: */ 6999 call_rcu(&tg->rcu, sched_free_group_rcu); 7000 } 7001 7002 void sched_offline_group(struct task_group *tg) 7003 { 7004 unsigned long flags; 7005 7006 /* End participation in shares distribution: */ 7007 unregister_fair_sched_group(tg); 7008 7009 spin_lock_irqsave(&task_group_lock, flags); 7010 list_del_rcu(&tg->list); 7011 list_del_rcu(&tg->siblings); 7012 spin_unlock_irqrestore(&task_group_lock, flags); 7013 } 7014 7015 static void sched_change_group(struct task_struct *tsk, int type) 7016 { 7017 struct task_group *tg; 7018 7019 /* 7020 * All callers are synchronized by task_rq_lock(); we do not use RCU 7021 * which is pointless here. Thus, we pass "true" to task_css_check() 7022 * to prevent lockdep warnings. 7023 */ 7024 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), 7025 struct task_group, css); 7026 tg = autogroup_task_group(tsk, tg); 7027 tsk->sched_task_group = tg; 7028 7029 #ifdef CONFIG_FAIR_GROUP_SCHED 7030 if (tsk->sched_class->task_change_group) 7031 tsk->sched_class->task_change_group(tsk, type); 7032 else 7033 #endif 7034 set_task_rq(tsk, task_cpu(tsk)); 7035 } 7036 7037 /* 7038 * Change task's runqueue when it moves between groups. 7039 * 7040 * The caller of this function should have put the task in its new group by 7041 * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect 7042 * its new group. 7043 */ 7044 void sched_move_task(struct task_struct *tsk) 7045 { 7046 int queued, running, queue_flags = 7047 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 7048 struct rq_flags rf; 7049 struct rq *rq; 7050 7051 rq = task_rq_lock(tsk, &rf); 7052 update_rq_clock(rq); 7053 7054 running = task_current(rq, tsk); 7055 queued = task_on_rq_queued(tsk); 7056 7057 if (queued) 7058 dequeue_task(rq, tsk, queue_flags); 7059 if (running) 7060 put_prev_task(rq, tsk); 7061 7062 sched_change_group(tsk, TASK_MOVE_GROUP); 7063 7064 if (queued) 7065 enqueue_task(rq, tsk, queue_flags); 7066 if (running) { 7067 set_next_task(rq, tsk); 7068 /* 7069 * After changing group, the running task may have joined a 7070 * throttled one but it's still the running task. Trigger a 7071 * resched to make sure that task can still run. 7072 */ 7073 resched_curr(rq); 7074 } 7075 7076 task_rq_unlock(rq, tsk, &rf); 7077 } 7078 7079 static inline struct task_group *css_tg(struct cgroup_subsys_state *css) 7080 { 7081 return css ? container_of(css, struct task_group, css) : NULL; 7082 } 7083 7084 static struct cgroup_subsys_state * 7085 cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 7086 { 7087 struct task_group *parent = css_tg(parent_css); 7088 struct task_group *tg; 7089 7090 if (!parent) { 7091 /* This is early initialization for the top cgroup */ 7092 return &root_task_group.css; 7093 } 7094 7095 tg = sched_create_group(parent); 7096 if (IS_ERR(tg)) 7097 return ERR_PTR(-ENOMEM); 7098 7099 return &tg->css; 7100 } 7101 7102 /* Expose task group only after completing cgroup initialization */ 7103 static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) 7104 { 7105 struct task_group *tg = css_tg(css); 7106 struct task_group *parent = css_tg(css->parent); 7107 7108 if (parent) 7109 sched_online_group(tg, parent); 7110 7111 #ifdef CONFIG_UCLAMP_TASK_GROUP 7112 /* Propagate the effective uclamp value for the new group */ 7113 cpu_util_update_eff(css); 7114 #endif 7115 7116 return 0; 7117 } 7118 7119 static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) 7120 { 7121 struct task_group *tg = css_tg(css); 7122 7123 sched_offline_group(tg); 7124 } 7125 7126 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) 7127 { 7128 struct task_group *tg = css_tg(css); 7129 7130 /* 7131 * Relies on the RCU grace period between css_released() and this. 7132 */ 7133 sched_free_group(tg); 7134 } 7135 7136 /* 7137 * This is called before wake_up_new_task(), therefore we really only 7138 * have to set its group bits, all the other stuff does not apply. 7139 */ 7140 static void cpu_cgroup_fork(struct task_struct *task) 7141 { 7142 struct rq_flags rf; 7143 struct rq *rq; 7144 7145 rq = task_rq_lock(task, &rf); 7146 7147 update_rq_clock(rq); 7148 sched_change_group(task, TASK_SET_GROUP); 7149 7150 task_rq_unlock(rq, task, &rf); 7151 } 7152 7153 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) 7154 { 7155 struct task_struct *task; 7156 struct cgroup_subsys_state *css; 7157 int ret = 0; 7158 7159 cgroup_taskset_for_each(task, css, tset) { 7160 #ifdef CONFIG_RT_GROUP_SCHED 7161 if (!sched_rt_can_attach(css_tg(css), task)) 7162 return -EINVAL; 7163 #endif 7164 /* 7165 * Serialize against wake_up_new_task() such that if its 7166 * running, we're sure to observe its full state. 7167 */ 7168 raw_spin_lock_irq(&task->pi_lock); 7169 /* 7170 * Avoid calling sched_move_task() before wake_up_new_task() 7171 * has happened. This would lead to problems with PELT, due to 7172 * move wanting to detach+attach while we're not attached yet. 7173 */ 7174 if (task->state == TASK_NEW) 7175 ret = -EINVAL; 7176 raw_spin_unlock_irq(&task->pi_lock); 7177 7178 if (ret) 7179 break; 7180 } 7181 return ret; 7182 } 7183 7184 static void cpu_cgroup_attach(struct cgroup_taskset *tset) 7185 { 7186 struct task_struct *task; 7187 struct cgroup_subsys_state *css; 7188 7189 cgroup_taskset_for_each(task, css, tset) 7190 sched_move_task(task); 7191 } 7192 7193 #ifdef CONFIG_UCLAMP_TASK_GROUP 7194 static void cpu_util_update_eff(struct cgroup_subsys_state *css) 7195 { 7196 struct cgroup_subsys_state *top_css = css; 7197 struct uclamp_se *uc_parent = NULL; 7198 struct uclamp_se *uc_se = NULL; 7199 unsigned int eff[UCLAMP_CNT]; 7200 enum uclamp_id clamp_id; 7201 unsigned int clamps; 7202 7203 css_for_each_descendant_pre(css, top_css) { 7204 uc_parent = css_tg(css)->parent 7205 ? css_tg(css)->parent->uclamp : NULL; 7206 7207 for_each_clamp_id(clamp_id) { 7208 /* Assume effective clamps matches requested clamps */ 7209 eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value; 7210 /* Cap effective clamps with parent's effective clamps */ 7211 if (uc_parent && 7212 eff[clamp_id] > uc_parent[clamp_id].value) { 7213 eff[clamp_id] = uc_parent[clamp_id].value; 7214 } 7215 } 7216 /* Ensure protection is always capped by limit */ 7217 eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]); 7218 7219 /* Propagate most restrictive effective clamps */ 7220 clamps = 0x0; 7221 uc_se = css_tg(css)->uclamp; 7222 for_each_clamp_id(clamp_id) { 7223 if (eff[clamp_id] == uc_se[clamp_id].value) 7224 continue; 7225 uc_se[clamp_id].value = eff[clamp_id]; 7226 uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]); 7227 clamps |= (0x1 << clamp_id); 7228 } 7229 if (!clamps) { 7230 css = css_rightmost_descendant(css); 7231 continue; 7232 } 7233 7234 /* Immediately update descendants RUNNABLE tasks */ 7235 uclamp_update_active_tasks(css, clamps); 7236 } 7237 } 7238 7239 /* 7240 * Integer 10^N with a given N exponent by casting to integer the literal "1eN" 7241 * C expression. Since there is no way to convert a macro argument (N) into a 7242 * character constant, use two levels of macros. 7243 */ 7244 #define _POW10(exp) ((unsigned int)1e##exp) 7245 #define POW10(exp) _POW10(exp) 7246 7247 struct uclamp_request { 7248 #define UCLAMP_PERCENT_SHIFT 2 7249 #define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT)) 7250 s64 percent; 7251 u64 util; 7252 int ret; 7253 }; 7254 7255 static inline struct uclamp_request 7256 capacity_from_percent(char *buf) 7257 { 7258 struct uclamp_request req = { 7259 .percent = UCLAMP_PERCENT_SCALE, 7260 .util = SCHED_CAPACITY_SCALE, 7261 .ret = 0, 7262 }; 7263 7264 buf = strim(buf); 7265 if (strcmp(buf, "max")) { 7266 req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT, 7267 &req.percent); 7268 if (req.ret) 7269 return req; 7270 if ((u64)req.percent > UCLAMP_PERCENT_SCALE) { 7271 req.ret = -ERANGE; 7272 return req; 7273 } 7274 7275 req.util = req.percent << SCHED_CAPACITY_SHIFT; 7276 req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE); 7277 } 7278 7279 return req; 7280 } 7281 7282 static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, 7283 size_t nbytes, loff_t off, 7284 enum uclamp_id clamp_id) 7285 { 7286 struct uclamp_request req; 7287 struct task_group *tg; 7288 7289 req = capacity_from_percent(buf); 7290 if (req.ret) 7291 return req.ret; 7292 7293 mutex_lock(&uclamp_mutex); 7294 rcu_read_lock(); 7295 7296 tg = css_tg(of_css(of)); 7297 if (tg->uclamp_req[clamp_id].value != req.util) 7298 uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false); 7299 7300 /* 7301 * Because of not recoverable conversion rounding we keep track of the 7302 * exact requested value 7303 */ 7304 tg->uclamp_pct[clamp_id] = req.percent; 7305 7306 /* Update effective clamps to track the most restrictive value */ 7307 cpu_util_update_eff(of_css(of)); 7308 7309 rcu_read_unlock(); 7310 mutex_unlock(&uclamp_mutex); 7311 7312 return nbytes; 7313 } 7314 7315 static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of, 7316 char *buf, size_t nbytes, 7317 loff_t off) 7318 { 7319 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN); 7320 } 7321 7322 static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of, 7323 char *buf, size_t nbytes, 7324 loff_t off) 7325 { 7326 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX); 7327 } 7328 7329 static inline void cpu_uclamp_print(struct seq_file *sf, 7330 enum uclamp_id clamp_id) 7331 { 7332 struct task_group *tg; 7333 u64 util_clamp; 7334 u64 percent; 7335 u32 rem; 7336 7337 rcu_read_lock(); 7338 tg = css_tg(seq_css(sf)); 7339 util_clamp = tg->uclamp_req[clamp_id].value; 7340 rcu_read_unlock(); 7341 7342 if (util_clamp == SCHED_CAPACITY_SCALE) { 7343 seq_puts(sf, "max\n"); 7344 return; 7345 } 7346 7347 percent = tg->uclamp_pct[clamp_id]; 7348 percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem); 7349 seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem); 7350 } 7351 7352 static int cpu_uclamp_min_show(struct seq_file *sf, void *v) 7353 { 7354 cpu_uclamp_print(sf, UCLAMP_MIN); 7355 return 0; 7356 } 7357 7358 static int cpu_uclamp_max_show(struct seq_file *sf, void *v) 7359 { 7360 cpu_uclamp_print(sf, UCLAMP_MAX); 7361 return 0; 7362 } 7363 #endif /* CONFIG_UCLAMP_TASK_GROUP */ 7364 7365 #ifdef CONFIG_FAIR_GROUP_SCHED 7366 static int cpu_shares_write_u64(struct cgroup_subsys_state *css, 7367 struct cftype *cftype, u64 shareval) 7368 { 7369 if (shareval > scale_load_down(ULONG_MAX)) 7370 shareval = MAX_SHARES; 7371 return sched_group_set_shares(css_tg(css), scale_load(shareval)); 7372 } 7373 7374 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, 7375 struct cftype *cft) 7376 { 7377 struct task_group *tg = css_tg(css); 7378 7379 return (u64) scale_load_down(tg->shares); 7380 } 7381 7382 #ifdef CONFIG_CFS_BANDWIDTH 7383 static DEFINE_MUTEX(cfs_constraints_mutex); 7384 7385 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ 7386 static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ 7387 7388 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); 7389 7390 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 7391 { 7392 int i, ret = 0, runtime_enabled, runtime_was_enabled; 7393 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7394 7395 if (tg == &root_task_group) 7396 return -EINVAL; 7397 7398 /* 7399 * Ensure we have at some amount of bandwidth every period. This is 7400 * to prevent reaching a state of large arrears when throttled via 7401 * entity_tick() resulting in prolonged exit starvation. 7402 */ 7403 if (quota < min_cfs_quota_period || period < min_cfs_quota_period) 7404 return -EINVAL; 7405 7406 /* 7407 * Likewise, bound things on the otherside by preventing insane quota 7408 * periods. This also allows us to normalize in computing quota 7409 * feasibility. 7410 */ 7411 if (period > max_cfs_quota_period) 7412 return -EINVAL; 7413 7414 /* 7415 * Prevent race between setting of cfs_rq->runtime_enabled and 7416 * unthrottle_offline_cfs_rqs(). 7417 */ 7418 get_online_cpus(); 7419 mutex_lock(&cfs_constraints_mutex); 7420 ret = __cfs_schedulable(tg, period, quota); 7421 if (ret) 7422 goto out_unlock; 7423 7424 runtime_enabled = quota != RUNTIME_INF; 7425 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 7426 /* 7427 * If we need to toggle cfs_bandwidth_used, off->on must occur 7428 * before making related changes, and on->off must occur afterwards 7429 */ 7430 if (runtime_enabled && !runtime_was_enabled) 7431 cfs_bandwidth_usage_inc(); 7432 raw_spin_lock_irq(&cfs_b->lock); 7433 cfs_b->period = ns_to_ktime(period); 7434 cfs_b->quota = quota; 7435 7436 __refill_cfs_bandwidth_runtime(cfs_b); 7437 7438 /* Restart the period timer (if active) to handle new period expiry: */ 7439 if (runtime_enabled) 7440 start_cfs_bandwidth(cfs_b); 7441 7442 raw_spin_unlock_irq(&cfs_b->lock); 7443 7444 for_each_online_cpu(i) { 7445 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7446 struct rq *rq = cfs_rq->rq; 7447 struct rq_flags rf; 7448 7449 rq_lock_irq(rq, &rf); 7450 cfs_rq->runtime_enabled = runtime_enabled; 7451 cfs_rq->runtime_remaining = 0; 7452 7453 if (cfs_rq->throttled) 7454 unthrottle_cfs_rq(cfs_rq); 7455 rq_unlock_irq(rq, &rf); 7456 } 7457 if (runtime_was_enabled && !runtime_enabled) 7458 cfs_bandwidth_usage_dec(); 7459 out_unlock: 7460 mutex_unlock(&cfs_constraints_mutex); 7461 put_online_cpus(); 7462 7463 return ret; 7464 } 7465 7466 static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) 7467 { 7468 u64 quota, period; 7469 7470 period = ktime_to_ns(tg->cfs_bandwidth.period); 7471 if (cfs_quota_us < 0) 7472 quota = RUNTIME_INF; 7473 else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) 7474 quota = (u64)cfs_quota_us * NSEC_PER_USEC; 7475 else 7476 return -EINVAL; 7477 7478 return tg_set_cfs_bandwidth(tg, period, quota); 7479 } 7480 7481 static long tg_get_cfs_quota(struct task_group *tg) 7482 { 7483 u64 quota_us; 7484 7485 if (tg->cfs_bandwidth.quota == RUNTIME_INF) 7486 return -1; 7487 7488 quota_us = tg->cfs_bandwidth.quota; 7489 do_div(quota_us, NSEC_PER_USEC); 7490 7491 return quota_us; 7492 } 7493 7494 static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) 7495 { 7496 u64 quota, period; 7497 7498 if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) 7499 return -EINVAL; 7500 7501 period = (u64)cfs_period_us * NSEC_PER_USEC; 7502 quota = tg->cfs_bandwidth.quota; 7503 7504 return tg_set_cfs_bandwidth(tg, period, quota); 7505 } 7506 7507 static long tg_get_cfs_period(struct task_group *tg) 7508 { 7509 u64 cfs_period_us; 7510 7511 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); 7512 do_div(cfs_period_us, NSEC_PER_USEC); 7513 7514 return cfs_period_us; 7515 } 7516 7517 static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, 7518 struct cftype *cft) 7519 { 7520 return tg_get_cfs_quota(css_tg(css)); 7521 } 7522 7523 static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, 7524 struct cftype *cftype, s64 cfs_quota_us) 7525 { 7526 return tg_set_cfs_quota(css_tg(css), cfs_quota_us); 7527 } 7528 7529 static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, 7530 struct cftype *cft) 7531 { 7532 return tg_get_cfs_period(css_tg(css)); 7533 } 7534 7535 static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, 7536 struct cftype *cftype, u64 cfs_period_us) 7537 { 7538 return tg_set_cfs_period(css_tg(css), cfs_period_us); 7539 } 7540 7541 struct cfs_schedulable_data { 7542 struct task_group *tg; 7543 u64 period, quota; 7544 }; 7545 7546 /* 7547 * normalize group quota/period to be quota/max_period 7548 * note: units are usecs 7549 */ 7550 static u64 normalize_cfs_quota(struct task_group *tg, 7551 struct cfs_schedulable_data *d) 7552 { 7553 u64 quota, period; 7554 7555 if (tg == d->tg) { 7556 period = d->period; 7557 quota = d->quota; 7558 } else { 7559 period = tg_get_cfs_period(tg); 7560 quota = tg_get_cfs_quota(tg); 7561 } 7562 7563 /* note: these should typically be equivalent */ 7564 if (quota == RUNTIME_INF || quota == -1) 7565 return RUNTIME_INF; 7566 7567 return to_ratio(period, quota); 7568 } 7569 7570 static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 7571 { 7572 struct cfs_schedulable_data *d = data; 7573 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7574 s64 quota = 0, parent_quota = -1; 7575 7576 if (!tg->parent) { 7577 quota = RUNTIME_INF; 7578 } else { 7579 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; 7580 7581 quota = normalize_cfs_quota(tg, d); 7582 parent_quota = parent_b->hierarchical_quota; 7583 7584 /* 7585 * Ensure max(child_quota) <= parent_quota. On cgroup2, 7586 * always take the min. On cgroup1, only inherit when no 7587 * limit is set: 7588 */ 7589 if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) { 7590 quota = min(quota, parent_quota); 7591 } else { 7592 if (quota == RUNTIME_INF) 7593 quota = parent_quota; 7594 else if (parent_quota != RUNTIME_INF && quota > parent_quota) 7595 return -EINVAL; 7596 } 7597 } 7598 cfs_b->hierarchical_quota = quota; 7599 7600 return 0; 7601 } 7602 7603 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) 7604 { 7605 int ret; 7606 struct cfs_schedulable_data data = { 7607 .tg = tg, 7608 .period = period, 7609 .quota = quota, 7610 }; 7611 7612 if (quota != RUNTIME_INF) { 7613 do_div(data.period, NSEC_PER_USEC); 7614 do_div(data.quota, NSEC_PER_USEC); 7615 } 7616 7617 rcu_read_lock(); 7618 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); 7619 rcu_read_unlock(); 7620 7621 return ret; 7622 } 7623 7624 static int cpu_cfs_stat_show(struct seq_file *sf, void *v) 7625 { 7626 struct task_group *tg = css_tg(seq_css(sf)); 7627 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7628 7629 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); 7630 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); 7631 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); 7632 7633 if (schedstat_enabled() && tg != &root_task_group) { 7634 u64 ws = 0; 7635 int i; 7636 7637 for_each_possible_cpu(i) 7638 ws += schedstat_val(tg->se[i]->statistics.wait_sum); 7639 7640 seq_printf(sf, "wait_sum %llu\n", ws); 7641 } 7642 7643 return 0; 7644 } 7645 #endif /* CONFIG_CFS_BANDWIDTH */ 7646 #endif /* CONFIG_FAIR_GROUP_SCHED */ 7647 7648 #ifdef CONFIG_RT_GROUP_SCHED 7649 static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, 7650 struct cftype *cft, s64 val) 7651 { 7652 return sched_group_set_rt_runtime(css_tg(css), val); 7653 } 7654 7655 static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, 7656 struct cftype *cft) 7657 { 7658 return sched_group_rt_runtime(css_tg(css)); 7659 } 7660 7661 static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, 7662 struct cftype *cftype, u64 rt_period_us) 7663 { 7664 return sched_group_set_rt_period(css_tg(css), rt_period_us); 7665 } 7666 7667 static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, 7668 struct cftype *cft) 7669 { 7670 return sched_group_rt_period(css_tg(css)); 7671 } 7672 #endif /* CONFIG_RT_GROUP_SCHED */ 7673 7674 static struct cftype cpu_legacy_files[] = { 7675 #ifdef CONFIG_FAIR_GROUP_SCHED 7676 { 7677 .name = "shares", 7678 .read_u64 = cpu_shares_read_u64, 7679 .write_u64 = cpu_shares_write_u64, 7680 }, 7681 #endif 7682 #ifdef CONFIG_CFS_BANDWIDTH 7683 { 7684 .name = "cfs_quota_us", 7685 .read_s64 = cpu_cfs_quota_read_s64, 7686 .write_s64 = cpu_cfs_quota_write_s64, 7687 }, 7688 { 7689 .name = "cfs_period_us", 7690 .read_u64 = cpu_cfs_period_read_u64, 7691 .write_u64 = cpu_cfs_period_write_u64, 7692 }, 7693 { 7694 .name = "stat", 7695 .seq_show = cpu_cfs_stat_show, 7696 }, 7697 #endif 7698 #ifdef CONFIG_RT_GROUP_SCHED 7699 { 7700 .name = "rt_runtime_us", 7701 .read_s64 = cpu_rt_runtime_read, 7702 .write_s64 = cpu_rt_runtime_write, 7703 }, 7704 { 7705 .name = "rt_period_us", 7706 .read_u64 = cpu_rt_period_read_uint, 7707 .write_u64 = cpu_rt_period_write_uint, 7708 }, 7709 #endif 7710 #ifdef CONFIG_UCLAMP_TASK_GROUP 7711 { 7712 .name = "uclamp.min", 7713 .flags = CFTYPE_NOT_ON_ROOT, 7714 .seq_show = cpu_uclamp_min_show, 7715 .write = cpu_uclamp_min_write, 7716 }, 7717 { 7718 .name = "uclamp.max", 7719 .flags = CFTYPE_NOT_ON_ROOT, 7720 .seq_show = cpu_uclamp_max_show, 7721 .write = cpu_uclamp_max_write, 7722 }, 7723 #endif 7724 { } /* Terminate */ 7725 }; 7726 7727 static int cpu_extra_stat_show(struct seq_file *sf, 7728 struct cgroup_subsys_state *css) 7729 { 7730 #ifdef CONFIG_CFS_BANDWIDTH 7731 { 7732 struct task_group *tg = css_tg(css); 7733 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7734 u64 throttled_usec; 7735 7736 throttled_usec = cfs_b->throttled_time; 7737 do_div(throttled_usec, NSEC_PER_USEC); 7738 7739 seq_printf(sf, "nr_periods %d\n" 7740 "nr_throttled %d\n" 7741 "throttled_usec %llu\n", 7742 cfs_b->nr_periods, cfs_b->nr_throttled, 7743 throttled_usec); 7744 } 7745 #endif 7746 return 0; 7747 } 7748 7749 #ifdef CONFIG_FAIR_GROUP_SCHED 7750 static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, 7751 struct cftype *cft) 7752 { 7753 struct task_group *tg = css_tg(css); 7754 u64 weight = scale_load_down(tg->shares); 7755 7756 return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); 7757 } 7758 7759 static int cpu_weight_write_u64(struct cgroup_subsys_state *css, 7760 struct cftype *cft, u64 weight) 7761 { 7762 /* 7763 * cgroup weight knobs should use the common MIN, DFL and MAX 7764 * values which are 1, 100 and 10000 respectively. While it loses 7765 * a bit of range on both ends, it maps pretty well onto the shares 7766 * value used by scheduler and the round-trip conversions preserve 7767 * the original value over the entire range. 7768 */ 7769 if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) 7770 return -ERANGE; 7771 7772 weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); 7773 7774 return sched_group_set_shares(css_tg(css), scale_load(weight)); 7775 } 7776 7777 static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, 7778 struct cftype *cft) 7779 { 7780 unsigned long weight = scale_load_down(css_tg(css)->shares); 7781 int last_delta = INT_MAX; 7782 int prio, delta; 7783 7784 /* find the closest nice value to the current weight */ 7785 for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) { 7786 delta = abs(sched_prio_to_weight[prio] - weight); 7787 if (delta >= last_delta) 7788 break; 7789 last_delta = delta; 7790 } 7791 7792 return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO); 7793 } 7794 7795 static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, 7796 struct cftype *cft, s64 nice) 7797 { 7798 unsigned long weight; 7799 int idx; 7800 7801 if (nice < MIN_NICE || nice > MAX_NICE) 7802 return -ERANGE; 7803 7804 idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO; 7805 idx = array_index_nospec(idx, 40); 7806 weight = sched_prio_to_weight[idx]; 7807 7808 return sched_group_set_shares(css_tg(css), scale_load(weight)); 7809 } 7810 #endif 7811 7812 static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, 7813 long period, long quota) 7814 { 7815 if (quota < 0) 7816 seq_puts(sf, "max"); 7817 else 7818 seq_printf(sf, "%ld", quota); 7819 7820 seq_printf(sf, " %ld\n", period); 7821 } 7822 7823 /* caller should put the current value in *@periodp before calling */ 7824 static int __maybe_unused cpu_period_quota_parse(char *buf, 7825 u64 *periodp, u64 *quotap) 7826 { 7827 char tok[21]; /* U64_MAX */ 7828 7829 if (sscanf(buf, "%20s %llu", tok, periodp) < 1) 7830 return -EINVAL; 7831 7832 *periodp *= NSEC_PER_USEC; 7833 7834 if (sscanf(tok, "%llu", quotap)) 7835 *quotap *= NSEC_PER_USEC; 7836 else if (!strcmp(tok, "max")) 7837 *quotap = RUNTIME_INF; 7838 else 7839 return -EINVAL; 7840 7841 return 0; 7842 } 7843 7844 #ifdef CONFIG_CFS_BANDWIDTH 7845 static int cpu_max_show(struct seq_file *sf, void *v) 7846 { 7847 struct task_group *tg = css_tg(seq_css(sf)); 7848 7849 cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); 7850 return 0; 7851 } 7852 7853 static ssize_t cpu_max_write(struct kernfs_open_file *of, 7854 char *buf, size_t nbytes, loff_t off) 7855 { 7856 struct task_group *tg = css_tg(of_css(of)); 7857 u64 period = tg_get_cfs_period(tg); 7858 u64 quota; 7859 int ret; 7860 7861 ret = cpu_period_quota_parse(buf, &period, "a); 7862 if (!ret) 7863 ret = tg_set_cfs_bandwidth(tg, period, quota); 7864 return ret ?: nbytes; 7865 } 7866 #endif 7867 7868 static struct cftype cpu_files[] = { 7869 #ifdef CONFIG_FAIR_GROUP_SCHED 7870 { 7871 .name = "weight", 7872 .flags = CFTYPE_NOT_ON_ROOT, 7873 .read_u64 = cpu_weight_read_u64, 7874 .write_u64 = cpu_weight_write_u64, 7875 }, 7876 { 7877 .name = "weight.nice", 7878 .flags = CFTYPE_NOT_ON_ROOT, 7879 .read_s64 = cpu_weight_nice_read_s64, 7880 .write_s64 = cpu_weight_nice_write_s64, 7881 }, 7882 #endif 7883 #ifdef CONFIG_CFS_BANDWIDTH 7884 { 7885 .name = "max", 7886 .flags = CFTYPE_NOT_ON_ROOT, 7887 .seq_show = cpu_max_show, 7888 .write = cpu_max_write, 7889 }, 7890 #endif 7891 #ifdef CONFIG_UCLAMP_TASK_GROUP 7892 { 7893 .name = "uclamp.min", 7894 .flags = CFTYPE_NOT_ON_ROOT, 7895 .seq_show = cpu_uclamp_min_show, 7896 .write = cpu_uclamp_min_write, 7897 }, 7898 { 7899 .name = "uclamp.max", 7900 .flags = CFTYPE_NOT_ON_ROOT, 7901 .seq_show = cpu_uclamp_max_show, 7902 .write = cpu_uclamp_max_write, 7903 }, 7904 #endif 7905 { } /* terminate */ 7906 }; 7907 7908 struct cgroup_subsys cpu_cgrp_subsys = { 7909 .css_alloc = cpu_cgroup_css_alloc, 7910 .css_online = cpu_cgroup_css_online, 7911 .css_released = cpu_cgroup_css_released, 7912 .css_free = cpu_cgroup_css_free, 7913 .css_extra_stat_show = cpu_extra_stat_show, 7914 .fork = cpu_cgroup_fork, 7915 .can_attach = cpu_cgroup_can_attach, 7916 .attach = cpu_cgroup_attach, 7917 .legacy_cftypes = cpu_legacy_files, 7918 .dfl_cftypes = cpu_files, 7919 .early_init = true, 7920 .threaded = true, 7921 }; 7922 7923 #endif /* CONFIG_CGROUP_SCHED */ 7924 7925 void dump_cpu_task(int cpu) 7926 { 7927 pr_info("Task dump for CPU %d:\n", cpu); 7928 sched_show_task(cpu_curr(cpu)); 7929 } 7930 7931 /* 7932 * Nice levels are multiplicative, with a gentle 10% change for every 7933 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to 7934 * nice 1, it will get ~10% less CPU time than another CPU-bound task 7935 * that remained on nice 0. 7936 * 7937 * The "10% effect" is relative and cumulative: from _any_ nice level, 7938 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level 7939 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. 7940 * If a task goes up by ~10% and another task goes down by ~10% then 7941 * the relative distance between them is ~25%.) 7942 */ 7943 const int sched_prio_to_weight[40] = { 7944 /* -20 */ 88761, 71755, 56483, 46273, 36291, 7945 /* -15 */ 29154, 23254, 18705, 14949, 11916, 7946 /* -10 */ 9548, 7620, 6100, 4904, 3906, 7947 /* -5 */ 3121, 2501, 1991, 1586, 1277, 7948 /* 0 */ 1024, 820, 655, 526, 423, 7949 /* 5 */ 335, 272, 215, 172, 137, 7950 /* 10 */ 110, 87, 70, 56, 45, 7951 /* 15 */ 36, 29, 23, 18, 15, 7952 }; 7953 7954 /* 7955 * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated. 7956 * 7957 * In cases where the weight does not change often, we can use the 7958 * precalculated inverse to speed up arithmetics by turning divisions 7959 * into multiplications: 7960 */ 7961 const u32 sched_prio_to_wmult[40] = { 7962 /* -20 */ 48388, 59856, 76040, 92818, 118348, 7963 /* -15 */ 147320, 184698, 229616, 287308, 360437, 7964 /* -10 */ 449829, 563644, 704093, 875809, 1099582, 7965 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, 7966 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, 7967 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, 7968 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, 7969 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 7970 }; 7971 7972 #undef CREATE_TRACE_POINTS 7973