1 /* 2 * kernel/sched/core.c 3 * 4 * Core kernel scheduler code and related syscalls 5 * 6 * Copyright (C) 1991-2002 Linus Torvalds 7 */ 8 #include <linux/sched.h> 9 #include <linux/cpuset.h> 10 #include <linux/delayacct.h> 11 #include <linux/init_task.h> 12 #include <linux/context_tracking.h> 13 14 #include <linux/blkdev.h> 15 #include <linux/kprobes.h> 16 #include <linux/mmu_context.h> 17 #include <linux/module.h> 18 #include <linux/nmi.h> 19 #include <linux/prefetch.h> 20 #include <linux/profile.h> 21 #include <linux/security.h> 22 #include <linux/syscalls.h> 23 24 #include <asm/switch_to.h> 25 #include <asm/tlb.h> 26 #ifdef CONFIG_PARAVIRT 27 #include <asm/paravirt.h> 28 #endif 29 30 #include "sched.h" 31 #include "../workqueue_internal.h" 32 #include "../smpboot.h" 33 34 #define CREATE_TRACE_POINTS 35 #include <trace/events/sched.h> 36 37 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 38 39 /* 40 * Debugging: various feature bits 41 */ 42 43 #define SCHED_FEAT(name, enabled) \ 44 (1UL << __SCHED_FEAT_##name) * enabled | 45 46 const_debug unsigned int sysctl_sched_features = 47 #include "features.h" 48 0; 49 50 #undef SCHED_FEAT 51 52 /* 53 * Number of tasks to iterate in a single balance run. 54 * Limited because this is done with IRQs disabled. 55 */ 56 const_debug unsigned int sysctl_sched_nr_migrate = 32; 57 58 /* 59 * period over which we average the RT time consumption, measured 60 * in ms. 61 * 62 * default: 1s 63 */ 64 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; 65 66 /* 67 * period over which we measure -rt task CPU usage in us. 68 * default: 1s 69 */ 70 unsigned int sysctl_sched_rt_period = 1000000; 71 72 __read_mostly int scheduler_running; 73 74 /* 75 * part of the period that we allow rt tasks to run in us. 76 * default: 0.95s 77 */ 78 int sysctl_sched_rt_runtime = 950000; 79 80 /* CPUs with isolated domains */ 81 cpumask_var_t cpu_isolated_map; 82 83 /* 84 * this_rq_lock - lock this runqueue and disable interrupts. 85 */ 86 static struct rq *this_rq_lock(void) 87 __acquires(rq->lock) 88 { 89 struct rq *rq; 90 91 local_irq_disable(); 92 rq = this_rq(); 93 raw_spin_lock(&rq->lock); 94 95 return rq; 96 } 97 98 /* 99 * __task_rq_lock - lock the rq @p resides on. 100 */ 101 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) 102 __acquires(rq->lock) 103 { 104 struct rq *rq; 105 106 lockdep_assert_held(&p->pi_lock); 107 108 for (;;) { 109 rq = task_rq(p); 110 raw_spin_lock(&rq->lock); 111 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { 112 rq_pin_lock(rq, rf); 113 return rq; 114 } 115 raw_spin_unlock(&rq->lock); 116 117 while (unlikely(task_on_rq_migrating(p))) 118 cpu_relax(); 119 } 120 } 121 122 /* 123 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. 124 */ 125 struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) 126 __acquires(p->pi_lock) 127 __acquires(rq->lock) 128 { 129 struct rq *rq; 130 131 for (;;) { 132 raw_spin_lock_irqsave(&p->pi_lock, rf->flags); 133 rq = task_rq(p); 134 raw_spin_lock(&rq->lock); 135 /* 136 * move_queued_task() task_rq_lock() 137 * 138 * ACQUIRE (rq->lock) 139 * [S] ->on_rq = MIGRATING [L] rq = task_rq() 140 * WMB (__set_task_cpu()) ACQUIRE (rq->lock); 141 * [S] ->cpu = new_cpu [L] task_rq() 142 * [L] ->on_rq 143 * RELEASE (rq->lock) 144 * 145 * If we observe the old cpu in task_rq_lock, the acquire of 146 * the old rq->lock will fully serialize against the stores. 147 * 148 * If we observe the new CPU in task_rq_lock, the acquire will 149 * pair with the WMB to ensure we must then also see migrating. 150 */ 151 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { 152 rq_pin_lock(rq, rf); 153 return rq; 154 } 155 raw_spin_unlock(&rq->lock); 156 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); 157 158 while (unlikely(task_on_rq_migrating(p))) 159 cpu_relax(); 160 } 161 } 162 163 /* 164 * RQ-clock updating methods: 165 */ 166 167 static void update_rq_clock_task(struct rq *rq, s64 delta) 168 { 169 /* 170 * In theory, the compile should just see 0 here, and optimize out the call 171 * to sched_rt_avg_update. But I don't trust it... 172 */ 173 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 174 s64 steal = 0, irq_delta = 0; 175 #endif 176 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 177 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 178 179 /* 180 * Since irq_time is only updated on {soft,}irq_exit, we might run into 181 * this case when a previous update_rq_clock() happened inside a 182 * {soft,}irq region. 183 * 184 * When this happens, we stop ->clock_task and only update the 185 * prev_irq_time stamp to account for the part that fit, so that a next 186 * update will consume the rest. This ensures ->clock_task is 187 * monotonic. 188 * 189 * It does however cause some slight miss-attribution of {soft,}irq 190 * time, a more accurate solution would be to update the irq_time using 191 * the current rq->clock timestamp, except that would require using 192 * atomic ops. 193 */ 194 if (irq_delta > delta) 195 irq_delta = delta; 196 197 rq->prev_irq_time += irq_delta; 198 delta -= irq_delta; 199 #endif 200 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 201 if (static_key_false((¶virt_steal_rq_enabled))) { 202 steal = paravirt_steal_clock(cpu_of(rq)); 203 steal -= rq->prev_steal_time_rq; 204 205 if (unlikely(steal > delta)) 206 steal = delta; 207 208 rq->prev_steal_time_rq += steal; 209 delta -= steal; 210 } 211 #endif 212 213 rq->clock_task += delta; 214 215 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 216 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) 217 sched_rt_avg_update(rq, irq_delta + steal); 218 #endif 219 } 220 221 void update_rq_clock(struct rq *rq) 222 { 223 s64 delta; 224 225 lockdep_assert_held(&rq->lock); 226 227 if (rq->clock_update_flags & RQCF_ACT_SKIP) 228 return; 229 230 #ifdef CONFIG_SCHED_DEBUG 231 rq->clock_update_flags |= RQCF_UPDATED; 232 #endif 233 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 234 if (delta < 0) 235 return; 236 rq->clock += delta; 237 update_rq_clock_task(rq, delta); 238 } 239 240 241 #ifdef CONFIG_SCHED_HRTICK 242 /* 243 * Use HR-timers to deliver accurate preemption points. 244 */ 245 246 static void hrtick_clear(struct rq *rq) 247 { 248 if (hrtimer_active(&rq->hrtick_timer)) 249 hrtimer_cancel(&rq->hrtick_timer); 250 } 251 252 /* 253 * High-resolution timer tick. 254 * Runs from hardirq context with interrupts disabled. 255 */ 256 static enum hrtimer_restart hrtick(struct hrtimer *timer) 257 { 258 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 259 260 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 261 262 raw_spin_lock(&rq->lock); 263 update_rq_clock(rq); 264 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 265 raw_spin_unlock(&rq->lock); 266 267 return HRTIMER_NORESTART; 268 } 269 270 #ifdef CONFIG_SMP 271 272 static void __hrtick_restart(struct rq *rq) 273 { 274 struct hrtimer *timer = &rq->hrtick_timer; 275 276 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); 277 } 278 279 /* 280 * called from hardirq (IPI) context 281 */ 282 static void __hrtick_start(void *arg) 283 { 284 struct rq *rq = arg; 285 286 raw_spin_lock(&rq->lock); 287 __hrtick_restart(rq); 288 rq->hrtick_csd_pending = 0; 289 raw_spin_unlock(&rq->lock); 290 } 291 292 /* 293 * Called to set the hrtick timer state. 294 * 295 * called with rq->lock held and irqs disabled 296 */ 297 void hrtick_start(struct rq *rq, u64 delay) 298 { 299 struct hrtimer *timer = &rq->hrtick_timer; 300 ktime_t time; 301 s64 delta; 302 303 /* 304 * Don't schedule slices shorter than 10000ns, that just 305 * doesn't make sense and can cause timer DoS. 306 */ 307 delta = max_t(s64, delay, 10000LL); 308 time = ktime_add_ns(timer->base->get_time(), delta); 309 310 hrtimer_set_expires(timer, time); 311 312 if (rq == this_rq()) { 313 __hrtick_restart(rq); 314 } else if (!rq->hrtick_csd_pending) { 315 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); 316 rq->hrtick_csd_pending = 1; 317 } 318 } 319 320 #else 321 /* 322 * Called to set the hrtick timer state. 323 * 324 * called with rq->lock held and irqs disabled 325 */ 326 void hrtick_start(struct rq *rq, u64 delay) 327 { 328 /* 329 * Don't schedule slices shorter than 10000ns, that just 330 * doesn't make sense. Rely on vruntime for fairness. 331 */ 332 delay = max_t(u64, delay, 10000LL); 333 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), 334 HRTIMER_MODE_REL_PINNED); 335 } 336 #endif /* CONFIG_SMP */ 337 338 static void init_rq_hrtick(struct rq *rq) 339 { 340 #ifdef CONFIG_SMP 341 rq->hrtick_csd_pending = 0; 342 343 rq->hrtick_csd.flags = 0; 344 rq->hrtick_csd.func = __hrtick_start; 345 rq->hrtick_csd.info = rq; 346 #endif 347 348 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 349 rq->hrtick_timer.function = hrtick; 350 } 351 #else /* CONFIG_SCHED_HRTICK */ 352 static inline void hrtick_clear(struct rq *rq) 353 { 354 } 355 356 static inline void init_rq_hrtick(struct rq *rq) 357 { 358 } 359 #endif /* CONFIG_SCHED_HRTICK */ 360 361 /* 362 * cmpxchg based fetch_or, macro so it works for different integer types 363 */ 364 #define fetch_or(ptr, mask) \ 365 ({ \ 366 typeof(ptr) _ptr = (ptr); \ 367 typeof(mask) _mask = (mask); \ 368 typeof(*_ptr) _old, _val = *_ptr; \ 369 \ 370 for (;;) { \ 371 _old = cmpxchg(_ptr, _val, _val | _mask); \ 372 if (_old == _val) \ 373 break; \ 374 _val = _old; \ 375 } \ 376 _old; \ 377 }) 378 379 #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) 380 /* 381 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, 382 * this avoids any races wrt polling state changes and thereby avoids 383 * spurious IPIs. 384 */ 385 static bool set_nr_and_not_polling(struct task_struct *p) 386 { 387 struct thread_info *ti = task_thread_info(p); 388 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); 389 } 390 391 /* 392 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. 393 * 394 * If this returns true, then the idle task promises to call 395 * sched_ttwu_pending() and reschedule soon. 396 */ 397 static bool set_nr_if_polling(struct task_struct *p) 398 { 399 struct thread_info *ti = task_thread_info(p); 400 typeof(ti->flags) old, val = READ_ONCE(ti->flags); 401 402 for (;;) { 403 if (!(val & _TIF_POLLING_NRFLAG)) 404 return false; 405 if (val & _TIF_NEED_RESCHED) 406 return true; 407 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); 408 if (old == val) 409 break; 410 val = old; 411 } 412 return true; 413 } 414 415 #else 416 static bool set_nr_and_not_polling(struct task_struct *p) 417 { 418 set_tsk_need_resched(p); 419 return true; 420 } 421 422 #ifdef CONFIG_SMP 423 static bool set_nr_if_polling(struct task_struct *p) 424 { 425 return false; 426 } 427 #endif 428 #endif 429 430 void wake_q_add(struct wake_q_head *head, struct task_struct *task) 431 { 432 struct wake_q_node *node = &task->wake_q; 433 434 /* 435 * Atomically grab the task, if ->wake_q is !nil already it means 436 * its already queued (either by us or someone else) and will get the 437 * wakeup due to that. 438 * 439 * This cmpxchg() implies a full barrier, which pairs with the write 440 * barrier implied by the wakeup in wake_up_q(). 441 */ 442 if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) 443 return; 444 445 get_task_struct(task); 446 447 /* 448 * The head is context local, there can be no concurrency. 449 */ 450 *head->lastp = node; 451 head->lastp = &node->next; 452 } 453 454 void wake_up_q(struct wake_q_head *head) 455 { 456 struct wake_q_node *node = head->first; 457 458 while (node != WAKE_Q_TAIL) { 459 struct task_struct *task; 460 461 task = container_of(node, struct task_struct, wake_q); 462 BUG_ON(!task); 463 /* Task can safely be re-inserted now: */ 464 node = node->next; 465 task->wake_q.next = NULL; 466 467 /* 468 * wake_up_process() implies a wmb() to pair with the queueing 469 * in wake_q_add() so as not to miss wakeups. 470 */ 471 wake_up_process(task); 472 put_task_struct(task); 473 } 474 } 475 476 /* 477 * resched_curr - mark rq's current task 'to be rescheduled now'. 478 * 479 * On UP this means the setting of the need_resched flag, on SMP it 480 * might also involve a cross-CPU call to trigger the scheduler on 481 * the target CPU. 482 */ 483 void resched_curr(struct rq *rq) 484 { 485 struct task_struct *curr = rq->curr; 486 int cpu; 487 488 lockdep_assert_held(&rq->lock); 489 490 if (test_tsk_need_resched(curr)) 491 return; 492 493 cpu = cpu_of(rq); 494 495 if (cpu == smp_processor_id()) { 496 set_tsk_need_resched(curr); 497 set_preempt_need_resched(); 498 return; 499 } 500 501 if (set_nr_and_not_polling(curr)) 502 smp_send_reschedule(cpu); 503 else 504 trace_sched_wake_idle_without_ipi(cpu); 505 } 506 507 void resched_cpu(int cpu) 508 { 509 struct rq *rq = cpu_rq(cpu); 510 unsigned long flags; 511 512 if (!raw_spin_trylock_irqsave(&rq->lock, flags)) 513 return; 514 resched_curr(rq); 515 raw_spin_unlock_irqrestore(&rq->lock, flags); 516 } 517 518 #ifdef CONFIG_SMP 519 #ifdef CONFIG_NO_HZ_COMMON 520 /* 521 * In the semi idle case, use the nearest busy CPU for migrating timers 522 * from an idle CPU. This is good for power-savings. 523 * 524 * We don't do similar optimization for completely idle system, as 525 * selecting an idle CPU will add more delays to the timers than intended 526 * (as that CPU's timer base may not be uptodate wrt jiffies etc). 527 */ 528 int get_nohz_timer_target(void) 529 { 530 int i, cpu = smp_processor_id(); 531 struct sched_domain *sd; 532 533 if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) 534 return cpu; 535 536 rcu_read_lock(); 537 for_each_domain(cpu, sd) { 538 for_each_cpu(i, sched_domain_span(sd)) { 539 if (cpu == i) 540 continue; 541 542 if (!idle_cpu(i) && is_housekeeping_cpu(i)) { 543 cpu = i; 544 goto unlock; 545 } 546 } 547 } 548 549 if (!is_housekeeping_cpu(cpu)) 550 cpu = housekeeping_any_cpu(); 551 unlock: 552 rcu_read_unlock(); 553 return cpu; 554 } 555 556 /* 557 * When add_timer_on() enqueues a timer into the timer wheel of an 558 * idle CPU then this timer might expire before the next timer event 559 * which is scheduled to wake up that CPU. In case of a completely 560 * idle system the next event might even be infinite time into the 561 * future. wake_up_idle_cpu() ensures that the CPU is woken up and 562 * leaves the inner idle loop so the newly added timer is taken into 563 * account when the CPU goes back to idle and evaluates the timer 564 * wheel for the next timer event. 565 */ 566 static void wake_up_idle_cpu(int cpu) 567 { 568 struct rq *rq = cpu_rq(cpu); 569 570 if (cpu == smp_processor_id()) 571 return; 572 573 if (set_nr_and_not_polling(rq->idle)) 574 smp_send_reschedule(cpu); 575 else 576 trace_sched_wake_idle_without_ipi(cpu); 577 } 578 579 static bool wake_up_full_nohz_cpu(int cpu) 580 { 581 /* 582 * We just need the target to call irq_exit() and re-evaluate 583 * the next tick. The nohz full kick at least implies that. 584 * If needed we can still optimize that later with an 585 * empty IRQ. 586 */ 587 if (cpu_is_offline(cpu)) 588 return true; /* Don't try to wake offline CPUs. */ 589 if (tick_nohz_full_cpu(cpu)) { 590 if (cpu != smp_processor_id() || 591 tick_nohz_tick_stopped()) 592 tick_nohz_full_kick_cpu(cpu); 593 return true; 594 } 595 596 return false; 597 } 598 599 /* 600 * Wake up the specified CPU. If the CPU is going offline, it is the 601 * caller's responsibility to deal with the lost wakeup, for example, 602 * by hooking into the CPU_DEAD notifier like timers and hrtimers do. 603 */ 604 void wake_up_nohz_cpu(int cpu) 605 { 606 if (!wake_up_full_nohz_cpu(cpu)) 607 wake_up_idle_cpu(cpu); 608 } 609 610 static inline bool got_nohz_idle_kick(void) 611 { 612 int cpu = smp_processor_id(); 613 614 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) 615 return false; 616 617 if (idle_cpu(cpu) && !need_resched()) 618 return true; 619 620 /* 621 * We can't run Idle Load Balance on this CPU for this time so we 622 * cancel it and clear NOHZ_BALANCE_KICK 623 */ 624 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 625 return false; 626 } 627 628 #else /* CONFIG_NO_HZ_COMMON */ 629 630 static inline bool got_nohz_idle_kick(void) 631 { 632 return false; 633 } 634 635 #endif /* CONFIG_NO_HZ_COMMON */ 636 637 #ifdef CONFIG_NO_HZ_FULL 638 bool sched_can_stop_tick(struct rq *rq) 639 { 640 int fifo_nr_running; 641 642 /* Deadline tasks, even if single, need the tick */ 643 if (rq->dl.dl_nr_running) 644 return false; 645 646 /* 647 * If there are more than one RR tasks, we need the tick to effect the 648 * actual RR behaviour. 649 */ 650 if (rq->rt.rr_nr_running) { 651 if (rq->rt.rr_nr_running == 1) 652 return true; 653 else 654 return false; 655 } 656 657 /* 658 * If there's no RR tasks, but FIFO tasks, we can skip the tick, no 659 * forced preemption between FIFO tasks. 660 */ 661 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running; 662 if (fifo_nr_running) 663 return true; 664 665 /* 666 * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left; 667 * if there's more than one we need the tick for involuntary 668 * preemption. 669 */ 670 if (rq->nr_running > 1) 671 return false; 672 673 return true; 674 } 675 #endif /* CONFIG_NO_HZ_FULL */ 676 677 void sched_avg_update(struct rq *rq) 678 { 679 s64 period = sched_avg_period(); 680 681 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { 682 /* 683 * Inline assembly required to prevent the compiler 684 * optimising this loop into a divmod call. 685 * See __iter_div_u64_rem() for another example of this. 686 */ 687 asm("" : "+rm" (rq->age_stamp)); 688 rq->age_stamp += period; 689 rq->rt_avg /= 2; 690 } 691 } 692 693 #endif /* CONFIG_SMP */ 694 695 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 696 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 697 /* 698 * Iterate task_group tree rooted at *from, calling @down when first entering a 699 * node and @up when leaving it for the final time. 700 * 701 * Caller must hold rcu_lock or sufficient equivalent. 702 */ 703 int walk_tg_tree_from(struct task_group *from, 704 tg_visitor down, tg_visitor up, void *data) 705 { 706 struct task_group *parent, *child; 707 int ret; 708 709 parent = from; 710 711 down: 712 ret = (*down)(parent, data); 713 if (ret) 714 goto out; 715 list_for_each_entry_rcu(child, &parent->children, siblings) { 716 parent = child; 717 goto down; 718 719 up: 720 continue; 721 } 722 ret = (*up)(parent, data); 723 if (ret || parent == from) 724 goto out; 725 726 child = parent; 727 parent = parent->parent; 728 if (parent) 729 goto up; 730 out: 731 return ret; 732 } 733 734 int tg_nop(struct task_group *tg, void *data) 735 { 736 return 0; 737 } 738 #endif 739 740 static void set_load_weight(struct task_struct *p) 741 { 742 int prio = p->static_prio - MAX_RT_PRIO; 743 struct load_weight *load = &p->se.load; 744 745 /* 746 * SCHED_IDLE tasks get minimal weight: 747 */ 748 if (idle_policy(p->policy)) { 749 load->weight = scale_load(WEIGHT_IDLEPRIO); 750 load->inv_weight = WMULT_IDLEPRIO; 751 return; 752 } 753 754 load->weight = scale_load(sched_prio_to_weight[prio]); 755 load->inv_weight = sched_prio_to_wmult[prio]; 756 } 757 758 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 759 { 760 update_rq_clock(rq); 761 if (!(flags & ENQUEUE_RESTORE)) 762 sched_info_queued(rq, p); 763 p->sched_class->enqueue_task(rq, p, flags); 764 } 765 766 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 767 { 768 update_rq_clock(rq); 769 if (!(flags & DEQUEUE_SAVE)) 770 sched_info_dequeued(rq, p); 771 p->sched_class->dequeue_task(rq, p, flags); 772 } 773 774 void activate_task(struct rq *rq, struct task_struct *p, int flags) 775 { 776 if (task_contributes_to_load(p)) 777 rq->nr_uninterruptible--; 778 779 enqueue_task(rq, p, flags); 780 } 781 782 void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 783 { 784 if (task_contributes_to_load(p)) 785 rq->nr_uninterruptible++; 786 787 dequeue_task(rq, p, flags); 788 } 789 790 void sched_set_stop_task(int cpu, struct task_struct *stop) 791 { 792 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 793 struct task_struct *old_stop = cpu_rq(cpu)->stop; 794 795 if (stop) { 796 /* 797 * Make it appear like a SCHED_FIFO task, its something 798 * userspace knows about and won't get confused about. 799 * 800 * Also, it will make PI more or less work without too 801 * much confusion -- but then, stop work should not 802 * rely on PI working anyway. 803 */ 804 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); 805 806 stop->sched_class = &stop_sched_class; 807 } 808 809 cpu_rq(cpu)->stop = stop; 810 811 if (old_stop) { 812 /* 813 * Reset it back to a normal scheduling class so that 814 * it can die in pieces. 815 */ 816 old_stop->sched_class = &rt_sched_class; 817 } 818 } 819 820 /* 821 * __normal_prio - return the priority that is based on the static prio 822 */ 823 static inline int __normal_prio(struct task_struct *p) 824 { 825 return p->static_prio; 826 } 827 828 /* 829 * Calculate the expected normal priority: i.e. priority 830 * without taking RT-inheritance into account. Might be 831 * boosted by interactivity modifiers. Changes upon fork, 832 * setprio syscalls, and whenever the interactivity 833 * estimator recalculates. 834 */ 835 static inline int normal_prio(struct task_struct *p) 836 { 837 int prio; 838 839 if (task_has_dl_policy(p)) 840 prio = MAX_DL_PRIO-1; 841 else if (task_has_rt_policy(p)) 842 prio = MAX_RT_PRIO-1 - p->rt_priority; 843 else 844 prio = __normal_prio(p); 845 return prio; 846 } 847 848 /* 849 * Calculate the current priority, i.e. the priority 850 * taken into account by the scheduler. This value might 851 * be boosted by RT tasks, or might be boosted by 852 * interactivity modifiers. Will be RT if the task got 853 * RT-boosted. If not then it returns p->normal_prio. 854 */ 855 static int effective_prio(struct task_struct *p) 856 { 857 p->normal_prio = normal_prio(p); 858 /* 859 * If we are RT tasks or we were boosted to RT priority, 860 * keep the priority unchanged. Otherwise, update priority 861 * to the normal priority: 862 */ 863 if (!rt_prio(p->prio)) 864 return p->normal_prio; 865 return p->prio; 866 } 867 868 /** 869 * task_curr - is this task currently executing on a CPU? 870 * @p: the task in question. 871 * 872 * Return: 1 if the task is currently executing. 0 otherwise. 873 */ 874 inline int task_curr(const struct task_struct *p) 875 { 876 return cpu_curr(task_cpu(p)) == p; 877 } 878 879 /* 880 * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, 881 * use the balance_callback list if you want balancing. 882 * 883 * this means any call to check_class_changed() must be followed by a call to 884 * balance_callback(). 885 */ 886 static inline void check_class_changed(struct rq *rq, struct task_struct *p, 887 const struct sched_class *prev_class, 888 int oldprio) 889 { 890 if (prev_class != p->sched_class) { 891 if (prev_class->switched_from) 892 prev_class->switched_from(rq, p); 893 894 p->sched_class->switched_to(rq, p); 895 } else if (oldprio != p->prio || dl_task(p)) 896 p->sched_class->prio_changed(rq, p, oldprio); 897 } 898 899 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 900 { 901 const struct sched_class *class; 902 903 if (p->sched_class == rq->curr->sched_class) { 904 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 905 } else { 906 for_each_class(class) { 907 if (class == rq->curr->sched_class) 908 break; 909 if (class == p->sched_class) { 910 resched_curr(rq); 911 break; 912 } 913 } 914 } 915 916 /* 917 * A queue event has occurred, and we're going to schedule. In 918 * this case, we can save a useless back to back clock update. 919 */ 920 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) 921 rq_clock_skip_update(rq, true); 922 } 923 924 #ifdef CONFIG_SMP 925 /* 926 * This is how migration works: 927 * 928 * 1) we invoke migration_cpu_stop() on the target CPU using 929 * stop_one_cpu(). 930 * 2) stopper starts to run (implicitly forcing the migrated thread 931 * off the CPU) 932 * 3) it checks whether the migrated task is still in the wrong runqueue. 933 * 4) if it's in the wrong runqueue then the migration thread removes 934 * it and puts it into the right queue. 935 * 5) stopper completes and stop_one_cpu() returns and the migration 936 * is done. 937 */ 938 939 /* 940 * move_queued_task - move a queued task to new rq. 941 * 942 * Returns (locked) new rq. Old rq's lock is released. 943 */ 944 static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu) 945 { 946 lockdep_assert_held(&rq->lock); 947 948 p->on_rq = TASK_ON_RQ_MIGRATING; 949 dequeue_task(rq, p, 0); 950 set_task_cpu(p, new_cpu); 951 raw_spin_unlock(&rq->lock); 952 953 rq = cpu_rq(new_cpu); 954 955 raw_spin_lock(&rq->lock); 956 BUG_ON(task_cpu(p) != new_cpu); 957 enqueue_task(rq, p, 0); 958 p->on_rq = TASK_ON_RQ_QUEUED; 959 check_preempt_curr(rq, p, 0); 960 961 return rq; 962 } 963 964 struct migration_arg { 965 struct task_struct *task; 966 int dest_cpu; 967 }; 968 969 /* 970 * Move (not current) task off this CPU, onto the destination CPU. We're doing 971 * this because either it can't run here any more (set_cpus_allowed() 972 * away from this CPU, or CPU going down), or because we're 973 * attempting to rebalance this task on exec (sched_exec). 974 * 975 * So we race with normal scheduler movements, but that's OK, as long 976 * as the task is no longer on this CPU. 977 */ 978 static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu) 979 { 980 if (unlikely(!cpu_active(dest_cpu))) 981 return rq; 982 983 /* Affinity changed (again). */ 984 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 985 return rq; 986 987 rq = move_queued_task(rq, p, dest_cpu); 988 989 return rq; 990 } 991 992 /* 993 * migration_cpu_stop - this will be executed by a highprio stopper thread 994 * and performs thread migration by bumping thread off CPU then 995 * 'pushing' onto another runqueue. 996 */ 997 static int migration_cpu_stop(void *data) 998 { 999 struct migration_arg *arg = data; 1000 struct task_struct *p = arg->task; 1001 struct rq *rq = this_rq(); 1002 1003 /* 1004 * The original target CPU might have gone down and we might 1005 * be on another CPU but it doesn't matter. 1006 */ 1007 local_irq_disable(); 1008 /* 1009 * We need to explicitly wake pending tasks before running 1010 * __migrate_task() such that we will not miss enforcing cpus_allowed 1011 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. 1012 */ 1013 sched_ttwu_pending(); 1014 1015 raw_spin_lock(&p->pi_lock); 1016 raw_spin_lock(&rq->lock); 1017 /* 1018 * If task_rq(p) != rq, it cannot be migrated here, because we're 1019 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because 1020 * we're holding p->pi_lock. 1021 */ 1022 if (task_rq(p) == rq) { 1023 if (task_on_rq_queued(p)) 1024 rq = __migrate_task(rq, p, arg->dest_cpu); 1025 else 1026 p->wake_cpu = arg->dest_cpu; 1027 } 1028 raw_spin_unlock(&rq->lock); 1029 raw_spin_unlock(&p->pi_lock); 1030 1031 local_irq_enable(); 1032 return 0; 1033 } 1034 1035 /* 1036 * sched_class::set_cpus_allowed must do the below, but is not required to 1037 * actually call this function. 1038 */ 1039 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) 1040 { 1041 cpumask_copy(&p->cpus_allowed, new_mask); 1042 p->nr_cpus_allowed = cpumask_weight(new_mask); 1043 } 1044 1045 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 1046 { 1047 struct rq *rq = task_rq(p); 1048 bool queued, running; 1049 1050 lockdep_assert_held(&p->pi_lock); 1051 1052 queued = task_on_rq_queued(p); 1053 running = task_current(rq, p); 1054 1055 if (queued) { 1056 /* 1057 * Because __kthread_bind() calls this on blocked tasks without 1058 * holding rq->lock. 1059 */ 1060 lockdep_assert_held(&rq->lock); 1061 dequeue_task(rq, p, DEQUEUE_SAVE); 1062 } 1063 if (running) 1064 put_prev_task(rq, p); 1065 1066 p->sched_class->set_cpus_allowed(p, new_mask); 1067 1068 if (queued) 1069 enqueue_task(rq, p, ENQUEUE_RESTORE); 1070 if (running) 1071 set_curr_task(rq, p); 1072 } 1073 1074 /* 1075 * Change a given task's CPU affinity. Migrate the thread to a 1076 * proper CPU and schedule it away if the CPU it's executing on 1077 * is removed from the allowed bitmask. 1078 * 1079 * NOTE: the caller must have a valid reference to the task, the 1080 * task must not exit() & deallocate itself prematurely. The 1081 * call is not atomic; no spinlocks may be held. 1082 */ 1083 static int __set_cpus_allowed_ptr(struct task_struct *p, 1084 const struct cpumask *new_mask, bool check) 1085 { 1086 const struct cpumask *cpu_valid_mask = cpu_active_mask; 1087 unsigned int dest_cpu; 1088 struct rq_flags rf; 1089 struct rq *rq; 1090 int ret = 0; 1091 1092 rq = task_rq_lock(p, &rf); 1093 1094 if (p->flags & PF_KTHREAD) { 1095 /* 1096 * Kernel threads are allowed on online && !active CPUs 1097 */ 1098 cpu_valid_mask = cpu_online_mask; 1099 } 1100 1101 /* 1102 * Must re-check here, to close a race against __kthread_bind(), 1103 * sched_setaffinity() is not guaranteed to observe the flag. 1104 */ 1105 if (check && (p->flags & PF_NO_SETAFFINITY)) { 1106 ret = -EINVAL; 1107 goto out; 1108 } 1109 1110 if (cpumask_equal(&p->cpus_allowed, new_mask)) 1111 goto out; 1112 1113 if (!cpumask_intersects(new_mask, cpu_valid_mask)) { 1114 ret = -EINVAL; 1115 goto out; 1116 } 1117 1118 do_set_cpus_allowed(p, new_mask); 1119 1120 if (p->flags & PF_KTHREAD) { 1121 /* 1122 * For kernel threads that do indeed end up on online && 1123 * !active we want to ensure they are strict per-CPU threads. 1124 */ 1125 WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && 1126 !cpumask_intersects(new_mask, cpu_active_mask) && 1127 p->nr_cpus_allowed != 1); 1128 } 1129 1130 /* Can the task run on the task's current CPU? If so, we're done */ 1131 if (cpumask_test_cpu(task_cpu(p), new_mask)) 1132 goto out; 1133 1134 dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); 1135 if (task_running(rq, p) || p->state == TASK_WAKING) { 1136 struct migration_arg arg = { p, dest_cpu }; 1137 /* Need help from migration thread: drop lock and wait. */ 1138 task_rq_unlock(rq, p, &rf); 1139 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 1140 tlb_migrate_finish(p->mm); 1141 return 0; 1142 } else if (task_on_rq_queued(p)) { 1143 /* 1144 * OK, since we're going to drop the lock immediately 1145 * afterwards anyway. 1146 */ 1147 rq_unpin_lock(rq, &rf); 1148 rq = move_queued_task(rq, p, dest_cpu); 1149 rq_repin_lock(rq, &rf); 1150 } 1151 out: 1152 task_rq_unlock(rq, p, &rf); 1153 1154 return ret; 1155 } 1156 1157 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 1158 { 1159 return __set_cpus_allowed_ptr(p, new_mask, false); 1160 } 1161 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 1162 1163 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1164 { 1165 #ifdef CONFIG_SCHED_DEBUG 1166 /* 1167 * We should never call set_task_cpu() on a blocked task, 1168 * ttwu() will sort out the placement. 1169 */ 1170 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 1171 !p->on_rq); 1172 1173 /* 1174 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING, 1175 * because schedstat_wait_{start,end} rebase migrating task's wait_start 1176 * time relying on p->on_rq. 1177 */ 1178 WARN_ON_ONCE(p->state == TASK_RUNNING && 1179 p->sched_class == &fair_sched_class && 1180 (p->on_rq && !task_on_rq_migrating(p))); 1181 1182 #ifdef CONFIG_LOCKDEP 1183 /* 1184 * The caller should hold either p->pi_lock or rq->lock, when changing 1185 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 1186 * 1187 * sched_move_task() holds both and thus holding either pins the cgroup, 1188 * see task_group(). 1189 * 1190 * Furthermore, all task_rq users should acquire both locks, see 1191 * task_rq_lock(). 1192 */ 1193 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || 1194 lockdep_is_held(&task_rq(p)->lock))); 1195 #endif 1196 #endif 1197 1198 trace_sched_migrate_task(p, new_cpu); 1199 1200 if (task_cpu(p) != new_cpu) { 1201 if (p->sched_class->migrate_task_rq) 1202 p->sched_class->migrate_task_rq(p); 1203 p->se.nr_migrations++; 1204 perf_event_task_migrate(p); 1205 } 1206 1207 __set_task_cpu(p, new_cpu); 1208 } 1209 1210 static void __migrate_swap_task(struct task_struct *p, int cpu) 1211 { 1212 if (task_on_rq_queued(p)) { 1213 struct rq *src_rq, *dst_rq; 1214 1215 src_rq = task_rq(p); 1216 dst_rq = cpu_rq(cpu); 1217 1218 p->on_rq = TASK_ON_RQ_MIGRATING; 1219 deactivate_task(src_rq, p, 0); 1220 set_task_cpu(p, cpu); 1221 activate_task(dst_rq, p, 0); 1222 p->on_rq = TASK_ON_RQ_QUEUED; 1223 check_preempt_curr(dst_rq, p, 0); 1224 } else { 1225 /* 1226 * Task isn't running anymore; make it appear like we migrated 1227 * it before it went to sleep. This means on wakeup we make the 1228 * previous CPU our target instead of where it really is. 1229 */ 1230 p->wake_cpu = cpu; 1231 } 1232 } 1233 1234 struct migration_swap_arg { 1235 struct task_struct *src_task, *dst_task; 1236 int src_cpu, dst_cpu; 1237 }; 1238 1239 static int migrate_swap_stop(void *data) 1240 { 1241 struct migration_swap_arg *arg = data; 1242 struct rq *src_rq, *dst_rq; 1243 int ret = -EAGAIN; 1244 1245 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) 1246 return -EAGAIN; 1247 1248 src_rq = cpu_rq(arg->src_cpu); 1249 dst_rq = cpu_rq(arg->dst_cpu); 1250 1251 double_raw_lock(&arg->src_task->pi_lock, 1252 &arg->dst_task->pi_lock); 1253 double_rq_lock(src_rq, dst_rq); 1254 1255 if (task_cpu(arg->dst_task) != arg->dst_cpu) 1256 goto unlock; 1257 1258 if (task_cpu(arg->src_task) != arg->src_cpu) 1259 goto unlock; 1260 1261 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) 1262 goto unlock; 1263 1264 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) 1265 goto unlock; 1266 1267 __migrate_swap_task(arg->src_task, arg->dst_cpu); 1268 __migrate_swap_task(arg->dst_task, arg->src_cpu); 1269 1270 ret = 0; 1271 1272 unlock: 1273 double_rq_unlock(src_rq, dst_rq); 1274 raw_spin_unlock(&arg->dst_task->pi_lock); 1275 raw_spin_unlock(&arg->src_task->pi_lock); 1276 1277 return ret; 1278 } 1279 1280 /* 1281 * Cross migrate two tasks 1282 */ 1283 int migrate_swap(struct task_struct *cur, struct task_struct *p) 1284 { 1285 struct migration_swap_arg arg; 1286 int ret = -EINVAL; 1287 1288 arg = (struct migration_swap_arg){ 1289 .src_task = cur, 1290 .src_cpu = task_cpu(cur), 1291 .dst_task = p, 1292 .dst_cpu = task_cpu(p), 1293 }; 1294 1295 if (arg.src_cpu == arg.dst_cpu) 1296 goto out; 1297 1298 /* 1299 * These three tests are all lockless; this is OK since all of them 1300 * will be re-checked with proper locks held further down the line. 1301 */ 1302 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) 1303 goto out; 1304 1305 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) 1306 goto out; 1307 1308 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) 1309 goto out; 1310 1311 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); 1312 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); 1313 1314 out: 1315 return ret; 1316 } 1317 1318 /* 1319 * wait_task_inactive - wait for a thread to unschedule. 1320 * 1321 * If @match_state is nonzero, it's the @p->state value just checked and 1322 * not expected to change. If it changes, i.e. @p might have woken up, 1323 * then return zero. When we succeed in waiting for @p to be off its CPU, 1324 * we return a positive number (its total switch count). If a second call 1325 * a short while later returns the same number, the caller can be sure that 1326 * @p has remained unscheduled the whole time. 1327 * 1328 * The caller must ensure that the task *will* unschedule sometime soon, 1329 * else this function might spin for a *long* time. This function can't 1330 * be called with interrupts off, or it may introduce deadlock with 1331 * smp_call_function() if an IPI is sent by the same process we are 1332 * waiting to become inactive. 1333 */ 1334 unsigned long wait_task_inactive(struct task_struct *p, long match_state) 1335 { 1336 int running, queued; 1337 struct rq_flags rf; 1338 unsigned long ncsw; 1339 struct rq *rq; 1340 1341 for (;;) { 1342 /* 1343 * We do the initial early heuristics without holding 1344 * any task-queue locks at all. We'll only try to get 1345 * the runqueue lock when things look like they will 1346 * work out! 1347 */ 1348 rq = task_rq(p); 1349 1350 /* 1351 * If the task is actively running on another CPU 1352 * still, just relax and busy-wait without holding 1353 * any locks. 1354 * 1355 * NOTE! Since we don't hold any locks, it's not 1356 * even sure that "rq" stays as the right runqueue! 1357 * But we don't care, since "task_running()" will 1358 * return false if the runqueue has changed and p 1359 * is actually now running somewhere else! 1360 */ 1361 while (task_running(rq, p)) { 1362 if (match_state && unlikely(p->state != match_state)) 1363 return 0; 1364 cpu_relax(); 1365 } 1366 1367 /* 1368 * Ok, time to look more closely! We need the rq 1369 * lock now, to be *sure*. If we're wrong, we'll 1370 * just go back and repeat. 1371 */ 1372 rq = task_rq_lock(p, &rf); 1373 trace_sched_wait_task(p); 1374 running = task_running(rq, p); 1375 queued = task_on_rq_queued(p); 1376 ncsw = 0; 1377 if (!match_state || p->state == match_state) 1378 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 1379 task_rq_unlock(rq, p, &rf); 1380 1381 /* 1382 * If it changed from the expected state, bail out now. 1383 */ 1384 if (unlikely(!ncsw)) 1385 break; 1386 1387 /* 1388 * Was it really running after all now that we 1389 * checked with the proper locks actually held? 1390 * 1391 * Oops. Go back and try again.. 1392 */ 1393 if (unlikely(running)) { 1394 cpu_relax(); 1395 continue; 1396 } 1397 1398 /* 1399 * It's not enough that it's not actively running, 1400 * it must be off the runqueue _entirely_, and not 1401 * preempted! 1402 * 1403 * So if it was still runnable (but just not actively 1404 * running right now), it's preempted, and we should 1405 * yield - it could be a while. 1406 */ 1407 if (unlikely(queued)) { 1408 ktime_t to = NSEC_PER_SEC / HZ; 1409 1410 set_current_state(TASK_UNINTERRUPTIBLE); 1411 schedule_hrtimeout(&to, HRTIMER_MODE_REL); 1412 continue; 1413 } 1414 1415 /* 1416 * Ahh, all good. It wasn't running, and it wasn't 1417 * runnable, which means that it will never become 1418 * running in the future either. We're all done! 1419 */ 1420 break; 1421 } 1422 1423 return ncsw; 1424 } 1425 1426 /*** 1427 * kick_process - kick a running thread to enter/exit the kernel 1428 * @p: the to-be-kicked thread 1429 * 1430 * Cause a process which is running on another CPU to enter 1431 * kernel-mode, without any delay. (to get signals handled.) 1432 * 1433 * NOTE: this function doesn't have to take the runqueue lock, 1434 * because all it wants to ensure is that the remote task enters 1435 * the kernel. If the IPI races and the task has been migrated 1436 * to another CPU then no harm is done and the purpose has been 1437 * achieved as well. 1438 */ 1439 void kick_process(struct task_struct *p) 1440 { 1441 int cpu; 1442 1443 preempt_disable(); 1444 cpu = task_cpu(p); 1445 if ((cpu != smp_processor_id()) && task_curr(p)) 1446 smp_send_reschedule(cpu); 1447 preempt_enable(); 1448 } 1449 EXPORT_SYMBOL_GPL(kick_process); 1450 1451 /* 1452 * ->cpus_allowed is protected by both rq->lock and p->pi_lock 1453 * 1454 * A few notes on cpu_active vs cpu_online: 1455 * 1456 * - cpu_active must be a subset of cpu_online 1457 * 1458 * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, 1459 * see __set_cpus_allowed_ptr(). At this point the newly online 1460 * CPU isn't yet part of the sched domains, and balancing will not 1461 * see it. 1462 * 1463 * - on CPU-down we clear cpu_active() to mask the sched domains and 1464 * avoid the load balancer to place new tasks on the to be removed 1465 * CPU. Existing tasks will remain running there and will be taken 1466 * off. 1467 * 1468 * This means that fallback selection must not select !active CPUs. 1469 * And can assume that any active CPU must be online. Conversely 1470 * select_task_rq() below may allow selection of !active CPUs in order 1471 * to satisfy the above rules. 1472 */ 1473 static int select_fallback_rq(int cpu, struct task_struct *p) 1474 { 1475 int nid = cpu_to_node(cpu); 1476 const struct cpumask *nodemask = NULL; 1477 enum { cpuset, possible, fail } state = cpuset; 1478 int dest_cpu; 1479 1480 /* 1481 * If the node that the CPU is on has been offlined, cpu_to_node() 1482 * will return -1. There is no CPU on the node, and we should 1483 * select the CPU on the other node. 1484 */ 1485 if (nid != -1) { 1486 nodemask = cpumask_of_node(nid); 1487 1488 /* Look for allowed, online CPU in same node. */ 1489 for_each_cpu(dest_cpu, nodemask) { 1490 if (!cpu_active(dest_cpu)) 1491 continue; 1492 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1493 return dest_cpu; 1494 } 1495 } 1496 1497 for (;;) { 1498 /* Any allowed, online CPU? */ 1499 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { 1500 if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu)) 1501 continue; 1502 if (!cpu_online(dest_cpu)) 1503 continue; 1504 goto out; 1505 } 1506 1507 /* No more Mr. Nice Guy. */ 1508 switch (state) { 1509 case cpuset: 1510 if (IS_ENABLED(CONFIG_CPUSETS)) { 1511 cpuset_cpus_allowed_fallback(p); 1512 state = possible; 1513 break; 1514 } 1515 /* Fall-through */ 1516 case possible: 1517 do_set_cpus_allowed(p, cpu_possible_mask); 1518 state = fail; 1519 break; 1520 1521 case fail: 1522 BUG(); 1523 break; 1524 } 1525 } 1526 1527 out: 1528 if (state != cpuset) { 1529 /* 1530 * Don't tell them about moving exiting tasks or 1531 * kernel threads (both mm NULL), since they never 1532 * leave kernel. 1533 */ 1534 if (p->mm && printk_ratelimit()) { 1535 printk_deferred("process %d (%s) no longer affine to cpu%d\n", 1536 task_pid_nr(p), p->comm, cpu); 1537 } 1538 } 1539 1540 return dest_cpu; 1541 } 1542 1543 /* 1544 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1545 */ 1546 static inline 1547 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 1548 { 1549 lockdep_assert_held(&p->pi_lock); 1550 1551 if (tsk_nr_cpus_allowed(p) > 1) 1552 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 1553 else 1554 cpu = cpumask_any(tsk_cpus_allowed(p)); 1555 1556 /* 1557 * In order not to call set_task_cpu() on a blocking task we need 1558 * to rely on ttwu() to place the task on a valid ->cpus_allowed 1559 * CPU. 1560 * 1561 * Since this is common to all placement strategies, this lives here. 1562 * 1563 * [ this allows ->select_task() to simply return task_cpu(p) and 1564 * not worry about this generic constraint ] 1565 */ 1566 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || 1567 !cpu_online(cpu))) 1568 cpu = select_fallback_rq(task_cpu(p), p); 1569 1570 return cpu; 1571 } 1572 1573 static void update_avg(u64 *avg, u64 sample) 1574 { 1575 s64 diff = sample - *avg; 1576 *avg += diff >> 3; 1577 } 1578 1579 #else 1580 1581 static inline int __set_cpus_allowed_ptr(struct task_struct *p, 1582 const struct cpumask *new_mask, bool check) 1583 { 1584 return set_cpus_allowed_ptr(p, new_mask); 1585 } 1586 1587 #endif /* CONFIG_SMP */ 1588 1589 static void 1590 ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 1591 { 1592 struct rq *rq; 1593 1594 if (!schedstat_enabled()) 1595 return; 1596 1597 rq = this_rq(); 1598 1599 #ifdef CONFIG_SMP 1600 if (cpu == rq->cpu) { 1601 schedstat_inc(rq->ttwu_local); 1602 schedstat_inc(p->se.statistics.nr_wakeups_local); 1603 } else { 1604 struct sched_domain *sd; 1605 1606 schedstat_inc(p->se.statistics.nr_wakeups_remote); 1607 rcu_read_lock(); 1608 for_each_domain(rq->cpu, sd) { 1609 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 1610 schedstat_inc(sd->ttwu_wake_remote); 1611 break; 1612 } 1613 } 1614 rcu_read_unlock(); 1615 } 1616 1617 if (wake_flags & WF_MIGRATED) 1618 schedstat_inc(p->se.statistics.nr_wakeups_migrate); 1619 #endif /* CONFIG_SMP */ 1620 1621 schedstat_inc(rq->ttwu_count); 1622 schedstat_inc(p->se.statistics.nr_wakeups); 1623 1624 if (wake_flags & WF_SYNC) 1625 schedstat_inc(p->se.statistics.nr_wakeups_sync); 1626 } 1627 1628 static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1629 { 1630 activate_task(rq, p, en_flags); 1631 p->on_rq = TASK_ON_RQ_QUEUED; 1632 1633 /* If a worker is waking up, notify the workqueue: */ 1634 if (p->flags & PF_WQ_WORKER) 1635 wq_worker_waking_up(p, cpu_of(rq)); 1636 } 1637 1638 /* 1639 * Mark the task runnable and perform wakeup-preemption. 1640 */ 1641 static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, 1642 struct rq_flags *rf) 1643 { 1644 check_preempt_curr(rq, p, wake_flags); 1645 p->state = TASK_RUNNING; 1646 trace_sched_wakeup(p); 1647 1648 #ifdef CONFIG_SMP 1649 if (p->sched_class->task_woken) { 1650 /* 1651 * Our task @p is fully woken up and running; so its safe to 1652 * drop the rq->lock, hereafter rq is only used for statistics. 1653 */ 1654 rq_unpin_lock(rq, rf); 1655 p->sched_class->task_woken(rq, p); 1656 rq_repin_lock(rq, rf); 1657 } 1658 1659 if (rq->idle_stamp) { 1660 u64 delta = rq_clock(rq) - rq->idle_stamp; 1661 u64 max = 2*rq->max_idle_balance_cost; 1662 1663 update_avg(&rq->avg_idle, delta); 1664 1665 if (rq->avg_idle > max) 1666 rq->avg_idle = max; 1667 1668 rq->idle_stamp = 0; 1669 } 1670 #endif 1671 } 1672 1673 static void 1674 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, 1675 struct rq_flags *rf) 1676 { 1677 int en_flags = ENQUEUE_WAKEUP; 1678 1679 lockdep_assert_held(&rq->lock); 1680 1681 #ifdef CONFIG_SMP 1682 if (p->sched_contributes_to_load) 1683 rq->nr_uninterruptible--; 1684 1685 if (wake_flags & WF_MIGRATED) 1686 en_flags |= ENQUEUE_MIGRATED; 1687 #endif 1688 1689 ttwu_activate(rq, p, en_flags); 1690 ttwu_do_wakeup(rq, p, wake_flags, rf); 1691 } 1692 1693 /* 1694 * Called in case the task @p isn't fully descheduled from its runqueue, 1695 * in this case we must do a remote wakeup. Its a 'light' wakeup though, 1696 * since all we need to do is flip p->state to TASK_RUNNING, since 1697 * the task is still ->on_rq. 1698 */ 1699 static int ttwu_remote(struct task_struct *p, int wake_flags) 1700 { 1701 struct rq_flags rf; 1702 struct rq *rq; 1703 int ret = 0; 1704 1705 rq = __task_rq_lock(p, &rf); 1706 if (task_on_rq_queued(p)) { 1707 /* check_preempt_curr() may use rq clock */ 1708 update_rq_clock(rq); 1709 ttwu_do_wakeup(rq, p, wake_flags, &rf); 1710 ret = 1; 1711 } 1712 __task_rq_unlock(rq, &rf); 1713 1714 return ret; 1715 } 1716 1717 #ifdef CONFIG_SMP 1718 void sched_ttwu_pending(void) 1719 { 1720 struct rq *rq = this_rq(); 1721 struct llist_node *llist = llist_del_all(&rq->wake_list); 1722 struct task_struct *p; 1723 unsigned long flags; 1724 struct rq_flags rf; 1725 1726 if (!llist) 1727 return; 1728 1729 raw_spin_lock_irqsave(&rq->lock, flags); 1730 rq_pin_lock(rq, &rf); 1731 1732 while (llist) { 1733 int wake_flags = 0; 1734 1735 p = llist_entry(llist, struct task_struct, wake_entry); 1736 llist = llist_next(llist); 1737 1738 if (p->sched_remote_wakeup) 1739 wake_flags = WF_MIGRATED; 1740 1741 ttwu_do_activate(rq, p, wake_flags, &rf); 1742 } 1743 1744 rq_unpin_lock(rq, &rf); 1745 raw_spin_unlock_irqrestore(&rq->lock, flags); 1746 } 1747 1748 void scheduler_ipi(void) 1749 { 1750 /* 1751 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting 1752 * TIF_NEED_RESCHED remotely (for the first time) will also send 1753 * this IPI. 1754 */ 1755 preempt_fold_need_resched(); 1756 1757 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 1758 return; 1759 1760 /* 1761 * Not all reschedule IPI handlers call irq_enter/irq_exit, since 1762 * traditionally all their work was done from the interrupt return 1763 * path. Now that we actually do some work, we need to make sure 1764 * we do call them. 1765 * 1766 * Some archs already do call them, luckily irq_enter/exit nest 1767 * properly. 1768 * 1769 * Arguably we should visit all archs and update all handlers, 1770 * however a fair share of IPIs are still resched only so this would 1771 * somewhat pessimize the simple resched case. 1772 */ 1773 irq_enter(); 1774 sched_ttwu_pending(); 1775 1776 /* 1777 * Check if someone kicked us for doing the nohz idle load balance. 1778 */ 1779 if (unlikely(got_nohz_idle_kick())) { 1780 this_rq()->idle_balance = 1; 1781 raise_softirq_irqoff(SCHED_SOFTIRQ); 1782 } 1783 irq_exit(); 1784 } 1785 1786 static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) 1787 { 1788 struct rq *rq = cpu_rq(cpu); 1789 1790 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); 1791 1792 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { 1793 if (!set_nr_if_polling(rq->idle)) 1794 smp_send_reschedule(cpu); 1795 else 1796 trace_sched_wake_idle_without_ipi(cpu); 1797 } 1798 } 1799 1800 void wake_up_if_idle(int cpu) 1801 { 1802 struct rq *rq = cpu_rq(cpu); 1803 unsigned long flags; 1804 1805 rcu_read_lock(); 1806 1807 if (!is_idle_task(rcu_dereference(rq->curr))) 1808 goto out; 1809 1810 if (set_nr_if_polling(rq->idle)) { 1811 trace_sched_wake_idle_without_ipi(cpu); 1812 } else { 1813 raw_spin_lock_irqsave(&rq->lock, flags); 1814 if (is_idle_task(rq->curr)) 1815 smp_send_reschedule(cpu); 1816 /* Else CPU is not idle, do nothing here: */ 1817 raw_spin_unlock_irqrestore(&rq->lock, flags); 1818 } 1819 1820 out: 1821 rcu_read_unlock(); 1822 } 1823 1824 bool cpus_share_cache(int this_cpu, int that_cpu) 1825 { 1826 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1827 } 1828 #endif /* CONFIG_SMP */ 1829 1830 static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) 1831 { 1832 struct rq *rq = cpu_rq(cpu); 1833 struct rq_flags rf; 1834 1835 #if defined(CONFIG_SMP) 1836 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 1837 sched_clock_cpu(cpu); /* Sync clocks across CPUs */ 1838 ttwu_queue_remote(p, cpu, wake_flags); 1839 return; 1840 } 1841 #endif 1842 1843 raw_spin_lock(&rq->lock); 1844 rq_pin_lock(rq, &rf); 1845 ttwu_do_activate(rq, p, wake_flags, &rf); 1846 rq_unpin_lock(rq, &rf); 1847 raw_spin_unlock(&rq->lock); 1848 } 1849 1850 /* 1851 * Notes on Program-Order guarantees on SMP systems. 1852 * 1853 * MIGRATION 1854 * 1855 * The basic program-order guarantee on SMP systems is that when a task [t] 1856 * migrates, all its activity on its old CPU [c0] happens-before any subsequent 1857 * execution on its new CPU [c1]. 1858 * 1859 * For migration (of runnable tasks) this is provided by the following means: 1860 * 1861 * A) UNLOCK of the rq(c0)->lock scheduling out task t 1862 * B) migration for t is required to synchronize *both* rq(c0)->lock and 1863 * rq(c1)->lock (if not at the same time, then in that order). 1864 * C) LOCK of the rq(c1)->lock scheduling in task 1865 * 1866 * Transitivity guarantees that B happens after A and C after B. 1867 * Note: we only require RCpc transitivity. 1868 * Note: the CPU doing B need not be c0 or c1 1869 * 1870 * Example: 1871 * 1872 * CPU0 CPU1 CPU2 1873 * 1874 * LOCK rq(0)->lock 1875 * sched-out X 1876 * sched-in Y 1877 * UNLOCK rq(0)->lock 1878 * 1879 * LOCK rq(0)->lock // orders against CPU0 1880 * dequeue X 1881 * UNLOCK rq(0)->lock 1882 * 1883 * LOCK rq(1)->lock 1884 * enqueue X 1885 * UNLOCK rq(1)->lock 1886 * 1887 * LOCK rq(1)->lock // orders against CPU2 1888 * sched-out Z 1889 * sched-in X 1890 * UNLOCK rq(1)->lock 1891 * 1892 * 1893 * BLOCKING -- aka. SLEEP + WAKEUP 1894 * 1895 * For blocking we (obviously) need to provide the same guarantee as for 1896 * migration. However the means are completely different as there is no lock 1897 * chain to provide order. Instead we do: 1898 * 1899 * 1) smp_store_release(X->on_cpu, 0) 1900 * 2) smp_cond_load_acquire(!X->on_cpu) 1901 * 1902 * Example: 1903 * 1904 * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) 1905 * 1906 * LOCK rq(0)->lock LOCK X->pi_lock 1907 * dequeue X 1908 * sched-out X 1909 * smp_store_release(X->on_cpu, 0); 1910 * 1911 * smp_cond_load_acquire(&X->on_cpu, !VAL); 1912 * X->state = WAKING 1913 * set_task_cpu(X,2) 1914 * 1915 * LOCK rq(2)->lock 1916 * enqueue X 1917 * X->state = RUNNING 1918 * UNLOCK rq(2)->lock 1919 * 1920 * LOCK rq(2)->lock // orders against CPU1 1921 * sched-out Z 1922 * sched-in X 1923 * UNLOCK rq(2)->lock 1924 * 1925 * UNLOCK X->pi_lock 1926 * UNLOCK rq(0)->lock 1927 * 1928 * 1929 * However; for wakeups there is a second guarantee we must provide, namely we 1930 * must observe the state that lead to our wakeup. That is, not only must our 1931 * task observe its own prior state, it must also observe the stores prior to 1932 * its wakeup. 1933 * 1934 * This means that any means of doing remote wakeups must order the CPU doing 1935 * the wakeup against the CPU the task is going to end up running on. This, 1936 * however, is already required for the regular Program-Order guarantee above, 1937 * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). 1938 * 1939 */ 1940 1941 /** 1942 * try_to_wake_up - wake up a thread 1943 * @p: the thread to be awakened 1944 * @state: the mask of task states that can be woken 1945 * @wake_flags: wake modifier flags (WF_*) 1946 * 1947 * If (@state & @p->state) @p->state = TASK_RUNNING. 1948 * 1949 * If the task was not queued/runnable, also place it back on a runqueue. 1950 * 1951 * Atomic against schedule() which would dequeue a task, also see 1952 * set_current_state(). 1953 * 1954 * Return: %true if @p->state changes (an actual wakeup was done), 1955 * %false otherwise. 1956 */ 1957 static int 1958 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 1959 { 1960 unsigned long flags; 1961 int cpu, success = 0; 1962 1963 /* 1964 * If we are going to wake up a thread waiting for CONDITION we 1965 * need to ensure that CONDITION=1 done by the caller can not be 1966 * reordered with p->state check below. This pairs with mb() in 1967 * set_current_state() the waiting thread does. 1968 */ 1969 smp_mb__before_spinlock(); 1970 raw_spin_lock_irqsave(&p->pi_lock, flags); 1971 if (!(p->state & state)) 1972 goto out; 1973 1974 trace_sched_waking(p); 1975 1976 /* We're going to change ->state: */ 1977 success = 1; 1978 cpu = task_cpu(p); 1979 1980 /* 1981 * Ensure we load p->on_rq _after_ p->state, otherwise it would 1982 * be possible to, falsely, observe p->on_rq == 0 and get stuck 1983 * in smp_cond_load_acquire() below. 1984 * 1985 * sched_ttwu_pending() try_to_wake_up() 1986 * [S] p->on_rq = 1; [L] P->state 1987 * UNLOCK rq->lock -----. 1988 * \ 1989 * +--- RMB 1990 * schedule() / 1991 * LOCK rq->lock -----' 1992 * UNLOCK rq->lock 1993 * 1994 * [task p] 1995 * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq 1996 * 1997 * Pairs with the UNLOCK+LOCK on rq->lock from the 1998 * last wakeup of our task and the schedule that got our task 1999 * current. 2000 */ 2001 smp_rmb(); 2002 if (p->on_rq && ttwu_remote(p, wake_flags)) 2003 goto stat; 2004 2005 #ifdef CONFIG_SMP 2006 /* 2007 * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be 2008 * possible to, falsely, observe p->on_cpu == 0. 2009 * 2010 * One must be running (->on_cpu == 1) in order to remove oneself 2011 * from the runqueue. 2012 * 2013 * [S] ->on_cpu = 1; [L] ->on_rq 2014 * UNLOCK rq->lock 2015 * RMB 2016 * LOCK rq->lock 2017 * [S] ->on_rq = 0; [L] ->on_cpu 2018 * 2019 * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock 2020 * from the consecutive calls to schedule(); the first switching to our 2021 * task, the second putting it to sleep. 2022 */ 2023 smp_rmb(); 2024 2025 /* 2026 * If the owning (remote) CPU is still in the middle of schedule() with 2027 * this task as prev, wait until its done referencing the task. 2028 * 2029 * Pairs with the smp_store_release() in finish_lock_switch(). 2030 * 2031 * This ensures that tasks getting woken will be fully ordered against 2032 * their previous state and preserve Program Order. 2033 */ 2034 smp_cond_load_acquire(&p->on_cpu, !VAL); 2035 2036 p->sched_contributes_to_load = !!task_contributes_to_load(p); 2037 p->state = TASK_WAKING; 2038 2039 if (p->in_iowait) { 2040 delayacct_blkio_end(); 2041 atomic_dec(&task_rq(p)->nr_iowait); 2042 } 2043 2044 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); 2045 if (task_cpu(p) != cpu) { 2046 wake_flags |= WF_MIGRATED; 2047 set_task_cpu(p, cpu); 2048 } 2049 2050 #else /* CONFIG_SMP */ 2051 2052 if (p->in_iowait) { 2053 delayacct_blkio_end(); 2054 atomic_dec(&task_rq(p)->nr_iowait); 2055 } 2056 2057 #endif /* CONFIG_SMP */ 2058 2059 ttwu_queue(p, cpu, wake_flags); 2060 stat: 2061 ttwu_stat(p, cpu, wake_flags); 2062 out: 2063 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2064 2065 return success; 2066 } 2067 2068 /** 2069 * try_to_wake_up_local - try to wake up a local task with rq lock held 2070 * @p: the thread to be awakened 2071 * @cookie: context's cookie for pinning 2072 * 2073 * Put @p on the run-queue if it's not already there. The caller must 2074 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2075 * the current task. 2076 */ 2077 static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) 2078 { 2079 struct rq *rq = task_rq(p); 2080 2081 if (WARN_ON_ONCE(rq != this_rq()) || 2082 WARN_ON_ONCE(p == current)) 2083 return; 2084 2085 lockdep_assert_held(&rq->lock); 2086 2087 if (!raw_spin_trylock(&p->pi_lock)) { 2088 /* 2089 * This is OK, because current is on_cpu, which avoids it being 2090 * picked for load-balance and preemption/IRQs are still 2091 * disabled avoiding further scheduler activity on it and we've 2092 * not yet picked a replacement task. 2093 */ 2094 rq_unpin_lock(rq, rf); 2095 raw_spin_unlock(&rq->lock); 2096 raw_spin_lock(&p->pi_lock); 2097 raw_spin_lock(&rq->lock); 2098 rq_repin_lock(rq, rf); 2099 } 2100 2101 if (!(p->state & TASK_NORMAL)) 2102 goto out; 2103 2104 trace_sched_waking(p); 2105 2106 if (!task_on_rq_queued(p)) { 2107 if (p->in_iowait) { 2108 delayacct_blkio_end(); 2109 atomic_dec(&rq->nr_iowait); 2110 } 2111 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 2112 } 2113 2114 ttwu_do_wakeup(rq, p, 0, rf); 2115 ttwu_stat(p, smp_processor_id(), 0); 2116 out: 2117 raw_spin_unlock(&p->pi_lock); 2118 } 2119 2120 /** 2121 * wake_up_process - Wake up a specific process 2122 * @p: The process to be woken up. 2123 * 2124 * Attempt to wake up the nominated process and move it to the set of runnable 2125 * processes. 2126 * 2127 * Return: 1 if the process was woken up, 0 if it was already running. 2128 * 2129 * It may be assumed that this function implies a write memory barrier before 2130 * changing the task state if and only if any tasks are woken up. 2131 */ 2132 int wake_up_process(struct task_struct *p) 2133 { 2134 return try_to_wake_up(p, TASK_NORMAL, 0); 2135 } 2136 EXPORT_SYMBOL(wake_up_process); 2137 2138 int wake_up_state(struct task_struct *p, unsigned int state) 2139 { 2140 return try_to_wake_up(p, state, 0); 2141 } 2142 2143 /* 2144 * This function clears the sched_dl_entity static params. 2145 */ 2146 void __dl_clear_params(struct task_struct *p) 2147 { 2148 struct sched_dl_entity *dl_se = &p->dl; 2149 2150 dl_se->dl_runtime = 0; 2151 dl_se->dl_deadline = 0; 2152 dl_se->dl_period = 0; 2153 dl_se->flags = 0; 2154 dl_se->dl_bw = 0; 2155 2156 dl_se->dl_throttled = 0; 2157 dl_se->dl_yielded = 0; 2158 } 2159 2160 /* 2161 * Perform scheduler related setup for a newly forked process p. 2162 * p is forked by current. 2163 * 2164 * __sched_fork() is basic setup used by init_idle() too: 2165 */ 2166 static void __sched_fork(unsigned long clone_flags, struct task_struct *p) 2167 { 2168 p->on_rq = 0; 2169 2170 p->se.on_rq = 0; 2171 p->se.exec_start = 0; 2172 p->se.sum_exec_runtime = 0; 2173 p->se.prev_sum_exec_runtime = 0; 2174 p->se.nr_migrations = 0; 2175 p->se.vruntime = 0; 2176 INIT_LIST_HEAD(&p->se.group_node); 2177 2178 #ifdef CONFIG_FAIR_GROUP_SCHED 2179 p->se.cfs_rq = NULL; 2180 #endif 2181 2182 #ifdef CONFIG_SCHEDSTATS 2183 /* Even if schedstat is disabled, there should not be garbage */ 2184 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2185 #endif 2186 2187 RB_CLEAR_NODE(&p->dl.rb_node); 2188 init_dl_task_timer(&p->dl); 2189 __dl_clear_params(p); 2190 2191 INIT_LIST_HEAD(&p->rt.run_list); 2192 p->rt.timeout = 0; 2193 p->rt.time_slice = sched_rr_timeslice; 2194 p->rt.on_rq = 0; 2195 p->rt.on_list = 0; 2196 2197 #ifdef CONFIG_PREEMPT_NOTIFIERS 2198 INIT_HLIST_HEAD(&p->preempt_notifiers); 2199 #endif 2200 2201 #ifdef CONFIG_NUMA_BALANCING 2202 if (p->mm && atomic_read(&p->mm->mm_users) == 1) { 2203 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); 2204 p->mm->numa_scan_seq = 0; 2205 } 2206 2207 if (clone_flags & CLONE_VM) 2208 p->numa_preferred_nid = current->numa_preferred_nid; 2209 else 2210 p->numa_preferred_nid = -1; 2211 2212 p->node_stamp = 0ULL; 2213 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 2214 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 2215 p->numa_work.next = &p->numa_work; 2216 p->numa_faults = NULL; 2217 p->last_task_numa_placement = 0; 2218 p->last_sum_exec_runtime = 0; 2219 2220 p->numa_group = NULL; 2221 #endif /* CONFIG_NUMA_BALANCING */ 2222 } 2223 2224 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); 2225 2226 #ifdef CONFIG_NUMA_BALANCING 2227 2228 void set_numabalancing_state(bool enabled) 2229 { 2230 if (enabled) 2231 static_branch_enable(&sched_numa_balancing); 2232 else 2233 static_branch_disable(&sched_numa_balancing); 2234 } 2235 2236 #ifdef CONFIG_PROC_SYSCTL 2237 int sysctl_numa_balancing(struct ctl_table *table, int write, 2238 void __user *buffer, size_t *lenp, loff_t *ppos) 2239 { 2240 struct ctl_table t; 2241 int err; 2242 int state = static_branch_likely(&sched_numa_balancing); 2243 2244 if (write && !capable(CAP_SYS_ADMIN)) 2245 return -EPERM; 2246 2247 t = *table; 2248 t.data = &state; 2249 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 2250 if (err < 0) 2251 return err; 2252 if (write) 2253 set_numabalancing_state(state); 2254 return err; 2255 } 2256 #endif 2257 #endif 2258 2259 #ifdef CONFIG_SCHEDSTATS 2260 2261 DEFINE_STATIC_KEY_FALSE(sched_schedstats); 2262 static bool __initdata __sched_schedstats = false; 2263 2264 static void set_schedstats(bool enabled) 2265 { 2266 if (enabled) 2267 static_branch_enable(&sched_schedstats); 2268 else 2269 static_branch_disable(&sched_schedstats); 2270 } 2271 2272 void force_schedstat_enabled(void) 2273 { 2274 if (!schedstat_enabled()) { 2275 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); 2276 static_branch_enable(&sched_schedstats); 2277 } 2278 } 2279 2280 static int __init setup_schedstats(char *str) 2281 { 2282 int ret = 0; 2283 if (!str) 2284 goto out; 2285 2286 /* 2287 * This code is called before jump labels have been set up, so we can't 2288 * change the static branch directly just yet. Instead set a temporary 2289 * variable so init_schedstats() can do it later. 2290 */ 2291 if (!strcmp(str, "enable")) { 2292 __sched_schedstats = true; 2293 ret = 1; 2294 } else if (!strcmp(str, "disable")) { 2295 __sched_schedstats = false; 2296 ret = 1; 2297 } 2298 out: 2299 if (!ret) 2300 pr_warn("Unable to parse schedstats=\n"); 2301 2302 return ret; 2303 } 2304 __setup("schedstats=", setup_schedstats); 2305 2306 static void __init init_schedstats(void) 2307 { 2308 set_schedstats(__sched_schedstats); 2309 } 2310 2311 #ifdef CONFIG_PROC_SYSCTL 2312 int sysctl_schedstats(struct ctl_table *table, int write, 2313 void __user *buffer, size_t *lenp, loff_t *ppos) 2314 { 2315 struct ctl_table t; 2316 int err; 2317 int state = static_branch_likely(&sched_schedstats); 2318 2319 if (write && !capable(CAP_SYS_ADMIN)) 2320 return -EPERM; 2321 2322 t = *table; 2323 t.data = &state; 2324 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 2325 if (err < 0) 2326 return err; 2327 if (write) 2328 set_schedstats(state); 2329 return err; 2330 } 2331 #endif /* CONFIG_PROC_SYSCTL */ 2332 #else /* !CONFIG_SCHEDSTATS */ 2333 static inline void init_schedstats(void) {} 2334 #endif /* CONFIG_SCHEDSTATS */ 2335 2336 /* 2337 * fork()/clone()-time setup: 2338 */ 2339 int sched_fork(unsigned long clone_flags, struct task_struct *p) 2340 { 2341 unsigned long flags; 2342 int cpu = get_cpu(); 2343 2344 __sched_fork(clone_flags, p); 2345 /* 2346 * We mark the process as NEW here. This guarantees that 2347 * nobody will actually run it, and a signal or other external 2348 * event cannot wake it up and insert it on the runqueue either. 2349 */ 2350 p->state = TASK_NEW; 2351 2352 /* 2353 * Make sure we do not leak PI boosting priority to the child. 2354 */ 2355 p->prio = current->normal_prio; 2356 2357 /* 2358 * Revert to default priority/policy on fork if requested. 2359 */ 2360 if (unlikely(p->sched_reset_on_fork)) { 2361 if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 2362 p->policy = SCHED_NORMAL; 2363 p->static_prio = NICE_TO_PRIO(0); 2364 p->rt_priority = 0; 2365 } else if (PRIO_TO_NICE(p->static_prio) < 0) 2366 p->static_prio = NICE_TO_PRIO(0); 2367 2368 p->prio = p->normal_prio = __normal_prio(p); 2369 set_load_weight(p); 2370 2371 /* 2372 * We don't need the reset flag anymore after the fork. It has 2373 * fulfilled its duty: 2374 */ 2375 p->sched_reset_on_fork = 0; 2376 } 2377 2378 if (dl_prio(p->prio)) { 2379 put_cpu(); 2380 return -EAGAIN; 2381 } else if (rt_prio(p->prio)) { 2382 p->sched_class = &rt_sched_class; 2383 } else { 2384 p->sched_class = &fair_sched_class; 2385 } 2386 2387 init_entity_runnable_average(&p->se); 2388 2389 /* 2390 * The child is not yet in the pid-hash so no cgroup attach races, 2391 * and the cgroup is pinned to this child due to cgroup_fork() 2392 * is ran before sched_fork(). 2393 * 2394 * Silence PROVE_RCU. 2395 */ 2396 raw_spin_lock_irqsave(&p->pi_lock, flags); 2397 /* 2398 * We're setting the CPU for the first time, we don't migrate, 2399 * so use __set_task_cpu(). 2400 */ 2401 __set_task_cpu(p, cpu); 2402 if (p->sched_class->task_fork) 2403 p->sched_class->task_fork(p); 2404 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2405 2406 #ifdef CONFIG_SCHED_INFO 2407 if (likely(sched_info_on())) 2408 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2409 #endif 2410 #if defined(CONFIG_SMP) 2411 p->on_cpu = 0; 2412 #endif 2413 init_task_preempt_count(p); 2414 #ifdef CONFIG_SMP 2415 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2416 RB_CLEAR_NODE(&p->pushable_dl_tasks); 2417 #endif 2418 2419 put_cpu(); 2420 return 0; 2421 } 2422 2423 unsigned long to_ratio(u64 period, u64 runtime) 2424 { 2425 if (runtime == RUNTIME_INF) 2426 return 1ULL << 20; 2427 2428 /* 2429 * Doing this here saves a lot of checks in all 2430 * the calling paths, and returning zero seems 2431 * safe for them anyway. 2432 */ 2433 if (period == 0) 2434 return 0; 2435 2436 return div64_u64(runtime << 20, period); 2437 } 2438 2439 #ifdef CONFIG_SMP 2440 inline struct dl_bw *dl_bw_of(int i) 2441 { 2442 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), 2443 "sched RCU must be held"); 2444 return &cpu_rq(i)->rd->dl_bw; 2445 } 2446 2447 static inline int dl_bw_cpus(int i) 2448 { 2449 struct root_domain *rd = cpu_rq(i)->rd; 2450 int cpus = 0; 2451 2452 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), 2453 "sched RCU must be held"); 2454 for_each_cpu_and(i, rd->span, cpu_active_mask) 2455 cpus++; 2456 2457 return cpus; 2458 } 2459 #else 2460 inline struct dl_bw *dl_bw_of(int i) 2461 { 2462 return &cpu_rq(i)->dl.dl_bw; 2463 } 2464 2465 static inline int dl_bw_cpus(int i) 2466 { 2467 return 1; 2468 } 2469 #endif 2470 2471 /* 2472 * We must be sure that accepting a new task (or allowing changing the 2473 * parameters of an existing one) is consistent with the bandwidth 2474 * constraints. If yes, this function also accordingly updates the currently 2475 * allocated bandwidth to reflect the new situation. 2476 * 2477 * This function is called while holding p's rq->lock. 2478 * 2479 * XXX we should delay bw change until the task's 0-lag point, see 2480 * __setparam_dl(). 2481 */ 2482 static int dl_overflow(struct task_struct *p, int policy, 2483 const struct sched_attr *attr) 2484 { 2485 2486 struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); 2487 u64 period = attr->sched_period ?: attr->sched_deadline; 2488 u64 runtime = attr->sched_runtime; 2489 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; 2490 int cpus, err = -1; 2491 2492 /* !deadline task may carry old deadline bandwidth */ 2493 if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) 2494 return 0; 2495 2496 /* 2497 * Either if a task, enters, leave, or stays -deadline but changes 2498 * its parameters, we may need to update accordingly the total 2499 * allocated bandwidth of the container. 2500 */ 2501 raw_spin_lock(&dl_b->lock); 2502 cpus = dl_bw_cpus(task_cpu(p)); 2503 if (dl_policy(policy) && !task_has_dl_policy(p) && 2504 !__dl_overflow(dl_b, cpus, 0, new_bw)) { 2505 __dl_add(dl_b, new_bw); 2506 err = 0; 2507 } else if (dl_policy(policy) && task_has_dl_policy(p) && 2508 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { 2509 __dl_clear(dl_b, p->dl.dl_bw); 2510 __dl_add(dl_b, new_bw); 2511 err = 0; 2512 } else if (!dl_policy(policy) && task_has_dl_policy(p)) { 2513 __dl_clear(dl_b, p->dl.dl_bw); 2514 err = 0; 2515 } 2516 raw_spin_unlock(&dl_b->lock); 2517 2518 return err; 2519 } 2520 2521 extern void init_dl_bw(struct dl_bw *dl_b); 2522 2523 /* 2524 * wake_up_new_task - wake up a newly created task for the first time. 2525 * 2526 * This function will do some initial scheduler statistics housekeeping 2527 * that must be done for every newly created context, then puts the task 2528 * on the runqueue and wakes it. 2529 */ 2530 void wake_up_new_task(struct task_struct *p) 2531 { 2532 struct rq_flags rf; 2533 struct rq *rq; 2534 2535 raw_spin_lock_irqsave(&p->pi_lock, rf.flags); 2536 p->state = TASK_RUNNING; 2537 #ifdef CONFIG_SMP 2538 /* 2539 * Fork balancing, do it here and not earlier because: 2540 * - cpus_allowed can change in the fork path 2541 * - any previously selected CPU might disappear through hotplug 2542 * 2543 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, 2544 * as we're not fully set-up yet. 2545 */ 2546 __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); 2547 #endif 2548 rq = __task_rq_lock(p, &rf); 2549 update_rq_clock(rq); 2550 post_init_entity_util_avg(&p->se); 2551 2552 activate_task(rq, p, 0); 2553 p->on_rq = TASK_ON_RQ_QUEUED; 2554 trace_sched_wakeup_new(p); 2555 check_preempt_curr(rq, p, WF_FORK); 2556 #ifdef CONFIG_SMP 2557 if (p->sched_class->task_woken) { 2558 /* 2559 * Nothing relies on rq->lock after this, so its fine to 2560 * drop it. 2561 */ 2562 rq_unpin_lock(rq, &rf); 2563 p->sched_class->task_woken(rq, p); 2564 rq_repin_lock(rq, &rf); 2565 } 2566 #endif 2567 task_rq_unlock(rq, p, &rf); 2568 } 2569 2570 #ifdef CONFIG_PREEMPT_NOTIFIERS 2571 2572 static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; 2573 2574 void preempt_notifier_inc(void) 2575 { 2576 static_key_slow_inc(&preempt_notifier_key); 2577 } 2578 EXPORT_SYMBOL_GPL(preempt_notifier_inc); 2579 2580 void preempt_notifier_dec(void) 2581 { 2582 static_key_slow_dec(&preempt_notifier_key); 2583 } 2584 EXPORT_SYMBOL_GPL(preempt_notifier_dec); 2585 2586 /** 2587 * preempt_notifier_register - tell me when current is being preempted & rescheduled 2588 * @notifier: notifier struct to register 2589 */ 2590 void preempt_notifier_register(struct preempt_notifier *notifier) 2591 { 2592 if (!static_key_false(&preempt_notifier_key)) 2593 WARN(1, "registering preempt_notifier while notifiers disabled\n"); 2594 2595 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); 2596 } 2597 EXPORT_SYMBOL_GPL(preempt_notifier_register); 2598 2599 /** 2600 * preempt_notifier_unregister - no longer interested in preemption notifications 2601 * @notifier: notifier struct to unregister 2602 * 2603 * This is *not* safe to call from within a preemption notifier. 2604 */ 2605 void preempt_notifier_unregister(struct preempt_notifier *notifier) 2606 { 2607 hlist_del(¬ifier->link); 2608 } 2609 EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 2610 2611 static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) 2612 { 2613 struct preempt_notifier *notifier; 2614 2615 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 2616 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 2617 } 2618 2619 static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2620 { 2621 if (static_key_false(&preempt_notifier_key)) 2622 __fire_sched_in_preempt_notifiers(curr); 2623 } 2624 2625 static void 2626 __fire_sched_out_preempt_notifiers(struct task_struct *curr, 2627 struct task_struct *next) 2628 { 2629 struct preempt_notifier *notifier; 2630 2631 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 2632 notifier->ops->sched_out(notifier, next); 2633 } 2634 2635 static __always_inline void 2636 fire_sched_out_preempt_notifiers(struct task_struct *curr, 2637 struct task_struct *next) 2638 { 2639 if (static_key_false(&preempt_notifier_key)) 2640 __fire_sched_out_preempt_notifiers(curr, next); 2641 } 2642 2643 #else /* !CONFIG_PREEMPT_NOTIFIERS */ 2644 2645 static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2646 { 2647 } 2648 2649 static inline void 2650 fire_sched_out_preempt_notifiers(struct task_struct *curr, 2651 struct task_struct *next) 2652 { 2653 } 2654 2655 #endif /* CONFIG_PREEMPT_NOTIFIERS */ 2656 2657 /** 2658 * prepare_task_switch - prepare to switch tasks 2659 * @rq: the runqueue preparing to switch 2660 * @prev: the current task that is being switched out 2661 * @next: the task we are going to switch to. 2662 * 2663 * This is called with the rq lock held and interrupts off. It must 2664 * be paired with a subsequent finish_task_switch after the context 2665 * switch. 2666 * 2667 * prepare_task_switch sets up locking and calls architecture specific 2668 * hooks. 2669 */ 2670 static inline void 2671 prepare_task_switch(struct rq *rq, struct task_struct *prev, 2672 struct task_struct *next) 2673 { 2674 sched_info_switch(rq, prev, next); 2675 perf_event_task_sched_out(prev, next); 2676 fire_sched_out_preempt_notifiers(prev, next); 2677 prepare_lock_switch(rq, next); 2678 prepare_arch_switch(next); 2679 } 2680 2681 /** 2682 * finish_task_switch - clean up after a task-switch 2683 * @prev: the thread we just switched away from. 2684 * 2685 * finish_task_switch must be called after the context switch, paired 2686 * with a prepare_task_switch call before the context switch. 2687 * finish_task_switch will reconcile locking set up by prepare_task_switch, 2688 * and do any other architecture-specific cleanup actions. 2689 * 2690 * Note that we may have delayed dropping an mm in context_switch(). If 2691 * so, we finish that here outside of the runqueue lock. (Doing it 2692 * with the lock held can cause deadlocks; see schedule() for 2693 * details.) 2694 * 2695 * The context switch have flipped the stack from under us and restored the 2696 * local variables which were saved when this task called schedule() in the 2697 * past. prev == current is still correct but we need to recalculate this_rq 2698 * because prev may have moved to another CPU. 2699 */ 2700 static struct rq *finish_task_switch(struct task_struct *prev) 2701 __releases(rq->lock) 2702 { 2703 struct rq *rq = this_rq(); 2704 struct mm_struct *mm = rq->prev_mm; 2705 long prev_state; 2706 2707 /* 2708 * The previous task will have left us with a preempt_count of 2 2709 * because it left us after: 2710 * 2711 * schedule() 2712 * preempt_disable(); // 1 2713 * __schedule() 2714 * raw_spin_lock_irq(&rq->lock) // 2 2715 * 2716 * Also, see FORK_PREEMPT_COUNT. 2717 */ 2718 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, 2719 "corrupted preempt_count: %s/%d/0x%x\n", 2720 current->comm, current->pid, preempt_count())) 2721 preempt_count_set(FORK_PREEMPT_COUNT); 2722 2723 rq->prev_mm = NULL; 2724 2725 /* 2726 * A task struct has one reference for the use as "current". 2727 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 2728 * schedule one last time. The schedule call will never return, and 2729 * the scheduled task must drop that reference. 2730 * 2731 * We must observe prev->state before clearing prev->on_cpu (in 2732 * finish_lock_switch), otherwise a concurrent wakeup can get prev 2733 * running on another CPU and we could rave with its RUNNING -> DEAD 2734 * transition, resulting in a double drop. 2735 */ 2736 prev_state = prev->state; 2737 vtime_task_switch(prev); 2738 perf_event_task_sched_in(prev, current); 2739 finish_lock_switch(rq, prev); 2740 finish_arch_post_lock_switch(); 2741 2742 fire_sched_in_preempt_notifiers(current); 2743 if (mm) 2744 mmdrop(mm); 2745 if (unlikely(prev_state == TASK_DEAD)) { 2746 if (prev->sched_class->task_dead) 2747 prev->sched_class->task_dead(prev); 2748 2749 /* 2750 * Remove function-return probe instances associated with this 2751 * task and put them back on the free list. 2752 */ 2753 kprobe_flush_task(prev); 2754 2755 /* Task is done with its stack. */ 2756 put_task_stack(prev); 2757 2758 put_task_struct(prev); 2759 } 2760 2761 tick_nohz_task_switch(); 2762 return rq; 2763 } 2764 2765 #ifdef CONFIG_SMP 2766 2767 /* rq->lock is NOT held, but preemption is disabled */ 2768 static void __balance_callback(struct rq *rq) 2769 { 2770 struct callback_head *head, *next; 2771 void (*func)(struct rq *rq); 2772 unsigned long flags; 2773 2774 raw_spin_lock_irqsave(&rq->lock, flags); 2775 head = rq->balance_callback; 2776 rq->balance_callback = NULL; 2777 while (head) { 2778 func = (void (*)(struct rq *))head->func; 2779 next = head->next; 2780 head->next = NULL; 2781 head = next; 2782 2783 func(rq); 2784 } 2785 raw_spin_unlock_irqrestore(&rq->lock, flags); 2786 } 2787 2788 static inline void balance_callback(struct rq *rq) 2789 { 2790 if (unlikely(rq->balance_callback)) 2791 __balance_callback(rq); 2792 } 2793 2794 #else 2795 2796 static inline void balance_callback(struct rq *rq) 2797 { 2798 } 2799 2800 #endif 2801 2802 /** 2803 * schedule_tail - first thing a freshly forked thread must call. 2804 * @prev: the thread we just switched away from. 2805 */ 2806 asmlinkage __visible void schedule_tail(struct task_struct *prev) 2807 __releases(rq->lock) 2808 { 2809 struct rq *rq; 2810 2811 /* 2812 * New tasks start with FORK_PREEMPT_COUNT, see there and 2813 * finish_task_switch() for details. 2814 * 2815 * finish_task_switch() will drop rq->lock() and lower preempt_count 2816 * and the preempt_enable() will end up enabling preemption (on 2817 * PREEMPT_COUNT kernels). 2818 */ 2819 2820 rq = finish_task_switch(prev); 2821 balance_callback(rq); 2822 preempt_enable(); 2823 2824 if (current->set_child_tid) 2825 put_user(task_pid_vnr(current), current->set_child_tid); 2826 } 2827 2828 /* 2829 * context_switch - switch to the new MM and the new thread's register state. 2830 */ 2831 static __always_inline struct rq * 2832 context_switch(struct rq *rq, struct task_struct *prev, 2833 struct task_struct *next, struct rq_flags *rf) 2834 { 2835 struct mm_struct *mm, *oldmm; 2836 2837 prepare_task_switch(rq, prev, next); 2838 2839 mm = next->mm; 2840 oldmm = prev->active_mm; 2841 /* 2842 * For paravirt, this is coupled with an exit in switch_to to 2843 * combine the page table reload and the switch backend into 2844 * one hypercall. 2845 */ 2846 arch_start_context_switch(prev); 2847 2848 if (!mm) { 2849 next->active_mm = oldmm; 2850 atomic_inc(&oldmm->mm_count); 2851 enter_lazy_tlb(oldmm, next); 2852 } else 2853 switch_mm_irqs_off(oldmm, mm, next); 2854 2855 if (!prev->mm) { 2856 prev->active_mm = NULL; 2857 rq->prev_mm = oldmm; 2858 } 2859 2860 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); 2861 2862 /* 2863 * Since the runqueue lock will be released by the next 2864 * task (which is an invalid locking op but in the case 2865 * of the scheduler it's an obvious special-case), so we 2866 * do an early lockdep release here: 2867 */ 2868 rq_unpin_lock(rq, rf); 2869 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 2870 2871 /* Here we just switch the register state and the stack. */ 2872 switch_to(prev, next, prev); 2873 barrier(); 2874 2875 return finish_task_switch(prev); 2876 } 2877 2878 /* 2879 * nr_running and nr_context_switches: 2880 * 2881 * externally visible scheduler statistics: current number of runnable 2882 * threads, total number of context switches performed since bootup. 2883 */ 2884 unsigned long nr_running(void) 2885 { 2886 unsigned long i, sum = 0; 2887 2888 for_each_online_cpu(i) 2889 sum += cpu_rq(i)->nr_running; 2890 2891 return sum; 2892 } 2893 2894 /* 2895 * Check if only the current task is running on the CPU. 2896 * 2897 * Caution: this function does not check that the caller has disabled 2898 * preemption, thus the result might have a time-of-check-to-time-of-use 2899 * race. The caller is responsible to use it correctly, for example: 2900 * 2901 * - from a non-preemptable section (of course) 2902 * 2903 * - from a thread that is bound to a single CPU 2904 * 2905 * - in a loop with very short iterations (e.g. a polling loop) 2906 */ 2907 bool single_task_running(void) 2908 { 2909 return raw_rq()->nr_running == 1; 2910 } 2911 EXPORT_SYMBOL(single_task_running); 2912 2913 unsigned long long nr_context_switches(void) 2914 { 2915 int i; 2916 unsigned long long sum = 0; 2917 2918 for_each_possible_cpu(i) 2919 sum += cpu_rq(i)->nr_switches; 2920 2921 return sum; 2922 } 2923 2924 /* 2925 * IO-wait accounting, and how its mostly bollocks (on SMP). 2926 * 2927 * The idea behind IO-wait account is to account the idle time that we could 2928 * have spend running if it were not for IO. That is, if we were to improve the 2929 * storage performance, we'd have a proportional reduction in IO-wait time. 2930 * 2931 * This all works nicely on UP, where, when a task blocks on IO, we account 2932 * idle time as IO-wait, because if the storage were faster, it could've been 2933 * running and we'd not be idle. 2934 * 2935 * This has been extended to SMP, by doing the same for each CPU. This however 2936 * is broken. 2937 * 2938 * Imagine for instance the case where two tasks block on one CPU, only the one 2939 * CPU will have IO-wait accounted, while the other has regular idle. Even 2940 * though, if the storage were faster, both could've ran at the same time, 2941 * utilising both CPUs. 2942 * 2943 * This means, that when looking globally, the current IO-wait accounting on 2944 * SMP is a lower bound, by reason of under accounting. 2945 * 2946 * Worse, since the numbers are provided per CPU, they are sometimes 2947 * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly 2948 * associated with any one particular CPU, it can wake to another CPU than it 2949 * blocked on. This means the per CPU IO-wait number is meaningless. 2950 * 2951 * Task CPU affinities can make all that even more 'interesting'. 2952 */ 2953 2954 unsigned long nr_iowait(void) 2955 { 2956 unsigned long i, sum = 0; 2957 2958 for_each_possible_cpu(i) 2959 sum += atomic_read(&cpu_rq(i)->nr_iowait); 2960 2961 return sum; 2962 } 2963 2964 /* 2965 * Consumers of these two interfaces, like for example the cpufreq menu 2966 * governor are using nonsensical data. Boosting frequency for a CPU that has 2967 * IO-wait which might not even end up running the task when it does become 2968 * runnable. 2969 */ 2970 2971 unsigned long nr_iowait_cpu(int cpu) 2972 { 2973 struct rq *this = cpu_rq(cpu); 2974 return atomic_read(&this->nr_iowait); 2975 } 2976 2977 void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) 2978 { 2979 struct rq *rq = this_rq(); 2980 *nr_waiters = atomic_read(&rq->nr_iowait); 2981 *load = rq->load.weight; 2982 } 2983 2984 #ifdef CONFIG_SMP 2985 2986 /* 2987 * sched_exec - execve() is a valuable balancing opportunity, because at 2988 * this point the task has the smallest effective memory and cache footprint. 2989 */ 2990 void sched_exec(void) 2991 { 2992 struct task_struct *p = current; 2993 unsigned long flags; 2994 int dest_cpu; 2995 2996 raw_spin_lock_irqsave(&p->pi_lock, flags); 2997 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 2998 if (dest_cpu == smp_processor_id()) 2999 goto unlock; 3000 3001 if (likely(cpu_active(dest_cpu))) { 3002 struct migration_arg arg = { p, dest_cpu }; 3003 3004 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 3005 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); 3006 return; 3007 } 3008 unlock: 3009 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 3010 } 3011 3012 #endif 3013 3014 DEFINE_PER_CPU(struct kernel_stat, kstat); 3015 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); 3016 3017 EXPORT_PER_CPU_SYMBOL(kstat); 3018 EXPORT_PER_CPU_SYMBOL(kernel_cpustat); 3019 3020 /* 3021 * The function fair_sched_class.update_curr accesses the struct curr 3022 * and its field curr->exec_start; when called from task_sched_runtime(), 3023 * we observe a high rate of cache misses in practice. 3024 * Prefetching this data results in improved performance. 3025 */ 3026 static inline void prefetch_curr_exec_start(struct task_struct *p) 3027 { 3028 #ifdef CONFIG_FAIR_GROUP_SCHED 3029 struct sched_entity *curr = (&p->se)->cfs_rq->curr; 3030 #else 3031 struct sched_entity *curr = (&task_rq(p)->cfs)->curr; 3032 #endif 3033 prefetch(curr); 3034 prefetch(&curr->exec_start); 3035 } 3036 3037 /* 3038 * Return accounted runtime for the task. 3039 * In case the task is currently running, return the runtime plus current's 3040 * pending runtime that have not been accounted yet. 3041 */ 3042 unsigned long long task_sched_runtime(struct task_struct *p) 3043 { 3044 struct rq_flags rf; 3045 struct rq *rq; 3046 u64 ns; 3047 3048 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) 3049 /* 3050 * 64-bit doesn't need locks to atomically read a 64bit value. 3051 * So we have a optimization chance when the task's delta_exec is 0. 3052 * Reading ->on_cpu is racy, but this is ok. 3053 * 3054 * If we race with it leaving CPU, we'll take a lock. So we're correct. 3055 * If we race with it entering CPU, unaccounted time is 0. This is 3056 * indistinguishable from the read occurring a few cycles earlier. 3057 * If we see ->on_cpu without ->on_rq, the task is leaving, and has 3058 * been accounted, so we're correct here as well. 3059 */ 3060 if (!p->on_cpu || !task_on_rq_queued(p)) 3061 return p->se.sum_exec_runtime; 3062 #endif 3063 3064 rq = task_rq_lock(p, &rf); 3065 /* 3066 * Must be ->curr _and_ ->on_rq. If dequeued, we would 3067 * project cycles that may never be accounted to this 3068 * thread, breaking clock_gettime(). 3069 */ 3070 if (task_current(rq, p) && task_on_rq_queued(p)) { 3071 prefetch_curr_exec_start(p); 3072 update_rq_clock(rq); 3073 p->sched_class->update_curr(rq); 3074 } 3075 ns = p->se.sum_exec_runtime; 3076 task_rq_unlock(rq, p, &rf); 3077 3078 return ns; 3079 } 3080 3081 /* 3082 * This function gets called by the timer code, with HZ frequency. 3083 * We call it with interrupts disabled. 3084 */ 3085 void scheduler_tick(void) 3086 { 3087 int cpu = smp_processor_id(); 3088 struct rq *rq = cpu_rq(cpu); 3089 struct task_struct *curr = rq->curr; 3090 3091 sched_clock_tick(); 3092 3093 raw_spin_lock(&rq->lock); 3094 update_rq_clock(rq); 3095 curr->sched_class->task_tick(rq, curr, 0); 3096 cpu_load_update_active(rq); 3097 calc_global_load_tick(rq); 3098 raw_spin_unlock(&rq->lock); 3099 3100 perf_event_task_tick(); 3101 3102 #ifdef CONFIG_SMP 3103 rq->idle_balance = idle_cpu(cpu); 3104 trigger_load_balance(rq); 3105 #endif 3106 rq_last_tick_reset(rq); 3107 } 3108 3109 #ifdef CONFIG_NO_HZ_FULL 3110 /** 3111 * scheduler_tick_max_deferment 3112 * 3113 * Keep at least one tick per second when a single 3114 * active task is running because the scheduler doesn't 3115 * yet completely support full dynticks environment. 3116 * 3117 * This makes sure that uptime, CFS vruntime, load 3118 * balancing, etc... continue to move forward, even 3119 * with a very low granularity. 3120 * 3121 * Return: Maximum deferment in nanoseconds. 3122 */ 3123 u64 scheduler_tick_max_deferment(void) 3124 { 3125 struct rq *rq = this_rq(); 3126 unsigned long next, now = READ_ONCE(jiffies); 3127 3128 next = rq->last_sched_tick + HZ; 3129 3130 if (time_before_eq(next, now)) 3131 return 0; 3132 3133 return jiffies_to_nsecs(next - now); 3134 } 3135 #endif 3136 3137 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 3138 defined(CONFIG_PREEMPT_TRACER)) 3139 /* 3140 * If the value passed in is equal to the current preempt count 3141 * then we just disabled preemption. Start timing the latency. 3142 */ 3143 static inline void preempt_latency_start(int val) 3144 { 3145 if (preempt_count() == val) { 3146 unsigned long ip = get_lock_parent_ip(); 3147 #ifdef CONFIG_DEBUG_PREEMPT 3148 current->preempt_disable_ip = ip; 3149 #endif 3150 trace_preempt_off(CALLER_ADDR0, ip); 3151 } 3152 } 3153 3154 void preempt_count_add(int val) 3155 { 3156 #ifdef CONFIG_DEBUG_PREEMPT 3157 /* 3158 * Underflow? 3159 */ 3160 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 3161 return; 3162 #endif 3163 __preempt_count_add(val); 3164 #ifdef CONFIG_DEBUG_PREEMPT 3165 /* 3166 * Spinlock count overflowing soon? 3167 */ 3168 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 3169 PREEMPT_MASK - 10); 3170 #endif 3171 preempt_latency_start(val); 3172 } 3173 EXPORT_SYMBOL(preempt_count_add); 3174 NOKPROBE_SYMBOL(preempt_count_add); 3175 3176 /* 3177 * If the value passed in equals to the current preempt count 3178 * then we just enabled preemption. Stop timing the latency. 3179 */ 3180 static inline void preempt_latency_stop(int val) 3181 { 3182 if (preempt_count() == val) 3183 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); 3184 } 3185 3186 void preempt_count_sub(int val) 3187 { 3188 #ifdef CONFIG_DEBUG_PREEMPT 3189 /* 3190 * Underflow? 3191 */ 3192 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 3193 return; 3194 /* 3195 * Is the spinlock portion underflowing? 3196 */ 3197 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 3198 !(preempt_count() & PREEMPT_MASK))) 3199 return; 3200 #endif 3201 3202 preempt_latency_stop(val); 3203 __preempt_count_sub(val); 3204 } 3205 EXPORT_SYMBOL(preempt_count_sub); 3206 NOKPROBE_SYMBOL(preempt_count_sub); 3207 3208 #else 3209 static inline void preempt_latency_start(int val) { } 3210 static inline void preempt_latency_stop(int val) { } 3211 #endif 3212 3213 /* 3214 * Print scheduling while atomic bug: 3215 */ 3216 static noinline void __schedule_bug(struct task_struct *prev) 3217 { 3218 /* Save this before calling printk(), since that will clobber it */ 3219 unsigned long preempt_disable_ip = get_preempt_disable_ip(current); 3220 3221 if (oops_in_progress) 3222 return; 3223 3224 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 3225 prev->comm, prev->pid, preempt_count()); 3226 3227 debug_show_held_locks(prev); 3228 print_modules(); 3229 if (irqs_disabled()) 3230 print_irqtrace_events(prev); 3231 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) 3232 && in_atomic_preempt_off()) { 3233 pr_err("Preemption disabled at:"); 3234 print_ip_sym(preempt_disable_ip); 3235 pr_cont("\n"); 3236 } 3237 if (panic_on_warn) 3238 panic("scheduling while atomic\n"); 3239 3240 dump_stack(); 3241 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 3242 } 3243 3244 /* 3245 * Various schedule()-time debugging checks and statistics: 3246 */ 3247 static inline void schedule_debug(struct task_struct *prev) 3248 { 3249 #ifdef CONFIG_SCHED_STACK_END_CHECK 3250 if (task_stack_end_corrupted(prev)) 3251 panic("corrupted stack end detected inside scheduler\n"); 3252 #endif 3253 3254 if (unlikely(in_atomic_preempt_off())) { 3255 __schedule_bug(prev); 3256 preempt_count_set(PREEMPT_DISABLED); 3257 } 3258 rcu_sleep_check(); 3259 3260 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3261 3262 schedstat_inc(this_rq()->sched_count); 3263 } 3264 3265 /* 3266 * Pick up the highest-prio task: 3267 */ 3268 static inline struct task_struct * 3269 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 3270 { 3271 const struct sched_class *class; 3272 struct task_struct *p; 3273 3274 /* 3275 * Optimization: we know that if all tasks are in 3276 * the fair class we can call that function directly: 3277 */ 3278 if (likely(rq->nr_running == rq->cfs.h_nr_running)) { 3279 p = fair_sched_class.pick_next_task(rq, prev, rf); 3280 if (unlikely(p == RETRY_TASK)) 3281 goto again; 3282 3283 /* Assumes fair_sched_class->next == idle_sched_class */ 3284 if (unlikely(!p)) 3285 p = idle_sched_class.pick_next_task(rq, prev, rf); 3286 3287 return p; 3288 } 3289 3290 again: 3291 for_each_class(class) { 3292 p = class->pick_next_task(rq, prev, rf); 3293 if (p) { 3294 if (unlikely(p == RETRY_TASK)) 3295 goto again; 3296 return p; 3297 } 3298 } 3299 3300 /* The idle class should always have a runnable task: */ 3301 BUG(); 3302 } 3303 3304 /* 3305 * __schedule() is the main scheduler function. 3306 * 3307 * The main means of driving the scheduler and thus entering this function are: 3308 * 3309 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. 3310 * 3311 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return 3312 * paths. For example, see arch/x86/entry_64.S. 3313 * 3314 * To drive preemption between tasks, the scheduler sets the flag in timer 3315 * interrupt handler scheduler_tick(). 3316 * 3317 * 3. Wakeups don't really cause entry into schedule(). They add a 3318 * task to the run-queue and that's it. 3319 * 3320 * Now, if the new task added to the run-queue preempts the current 3321 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets 3322 * called on the nearest possible occasion: 3323 * 3324 * - If the kernel is preemptible (CONFIG_PREEMPT=y): 3325 * 3326 * - in syscall or exception context, at the next outmost 3327 * preempt_enable(). (this might be as soon as the wake_up()'s 3328 * spin_unlock()!) 3329 * 3330 * - in IRQ context, return from interrupt-handler to 3331 * preemptible context 3332 * 3333 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) 3334 * then at the next: 3335 * 3336 * - cond_resched() call 3337 * - explicit schedule() call 3338 * - return from syscall or exception to user-space 3339 * - return from interrupt-handler to user-space 3340 * 3341 * WARNING: must be called with preemption disabled! 3342 */ 3343 static void __sched notrace __schedule(bool preempt) 3344 { 3345 struct task_struct *prev, *next; 3346 unsigned long *switch_count; 3347 struct rq_flags rf; 3348 struct rq *rq; 3349 int cpu; 3350 3351 cpu = smp_processor_id(); 3352 rq = cpu_rq(cpu); 3353 prev = rq->curr; 3354 3355 schedule_debug(prev); 3356 3357 if (sched_feat(HRTICK)) 3358 hrtick_clear(rq); 3359 3360 local_irq_disable(); 3361 rcu_note_context_switch(); 3362 3363 /* 3364 * Make sure that signal_pending_state()->signal_pending() below 3365 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) 3366 * done by the caller to avoid the race with signal_wake_up(). 3367 */ 3368 smp_mb__before_spinlock(); 3369 raw_spin_lock(&rq->lock); 3370 rq_pin_lock(rq, &rf); 3371 3372 /* Promote REQ to ACT */ 3373 rq->clock_update_flags <<= 1; 3374 3375 switch_count = &prev->nivcsw; 3376 if (!preempt && prev->state) { 3377 if (unlikely(signal_pending_state(prev->state, prev))) { 3378 prev->state = TASK_RUNNING; 3379 } else { 3380 deactivate_task(rq, prev, DEQUEUE_SLEEP); 3381 prev->on_rq = 0; 3382 3383 if (prev->in_iowait) { 3384 atomic_inc(&rq->nr_iowait); 3385 delayacct_blkio_start(); 3386 } 3387 3388 /* 3389 * If a worker went to sleep, notify and ask workqueue 3390 * whether it wants to wake up a task to maintain 3391 * concurrency. 3392 */ 3393 if (prev->flags & PF_WQ_WORKER) { 3394 struct task_struct *to_wakeup; 3395 3396 to_wakeup = wq_worker_sleeping(prev); 3397 if (to_wakeup) 3398 try_to_wake_up_local(to_wakeup, &rf); 3399 } 3400 } 3401 switch_count = &prev->nvcsw; 3402 } 3403 3404 if (task_on_rq_queued(prev)) 3405 update_rq_clock(rq); 3406 3407 next = pick_next_task(rq, prev, &rf); 3408 clear_tsk_need_resched(prev); 3409 clear_preempt_need_resched(); 3410 3411 if (likely(prev != next)) { 3412 rq->nr_switches++; 3413 rq->curr = next; 3414 ++*switch_count; 3415 3416 trace_sched_switch(preempt, prev, next); 3417 3418 /* Also unlocks the rq: */ 3419 rq = context_switch(rq, prev, next, &rf); 3420 } else { 3421 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); 3422 rq_unpin_lock(rq, &rf); 3423 raw_spin_unlock_irq(&rq->lock); 3424 } 3425 3426 balance_callback(rq); 3427 } 3428 3429 void __noreturn do_task_dead(void) 3430 { 3431 /* 3432 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed 3433 * when the following two conditions become true. 3434 * - There is race condition of mmap_sem (It is acquired by 3435 * exit_mm()), and 3436 * - SMI occurs before setting TASK_RUNINNG. 3437 * (or hypervisor of virtual machine switches to other guest) 3438 * As a result, we may become TASK_RUNNING after becoming TASK_DEAD 3439 * 3440 * To avoid it, we have to wait for releasing tsk->pi_lock which 3441 * is held by try_to_wake_up() 3442 */ 3443 smp_mb(); 3444 raw_spin_unlock_wait(¤t->pi_lock); 3445 3446 /* Causes final put_task_struct in finish_task_switch(): */ 3447 __set_current_state(TASK_DEAD); 3448 3449 /* Tell freezer to ignore us: */ 3450 current->flags |= PF_NOFREEZE; 3451 3452 __schedule(false); 3453 BUG(); 3454 3455 /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ 3456 for (;;) 3457 cpu_relax(); 3458 } 3459 3460 static inline void sched_submit_work(struct task_struct *tsk) 3461 { 3462 if (!tsk->state || tsk_is_pi_blocked(tsk)) 3463 return; 3464 /* 3465 * If we are going to sleep and we have plugged IO queued, 3466 * make sure to submit it to avoid deadlocks. 3467 */ 3468 if (blk_needs_flush_plug(tsk)) 3469 blk_schedule_flush_plug(tsk); 3470 } 3471 3472 asmlinkage __visible void __sched schedule(void) 3473 { 3474 struct task_struct *tsk = current; 3475 3476 sched_submit_work(tsk); 3477 do { 3478 preempt_disable(); 3479 __schedule(false); 3480 sched_preempt_enable_no_resched(); 3481 } while (need_resched()); 3482 } 3483 EXPORT_SYMBOL(schedule); 3484 3485 #ifdef CONFIG_CONTEXT_TRACKING 3486 asmlinkage __visible void __sched schedule_user(void) 3487 { 3488 /* 3489 * If we come here after a random call to set_need_resched(), 3490 * or we have been woken up remotely but the IPI has not yet arrived, 3491 * we haven't yet exited the RCU idle mode. Do it here manually until 3492 * we find a better solution. 3493 * 3494 * NB: There are buggy callers of this function. Ideally we 3495 * should warn if prev_state != CONTEXT_USER, but that will trigger 3496 * too frequently to make sense yet. 3497 */ 3498 enum ctx_state prev_state = exception_enter(); 3499 schedule(); 3500 exception_exit(prev_state); 3501 } 3502 #endif 3503 3504 /** 3505 * schedule_preempt_disabled - called with preemption disabled 3506 * 3507 * Returns with preemption disabled. Note: preempt_count must be 1 3508 */ 3509 void __sched schedule_preempt_disabled(void) 3510 { 3511 sched_preempt_enable_no_resched(); 3512 schedule(); 3513 preempt_disable(); 3514 } 3515 3516 static void __sched notrace preempt_schedule_common(void) 3517 { 3518 do { 3519 /* 3520 * Because the function tracer can trace preempt_count_sub() 3521 * and it also uses preempt_enable/disable_notrace(), if 3522 * NEED_RESCHED is set, the preempt_enable_notrace() called 3523 * by the function tracer will call this function again and 3524 * cause infinite recursion. 3525 * 3526 * Preemption must be disabled here before the function 3527 * tracer can trace. Break up preempt_disable() into two 3528 * calls. One to disable preemption without fear of being 3529 * traced. The other to still record the preemption latency, 3530 * which can also be traced by the function tracer. 3531 */ 3532 preempt_disable_notrace(); 3533 preempt_latency_start(1); 3534 __schedule(true); 3535 preempt_latency_stop(1); 3536 preempt_enable_no_resched_notrace(); 3537 3538 /* 3539 * Check again in case we missed a preemption opportunity 3540 * between schedule and now. 3541 */ 3542 } while (need_resched()); 3543 } 3544 3545 #ifdef CONFIG_PREEMPT 3546 /* 3547 * this is the entry point to schedule() from in-kernel preemption 3548 * off of preempt_enable. Kernel preemptions off return from interrupt 3549 * occur there and call schedule directly. 3550 */ 3551 asmlinkage __visible void __sched notrace preempt_schedule(void) 3552 { 3553 /* 3554 * If there is a non-zero preempt_count or interrupts are disabled, 3555 * we do not want to preempt the current task. Just return.. 3556 */ 3557 if (likely(!preemptible())) 3558 return; 3559 3560 preempt_schedule_common(); 3561 } 3562 NOKPROBE_SYMBOL(preempt_schedule); 3563 EXPORT_SYMBOL(preempt_schedule); 3564 3565 /** 3566 * preempt_schedule_notrace - preempt_schedule called by tracing 3567 * 3568 * The tracing infrastructure uses preempt_enable_notrace to prevent 3569 * recursion and tracing preempt enabling caused by the tracing 3570 * infrastructure itself. But as tracing can happen in areas coming 3571 * from userspace or just about to enter userspace, a preempt enable 3572 * can occur before user_exit() is called. This will cause the scheduler 3573 * to be called when the system is still in usermode. 3574 * 3575 * To prevent this, the preempt_enable_notrace will use this function 3576 * instead of preempt_schedule() to exit user context if needed before 3577 * calling the scheduler. 3578 */ 3579 asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) 3580 { 3581 enum ctx_state prev_ctx; 3582 3583 if (likely(!preemptible())) 3584 return; 3585 3586 do { 3587 /* 3588 * Because the function tracer can trace preempt_count_sub() 3589 * and it also uses preempt_enable/disable_notrace(), if 3590 * NEED_RESCHED is set, the preempt_enable_notrace() called 3591 * by the function tracer will call this function again and 3592 * cause infinite recursion. 3593 * 3594 * Preemption must be disabled here before the function 3595 * tracer can trace. Break up preempt_disable() into two 3596 * calls. One to disable preemption without fear of being 3597 * traced. The other to still record the preemption latency, 3598 * which can also be traced by the function tracer. 3599 */ 3600 preempt_disable_notrace(); 3601 preempt_latency_start(1); 3602 /* 3603 * Needs preempt disabled in case user_exit() is traced 3604 * and the tracer calls preempt_enable_notrace() causing 3605 * an infinite recursion. 3606 */ 3607 prev_ctx = exception_enter(); 3608 __schedule(true); 3609 exception_exit(prev_ctx); 3610 3611 preempt_latency_stop(1); 3612 preempt_enable_no_resched_notrace(); 3613 } while (need_resched()); 3614 } 3615 EXPORT_SYMBOL_GPL(preempt_schedule_notrace); 3616 3617 #endif /* CONFIG_PREEMPT */ 3618 3619 /* 3620 * this is the entry point to schedule() from kernel preemption 3621 * off of irq context. 3622 * Note, that this is called and return with irqs disabled. This will 3623 * protect us against recursive calling from irq. 3624 */ 3625 asmlinkage __visible void __sched preempt_schedule_irq(void) 3626 { 3627 enum ctx_state prev_state; 3628 3629 /* Catch callers which need to be fixed */ 3630 BUG_ON(preempt_count() || !irqs_disabled()); 3631 3632 prev_state = exception_enter(); 3633 3634 do { 3635 preempt_disable(); 3636 local_irq_enable(); 3637 __schedule(true); 3638 local_irq_disable(); 3639 sched_preempt_enable_no_resched(); 3640 } while (need_resched()); 3641 3642 exception_exit(prev_state); 3643 } 3644 3645 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 3646 void *key) 3647 { 3648 return try_to_wake_up(curr->private, mode, wake_flags); 3649 } 3650 EXPORT_SYMBOL(default_wake_function); 3651 3652 #ifdef CONFIG_RT_MUTEXES 3653 3654 /* 3655 * rt_mutex_setprio - set the current priority of a task 3656 * @p: task 3657 * @prio: prio value (kernel-internal form) 3658 * 3659 * This function changes the 'effective' priority of a task. It does 3660 * not touch ->normal_prio like __setscheduler(). 3661 * 3662 * Used by the rt_mutex code to implement priority inheritance 3663 * logic. Call site only calls if the priority of the task changed. 3664 */ 3665 void rt_mutex_setprio(struct task_struct *p, int prio) 3666 { 3667 int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; 3668 const struct sched_class *prev_class; 3669 struct rq_flags rf; 3670 struct rq *rq; 3671 3672 BUG_ON(prio > MAX_PRIO); 3673 3674 rq = __task_rq_lock(p, &rf); 3675 update_rq_clock(rq); 3676 3677 /* 3678 * Idle task boosting is a nono in general. There is one 3679 * exception, when PREEMPT_RT and NOHZ is active: 3680 * 3681 * The idle task calls get_next_timer_interrupt() and holds 3682 * the timer wheel base->lock on the CPU and another CPU wants 3683 * to access the timer (probably to cancel it). We can safely 3684 * ignore the boosting request, as the idle CPU runs this code 3685 * with interrupts disabled and will complete the lock 3686 * protected section without being interrupted. So there is no 3687 * real need to boost. 3688 */ 3689 if (unlikely(p == rq->idle)) { 3690 WARN_ON(p != rq->curr); 3691 WARN_ON(p->pi_blocked_on); 3692 goto out_unlock; 3693 } 3694 3695 trace_sched_pi_setprio(p, prio); 3696 oldprio = p->prio; 3697 3698 if (oldprio == prio) 3699 queue_flag &= ~DEQUEUE_MOVE; 3700 3701 prev_class = p->sched_class; 3702 queued = task_on_rq_queued(p); 3703 running = task_current(rq, p); 3704 if (queued) 3705 dequeue_task(rq, p, queue_flag); 3706 if (running) 3707 put_prev_task(rq, p); 3708 3709 /* 3710 * Boosting condition are: 3711 * 1. -rt task is running and holds mutex A 3712 * --> -dl task blocks on mutex A 3713 * 3714 * 2. -dl task is running and holds mutex A 3715 * --> -dl task blocks on mutex A and could preempt the 3716 * running task 3717 */ 3718 if (dl_prio(prio)) { 3719 struct task_struct *pi_task = rt_mutex_get_top_task(p); 3720 if (!dl_prio(p->normal_prio) || 3721 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { 3722 p->dl.dl_boosted = 1; 3723 queue_flag |= ENQUEUE_REPLENISH; 3724 } else 3725 p->dl.dl_boosted = 0; 3726 p->sched_class = &dl_sched_class; 3727 } else if (rt_prio(prio)) { 3728 if (dl_prio(oldprio)) 3729 p->dl.dl_boosted = 0; 3730 if (oldprio < prio) 3731 queue_flag |= ENQUEUE_HEAD; 3732 p->sched_class = &rt_sched_class; 3733 } else { 3734 if (dl_prio(oldprio)) 3735 p->dl.dl_boosted = 0; 3736 if (rt_prio(oldprio)) 3737 p->rt.timeout = 0; 3738 p->sched_class = &fair_sched_class; 3739 } 3740 3741 p->prio = prio; 3742 3743 if (queued) 3744 enqueue_task(rq, p, queue_flag); 3745 if (running) 3746 set_curr_task(rq, p); 3747 3748 check_class_changed(rq, p, prev_class, oldprio); 3749 out_unlock: 3750 /* Avoid rq from going away on us: */ 3751 preempt_disable(); 3752 __task_rq_unlock(rq, &rf); 3753 3754 balance_callback(rq); 3755 preempt_enable(); 3756 } 3757 #endif 3758 3759 void set_user_nice(struct task_struct *p, long nice) 3760 { 3761 bool queued, running; 3762 int old_prio, delta; 3763 struct rq_flags rf; 3764 struct rq *rq; 3765 3766 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) 3767 return; 3768 /* 3769 * We have to be careful, if called from sys_setpriority(), 3770 * the task might be in the middle of scheduling on another CPU. 3771 */ 3772 rq = task_rq_lock(p, &rf); 3773 update_rq_clock(rq); 3774 3775 /* 3776 * The RT priorities are set via sched_setscheduler(), but we still 3777 * allow the 'normal' nice value to be set - but as expected 3778 * it wont have any effect on scheduling until the task is 3779 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: 3780 */ 3781 if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 3782 p->static_prio = NICE_TO_PRIO(nice); 3783 goto out_unlock; 3784 } 3785 queued = task_on_rq_queued(p); 3786 running = task_current(rq, p); 3787 if (queued) 3788 dequeue_task(rq, p, DEQUEUE_SAVE); 3789 if (running) 3790 put_prev_task(rq, p); 3791 3792 p->static_prio = NICE_TO_PRIO(nice); 3793 set_load_weight(p); 3794 old_prio = p->prio; 3795 p->prio = effective_prio(p); 3796 delta = p->prio - old_prio; 3797 3798 if (queued) { 3799 enqueue_task(rq, p, ENQUEUE_RESTORE); 3800 /* 3801 * If the task increased its priority or is running and 3802 * lowered its priority, then reschedule its CPU: 3803 */ 3804 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3805 resched_curr(rq); 3806 } 3807 if (running) 3808 set_curr_task(rq, p); 3809 out_unlock: 3810 task_rq_unlock(rq, p, &rf); 3811 } 3812 EXPORT_SYMBOL(set_user_nice); 3813 3814 /* 3815 * can_nice - check if a task can reduce its nice value 3816 * @p: task 3817 * @nice: nice value 3818 */ 3819 int can_nice(const struct task_struct *p, const int nice) 3820 { 3821 /* Convert nice value [19,-20] to rlimit style value [1,40]: */ 3822 int nice_rlim = nice_to_rlimit(nice); 3823 3824 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 3825 capable(CAP_SYS_NICE)); 3826 } 3827 3828 #ifdef __ARCH_WANT_SYS_NICE 3829 3830 /* 3831 * sys_nice - change the priority of the current process. 3832 * @increment: priority increment 3833 * 3834 * sys_setpriority is a more generic, but much slower function that 3835 * does similar things. 3836 */ 3837 SYSCALL_DEFINE1(nice, int, increment) 3838 { 3839 long nice, retval; 3840 3841 /* 3842 * Setpriority might change our priority at the same moment. 3843 * We don't have to worry. Conceptually one call occurs first 3844 * and we have a single winner. 3845 */ 3846 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); 3847 nice = task_nice(current) + increment; 3848 3849 nice = clamp_val(nice, MIN_NICE, MAX_NICE); 3850 if (increment < 0 && !can_nice(current, nice)) 3851 return -EPERM; 3852 3853 retval = security_task_setnice(current, nice); 3854 if (retval) 3855 return retval; 3856 3857 set_user_nice(current, nice); 3858 return 0; 3859 } 3860 3861 #endif 3862 3863 /** 3864 * task_prio - return the priority value of a given task. 3865 * @p: the task in question. 3866 * 3867 * Return: The priority value as seen by users in /proc. 3868 * RT tasks are offset by -200. Normal tasks are centered 3869 * around 0, value goes from -16 to +15. 3870 */ 3871 int task_prio(const struct task_struct *p) 3872 { 3873 return p->prio - MAX_RT_PRIO; 3874 } 3875 3876 /** 3877 * idle_cpu - is a given CPU idle currently? 3878 * @cpu: the processor in question. 3879 * 3880 * Return: 1 if the CPU is currently idle. 0 otherwise. 3881 */ 3882 int idle_cpu(int cpu) 3883 { 3884 struct rq *rq = cpu_rq(cpu); 3885 3886 if (rq->curr != rq->idle) 3887 return 0; 3888 3889 if (rq->nr_running) 3890 return 0; 3891 3892 #ifdef CONFIG_SMP 3893 if (!llist_empty(&rq->wake_list)) 3894 return 0; 3895 #endif 3896 3897 return 1; 3898 } 3899 3900 /** 3901 * idle_task - return the idle task for a given CPU. 3902 * @cpu: the processor in question. 3903 * 3904 * Return: The idle task for the CPU @cpu. 3905 */ 3906 struct task_struct *idle_task(int cpu) 3907 { 3908 return cpu_rq(cpu)->idle; 3909 } 3910 3911 /** 3912 * find_process_by_pid - find a process with a matching PID value. 3913 * @pid: the pid in question. 3914 * 3915 * The task of @pid, if found. %NULL otherwise. 3916 */ 3917 static struct task_struct *find_process_by_pid(pid_t pid) 3918 { 3919 return pid ? find_task_by_vpid(pid) : current; 3920 } 3921 3922 /* 3923 * This function initializes the sched_dl_entity of a newly becoming 3924 * SCHED_DEADLINE task. 3925 * 3926 * Only the static values are considered here, the actual runtime and the 3927 * absolute deadline will be properly calculated when the task is enqueued 3928 * for the first time with its new policy. 3929 */ 3930 static void 3931 __setparam_dl(struct task_struct *p, const struct sched_attr *attr) 3932 { 3933 struct sched_dl_entity *dl_se = &p->dl; 3934 3935 dl_se->dl_runtime = attr->sched_runtime; 3936 dl_se->dl_deadline = attr->sched_deadline; 3937 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; 3938 dl_se->flags = attr->sched_flags; 3939 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); 3940 3941 /* 3942 * Changing the parameters of a task is 'tricky' and we're not doing 3943 * the correct thing -- also see task_dead_dl() and switched_from_dl(). 3944 * 3945 * What we SHOULD do is delay the bandwidth release until the 0-lag 3946 * point. This would include retaining the task_struct until that time 3947 * and change dl_overflow() to not immediately decrement the current 3948 * amount. 3949 * 3950 * Instead we retain the current runtime/deadline and let the new 3951 * parameters take effect after the current reservation period lapses. 3952 * This is safe (albeit pessimistic) because the 0-lag point is always 3953 * before the current scheduling deadline. 3954 * 3955 * We can still have temporary overloads because we do not delay the 3956 * change in bandwidth until that time; so admission control is 3957 * not on the safe side. It does however guarantee tasks will never 3958 * consume more than promised. 3959 */ 3960 } 3961 3962 /* 3963 * sched_setparam() passes in -1 for its policy, to let the functions 3964 * it calls know not to change it. 3965 */ 3966 #define SETPARAM_POLICY -1 3967 3968 static void __setscheduler_params(struct task_struct *p, 3969 const struct sched_attr *attr) 3970 { 3971 int policy = attr->sched_policy; 3972 3973 if (policy == SETPARAM_POLICY) 3974 policy = p->policy; 3975 3976 p->policy = policy; 3977 3978 if (dl_policy(policy)) 3979 __setparam_dl(p, attr); 3980 else if (fair_policy(policy)) 3981 p->static_prio = NICE_TO_PRIO(attr->sched_nice); 3982 3983 /* 3984 * __sched_setscheduler() ensures attr->sched_priority == 0 when 3985 * !rt_policy. Always setting this ensures that things like 3986 * getparam()/getattr() don't report silly values for !rt tasks. 3987 */ 3988 p->rt_priority = attr->sched_priority; 3989 p->normal_prio = normal_prio(p); 3990 set_load_weight(p); 3991 } 3992 3993 /* Actually do priority change: must hold pi & rq lock. */ 3994 static void __setscheduler(struct rq *rq, struct task_struct *p, 3995 const struct sched_attr *attr, bool keep_boost) 3996 { 3997 __setscheduler_params(p, attr); 3998 3999 /* 4000 * Keep a potential priority boosting if called from 4001 * sched_setscheduler(). 4002 */ 4003 if (keep_boost) 4004 p->prio = rt_mutex_get_effective_prio(p, normal_prio(p)); 4005 else 4006 p->prio = normal_prio(p); 4007 4008 if (dl_prio(p->prio)) 4009 p->sched_class = &dl_sched_class; 4010 else if (rt_prio(p->prio)) 4011 p->sched_class = &rt_sched_class; 4012 else 4013 p->sched_class = &fair_sched_class; 4014 } 4015 4016 static void 4017 __getparam_dl(struct task_struct *p, struct sched_attr *attr) 4018 { 4019 struct sched_dl_entity *dl_se = &p->dl; 4020 4021 attr->sched_priority = p->rt_priority; 4022 attr->sched_runtime = dl_se->dl_runtime; 4023 attr->sched_deadline = dl_se->dl_deadline; 4024 attr->sched_period = dl_se->dl_period; 4025 attr->sched_flags = dl_se->flags; 4026 } 4027 4028 /* 4029 * This function validates the new parameters of a -deadline task. 4030 * We ask for the deadline not being zero, and greater or equal 4031 * than the runtime, as well as the period of being zero or 4032 * greater than deadline. Furthermore, we have to be sure that 4033 * user parameters are above the internal resolution of 1us (we 4034 * check sched_runtime only since it is always the smaller one) and 4035 * below 2^63 ns (we have to check both sched_deadline and 4036 * sched_period, as the latter can be zero). 4037 */ 4038 static bool 4039 __checkparam_dl(const struct sched_attr *attr) 4040 { 4041 /* deadline != 0 */ 4042 if (attr->sched_deadline == 0) 4043 return false; 4044 4045 /* 4046 * Since we truncate DL_SCALE bits, make sure we're at least 4047 * that big. 4048 */ 4049 if (attr->sched_runtime < (1ULL << DL_SCALE)) 4050 return false; 4051 4052 /* 4053 * Since we use the MSB for wrap-around and sign issues, make 4054 * sure it's not set (mind that period can be equal to zero). 4055 */ 4056 if (attr->sched_deadline & (1ULL << 63) || 4057 attr->sched_period & (1ULL << 63)) 4058 return false; 4059 4060 /* runtime <= deadline <= period (if period != 0) */ 4061 if ((attr->sched_period != 0 && 4062 attr->sched_period < attr->sched_deadline) || 4063 attr->sched_deadline < attr->sched_runtime) 4064 return false; 4065 4066 return true; 4067 } 4068 4069 /* 4070 * Check the target process has a UID that matches the current process's: 4071 */ 4072 static bool check_same_owner(struct task_struct *p) 4073 { 4074 const struct cred *cred = current_cred(), *pcred; 4075 bool match; 4076 4077 rcu_read_lock(); 4078 pcred = __task_cred(p); 4079 match = (uid_eq(cred->euid, pcred->euid) || 4080 uid_eq(cred->euid, pcred->uid)); 4081 rcu_read_unlock(); 4082 return match; 4083 } 4084 4085 static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) 4086 { 4087 struct sched_dl_entity *dl_se = &p->dl; 4088 4089 if (dl_se->dl_runtime != attr->sched_runtime || 4090 dl_se->dl_deadline != attr->sched_deadline || 4091 dl_se->dl_period != attr->sched_period || 4092 dl_se->flags != attr->sched_flags) 4093 return true; 4094 4095 return false; 4096 } 4097 4098 static int __sched_setscheduler(struct task_struct *p, 4099 const struct sched_attr *attr, 4100 bool user, bool pi) 4101 { 4102 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : 4103 MAX_RT_PRIO - 1 - attr->sched_priority; 4104 int retval, oldprio, oldpolicy = -1, queued, running; 4105 int new_effective_prio, policy = attr->sched_policy; 4106 const struct sched_class *prev_class; 4107 struct rq_flags rf; 4108 int reset_on_fork; 4109 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; 4110 struct rq *rq; 4111 4112 /* May grab non-irq protected spin_locks: */ 4113 BUG_ON(in_interrupt()); 4114 recheck: 4115 /* Double check policy once rq lock held: */ 4116 if (policy < 0) { 4117 reset_on_fork = p->sched_reset_on_fork; 4118 policy = oldpolicy = p->policy; 4119 } else { 4120 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); 4121 4122 if (!valid_policy(policy)) 4123 return -EINVAL; 4124 } 4125 4126 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) 4127 return -EINVAL; 4128 4129 /* 4130 * Valid priorities for SCHED_FIFO and SCHED_RR are 4131 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 4132 * SCHED_BATCH and SCHED_IDLE is 0. 4133 */ 4134 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || 4135 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) 4136 return -EINVAL; 4137 if ((dl_policy(policy) && !__checkparam_dl(attr)) || 4138 (rt_policy(policy) != (attr->sched_priority != 0))) 4139 return -EINVAL; 4140 4141 /* 4142 * Allow unprivileged RT tasks to decrease priority: 4143 */ 4144 if (user && !capable(CAP_SYS_NICE)) { 4145 if (fair_policy(policy)) { 4146 if (attr->sched_nice < task_nice(p) && 4147 !can_nice(p, attr->sched_nice)) 4148 return -EPERM; 4149 } 4150 4151 if (rt_policy(policy)) { 4152 unsigned long rlim_rtprio = 4153 task_rlimit(p, RLIMIT_RTPRIO); 4154 4155 /* Can't set/change the rt policy: */ 4156 if (policy != p->policy && !rlim_rtprio) 4157 return -EPERM; 4158 4159 /* Can't increase priority: */ 4160 if (attr->sched_priority > p->rt_priority && 4161 attr->sched_priority > rlim_rtprio) 4162 return -EPERM; 4163 } 4164 4165 /* 4166 * Can't set/change SCHED_DEADLINE policy at all for now 4167 * (safest behavior); in the future we would like to allow 4168 * unprivileged DL tasks to increase their relative deadline 4169 * or reduce their runtime (both ways reducing utilization) 4170 */ 4171 if (dl_policy(policy)) 4172 return -EPERM; 4173 4174 /* 4175 * Treat SCHED_IDLE as nice 20. Only allow a switch to 4176 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 4177 */ 4178 if (idle_policy(p->policy) && !idle_policy(policy)) { 4179 if (!can_nice(p, task_nice(p))) 4180 return -EPERM; 4181 } 4182 4183 /* Can't change other user's priorities: */ 4184 if (!check_same_owner(p)) 4185 return -EPERM; 4186 4187 /* Normal users shall not reset the sched_reset_on_fork flag: */ 4188 if (p->sched_reset_on_fork && !reset_on_fork) 4189 return -EPERM; 4190 } 4191 4192 if (user) { 4193 retval = security_task_setscheduler(p); 4194 if (retval) 4195 return retval; 4196 } 4197 4198 /* 4199 * Make sure no PI-waiters arrive (or leave) while we are 4200 * changing the priority of the task: 4201 * 4202 * To be able to change p->policy safely, the appropriate 4203 * runqueue lock must be held. 4204 */ 4205 rq = task_rq_lock(p, &rf); 4206 update_rq_clock(rq); 4207 4208 /* 4209 * Changing the policy of the stop threads its a very bad idea: 4210 */ 4211 if (p == rq->stop) { 4212 task_rq_unlock(rq, p, &rf); 4213 return -EINVAL; 4214 } 4215 4216 /* 4217 * If not changing anything there's no need to proceed further, 4218 * but store a possible modification of reset_on_fork. 4219 */ 4220 if (unlikely(policy == p->policy)) { 4221 if (fair_policy(policy) && attr->sched_nice != task_nice(p)) 4222 goto change; 4223 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 4224 goto change; 4225 if (dl_policy(policy) && dl_param_changed(p, attr)) 4226 goto change; 4227 4228 p->sched_reset_on_fork = reset_on_fork; 4229 task_rq_unlock(rq, p, &rf); 4230 return 0; 4231 } 4232 change: 4233 4234 if (user) { 4235 #ifdef CONFIG_RT_GROUP_SCHED 4236 /* 4237 * Do not allow realtime tasks into groups that have no runtime 4238 * assigned. 4239 */ 4240 if (rt_bandwidth_enabled() && rt_policy(policy) && 4241 task_group(p)->rt_bandwidth.rt_runtime == 0 && 4242 !task_group_is_autogroup(task_group(p))) { 4243 task_rq_unlock(rq, p, &rf); 4244 return -EPERM; 4245 } 4246 #endif 4247 #ifdef CONFIG_SMP 4248 if (dl_bandwidth_enabled() && dl_policy(policy)) { 4249 cpumask_t *span = rq->rd->span; 4250 4251 /* 4252 * Don't allow tasks with an affinity mask smaller than 4253 * the entire root_domain to become SCHED_DEADLINE. We 4254 * will also fail if there's no bandwidth available. 4255 */ 4256 if (!cpumask_subset(span, &p->cpus_allowed) || 4257 rq->rd->dl_bw.bw == 0) { 4258 task_rq_unlock(rq, p, &rf); 4259 return -EPERM; 4260 } 4261 } 4262 #endif 4263 } 4264 4265 /* Re-check policy now with rq lock held: */ 4266 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4267 policy = oldpolicy = -1; 4268 task_rq_unlock(rq, p, &rf); 4269 goto recheck; 4270 } 4271 4272 /* 4273 * If setscheduling to SCHED_DEADLINE (or changing the parameters 4274 * of a SCHED_DEADLINE task) we need to check if enough bandwidth 4275 * is available. 4276 */ 4277 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { 4278 task_rq_unlock(rq, p, &rf); 4279 return -EBUSY; 4280 } 4281 4282 p->sched_reset_on_fork = reset_on_fork; 4283 oldprio = p->prio; 4284 4285 if (pi) { 4286 /* 4287 * Take priority boosted tasks into account. If the new 4288 * effective priority is unchanged, we just store the new 4289 * normal parameters and do not touch the scheduler class and 4290 * the runqueue. This will be done when the task deboost 4291 * itself. 4292 */ 4293 new_effective_prio = rt_mutex_get_effective_prio(p, newprio); 4294 if (new_effective_prio == oldprio) 4295 queue_flags &= ~DEQUEUE_MOVE; 4296 } 4297 4298 queued = task_on_rq_queued(p); 4299 running = task_current(rq, p); 4300 if (queued) 4301 dequeue_task(rq, p, queue_flags); 4302 if (running) 4303 put_prev_task(rq, p); 4304 4305 prev_class = p->sched_class; 4306 __setscheduler(rq, p, attr, pi); 4307 4308 if (queued) { 4309 /* 4310 * We enqueue to tail when the priority of a task is 4311 * increased (user space view). 4312 */ 4313 if (oldprio < p->prio) 4314 queue_flags |= ENQUEUE_HEAD; 4315 4316 enqueue_task(rq, p, queue_flags); 4317 } 4318 if (running) 4319 set_curr_task(rq, p); 4320 4321 check_class_changed(rq, p, prev_class, oldprio); 4322 4323 /* Avoid rq from going away on us: */ 4324 preempt_disable(); 4325 task_rq_unlock(rq, p, &rf); 4326 4327 if (pi) 4328 rt_mutex_adjust_pi(p); 4329 4330 /* Run balance callbacks after we've adjusted the PI chain: */ 4331 balance_callback(rq); 4332 preempt_enable(); 4333 4334 return 0; 4335 } 4336 4337 static int _sched_setscheduler(struct task_struct *p, int policy, 4338 const struct sched_param *param, bool check) 4339 { 4340 struct sched_attr attr = { 4341 .sched_policy = policy, 4342 .sched_priority = param->sched_priority, 4343 .sched_nice = PRIO_TO_NICE(p->static_prio), 4344 }; 4345 4346 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ 4347 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { 4348 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 4349 policy &= ~SCHED_RESET_ON_FORK; 4350 attr.sched_policy = policy; 4351 } 4352 4353 return __sched_setscheduler(p, &attr, check, true); 4354 } 4355 /** 4356 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 4357 * @p: the task in question. 4358 * @policy: new policy. 4359 * @param: structure containing the new RT priority. 4360 * 4361 * Return: 0 on success. An error code otherwise. 4362 * 4363 * NOTE that the task may be already dead. 4364 */ 4365 int sched_setscheduler(struct task_struct *p, int policy, 4366 const struct sched_param *param) 4367 { 4368 return _sched_setscheduler(p, policy, param, true); 4369 } 4370 EXPORT_SYMBOL_GPL(sched_setscheduler); 4371 4372 int sched_setattr(struct task_struct *p, const struct sched_attr *attr) 4373 { 4374 return __sched_setscheduler(p, attr, true, true); 4375 } 4376 EXPORT_SYMBOL_GPL(sched_setattr); 4377 4378 /** 4379 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 4380 * @p: the task in question. 4381 * @policy: new policy. 4382 * @param: structure containing the new RT priority. 4383 * 4384 * Just like sched_setscheduler, only don't bother checking if the 4385 * current context has permission. For example, this is needed in 4386 * stop_machine(): we create temporary high priority worker threads, 4387 * but our caller might not have that capability. 4388 * 4389 * Return: 0 on success. An error code otherwise. 4390 */ 4391 int sched_setscheduler_nocheck(struct task_struct *p, int policy, 4392 const struct sched_param *param) 4393 { 4394 return _sched_setscheduler(p, policy, param, false); 4395 } 4396 EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); 4397 4398 static int 4399 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 4400 { 4401 struct sched_param lparam; 4402 struct task_struct *p; 4403 int retval; 4404 4405 if (!param || pid < 0) 4406 return -EINVAL; 4407 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 4408 return -EFAULT; 4409 4410 rcu_read_lock(); 4411 retval = -ESRCH; 4412 p = find_process_by_pid(pid); 4413 if (p != NULL) 4414 retval = sched_setscheduler(p, policy, &lparam); 4415 rcu_read_unlock(); 4416 4417 return retval; 4418 } 4419 4420 /* 4421 * Mimics kernel/events/core.c perf_copy_attr(). 4422 */ 4423 static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) 4424 { 4425 u32 size; 4426 int ret; 4427 4428 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) 4429 return -EFAULT; 4430 4431 /* Zero the full structure, so that a short copy will be nice: */ 4432 memset(attr, 0, sizeof(*attr)); 4433 4434 ret = get_user(size, &uattr->size); 4435 if (ret) 4436 return ret; 4437 4438 /* Bail out on silly large: */ 4439 if (size > PAGE_SIZE) 4440 goto err_size; 4441 4442 /* ABI compatibility quirk: */ 4443 if (!size) 4444 size = SCHED_ATTR_SIZE_VER0; 4445 4446 if (size < SCHED_ATTR_SIZE_VER0) 4447 goto err_size; 4448 4449 /* 4450 * If we're handed a bigger struct than we know of, 4451 * ensure all the unknown bits are 0 - i.e. new 4452 * user-space does not rely on any kernel feature 4453 * extensions we dont know about yet. 4454 */ 4455 if (size > sizeof(*attr)) { 4456 unsigned char __user *addr; 4457 unsigned char __user *end; 4458 unsigned char val; 4459 4460 addr = (void __user *)uattr + sizeof(*attr); 4461 end = (void __user *)uattr + size; 4462 4463 for (; addr < end; addr++) { 4464 ret = get_user(val, addr); 4465 if (ret) 4466 return ret; 4467 if (val) 4468 goto err_size; 4469 } 4470 size = sizeof(*attr); 4471 } 4472 4473 ret = copy_from_user(attr, uattr, size); 4474 if (ret) 4475 return -EFAULT; 4476 4477 /* 4478 * XXX: Do we want to be lenient like existing syscalls; or do we want 4479 * to be strict and return an error on out-of-bounds values? 4480 */ 4481 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); 4482 4483 return 0; 4484 4485 err_size: 4486 put_user(sizeof(*attr), &uattr->size); 4487 return -E2BIG; 4488 } 4489 4490 /** 4491 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 4492 * @pid: the pid in question. 4493 * @policy: new policy. 4494 * @param: structure containing the new RT priority. 4495 * 4496 * Return: 0 on success. An error code otherwise. 4497 */ 4498 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) 4499 { 4500 if (policy < 0) 4501 return -EINVAL; 4502 4503 return do_sched_setscheduler(pid, policy, param); 4504 } 4505 4506 /** 4507 * sys_sched_setparam - set/change the RT priority of a thread 4508 * @pid: the pid in question. 4509 * @param: structure containing the new RT priority. 4510 * 4511 * Return: 0 on success. An error code otherwise. 4512 */ 4513 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 4514 { 4515 return do_sched_setscheduler(pid, SETPARAM_POLICY, param); 4516 } 4517 4518 /** 4519 * sys_sched_setattr - same as above, but with extended sched_attr 4520 * @pid: the pid in question. 4521 * @uattr: structure containing the extended parameters. 4522 * @flags: for future extension. 4523 */ 4524 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, 4525 unsigned int, flags) 4526 { 4527 struct sched_attr attr; 4528 struct task_struct *p; 4529 int retval; 4530 4531 if (!uattr || pid < 0 || flags) 4532 return -EINVAL; 4533 4534 retval = sched_copy_attr(uattr, &attr); 4535 if (retval) 4536 return retval; 4537 4538 if ((int)attr.sched_policy < 0) 4539 return -EINVAL; 4540 4541 rcu_read_lock(); 4542 retval = -ESRCH; 4543 p = find_process_by_pid(pid); 4544 if (p != NULL) 4545 retval = sched_setattr(p, &attr); 4546 rcu_read_unlock(); 4547 4548 return retval; 4549 } 4550 4551 /** 4552 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 4553 * @pid: the pid in question. 4554 * 4555 * Return: On success, the policy of the thread. Otherwise, a negative error 4556 * code. 4557 */ 4558 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 4559 { 4560 struct task_struct *p; 4561 int retval; 4562 4563 if (pid < 0) 4564 return -EINVAL; 4565 4566 retval = -ESRCH; 4567 rcu_read_lock(); 4568 p = find_process_by_pid(pid); 4569 if (p) { 4570 retval = security_task_getscheduler(p); 4571 if (!retval) 4572 retval = p->policy 4573 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 4574 } 4575 rcu_read_unlock(); 4576 return retval; 4577 } 4578 4579 /** 4580 * sys_sched_getparam - get the RT priority of a thread 4581 * @pid: the pid in question. 4582 * @param: structure containing the RT priority. 4583 * 4584 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error 4585 * code. 4586 */ 4587 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 4588 { 4589 struct sched_param lp = { .sched_priority = 0 }; 4590 struct task_struct *p; 4591 int retval; 4592 4593 if (!param || pid < 0) 4594 return -EINVAL; 4595 4596 rcu_read_lock(); 4597 p = find_process_by_pid(pid); 4598 retval = -ESRCH; 4599 if (!p) 4600 goto out_unlock; 4601 4602 retval = security_task_getscheduler(p); 4603 if (retval) 4604 goto out_unlock; 4605 4606 if (task_has_rt_policy(p)) 4607 lp.sched_priority = p->rt_priority; 4608 rcu_read_unlock(); 4609 4610 /* 4611 * This one might sleep, we cannot do it with a spinlock held ... 4612 */ 4613 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 4614 4615 return retval; 4616 4617 out_unlock: 4618 rcu_read_unlock(); 4619 return retval; 4620 } 4621 4622 static int sched_read_attr(struct sched_attr __user *uattr, 4623 struct sched_attr *attr, 4624 unsigned int usize) 4625 { 4626 int ret; 4627 4628 if (!access_ok(VERIFY_WRITE, uattr, usize)) 4629 return -EFAULT; 4630 4631 /* 4632 * If we're handed a smaller struct than we know of, 4633 * ensure all the unknown bits are 0 - i.e. old 4634 * user-space does not get uncomplete information. 4635 */ 4636 if (usize < sizeof(*attr)) { 4637 unsigned char *addr; 4638 unsigned char *end; 4639 4640 addr = (void *)attr + usize; 4641 end = (void *)attr + sizeof(*attr); 4642 4643 for (; addr < end; addr++) { 4644 if (*addr) 4645 return -EFBIG; 4646 } 4647 4648 attr->size = usize; 4649 } 4650 4651 ret = copy_to_user(uattr, attr, attr->size); 4652 if (ret) 4653 return -EFAULT; 4654 4655 return 0; 4656 } 4657 4658 /** 4659 * sys_sched_getattr - similar to sched_getparam, but with sched_attr 4660 * @pid: the pid in question. 4661 * @uattr: structure containing the extended parameters. 4662 * @size: sizeof(attr) for fwd/bwd comp. 4663 * @flags: for future extension. 4664 */ 4665 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, 4666 unsigned int, size, unsigned int, flags) 4667 { 4668 struct sched_attr attr = { 4669 .size = sizeof(struct sched_attr), 4670 }; 4671 struct task_struct *p; 4672 int retval; 4673 4674 if (!uattr || pid < 0 || size > PAGE_SIZE || 4675 size < SCHED_ATTR_SIZE_VER0 || flags) 4676 return -EINVAL; 4677 4678 rcu_read_lock(); 4679 p = find_process_by_pid(pid); 4680 retval = -ESRCH; 4681 if (!p) 4682 goto out_unlock; 4683 4684 retval = security_task_getscheduler(p); 4685 if (retval) 4686 goto out_unlock; 4687 4688 attr.sched_policy = p->policy; 4689 if (p->sched_reset_on_fork) 4690 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 4691 if (task_has_dl_policy(p)) 4692 __getparam_dl(p, &attr); 4693 else if (task_has_rt_policy(p)) 4694 attr.sched_priority = p->rt_priority; 4695 else 4696 attr.sched_nice = task_nice(p); 4697 4698 rcu_read_unlock(); 4699 4700 retval = sched_read_attr(uattr, &attr, size); 4701 return retval; 4702 4703 out_unlock: 4704 rcu_read_unlock(); 4705 return retval; 4706 } 4707 4708 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 4709 { 4710 cpumask_var_t cpus_allowed, new_mask; 4711 struct task_struct *p; 4712 int retval; 4713 4714 rcu_read_lock(); 4715 4716 p = find_process_by_pid(pid); 4717 if (!p) { 4718 rcu_read_unlock(); 4719 return -ESRCH; 4720 } 4721 4722 /* Prevent p going away */ 4723 get_task_struct(p); 4724 rcu_read_unlock(); 4725 4726 if (p->flags & PF_NO_SETAFFINITY) { 4727 retval = -EINVAL; 4728 goto out_put_task; 4729 } 4730 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 4731 retval = -ENOMEM; 4732 goto out_put_task; 4733 } 4734 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { 4735 retval = -ENOMEM; 4736 goto out_free_cpus_allowed; 4737 } 4738 retval = -EPERM; 4739 if (!check_same_owner(p)) { 4740 rcu_read_lock(); 4741 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { 4742 rcu_read_unlock(); 4743 goto out_free_new_mask; 4744 } 4745 rcu_read_unlock(); 4746 } 4747 4748 retval = security_task_setscheduler(p); 4749 if (retval) 4750 goto out_free_new_mask; 4751 4752 4753 cpuset_cpus_allowed(p, cpus_allowed); 4754 cpumask_and(new_mask, in_mask, cpus_allowed); 4755 4756 /* 4757 * Since bandwidth control happens on root_domain basis, 4758 * if admission test is enabled, we only admit -deadline 4759 * tasks allowed to run on all the CPUs in the task's 4760 * root_domain. 4761 */ 4762 #ifdef CONFIG_SMP 4763 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { 4764 rcu_read_lock(); 4765 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { 4766 retval = -EBUSY; 4767 rcu_read_unlock(); 4768 goto out_free_new_mask; 4769 } 4770 rcu_read_unlock(); 4771 } 4772 #endif 4773 again: 4774 retval = __set_cpus_allowed_ptr(p, new_mask, true); 4775 4776 if (!retval) { 4777 cpuset_cpus_allowed(p, cpus_allowed); 4778 if (!cpumask_subset(new_mask, cpus_allowed)) { 4779 /* 4780 * We must have raced with a concurrent cpuset 4781 * update. Just reset the cpus_allowed to the 4782 * cpuset's cpus_allowed 4783 */ 4784 cpumask_copy(new_mask, cpus_allowed); 4785 goto again; 4786 } 4787 } 4788 out_free_new_mask: 4789 free_cpumask_var(new_mask); 4790 out_free_cpus_allowed: 4791 free_cpumask_var(cpus_allowed); 4792 out_put_task: 4793 put_task_struct(p); 4794 return retval; 4795 } 4796 4797 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 4798 struct cpumask *new_mask) 4799 { 4800 if (len < cpumask_size()) 4801 cpumask_clear(new_mask); 4802 else if (len > cpumask_size()) 4803 len = cpumask_size(); 4804 4805 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 4806 } 4807 4808 /** 4809 * sys_sched_setaffinity - set the CPU affinity of a process 4810 * @pid: pid of the process 4811 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4812 * @user_mask_ptr: user-space pointer to the new CPU mask 4813 * 4814 * Return: 0 on success. An error code otherwise. 4815 */ 4816 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 4817 unsigned long __user *, user_mask_ptr) 4818 { 4819 cpumask_var_t new_mask; 4820 int retval; 4821 4822 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 4823 return -ENOMEM; 4824 4825 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); 4826 if (retval == 0) 4827 retval = sched_setaffinity(pid, new_mask); 4828 free_cpumask_var(new_mask); 4829 return retval; 4830 } 4831 4832 long sched_getaffinity(pid_t pid, struct cpumask *mask) 4833 { 4834 struct task_struct *p; 4835 unsigned long flags; 4836 int retval; 4837 4838 rcu_read_lock(); 4839 4840 retval = -ESRCH; 4841 p = find_process_by_pid(pid); 4842 if (!p) 4843 goto out_unlock; 4844 4845 retval = security_task_getscheduler(p); 4846 if (retval) 4847 goto out_unlock; 4848 4849 raw_spin_lock_irqsave(&p->pi_lock, flags); 4850 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); 4851 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4852 4853 out_unlock: 4854 rcu_read_unlock(); 4855 4856 return retval; 4857 } 4858 4859 /** 4860 * sys_sched_getaffinity - get the CPU affinity of a process 4861 * @pid: pid of the process 4862 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4863 * @user_mask_ptr: user-space pointer to hold the current CPU mask 4864 * 4865 * Return: size of CPU mask copied to user_mask_ptr on success. An 4866 * error code otherwise. 4867 */ 4868 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 4869 unsigned long __user *, user_mask_ptr) 4870 { 4871 int ret; 4872 cpumask_var_t mask; 4873 4874 if ((len * BITS_PER_BYTE) < nr_cpu_ids) 4875 return -EINVAL; 4876 if (len & (sizeof(unsigned long)-1)) 4877 return -EINVAL; 4878 4879 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 4880 return -ENOMEM; 4881 4882 ret = sched_getaffinity(pid, mask); 4883 if (ret == 0) { 4884 size_t retlen = min_t(size_t, len, cpumask_size()); 4885 4886 if (copy_to_user(user_mask_ptr, mask, retlen)) 4887 ret = -EFAULT; 4888 else 4889 ret = retlen; 4890 } 4891 free_cpumask_var(mask); 4892 4893 return ret; 4894 } 4895 4896 /** 4897 * sys_sched_yield - yield the current processor to other threads. 4898 * 4899 * This function yields the current CPU to other tasks. If there are no 4900 * other threads running on this CPU then this function will return. 4901 * 4902 * Return: 0. 4903 */ 4904 SYSCALL_DEFINE0(sched_yield) 4905 { 4906 struct rq *rq = this_rq_lock(); 4907 4908 schedstat_inc(rq->yld_count); 4909 current->sched_class->yield_task(rq); 4910 4911 /* 4912 * Since we are going to call schedule() anyway, there's 4913 * no need to preempt or enable interrupts: 4914 */ 4915 __release(rq->lock); 4916 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 4917 do_raw_spin_unlock(&rq->lock); 4918 sched_preempt_enable_no_resched(); 4919 4920 schedule(); 4921 4922 return 0; 4923 } 4924 4925 #ifndef CONFIG_PREEMPT 4926 int __sched _cond_resched(void) 4927 { 4928 if (should_resched(0)) { 4929 preempt_schedule_common(); 4930 return 1; 4931 } 4932 return 0; 4933 } 4934 EXPORT_SYMBOL(_cond_resched); 4935 #endif 4936 4937 /* 4938 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 4939 * call schedule, and on return reacquire the lock. 4940 * 4941 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 4942 * operations here to prevent schedule() from being called twice (once via 4943 * spin_unlock(), once by hand). 4944 */ 4945 int __cond_resched_lock(spinlock_t *lock) 4946 { 4947 int resched = should_resched(PREEMPT_LOCK_OFFSET); 4948 int ret = 0; 4949 4950 lockdep_assert_held(lock); 4951 4952 if (spin_needbreak(lock) || resched) { 4953 spin_unlock(lock); 4954 if (resched) 4955 preempt_schedule_common(); 4956 else 4957 cpu_relax(); 4958 ret = 1; 4959 spin_lock(lock); 4960 } 4961 return ret; 4962 } 4963 EXPORT_SYMBOL(__cond_resched_lock); 4964 4965 int __sched __cond_resched_softirq(void) 4966 { 4967 BUG_ON(!in_softirq()); 4968 4969 if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { 4970 local_bh_enable(); 4971 preempt_schedule_common(); 4972 local_bh_disable(); 4973 return 1; 4974 } 4975 return 0; 4976 } 4977 EXPORT_SYMBOL(__cond_resched_softirq); 4978 4979 /** 4980 * yield - yield the current processor to other threads. 4981 * 4982 * Do not ever use this function, there's a 99% chance you're doing it wrong. 4983 * 4984 * The scheduler is at all times free to pick the calling task as the most 4985 * eligible task to run, if removing the yield() call from your code breaks 4986 * it, its already broken. 4987 * 4988 * Typical broken usage is: 4989 * 4990 * while (!event) 4991 * yield(); 4992 * 4993 * where one assumes that yield() will let 'the other' process run that will 4994 * make event true. If the current task is a SCHED_FIFO task that will never 4995 * happen. Never use yield() as a progress guarantee!! 4996 * 4997 * If you want to use yield() to wait for something, use wait_event(). 4998 * If you want to use yield() to be 'nice' for others, use cond_resched(). 4999 * If you still want to use yield(), do not! 5000 */ 5001 void __sched yield(void) 5002 { 5003 set_current_state(TASK_RUNNING); 5004 sys_sched_yield(); 5005 } 5006 EXPORT_SYMBOL(yield); 5007 5008 /** 5009 * yield_to - yield the current processor to another thread in 5010 * your thread group, or accelerate that thread toward the 5011 * processor it's on. 5012 * @p: target task 5013 * @preempt: whether task preemption is allowed or not 5014 * 5015 * It's the caller's job to ensure that the target task struct 5016 * can't go away on us before we can do any checks. 5017 * 5018 * Return: 5019 * true (>0) if we indeed boosted the target task. 5020 * false (0) if we failed to boost the target. 5021 * -ESRCH if there's no task to yield to. 5022 */ 5023 int __sched yield_to(struct task_struct *p, bool preempt) 5024 { 5025 struct task_struct *curr = current; 5026 struct rq *rq, *p_rq; 5027 unsigned long flags; 5028 int yielded = 0; 5029 5030 local_irq_save(flags); 5031 rq = this_rq(); 5032 5033 again: 5034 p_rq = task_rq(p); 5035 /* 5036 * If we're the only runnable task on the rq and target rq also 5037 * has only one task, there's absolutely no point in yielding. 5038 */ 5039 if (rq->nr_running == 1 && p_rq->nr_running == 1) { 5040 yielded = -ESRCH; 5041 goto out_irq; 5042 } 5043 5044 double_rq_lock(rq, p_rq); 5045 if (task_rq(p) != p_rq) { 5046 double_rq_unlock(rq, p_rq); 5047 goto again; 5048 } 5049 5050 if (!curr->sched_class->yield_to_task) 5051 goto out_unlock; 5052 5053 if (curr->sched_class != p->sched_class) 5054 goto out_unlock; 5055 5056 if (task_running(p_rq, p) || p->state) 5057 goto out_unlock; 5058 5059 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 5060 if (yielded) { 5061 schedstat_inc(rq->yld_count); 5062 /* 5063 * Make p's CPU reschedule; pick_next_entity takes care of 5064 * fairness. 5065 */ 5066 if (preempt && rq != p_rq) 5067 resched_curr(p_rq); 5068 } 5069 5070 out_unlock: 5071 double_rq_unlock(rq, p_rq); 5072 out_irq: 5073 local_irq_restore(flags); 5074 5075 if (yielded > 0) 5076 schedule(); 5077 5078 return yielded; 5079 } 5080 EXPORT_SYMBOL_GPL(yield_to); 5081 5082 int io_schedule_prepare(void) 5083 { 5084 int old_iowait = current->in_iowait; 5085 5086 current->in_iowait = 1; 5087 blk_schedule_flush_plug(current); 5088 5089 return old_iowait; 5090 } 5091 5092 void io_schedule_finish(int token) 5093 { 5094 current->in_iowait = token; 5095 } 5096 5097 /* 5098 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 5099 * that process accounting knows that this is a task in IO wait state. 5100 */ 5101 long __sched io_schedule_timeout(long timeout) 5102 { 5103 int token; 5104 long ret; 5105 5106 token = io_schedule_prepare(); 5107 ret = schedule_timeout(timeout); 5108 io_schedule_finish(token); 5109 5110 return ret; 5111 } 5112 EXPORT_SYMBOL(io_schedule_timeout); 5113 5114 void io_schedule(void) 5115 { 5116 int token; 5117 5118 token = io_schedule_prepare(); 5119 schedule(); 5120 io_schedule_finish(token); 5121 } 5122 EXPORT_SYMBOL(io_schedule); 5123 5124 /** 5125 * sys_sched_get_priority_max - return maximum RT priority. 5126 * @policy: scheduling class. 5127 * 5128 * Return: On success, this syscall returns the maximum 5129 * rt_priority that can be used by a given scheduling class. 5130 * On failure, a negative error code is returned. 5131 */ 5132 SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 5133 { 5134 int ret = -EINVAL; 5135 5136 switch (policy) { 5137 case SCHED_FIFO: 5138 case SCHED_RR: 5139 ret = MAX_USER_RT_PRIO-1; 5140 break; 5141 case SCHED_DEADLINE: 5142 case SCHED_NORMAL: 5143 case SCHED_BATCH: 5144 case SCHED_IDLE: 5145 ret = 0; 5146 break; 5147 } 5148 return ret; 5149 } 5150 5151 /** 5152 * sys_sched_get_priority_min - return minimum RT priority. 5153 * @policy: scheduling class. 5154 * 5155 * Return: On success, this syscall returns the minimum 5156 * rt_priority that can be used by a given scheduling class. 5157 * On failure, a negative error code is returned. 5158 */ 5159 SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 5160 { 5161 int ret = -EINVAL; 5162 5163 switch (policy) { 5164 case SCHED_FIFO: 5165 case SCHED_RR: 5166 ret = 1; 5167 break; 5168 case SCHED_DEADLINE: 5169 case SCHED_NORMAL: 5170 case SCHED_BATCH: 5171 case SCHED_IDLE: 5172 ret = 0; 5173 } 5174 return ret; 5175 } 5176 5177 /** 5178 * sys_sched_rr_get_interval - return the default timeslice of a process. 5179 * @pid: pid of the process. 5180 * @interval: userspace pointer to the timeslice value. 5181 * 5182 * this syscall writes the default timeslice value of a given process 5183 * into the user-space timespec buffer. A value of '0' means infinity. 5184 * 5185 * Return: On success, 0 and the timeslice is in @interval. Otherwise, 5186 * an error code. 5187 */ 5188 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 5189 struct timespec __user *, interval) 5190 { 5191 struct task_struct *p; 5192 unsigned int time_slice; 5193 struct rq_flags rf; 5194 struct timespec t; 5195 struct rq *rq; 5196 int retval; 5197 5198 if (pid < 0) 5199 return -EINVAL; 5200 5201 retval = -ESRCH; 5202 rcu_read_lock(); 5203 p = find_process_by_pid(pid); 5204 if (!p) 5205 goto out_unlock; 5206 5207 retval = security_task_getscheduler(p); 5208 if (retval) 5209 goto out_unlock; 5210 5211 rq = task_rq_lock(p, &rf); 5212 time_slice = 0; 5213 if (p->sched_class->get_rr_interval) 5214 time_slice = p->sched_class->get_rr_interval(rq, p); 5215 task_rq_unlock(rq, p, &rf); 5216 5217 rcu_read_unlock(); 5218 jiffies_to_timespec(time_slice, &t); 5219 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 5220 return retval; 5221 5222 out_unlock: 5223 rcu_read_unlock(); 5224 return retval; 5225 } 5226 5227 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; 5228 5229 void sched_show_task(struct task_struct *p) 5230 { 5231 unsigned long free = 0; 5232 int ppid; 5233 unsigned long state = p->state; 5234 5235 if (!try_get_task_stack(p)) 5236 return; 5237 if (state) 5238 state = __ffs(state) + 1; 5239 printk(KERN_INFO "%-15.15s %c", p->comm, 5240 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5241 if (state == TASK_RUNNING) 5242 printk(KERN_CONT " running task "); 5243 #ifdef CONFIG_DEBUG_STACK_USAGE 5244 free = stack_not_used(p); 5245 #endif 5246 ppid = 0; 5247 rcu_read_lock(); 5248 if (pid_alive(p)) 5249 ppid = task_pid_nr(rcu_dereference(p->real_parent)); 5250 rcu_read_unlock(); 5251 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 5252 task_pid_nr(p), ppid, 5253 (unsigned long)task_thread_info(p)->flags); 5254 5255 print_worker_info(KERN_INFO, p); 5256 show_stack(p, NULL); 5257 put_task_stack(p); 5258 } 5259 5260 void show_state_filter(unsigned long state_filter) 5261 { 5262 struct task_struct *g, *p; 5263 5264 #if BITS_PER_LONG == 32 5265 printk(KERN_INFO 5266 " task PC stack pid father\n"); 5267 #else 5268 printk(KERN_INFO 5269 " task PC stack pid father\n"); 5270 #endif 5271 rcu_read_lock(); 5272 for_each_process_thread(g, p) { 5273 /* 5274 * reset the NMI-timeout, listing all files on a slow 5275 * console might take a lot of time: 5276 * Also, reset softlockup watchdogs on all CPUs, because 5277 * another CPU might be blocked waiting for us to process 5278 * an IPI. 5279 */ 5280 touch_nmi_watchdog(); 5281 touch_all_softlockup_watchdogs(); 5282 if (!state_filter || (p->state & state_filter)) 5283 sched_show_task(p); 5284 } 5285 5286 #ifdef CONFIG_SCHED_DEBUG 5287 if (!state_filter) 5288 sysrq_sched_debug_show(); 5289 #endif 5290 rcu_read_unlock(); 5291 /* 5292 * Only show locks if all tasks are dumped: 5293 */ 5294 if (!state_filter) 5295 debug_show_all_locks(); 5296 } 5297 5298 void init_idle_bootup_task(struct task_struct *idle) 5299 { 5300 idle->sched_class = &idle_sched_class; 5301 } 5302 5303 /** 5304 * init_idle - set up an idle thread for a given CPU 5305 * @idle: task in question 5306 * @cpu: CPU the idle task belongs to 5307 * 5308 * NOTE: this function does not set the idle thread's NEED_RESCHED 5309 * flag, to make booting more robust. 5310 */ 5311 void init_idle(struct task_struct *idle, int cpu) 5312 { 5313 struct rq *rq = cpu_rq(cpu); 5314 unsigned long flags; 5315 5316 raw_spin_lock_irqsave(&idle->pi_lock, flags); 5317 raw_spin_lock(&rq->lock); 5318 5319 __sched_fork(0, idle); 5320 idle->state = TASK_RUNNING; 5321 idle->se.exec_start = sched_clock(); 5322 idle->flags |= PF_IDLE; 5323 5324 kasan_unpoison_task_stack(idle); 5325 5326 #ifdef CONFIG_SMP 5327 /* 5328 * Its possible that init_idle() gets called multiple times on a task, 5329 * in that case do_set_cpus_allowed() will not do the right thing. 5330 * 5331 * And since this is boot we can forgo the serialization. 5332 */ 5333 set_cpus_allowed_common(idle, cpumask_of(cpu)); 5334 #endif 5335 /* 5336 * We're having a chicken and egg problem, even though we are 5337 * holding rq->lock, the CPU isn't yet set to this CPU so the 5338 * lockdep check in task_group() will fail. 5339 * 5340 * Similar case to sched_fork(). / Alternatively we could 5341 * use task_rq_lock() here and obtain the other rq->lock. 5342 * 5343 * Silence PROVE_RCU 5344 */ 5345 rcu_read_lock(); 5346 __set_task_cpu(idle, cpu); 5347 rcu_read_unlock(); 5348 5349 rq->curr = rq->idle = idle; 5350 idle->on_rq = TASK_ON_RQ_QUEUED; 5351 #ifdef CONFIG_SMP 5352 idle->on_cpu = 1; 5353 #endif 5354 raw_spin_unlock(&rq->lock); 5355 raw_spin_unlock_irqrestore(&idle->pi_lock, flags); 5356 5357 /* Set the preempt count _outside_ the spinlocks! */ 5358 init_idle_preempt_count(idle, cpu); 5359 5360 /* 5361 * The idle tasks have their own, simple scheduling class: 5362 */ 5363 idle->sched_class = &idle_sched_class; 5364 ftrace_graph_init_idle_task(idle, cpu); 5365 vtime_init_idle(idle, cpu); 5366 #ifdef CONFIG_SMP 5367 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 5368 #endif 5369 } 5370 5371 int cpuset_cpumask_can_shrink(const struct cpumask *cur, 5372 const struct cpumask *trial) 5373 { 5374 int ret = 1, trial_cpus; 5375 struct dl_bw *cur_dl_b; 5376 unsigned long flags; 5377 5378 if (!cpumask_weight(cur)) 5379 return ret; 5380 5381 rcu_read_lock_sched(); 5382 cur_dl_b = dl_bw_of(cpumask_any(cur)); 5383 trial_cpus = cpumask_weight(trial); 5384 5385 raw_spin_lock_irqsave(&cur_dl_b->lock, flags); 5386 if (cur_dl_b->bw != -1 && 5387 cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) 5388 ret = 0; 5389 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); 5390 rcu_read_unlock_sched(); 5391 5392 return ret; 5393 } 5394 5395 int task_can_attach(struct task_struct *p, 5396 const struct cpumask *cs_cpus_allowed) 5397 { 5398 int ret = 0; 5399 5400 /* 5401 * Kthreads which disallow setaffinity shouldn't be moved 5402 * to a new cpuset; we don't want to change their CPU 5403 * affinity and isolating such threads by their set of 5404 * allowed nodes is unnecessary. Thus, cpusets are not 5405 * applicable for such threads. This prevents checking for 5406 * success of set_cpus_allowed_ptr() on all attached tasks 5407 * before cpus_allowed may be changed. 5408 */ 5409 if (p->flags & PF_NO_SETAFFINITY) { 5410 ret = -EINVAL; 5411 goto out; 5412 } 5413 5414 #ifdef CONFIG_SMP 5415 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, 5416 cs_cpus_allowed)) { 5417 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, 5418 cs_cpus_allowed); 5419 struct dl_bw *dl_b; 5420 bool overflow; 5421 int cpus; 5422 unsigned long flags; 5423 5424 rcu_read_lock_sched(); 5425 dl_b = dl_bw_of(dest_cpu); 5426 raw_spin_lock_irqsave(&dl_b->lock, flags); 5427 cpus = dl_bw_cpus(dest_cpu); 5428 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); 5429 if (overflow) 5430 ret = -EBUSY; 5431 else { 5432 /* 5433 * We reserve space for this task in the destination 5434 * root_domain, as we can't fail after this point. 5435 * We will free resources in the source root_domain 5436 * later on (see set_cpus_allowed_dl()). 5437 */ 5438 __dl_add(dl_b, p->dl.dl_bw); 5439 } 5440 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 5441 rcu_read_unlock_sched(); 5442 5443 } 5444 #endif 5445 out: 5446 return ret; 5447 } 5448 5449 #ifdef CONFIG_SMP 5450 5451 bool sched_smp_initialized __read_mostly; 5452 5453 #ifdef CONFIG_NUMA_BALANCING 5454 /* Migrate current task p to target_cpu */ 5455 int migrate_task_to(struct task_struct *p, int target_cpu) 5456 { 5457 struct migration_arg arg = { p, target_cpu }; 5458 int curr_cpu = task_cpu(p); 5459 5460 if (curr_cpu == target_cpu) 5461 return 0; 5462 5463 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) 5464 return -EINVAL; 5465 5466 /* TODO: This is not properly updating schedstats */ 5467 5468 trace_sched_move_numa(p, curr_cpu, target_cpu); 5469 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); 5470 } 5471 5472 /* 5473 * Requeue a task on a given node and accurately track the number of NUMA 5474 * tasks on the runqueues 5475 */ 5476 void sched_setnuma(struct task_struct *p, int nid) 5477 { 5478 bool queued, running; 5479 struct rq_flags rf; 5480 struct rq *rq; 5481 5482 rq = task_rq_lock(p, &rf); 5483 queued = task_on_rq_queued(p); 5484 running = task_current(rq, p); 5485 5486 if (queued) 5487 dequeue_task(rq, p, DEQUEUE_SAVE); 5488 if (running) 5489 put_prev_task(rq, p); 5490 5491 p->numa_preferred_nid = nid; 5492 5493 if (queued) 5494 enqueue_task(rq, p, ENQUEUE_RESTORE); 5495 if (running) 5496 set_curr_task(rq, p); 5497 task_rq_unlock(rq, p, &rf); 5498 } 5499 #endif /* CONFIG_NUMA_BALANCING */ 5500 5501 #ifdef CONFIG_HOTPLUG_CPU 5502 /* 5503 * Ensure that the idle task is using init_mm right before its CPU goes 5504 * offline. 5505 */ 5506 void idle_task_exit(void) 5507 { 5508 struct mm_struct *mm = current->active_mm; 5509 5510 BUG_ON(cpu_online(smp_processor_id())); 5511 5512 if (mm != &init_mm) { 5513 switch_mm_irqs_off(mm, &init_mm, current); 5514 finish_arch_post_lock_switch(); 5515 } 5516 mmdrop(mm); 5517 } 5518 5519 /* 5520 * Since this CPU is going 'away' for a while, fold any nr_active delta 5521 * we might have. Assumes we're called after migrate_tasks() so that the 5522 * nr_active count is stable. We need to take the teardown thread which 5523 * is calling this into account, so we hand in adjust = 1 to the load 5524 * calculation. 5525 * 5526 * Also see the comment "Global load-average calculations". 5527 */ 5528 static void calc_load_migrate(struct rq *rq) 5529 { 5530 long delta = calc_load_fold_active(rq, 1); 5531 if (delta) 5532 atomic_long_add(delta, &calc_load_tasks); 5533 } 5534 5535 static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) 5536 { 5537 } 5538 5539 static const struct sched_class fake_sched_class = { 5540 .put_prev_task = put_prev_task_fake, 5541 }; 5542 5543 static struct task_struct fake_task = { 5544 /* 5545 * Avoid pull_{rt,dl}_task() 5546 */ 5547 .prio = MAX_PRIO + 1, 5548 .sched_class = &fake_sched_class, 5549 }; 5550 5551 /* 5552 * Migrate all tasks from the rq, sleeping tasks will be migrated by 5553 * try_to_wake_up()->select_task_rq(). 5554 * 5555 * Called with rq->lock held even though we'er in stop_machine() and 5556 * there's no concurrency possible, we hold the required locks anyway 5557 * because of lock validation efforts. 5558 */ 5559 static void migrate_tasks(struct rq *dead_rq) 5560 { 5561 struct rq *rq = dead_rq; 5562 struct task_struct *next, *stop = rq->stop; 5563 struct rq_flags rf, old_rf; 5564 int dest_cpu; 5565 5566 /* 5567 * Fudge the rq selection such that the below task selection loop 5568 * doesn't get stuck on the currently eligible stop task. 5569 * 5570 * We're currently inside stop_machine() and the rq is either stuck 5571 * in the stop_machine_cpu_stop() loop, or we're executing this code, 5572 * either way we should never end up calling schedule() until we're 5573 * done here. 5574 */ 5575 rq->stop = NULL; 5576 5577 /* 5578 * put_prev_task() and pick_next_task() sched 5579 * class method both need to have an up-to-date 5580 * value of rq->clock[_task] 5581 */ 5582 update_rq_clock(rq); 5583 5584 for (;;) { 5585 /* 5586 * There's this thread running, bail when that's the only 5587 * remaining thread: 5588 */ 5589 if (rq->nr_running == 1) 5590 break; 5591 5592 /* 5593 * pick_next_task() assumes pinned rq->lock: 5594 */ 5595 rq_pin_lock(rq, &rf); 5596 next = pick_next_task(rq, &fake_task, &rf); 5597 BUG_ON(!next); 5598 next->sched_class->put_prev_task(rq, next); 5599 5600 /* 5601 * Rules for changing task_struct::cpus_allowed are holding 5602 * both pi_lock and rq->lock, such that holding either 5603 * stabilizes the mask. 5604 * 5605 * Drop rq->lock is not quite as disastrous as it usually is 5606 * because !cpu_active at this point, which means load-balance 5607 * will not interfere. Also, stop-machine. 5608 */ 5609 rq_unpin_lock(rq, &rf); 5610 raw_spin_unlock(&rq->lock); 5611 raw_spin_lock(&next->pi_lock); 5612 raw_spin_lock(&rq->lock); 5613 5614 /* 5615 * Since we're inside stop-machine, _nothing_ should have 5616 * changed the task, WARN if weird stuff happened, because in 5617 * that case the above rq->lock drop is a fail too. 5618 */ 5619 if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { 5620 raw_spin_unlock(&next->pi_lock); 5621 continue; 5622 } 5623 5624 /* 5625 * __migrate_task() may return with a different 5626 * rq->lock held and a new cookie in 'rf', but we need 5627 * to preserve rf::clock_update_flags for 'dead_rq'. 5628 */ 5629 old_rf = rf; 5630 5631 /* Find suitable destination for @next, with force if needed. */ 5632 dest_cpu = select_fallback_rq(dead_rq->cpu, next); 5633 5634 rq = __migrate_task(rq, next, dest_cpu); 5635 if (rq != dead_rq) { 5636 raw_spin_unlock(&rq->lock); 5637 rq = dead_rq; 5638 raw_spin_lock(&rq->lock); 5639 rf = old_rf; 5640 } 5641 raw_spin_unlock(&next->pi_lock); 5642 } 5643 5644 rq->stop = stop; 5645 } 5646 #endif /* CONFIG_HOTPLUG_CPU */ 5647 5648 void set_rq_online(struct rq *rq) 5649 { 5650 if (!rq->online) { 5651 const struct sched_class *class; 5652 5653 cpumask_set_cpu(rq->cpu, rq->rd->online); 5654 rq->online = 1; 5655 5656 for_each_class(class) { 5657 if (class->rq_online) 5658 class->rq_online(rq); 5659 } 5660 } 5661 } 5662 5663 void set_rq_offline(struct rq *rq) 5664 { 5665 if (rq->online) { 5666 const struct sched_class *class; 5667 5668 for_each_class(class) { 5669 if (class->rq_offline) 5670 class->rq_offline(rq); 5671 } 5672 5673 cpumask_clear_cpu(rq->cpu, rq->rd->online); 5674 rq->online = 0; 5675 } 5676 } 5677 5678 static void set_cpu_rq_start_time(unsigned int cpu) 5679 { 5680 struct rq *rq = cpu_rq(cpu); 5681 5682 rq->age_stamp = sched_clock_cpu(cpu); 5683 } 5684 5685 /* 5686 * used to mark begin/end of suspend/resume: 5687 */ 5688 static int num_cpus_frozen; 5689 5690 /* 5691 * Update cpusets according to cpu_active mask. If cpusets are 5692 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 5693 * around partition_sched_domains(). 5694 * 5695 * If we come here as part of a suspend/resume, don't touch cpusets because we 5696 * want to restore it back to its original state upon resume anyway. 5697 */ 5698 static void cpuset_cpu_active(void) 5699 { 5700 if (cpuhp_tasks_frozen) { 5701 /* 5702 * num_cpus_frozen tracks how many CPUs are involved in suspend 5703 * resume sequence. As long as this is not the last online 5704 * operation in the resume sequence, just build a single sched 5705 * domain, ignoring cpusets. 5706 */ 5707 num_cpus_frozen--; 5708 if (likely(num_cpus_frozen)) { 5709 partition_sched_domains(1, NULL, NULL); 5710 return; 5711 } 5712 /* 5713 * This is the last CPU online operation. So fall through and 5714 * restore the original sched domains by considering the 5715 * cpuset configurations. 5716 */ 5717 } 5718 cpuset_update_active_cpus(true); 5719 } 5720 5721 static int cpuset_cpu_inactive(unsigned int cpu) 5722 { 5723 unsigned long flags; 5724 struct dl_bw *dl_b; 5725 bool overflow; 5726 int cpus; 5727 5728 if (!cpuhp_tasks_frozen) { 5729 rcu_read_lock_sched(); 5730 dl_b = dl_bw_of(cpu); 5731 5732 raw_spin_lock_irqsave(&dl_b->lock, flags); 5733 cpus = dl_bw_cpus(cpu); 5734 overflow = __dl_overflow(dl_b, cpus, 0, 0); 5735 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 5736 5737 rcu_read_unlock_sched(); 5738 5739 if (overflow) 5740 return -EBUSY; 5741 cpuset_update_active_cpus(false); 5742 } else { 5743 num_cpus_frozen++; 5744 partition_sched_domains(1, NULL, NULL); 5745 } 5746 return 0; 5747 } 5748 5749 int sched_cpu_activate(unsigned int cpu) 5750 { 5751 struct rq *rq = cpu_rq(cpu); 5752 unsigned long flags; 5753 5754 set_cpu_active(cpu, true); 5755 5756 if (sched_smp_initialized) { 5757 sched_domains_numa_masks_set(cpu); 5758 cpuset_cpu_active(); 5759 } 5760 5761 /* 5762 * Put the rq online, if not already. This happens: 5763 * 5764 * 1) In the early boot process, because we build the real domains 5765 * after all CPUs have been brought up. 5766 * 5767 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the 5768 * domains. 5769 */ 5770 raw_spin_lock_irqsave(&rq->lock, flags); 5771 if (rq->rd) { 5772 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5773 set_rq_online(rq); 5774 } 5775 raw_spin_unlock_irqrestore(&rq->lock, flags); 5776 5777 update_max_interval(); 5778 5779 return 0; 5780 } 5781 5782 int sched_cpu_deactivate(unsigned int cpu) 5783 { 5784 int ret; 5785 5786 set_cpu_active(cpu, false); 5787 /* 5788 * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU 5789 * users of this state to go away such that all new such users will 5790 * observe it. 5791 * 5792 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might 5793 * not imply sync_sched(), so wait for both. 5794 * 5795 * Do sync before park smpboot threads to take care the rcu boost case. 5796 */ 5797 if (IS_ENABLED(CONFIG_PREEMPT)) 5798 synchronize_rcu_mult(call_rcu, call_rcu_sched); 5799 else 5800 synchronize_rcu(); 5801 5802 if (!sched_smp_initialized) 5803 return 0; 5804 5805 ret = cpuset_cpu_inactive(cpu); 5806 if (ret) { 5807 set_cpu_active(cpu, true); 5808 return ret; 5809 } 5810 sched_domains_numa_masks_clear(cpu); 5811 return 0; 5812 } 5813 5814 static void sched_rq_cpu_starting(unsigned int cpu) 5815 { 5816 struct rq *rq = cpu_rq(cpu); 5817 5818 rq->calc_load_update = calc_load_update; 5819 update_max_interval(); 5820 } 5821 5822 int sched_cpu_starting(unsigned int cpu) 5823 { 5824 set_cpu_rq_start_time(cpu); 5825 sched_rq_cpu_starting(cpu); 5826 return 0; 5827 } 5828 5829 #ifdef CONFIG_HOTPLUG_CPU 5830 int sched_cpu_dying(unsigned int cpu) 5831 { 5832 struct rq *rq = cpu_rq(cpu); 5833 unsigned long flags; 5834 5835 /* Handle pending wakeups and then migrate everything off */ 5836 sched_ttwu_pending(); 5837 raw_spin_lock_irqsave(&rq->lock, flags); 5838 if (rq->rd) { 5839 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5840 set_rq_offline(rq); 5841 } 5842 migrate_tasks(rq); 5843 BUG_ON(rq->nr_running != 1); 5844 raw_spin_unlock_irqrestore(&rq->lock, flags); 5845 calc_load_migrate(rq); 5846 update_max_interval(); 5847 nohz_balance_exit_idle(cpu); 5848 hrtick_clear(rq); 5849 return 0; 5850 } 5851 #endif 5852 5853 #ifdef CONFIG_SCHED_SMT 5854 DEFINE_STATIC_KEY_FALSE(sched_smt_present); 5855 5856 static void sched_init_smt(void) 5857 { 5858 /* 5859 * We've enumerated all CPUs and will assume that if any CPU 5860 * has SMT siblings, CPU0 will too. 5861 */ 5862 if (cpumask_weight(cpu_smt_mask(0)) > 1) 5863 static_branch_enable(&sched_smt_present); 5864 } 5865 #else 5866 static inline void sched_init_smt(void) { } 5867 #endif 5868 5869 void __init sched_init_smp(void) 5870 { 5871 cpumask_var_t non_isolated_cpus; 5872 5873 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 5874 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 5875 5876 sched_init_numa(); 5877 5878 /* 5879 * There's no userspace yet to cause hotplug operations; hence all the 5880 * CPU masks are stable and all blatant races in the below code cannot 5881 * happen. 5882 */ 5883 mutex_lock(&sched_domains_mutex); 5884 init_sched_domains(cpu_active_mask); 5885 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 5886 if (cpumask_empty(non_isolated_cpus)) 5887 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 5888 mutex_unlock(&sched_domains_mutex); 5889 5890 /* Move init over to a non-isolated CPU */ 5891 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) 5892 BUG(); 5893 sched_init_granularity(); 5894 free_cpumask_var(non_isolated_cpus); 5895 5896 init_sched_rt_class(); 5897 init_sched_dl_class(); 5898 5899 sched_init_smt(); 5900 sched_clock_init_late(); 5901 5902 sched_smp_initialized = true; 5903 } 5904 5905 static int __init migration_init(void) 5906 { 5907 sched_rq_cpu_starting(smp_processor_id()); 5908 return 0; 5909 } 5910 early_initcall(migration_init); 5911 5912 #else 5913 void __init sched_init_smp(void) 5914 { 5915 sched_init_granularity(); 5916 sched_clock_init_late(); 5917 } 5918 #endif /* CONFIG_SMP */ 5919 5920 int in_sched_functions(unsigned long addr) 5921 { 5922 return in_lock_functions(addr) || 5923 (addr >= (unsigned long)__sched_text_start 5924 && addr < (unsigned long)__sched_text_end); 5925 } 5926 5927 #ifdef CONFIG_CGROUP_SCHED 5928 /* 5929 * Default task group. 5930 * Every task in system belongs to this group at bootup. 5931 */ 5932 struct task_group root_task_group; 5933 LIST_HEAD(task_groups); 5934 5935 /* Cacheline aligned slab cache for task_group */ 5936 static struct kmem_cache *task_group_cache __read_mostly; 5937 #endif 5938 5939 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); 5940 DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); 5941 5942 #define WAIT_TABLE_BITS 8 5943 #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) 5944 static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; 5945 5946 wait_queue_head_t *bit_waitqueue(void *word, int bit) 5947 { 5948 const int shift = BITS_PER_LONG == 32 ? 5 : 6; 5949 unsigned long val = (unsigned long)word << shift | bit; 5950 5951 return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); 5952 } 5953 EXPORT_SYMBOL(bit_waitqueue); 5954 5955 void __init sched_init(void) 5956 { 5957 int i, j; 5958 unsigned long alloc_size = 0, ptr; 5959 5960 sched_clock_init(); 5961 5962 for (i = 0; i < WAIT_TABLE_SIZE; i++) 5963 init_waitqueue_head(bit_wait_table + i); 5964 5965 #ifdef CONFIG_FAIR_GROUP_SCHED 5966 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 5967 #endif 5968 #ifdef CONFIG_RT_GROUP_SCHED 5969 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 5970 #endif 5971 if (alloc_size) { 5972 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 5973 5974 #ifdef CONFIG_FAIR_GROUP_SCHED 5975 root_task_group.se = (struct sched_entity **)ptr; 5976 ptr += nr_cpu_ids * sizeof(void **); 5977 5978 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 5979 ptr += nr_cpu_ids * sizeof(void **); 5980 5981 #endif /* CONFIG_FAIR_GROUP_SCHED */ 5982 #ifdef CONFIG_RT_GROUP_SCHED 5983 root_task_group.rt_se = (struct sched_rt_entity **)ptr; 5984 ptr += nr_cpu_ids * sizeof(void **); 5985 5986 root_task_group.rt_rq = (struct rt_rq **)ptr; 5987 ptr += nr_cpu_ids * sizeof(void **); 5988 5989 #endif /* CONFIG_RT_GROUP_SCHED */ 5990 } 5991 #ifdef CONFIG_CPUMASK_OFFSTACK 5992 for_each_possible_cpu(i) { 5993 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( 5994 cpumask_size(), GFP_KERNEL, cpu_to_node(i)); 5995 per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node( 5996 cpumask_size(), GFP_KERNEL, cpu_to_node(i)); 5997 } 5998 #endif /* CONFIG_CPUMASK_OFFSTACK */ 5999 6000 init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); 6001 init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime()); 6002 6003 #ifdef CONFIG_SMP 6004 init_defrootdomain(); 6005 #endif 6006 6007 #ifdef CONFIG_RT_GROUP_SCHED 6008 init_rt_bandwidth(&root_task_group.rt_bandwidth, 6009 global_rt_period(), global_rt_runtime()); 6010 #endif /* CONFIG_RT_GROUP_SCHED */ 6011 6012 #ifdef CONFIG_CGROUP_SCHED 6013 task_group_cache = KMEM_CACHE(task_group, 0); 6014 6015 list_add(&root_task_group.list, &task_groups); 6016 INIT_LIST_HEAD(&root_task_group.children); 6017 INIT_LIST_HEAD(&root_task_group.siblings); 6018 autogroup_init(&init_task); 6019 #endif /* CONFIG_CGROUP_SCHED */ 6020 6021 for_each_possible_cpu(i) { 6022 struct rq *rq; 6023 6024 rq = cpu_rq(i); 6025 raw_spin_lock_init(&rq->lock); 6026 rq->nr_running = 0; 6027 rq->calc_load_active = 0; 6028 rq->calc_load_update = jiffies + LOAD_FREQ; 6029 init_cfs_rq(&rq->cfs); 6030 init_rt_rq(&rq->rt); 6031 init_dl_rq(&rq->dl); 6032 #ifdef CONFIG_FAIR_GROUP_SCHED 6033 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 6034 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6035 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; 6036 /* 6037 * How much CPU bandwidth does root_task_group get? 6038 * 6039 * In case of task-groups formed thr' the cgroup filesystem, it 6040 * gets 100% of the CPU resources in the system. This overall 6041 * system CPU resource is divided among the tasks of 6042 * root_task_group and its child task-groups in a fair manner, 6043 * based on each entity's (task or task-group's) weight 6044 * (se->load.weight). 6045 * 6046 * In other words, if root_task_group has 10 tasks of weight 6047 * 1024) and two child groups A0 and A1 (of weight 1024 each), 6048 * then A0's share of the CPU resource is: 6049 * 6050 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 6051 * 6052 * We achieve this by letting root_task_group's tasks sit 6053 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 6054 */ 6055 init_cfs_bandwidth(&root_task_group.cfs_bandwidth); 6056 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 6057 #endif /* CONFIG_FAIR_GROUP_SCHED */ 6058 6059 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 6060 #ifdef CONFIG_RT_GROUP_SCHED 6061 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 6062 #endif 6063 6064 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 6065 rq->cpu_load[j] = 0; 6066 6067 #ifdef CONFIG_SMP 6068 rq->sd = NULL; 6069 rq->rd = NULL; 6070 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; 6071 rq->balance_callback = NULL; 6072 rq->active_balance = 0; 6073 rq->next_balance = jiffies; 6074 rq->push_cpu = 0; 6075 rq->cpu = i; 6076 rq->online = 0; 6077 rq->idle_stamp = 0; 6078 rq->avg_idle = 2*sysctl_sched_migration_cost; 6079 rq->max_idle_balance_cost = sysctl_sched_migration_cost; 6080 6081 INIT_LIST_HEAD(&rq->cfs_tasks); 6082 6083 rq_attach_root(rq, &def_root_domain); 6084 #ifdef CONFIG_NO_HZ_COMMON 6085 rq->last_load_update_tick = jiffies; 6086 rq->nohz_flags = 0; 6087 #endif 6088 #ifdef CONFIG_NO_HZ_FULL 6089 rq->last_sched_tick = 0; 6090 #endif 6091 #endif /* CONFIG_SMP */ 6092 init_rq_hrtick(rq); 6093 atomic_set(&rq->nr_iowait, 0); 6094 } 6095 6096 set_load_weight(&init_task); 6097 6098 /* 6099 * The boot idle thread does lazy MMU switching as well: 6100 */ 6101 atomic_inc(&init_mm.mm_count); 6102 enter_lazy_tlb(&init_mm, current); 6103 6104 /* 6105 * Make us the idle thread. Technically, schedule() should not be 6106 * called from this thread, however somewhere below it might be, 6107 * but because we are the idle thread, we just pick up running again 6108 * when this runqueue becomes "idle". 6109 */ 6110 init_idle(current, smp_processor_id()); 6111 6112 calc_load_update = jiffies + LOAD_FREQ; 6113 6114 #ifdef CONFIG_SMP 6115 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 6116 /* May be allocated at isolcpus cmdline parse time */ 6117 if (cpu_isolated_map == NULL) 6118 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 6119 idle_thread_set_boot_cpu(); 6120 set_cpu_rq_start_time(smp_processor_id()); 6121 #endif 6122 init_sched_fair_class(); 6123 6124 init_schedstats(); 6125 6126 scheduler_running = 1; 6127 } 6128 6129 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 6130 static inline int preempt_count_equals(int preempt_offset) 6131 { 6132 int nested = preempt_count() + rcu_preempt_depth(); 6133 6134 return (nested == preempt_offset); 6135 } 6136 6137 void __might_sleep(const char *file, int line, int preempt_offset) 6138 { 6139 /* 6140 * Blocking primitives will set (and therefore destroy) current->state, 6141 * since we will exit with TASK_RUNNING make sure we enter with it, 6142 * otherwise we will destroy state. 6143 */ 6144 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, 6145 "do not call blocking ops when !TASK_RUNNING; " 6146 "state=%lx set at [<%p>] %pS\n", 6147 current->state, 6148 (void *)current->task_state_change, 6149 (void *)current->task_state_change); 6150 6151 ___might_sleep(file, line, preempt_offset); 6152 } 6153 EXPORT_SYMBOL(__might_sleep); 6154 6155 void ___might_sleep(const char *file, int line, int preempt_offset) 6156 { 6157 /* Ratelimiting timestamp: */ 6158 static unsigned long prev_jiffy; 6159 6160 unsigned long preempt_disable_ip; 6161 6162 /* WARN_ON_ONCE() by default, no rate limit required: */ 6163 rcu_sleep_check(); 6164 6165 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && 6166 !is_idle_task(current)) || 6167 system_state != SYSTEM_RUNNING || oops_in_progress) 6168 return; 6169 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 6170 return; 6171 prev_jiffy = jiffies; 6172 6173 /* Save this before calling printk(), since that will clobber it: */ 6174 preempt_disable_ip = get_preempt_disable_ip(current); 6175 6176 printk(KERN_ERR 6177 "BUG: sleeping function called from invalid context at %s:%d\n", 6178 file, line); 6179 printk(KERN_ERR 6180 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 6181 in_atomic(), irqs_disabled(), 6182 current->pid, current->comm); 6183 6184 if (task_stack_end_corrupted(current)) 6185 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); 6186 6187 debug_show_held_locks(current); 6188 if (irqs_disabled()) 6189 print_irqtrace_events(current); 6190 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) 6191 && !preempt_count_equals(preempt_offset)) { 6192 pr_err("Preemption disabled at:"); 6193 print_ip_sym(preempt_disable_ip); 6194 pr_cont("\n"); 6195 } 6196 dump_stack(); 6197 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 6198 } 6199 EXPORT_SYMBOL(___might_sleep); 6200 #endif 6201 6202 #ifdef CONFIG_MAGIC_SYSRQ 6203 void normalize_rt_tasks(void) 6204 { 6205 struct task_struct *g, *p; 6206 struct sched_attr attr = { 6207 .sched_policy = SCHED_NORMAL, 6208 }; 6209 6210 read_lock(&tasklist_lock); 6211 for_each_process_thread(g, p) { 6212 /* 6213 * Only normalize user tasks: 6214 */ 6215 if (p->flags & PF_KTHREAD) 6216 continue; 6217 6218 p->se.exec_start = 0; 6219 schedstat_set(p->se.statistics.wait_start, 0); 6220 schedstat_set(p->se.statistics.sleep_start, 0); 6221 schedstat_set(p->se.statistics.block_start, 0); 6222 6223 if (!dl_task(p) && !rt_task(p)) { 6224 /* 6225 * Renice negative nice level userspace 6226 * tasks back to 0: 6227 */ 6228 if (task_nice(p) < 0) 6229 set_user_nice(p, 0); 6230 continue; 6231 } 6232 6233 __sched_setscheduler(p, &attr, false, false); 6234 } 6235 read_unlock(&tasklist_lock); 6236 } 6237 6238 #endif /* CONFIG_MAGIC_SYSRQ */ 6239 6240 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) 6241 /* 6242 * These functions are only useful for the IA64 MCA handling, or kdb. 6243 * 6244 * They can only be called when the whole system has been 6245 * stopped - every CPU needs to be quiescent, and no scheduling 6246 * activity can take place. Using them for anything else would 6247 * be a serious bug, and as a result, they aren't even visible 6248 * under any other configuration. 6249 */ 6250 6251 /** 6252 * curr_task - return the current task for a given CPU. 6253 * @cpu: the processor in question. 6254 * 6255 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6256 * 6257 * Return: The current task for @cpu. 6258 */ 6259 struct task_struct *curr_task(int cpu) 6260 { 6261 return cpu_curr(cpu); 6262 } 6263 6264 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ 6265 6266 #ifdef CONFIG_IA64 6267 /** 6268 * set_curr_task - set the current task for a given CPU. 6269 * @cpu: the processor in question. 6270 * @p: the task pointer to set. 6271 * 6272 * Description: This function must only be used when non-maskable interrupts 6273 * are serviced on a separate stack. It allows the architecture to switch the 6274 * notion of the current task on a CPU in a non-blocking manner. This function 6275 * must be called with all CPU's synchronized, and interrupts disabled, the 6276 * and caller must save the original value of the current task (see 6277 * curr_task() above) and restore that value before reenabling interrupts and 6278 * re-starting the system. 6279 * 6280 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6281 */ 6282 void ia64_set_curr_task(int cpu, struct task_struct *p) 6283 { 6284 cpu_curr(cpu) = p; 6285 } 6286 6287 #endif 6288 6289 #ifdef CONFIG_CGROUP_SCHED 6290 /* task_group_lock serializes the addition/removal of task groups */ 6291 static DEFINE_SPINLOCK(task_group_lock); 6292 6293 static void sched_free_group(struct task_group *tg) 6294 { 6295 free_fair_sched_group(tg); 6296 free_rt_sched_group(tg); 6297 autogroup_free(tg); 6298 kmem_cache_free(task_group_cache, tg); 6299 } 6300 6301 /* allocate runqueue etc for a new task group */ 6302 struct task_group *sched_create_group(struct task_group *parent) 6303 { 6304 struct task_group *tg; 6305 6306 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); 6307 if (!tg) 6308 return ERR_PTR(-ENOMEM); 6309 6310 if (!alloc_fair_sched_group(tg, parent)) 6311 goto err; 6312 6313 if (!alloc_rt_sched_group(tg, parent)) 6314 goto err; 6315 6316 return tg; 6317 6318 err: 6319 sched_free_group(tg); 6320 return ERR_PTR(-ENOMEM); 6321 } 6322 6323 void sched_online_group(struct task_group *tg, struct task_group *parent) 6324 { 6325 unsigned long flags; 6326 6327 spin_lock_irqsave(&task_group_lock, flags); 6328 list_add_rcu(&tg->list, &task_groups); 6329 6330 /* Root should already exist: */ 6331 WARN_ON(!parent); 6332 6333 tg->parent = parent; 6334 INIT_LIST_HEAD(&tg->children); 6335 list_add_rcu(&tg->siblings, &parent->children); 6336 spin_unlock_irqrestore(&task_group_lock, flags); 6337 6338 online_fair_sched_group(tg); 6339 } 6340 6341 /* rcu callback to free various structures associated with a task group */ 6342 static void sched_free_group_rcu(struct rcu_head *rhp) 6343 { 6344 /* Now it should be safe to free those cfs_rqs: */ 6345 sched_free_group(container_of(rhp, struct task_group, rcu)); 6346 } 6347 6348 void sched_destroy_group(struct task_group *tg) 6349 { 6350 /* Wait for possible concurrent references to cfs_rqs complete: */ 6351 call_rcu(&tg->rcu, sched_free_group_rcu); 6352 } 6353 6354 void sched_offline_group(struct task_group *tg) 6355 { 6356 unsigned long flags; 6357 6358 /* End participation in shares distribution: */ 6359 unregister_fair_sched_group(tg); 6360 6361 spin_lock_irqsave(&task_group_lock, flags); 6362 list_del_rcu(&tg->list); 6363 list_del_rcu(&tg->siblings); 6364 spin_unlock_irqrestore(&task_group_lock, flags); 6365 } 6366 6367 static void sched_change_group(struct task_struct *tsk, int type) 6368 { 6369 struct task_group *tg; 6370 6371 /* 6372 * All callers are synchronized by task_rq_lock(); we do not use RCU 6373 * which is pointless here. Thus, we pass "true" to task_css_check() 6374 * to prevent lockdep warnings. 6375 */ 6376 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), 6377 struct task_group, css); 6378 tg = autogroup_task_group(tsk, tg); 6379 tsk->sched_task_group = tg; 6380 6381 #ifdef CONFIG_FAIR_GROUP_SCHED 6382 if (tsk->sched_class->task_change_group) 6383 tsk->sched_class->task_change_group(tsk, type); 6384 else 6385 #endif 6386 set_task_rq(tsk, task_cpu(tsk)); 6387 } 6388 6389 /* 6390 * Change task's runqueue when it moves between groups. 6391 * 6392 * The caller of this function should have put the task in its new group by 6393 * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect 6394 * its new group. 6395 */ 6396 void sched_move_task(struct task_struct *tsk) 6397 { 6398 int queued, running; 6399 struct rq_flags rf; 6400 struct rq *rq; 6401 6402 rq = task_rq_lock(tsk, &rf); 6403 update_rq_clock(rq); 6404 6405 running = task_current(rq, tsk); 6406 queued = task_on_rq_queued(tsk); 6407 6408 if (queued) 6409 dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); 6410 if (running) 6411 put_prev_task(rq, tsk); 6412 6413 sched_change_group(tsk, TASK_MOVE_GROUP); 6414 6415 if (queued) 6416 enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); 6417 if (running) 6418 set_curr_task(rq, tsk); 6419 6420 task_rq_unlock(rq, tsk, &rf); 6421 } 6422 #endif /* CONFIG_CGROUP_SCHED */ 6423 6424 #ifdef CONFIG_RT_GROUP_SCHED 6425 /* 6426 * Ensure that the real time constraints are schedulable. 6427 */ 6428 static DEFINE_MUTEX(rt_constraints_mutex); 6429 6430 /* Must be called with tasklist_lock held */ 6431 static inline int tg_has_rt_tasks(struct task_group *tg) 6432 { 6433 struct task_struct *g, *p; 6434 6435 /* 6436 * Autogroups do not have RT tasks; see autogroup_create(). 6437 */ 6438 if (task_group_is_autogroup(tg)) 6439 return 0; 6440 6441 for_each_process_thread(g, p) { 6442 if (rt_task(p) && task_group(p) == tg) 6443 return 1; 6444 } 6445 6446 return 0; 6447 } 6448 6449 struct rt_schedulable_data { 6450 struct task_group *tg; 6451 u64 rt_period; 6452 u64 rt_runtime; 6453 }; 6454 6455 static int tg_rt_schedulable(struct task_group *tg, void *data) 6456 { 6457 struct rt_schedulable_data *d = data; 6458 struct task_group *child; 6459 unsigned long total, sum = 0; 6460 u64 period, runtime; 6461 6462 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 6463 runtime = tg->rt_bandwidth.rt_runtime; 6464 6465 if (tg == d->tg) { 6466 period = d->rt_period; 6467 runtime = d->rt_runtime; 6468 } 6469 6470 /* 6471 * Cannot have more runtime than the period. 6472 */ 6473 if (runtime > period && runtime != RUNTIME_INF) 6474 return -EINVAL; 6475 6476 /* 6477 * Ensure we don't starve existing RT tasks. 6478 */ 6479 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) 6480 return -EBUSY; 6481 6482 total = to_ratio(period, runtime); 6483 6484 /* 6485 * Nobody can have more than the global setting allows. 6486 */ 6487 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 6488 return -EINVAL; 6489 6490 /* 6491 * The sum of our children's runtime should not exceed our own. 6492 */ 6493 list_for_each_entry_rcu(child, &tg->children, siblings) { 6494 period = ktime_to_ns(child->rt_bandwidth.rt_period); 6495 runtime = child->rt_bandwidth.rt_runtime; 6496 6497 if (child == d->tg) { 6498 period = d->rt_period; 6499 runtime = d->rt_runtime; 6500 } 6501 6502 sum += to_ratio(period, runtime); 6503 } 6504 6505 if (sum > total) 6506 return -EINVAL; 6507 6508 return 0; 6509 } 6510 6511 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 6512 { 6513 int ret; 6514 6515 struct rt_schedulable_data data = { 6516 .tg = tg, 6517 .rt_period = period, 6518 .rt_runtime = runtime, 6519 }; 6520 6521 rcu_read_lock(); 6522 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); 6523 rcu_read_unlock(); 6524 6525 return ret; 6526 } 6527 6528 static int tg_set_rt_bandwidth(struct task_group *tg, 6529 u64 rt_period, u64 rt_runtime) 6530 { 6531 int i, err = 0; 6532 6533 /* 6534 * Disallowing the root group RT runtime is BAD, it would disallow the 6535 * kernel creating (and or operating) RT threads. 6536 */ 6537 if (tg == &root_task_group && rt_runtime == 0) 6538 return -EINVAL; 6539 6540 /* No period doesn't make any sense. */ 6541 if (rt_period == 0) 6542 return -EINVAL; 6543 6544 mutex_lock(&rt_constraints_mutex); 6545 read_lock(&tasklist_lock); 6546 err = __rt_schedulable(tg, rt_period, rt_runtime); 6547 if (err) 6548 goto unlock; 6549 6550 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 6551 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 6552 tg->rt_bandwidth.rt_runtime = rt_runtime; 6553 6554 for_each_possible_cpu(i) { 6555 struct rt_rq *rt_rq = tg->rt_rq[i]; 6556 6557 raw_spin_lock(&rt_rq->rt_runtime_lock); 6558 rt_rq->rt_runtime = rt_runtime; 6559 raw_spin_unlock(&rt_rq->rt_runtime_lock); 6560 } 6561 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 6562 unlock: 6563 read_unlock(&tasklist_lock); 6564 mutex_unlock(&rt_constraints_mutex); 6565 6566 return err; 6567 } 6568 6569 static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 6570 { 6571 u64 rt_runtime, rt_period; 6572 6573 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 6574 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 6575 if (rt_runtime_us < 0) 6576 rt_runtime = RUNTIME_INF; 6577 6578 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 6579 } 6580 6581 static long sched_group_rt_runtime(struct task_group *tg) 6582 { 6583 u64 rt_runtime_us; 6584 6585 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 6586 return -1; 6587 6588 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 6589 do_div(rt_runtime_us, NSEC_PER_USEC); 6590 return rt_runtime_us; 6591 } 6592 6593 static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) 6594 { 6595 u64 rt_runtime, rt_period; 6596 6597 rt_period = rt_period_us * NSEC_PER_USEC; 6598 rt_runtime = tg->rt_bandwidth.rt_runtime; 6599 6600 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 6601 } 6602 6603 static long sched_group_rt_period(struct task_group *tg) 6604 { 6605 u64 rt_period_us; 6606 6607 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 6608 do_div(rt_period_us, NSEC_PER_USEC); 6609 return rt_period_us; 6610 } 6611 #endif /* CONFIG_RT_GROUP_SCHED */ 6612 6613 #ifdef CONFIG_RT_GROUP_SCHED 6614 static int sched_rt_global_constraints(void) 6615 { 6616 int ret = 0; 6617 6618 mutex_lock(&rt_constraints_mutex); 6619 read_lock(&tasklist_lock); 6620 ret = __rt_schedulable(NULL, 0, 0); 6621 read_unlock(&tasklist_lock); 6622 mutex_unlock(&rt_constraints_mutex); 6623 6624 return ret; 6625 } 6626 6627 static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 6628 { 6629 /* Don't accept realtime tasks when there is no way for them to run */ 6630 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 6631 return 0; 6632 6633 return 1; 6634 } 6635 6636 #else /* !CONFIG_RT_GROUP_SCHED */ 6637 static int sched_rt_global_constraints(void) 6638 { 6639 unsigned long flags; 6640 int i; 6641 6642 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 6643 for_each_possible_cpu(i) { 6644 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 6645 6646 raw_spin_lock(&rt_rq->rt_runtime_lock); 6647 rt_rq->rt_runtime = global_rt_runtime(); 6648 raw_spin_unlock(&rt_rq->rt_runtime_lock); 6649 } 6650 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 6651 6652 return 0; 6653 } 6654 #endif /* CONFIG_RT_GROUP_SCHED */ 6655 6656 static int sched_dl_global_validate(void) 6657 { 6658 u64 runtime = global_rt_runtime(); 6659 u64 period = global_rt_period(); 6660 u64 new_bw = to_ratio(period, runtime); 6661 struct dl_bw *dl_b; 6662 int cpu, ret = 0; 6663 unsigned long flags; 6664 6665 /* 6666 * Here we want to check the bandwidth not being set to some 6667 * value smaller than the currently allocated bandwidth in 6668 * any of the root_domains. 6669 * 6670 * FIXME: Cycling on all the CPUs is overdoing, but simpler than 6671 * cycling on root_domains... Discussion on different/better 6672 * solutions is welcome! 6673 */ 6674 for_each_possible_cpu(cpu) { 6675 rcu_read_lock_sched(); 6676 dl_b = dl_bw_of(cpu); 6677 6678 raw_spin_lock_irqsave(&dl_b->lock, flags); 6679 if (new_bw < dl_b->total_bw) 6680 ret = -EBUSY; 6681 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 6682 6683 rcu_read_unlock_sched(); 6684 6685 if (ret) 6686 break; 6687 } 6688 6689 return ret; 6690 } 6691 6692 static void sched_dl_do_global(void) 6693 { 6694 u64 new_bw = -1; 6695 struct dl_bw *dl_b; 6696 int cpu; 6697 unsigned long flags; 6698 6699 def_dl_bandwidth.dl_period = global_rt_period(); 6700 def_dl_bandwidth.dl_runtime = global_rt_runtime(); 6701 6702 if (global_rt_runtime() != RUNTIME_INF) 6703 new_bw = to_ratio(global_rt_period(), global_rt_runtime()); 6704 6705 /* 6706 * FIXME: As above... 6707 */ 6708 for_each_possible_cpu(cpu) { 6709 rcu_read_lock_sched(); 6710 dl_b = dl_bw_of(cpu); 6711 6712 raw_spin_lock_irqsave(&dl_b->lock, flags); 6713 dl_b->bw = new_bw; 6714 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 6715 6716 rcu_read_unlock_sched(); 6717 } 6718 } 6719 6720 static int sched_rt_global_validate(void) 6721 { 6722 if (sysctl_sched_rt_period <= 0) 6723 return -EINVAL; 6724 6725 if ((sysctl_sched_rt_runtime != RUNTIME_INF) && 6726 (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) 6727 return -EINVAL; 6728 6729 return 0; 6730 } 6731 6732 static void sched_rt_do_global(void) 6733 { 6734 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 6735 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); 6736 } 6737 6738 int sched_rt_handler(struct ctl_table *table, int write, 6739 void __user *buffer, size_t *lenp, 6740 loff_t *ppos) 6741 { 6742 int old_period, old_runtime; 6743 static DEFINE_MUTEX(mutex); 6744 int ret; 6745 6746 mutex_lock(&mutex); 6747 old_period = sysctl_sched_rt_period; 6748 old_runtime = sysctl_sched_rt_runtime; 6749 6750 ret = proc_dointvec(table, write, buffer, lenp, ppos); 6751 6752 if (!ret && write) { 6753 ret = sched_rt_global_validate(); 6754 if (ret) 6755 goto undo; 6756 6757 ret = sched_dl_global_validate(); 6758 if (ret) 6759 goto undo; 6760 6761 ret = sched_rt_global_constraints(); 6762 if (ret) 6763 goto undo; 6764 6765 sched_rt_do_global(); 6766 sched_dl_do_global(); 6767 } 6768 if (0) { 6769 undo: 6770 sysctl_sched_rt_period = old_period; 6771 sysctl_sched_rt_runtime = old_runtime; 6772 } 6773 mutex_unlock(&mutex); 6774 6775 return ret; 6776 } 6777 6778 int sched_rr_handler(struct ctl_table *table, int write, 6779 void __user *buffer, size_t *lenp, 6780 loff_t *ppos) 6781 { 6782 int ret; 6783 static DEFINE_MUTEX(mutex); 6784 6785 mutex_lock(&mutex); 6786 ret = proc_dointvec(table, write, buffer, lenp, ppos); 6787 /* 6788 * Make sure that internally we keep jiffies. 6789 * Also, writing zero resets the timeslice to default: 6790 */ 6791 if (!ret && write) { 6792 sched_rr_timeslice = 6793 sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : 6794 msecs_to_jiffies(sysctl_sched_rr_timeslice); 6795 } 6796 mutex_unlock(&mutex); 6797 return ret; 6798 } 6799 6800 #ifdef CONFIG_CGROUP_SCHED 6801 6802 static inline struct task_group *css_tg(struct cgroup_subsys_state *css) 6803 { 6804 return css ? container_of(css, struct task_group, css) : NULL; 6805 } 6806 6807 static struct cgroup_subsys_state * 6808 cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 6809 { 6810 struct task_group *parent = css_tg(parent_css); 6811 struct task_group *tg; 6812 6813 if (!parent) { 6814 /* This is early initialization for the top cgroup */ 6815 return &root_task_group.css; 6816 } 6817 6818 tg = sched_create_group(parent); 6819 if (IS_ERR(tg)) 6820 return ERR_PTR(-ENOMEM); 6821 6822 sched_online_group(tg, parent); 6823 6824 return &tg->css; 6825 } 6826 6827 static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) 6828 { 6829 struct task_group *tg = css_tg(css); 6830 6831 sched_offline_group(tg); 6832 } 6833 6834 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) 6835 { 6836 struct task_group *tg = css_tg(css); 6837 6838 /* 6839 * Relies on the RCU grace period between css_released() and this. 6840 */ 6841 sched_free_group(tg); 6842 } 6843 6844 /* 6845 * This is called before wake_up_new_task(), therefore we really only 6846 * have to set its group bits, all the other stuff does not apply. 6847 */ 6848 static void cpu_cgroup_fork(struct task_struct *task) 6849 { 6850 struct rq_flags rf; 6851 struct rq *rq; 6852 6853 rq = task_rq_lock(task, &rf); 6854 6855 update_rq_clock(rq); 6856 sched_change_group(task, TASK_SET_GROUP); 6857 6858 task_rq_unlock(rq, task, &rf); 6859 } 6860 6861 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) 6862 { 6863 struct task_struct *task; 6864 struct cgroup_subsys_state *css; 6865 int ret = 0; 6866 6867 cgroup_taskset_for_each(task, css, tset) { 6868 #ifdef CONFIG_RT_GROUP_SCHED 6869 if (!sched_rt_can_attach(css_tg(css), task)) 6870 return -EINVAL; 6871 #else 6872 /* We don't support RT-tasks being in separate groups */ 6873 if (task->sched_class != &fair_sched_class) 6874 return -EINVAL; 6875 #endif 6876 /* 6877 * Serialize against wake_up_new_task() such that if its 6878 * running, we're sure to observe its full state. 6879 */ 6880 raw_spin_lock_irq(&task->pi_lock); 6881 /* 6882 * Avoid calling sched_move_task() before wake_up_new_task() 6883 * has happened. This would lead to problems with PELT, due to 6884 * move wanting to detach+attach while we're not attached yet. 6885 */ 6886 if (task->state == TASK_NEW) 6887 ret = -EINVAL; 6888 raw_spin_unlock_irq(&task->pi_lock); 6889 6890 if (ret) 6891 break; 6892 } 6893 return ret; 6894 } 6895 6896 static void cpu_cgroup_attach(struct cgroup_taskset *tset) 6897 { 6898 struct task_struct *task; 6899 struct cgroup_subsys_state *css; 6900 6901 cgroup_taskset_for_each(task, css, tset) 6902 sched_move_task(task); 6903 } 6904 6905 #ifdef CONFIG_FAIR_GROUP_SCHED 6906 static int cpu_shares_write_u64(struct cgroup_subsys_state *css, 6907 struct cftype *cftype, u64 shareval) 6908 { 6909 return sched_group_set_shares(css_tg(css), scale_load(shareval)); 6910 } 6911 6912 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, 6913 struct cftype *cft) 6914 { 6915 struct task_group *tg = css_tg(css); 6916 6917 return (u64) scale_load_down(tg->shares); 6918 } 6919 6920 #ifdef CONFIG_CFS_BANDWIDTH 6921 static DEFINE_MUTEX(cfs_constraints_mutex); 6922 6923 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ 6924 const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ 6925 6926 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); 6927 6928 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 6929 { 6930 int i, ret = 0, runtime_enabled, runtime_was_enabled; 6931 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 6932 6933 if (tg == &root_task_group) 6934 return -EINVAL; 6935 6936 /* 6937 * Ensure we have at some amount of bandwidth every period. This is 6938 * to prevent reaching a state of large arrears when throttled via 6939 * entity_tick() resulting in prolonged exit starvation. 6940 */ 6941 if (quota < min_cfs_quota_period || period < min_cfs_quota_period) 6942 return -EINVAL; 6943 6944 /* 6945 * Likewise, bound things on the otherside by preventing insane quota 6946 * periods. This also allows us to normalize in computing quota 6947 * feasibility. 6948 */ 6949 if (period > max_cfs_quota_period) 6950 return -EINVAL; 6951 6952 /* 6953 * Prevent race between setting of cfs_rq->runtime_enabled and 6954 * unthrottle_offline_cfs_rqs(). 6955 */ 6956 get_online_cpus(); 6957 mutex_lock(&cfs_constraints_mutex); 6958 ret = __cfs_schedulable(tg, period, quota); 6959 if (ret) 6960 goto out_unlock; 6961 6962 runtime_enabled = quota != RUNTIME_INF; 6963 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 6964 /* 6965 * If we need to toggle cfs_bandwidth_used, off->on must occur 6966 * before making related changes, and on->off must occur afterwards 6967 */ 6968 if (runtime_enabled && !runtime_was_enabled) 6969 cfs_bandwidth_usage_inc(); 6970 raw_spin_lock_irq(&cfs_b->lock); 6971 cfs_b->period = ns_to_ktime(period); 6972 cfs_b->quota = quota; 6973 6974 __refill_cfs_bandwidth_runtime(cfs_b); 6975 6976 /* Restart the period timer (if active) to handle new period expiry: */ 6977 if (runtime_enabled) 6978 start_cfs_bandwidth(cfs_b); 6979 6980 raw_spin_unlock_irq(&cfs_b->lock); 6981 6982 for_each_online_cpu(i) { 6983 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 6984 struct rq *rq = cfs_rq->rq; 6985 6986 raw_spin_lock_irq(&rq->lock); 6987 cfs_rq->runtime_enabled = runtime_enabled; 6988 cfs_rq->runtime_remaining = 0; 6989 6990 if (cfs_rq->throttled) 6991 unthrottle_cfs_rq(cfs_rq); 6992 raw_spin_unlock_irq(&rq->lock); 6993 } 6994 if (runtime_was_enabled && !runtime_enabled) 6995 cfs_bandwidth_usage_dec(); 6996 out_unlock: 6997 mutex_unlock(&cfs_constraints_mutex); 6998 put_online_cpus(); 6999 7000 return ret; 7001 } 7002 7003 int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) 7004 { 7005 u64 quota, period; 7006 7007 period = ktime_to_ns(tg->cfs_bandwidth.period); 7008 if (cfs_quota_us < 0) 7009 quota = RUNTIME_INF; 7010 else 7011 quota = (u64)cfs_quota_us * NSEC_PER_USEC; 7012 7013 return tg_set_cfs_bandwidth(tg, period, quota); 7014 } 7015 7016 long tg_get_cfs_quota(struct task_group *tg) 7017 { 7018 u64 quota_us; 7019 7020 if (tg->cfs_bandwidth.quota == RUNTIME_INF) 7021 return -1; 7022 7023 quota_us = tg->cfs_bandwidth.quota; 7024 do_div(quota_us, NSEC_PER_USEC); 7025 7026 return quota_us; 7027 } 7028 7029 int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) 7030 { 7031 u64 quota, period; 7032 7033 period = (u64)cfs_period_us * NSEC_PER_USEC; 7034 quota = tg->cfs_bandwidth.quota; 7035 7036 return tg_set_cfs_bandwidth(tg, period, quota); 7037 } 7038 7039 long tg_get_cfs_period(struct task_group *tg) 7040 { 7041 u64 cfs_period_us; 7042 7043 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); 7044 do_div(cfs_period_us, NSEC_PER_USEC); 7045 7046 return cfs_period_us; 7047 } 7048 7049 static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, 7050 struct cftype *cft) 7051 { 7052 return tg_get_cfs_quota(css_tg(css)); 7053 } 7054 7055 static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, 7056 struct cftype *cftype, s64 cfs_quota_us) 7057 { 7058 return tg_set_cfs_quota(css_tg(css), cfs_quota_us); 7059 } 7060 7061 static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, 7062 struct cftype *cft) 7063 { 7064 return tg_get_cfs_period(css_tg(css)); 7065 } 7066 7067 static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, 7068 struct cftype *cftype, u64 cfs_period_us) 7069 { 7070 return tg_set_cfs_period(css_tg(css), cfs_period_us); 7071 } 7072 7073 struct cfs_schedulable_data { 7074 struct task_group *tg; 7075 u64 period, quota; 7076 }; 7077 7078 /* 7079 * normalize group quota/period to be quota/max_period 7080 * note: units are usecs 7081 */ 7082 static u64 normalize_cfs_quota(struct task_group *tg, 7083 struct cfs_schedulable_data *d) 7084 { 7085 u64 quota, period; 7086 7087 if (tg == d->tg) { 7088 period = d->period; 7089 quota = d->quota; 7090 } else { 7091 period = tg_get_cfs_period(tg); 7092 quota = tg_get_cfs_quota(tg); 7093 } 7094 7095 /* note: these should typically be equivalent */ 7096 if (quota == RUNTIME_INF || quota == -1) 7097 return RUNTIME_INF; 7098 7099 return to_ratio(period, quota); 7100 } 7101 7102 static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 7103 { 7104 struct cfs_schedulable_data *d = data; 7105 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7106 s64 quota = 0, parent_quota = -1; 7107 7108 if (!tg->parent) { 7109 quota = RUNTIME_INF; 7110 } else { 7111 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; 7112 7113 quota = normalize_cfs_quota(tg, d); 7114 parent_quota = parent_b->hierarchical_quota; 7115 7116 /* 7117 * Ensure max(child_quota) <= parent_quota, inherit when no 7118 * limit is set: 7119 */ 7120 if (quota == RUNTIME_INF) 7121 quota = parent_quota; 7122 else if (parent_quota != RUNTIME_INF && quota > parent_quota) 7123 return -EINVAL; 7124 } 7125 cfs_b->hierarchical_quota = quota; 7126 7127 return 0; 7128 } 7129 7130 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) 7131 { 7132 int ret; 7133 struct cfs_schedulable_data data = { 7134 .tg = tg, 7135 .period = period, 7136 .quota = quota, 7137 }; 7138 7139 if (quota != RUNTIME_INF) { 7140 do_div(data.period, NSEC_PER_USEC); 7141 do_div(data.quota, NSEC_PER_USEC); 7142 } 7143 7144 rcu_read_lock(); 7145 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); 7146 rcu_read_unlock(); 7147 7148 return ret; 7149 } 7150 7151 static int cpu_stats_show(struct seq_file *sf, void *v) 7152 { 7153 struct task_group *tg = css_tg(seq_css(sf)); 7154 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7155 7156 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); 7157 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); 7158 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); 7159 7160 return 0; 7161 } 7162 #endif /* CONFIG_CFS_BANDWIDTH */ 7163 #endif /* CONFIG_FAIR_GROUP_SCHED */ 7164 7165 #ifdef CONFIG_RT_GROUP_SCHED 7166 static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, 7167 struct cftype *cft, s64 val) 7168 { 7169 return sched_group_set_rt_runtime(css_tg(css), val); 7170 } 7171 7172 static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, 7173 struct cftype *cft) 7174 { 7175 return sched_group_rt_runtime(css_tg(css)); 7176 } 7177 7178 static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, 7179 struct cftype *cftype, u64 rt_period_us) 7180 { 7181 return sched_group_set_rt_period(css_tg(css), rt_period_us); 7182 } 7183 7184 static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, 7185 struct cftype *cft) 7186 { 7187 return sched_group_rt_period(css_tg(css)); 7188 } 7189 #endif /* CONFIG_RT_GROUP_SCHED */ 7190 7191 static struct cftype cpu_files[] = { 7192 #ifdef CONFIG_FAIR_GROUP_SCHED 7193 { 7194 .name = "shares", 7195 .read_u64 = cpu_shares_read_u64, 7196 .write_u64 = cpu_shares_write_u64, 7197 }, 7198 #endif 7199 #ifdef CONFIG_CFS_BANDWIDTH 7200 { 7201 .name = "cfs_quota_us", 7202 .read_s64 = cpu_cfs_quota_read_s64, 7203 .write_s64 = cpu_cfs_quota_write_s64, 7204 }, 7205 { 7206 .name = "cfs_period_us", 7207 .read_u64 = cpu_cfs_period_read_u64, 7208 .write_u64 = cpu_cfs_period_write_u64, 7209 }, 7210 { 7211 .name = "stat", 7212 .seq_show = cpu_stats_show, 7213 }, 7214 #endif 7215 #ifdef CONFIG_RT_GROUP_SCHED 7216 { 7217 .name = "rt_runtime_us", 7218 .read_s64 = cpu_rt_runtime_read, 7219 .write_s64 = cpu_rt_runtime_write, 7220 }, 7221 { 7222 .name = "rt_period_us", 7223 .read_u64 = cpu_rt_period_read_uint, 7224 .write_u64 = cpu_rt_period_write_uint, 7225 }, 7226 #endif 7227 { } /* Terminate */ 7228 }; 7229 7230 struct cgroup_subsys cpu_cgrp_subsys = { 7231 .css_alloc = cpu_cgroup_css_alloc, 7232 .css_released = cpu_cgroup_css_released, 7233 .css_free = cpu_cgroup_css_free, 7234 .fork = cpu_cgroup_fork, 7235 .can_attach = cpu_cgroup_can_attach, 7236 .attach = cpu_cgroup_attach, 7237 .legacy_cftypes = cpu_files, 7238 .early_init = true, 7239 }; 7240 7241 #endif /* CONFIG_CGROUP_SCHED */ 7242 7243 void dump_cpu_task(int cpu) 7244 { 7245 pr_info("Task dump for CPU %d:\n", cpu); 7246 sched_show_task(cpu_curr(cpu)); 7247 } 7248 7249 /* 7250 * Nice levels are multiplicative, with a gentle 10% change for every 7251 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to 7252 * nice 1, it will get ~10% less CPU time than another CPU-bound task 7253 * that remained on nice 0. 7254 * 7255 * The "10% effect" is relative and cumulative: from _any_ nice level, 7256 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level 7257 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. 7258 * If a task goes up by ~10% and another task goes down by ~10% then 7259 * the relative distance between them is ~25%.) 7260 */ 7261 const int sched_prio_to_weight[40] = { 7262 /* -20 */ 88761, 71755, 56483, 46273, 36291, 7263 /* -15 */ 29154, 23254, 18705, 14949, 11916, 7264 /* -10 */ 9548, 7620, 6100, 4904, 3906, 7265 /* -5 */ 3121, 2501, 1991, 1586, 1277, 7266 /* 0 */ 1024, 820, 655, 526, 423, 7267 /* 5 */ 335, 272, 215, 172, 137, 7268 /* 10 */ 110, 87, 70, 56, 45, 7269 /* 15 */ 36, 29, 23, 18, 15, 7270 }; 7271 7272 /* 7273 * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated. 7274 * 7275 * In cases where the weight does not change often, we can use the 7276 * precalculated inverse to speed up arithmetics by turning divisions 7277 * into multiplications: 7278 */ 7279 const u32 sched_prio_to_wmult[40] = { 7280 /* -20 */ 48388, 59856, 76040, 92818, 118348, 7281 /* -15 */ 147320, 184698, 229616, 287308, 360437, 7282 /* -10 */ 449829, 563644, 704093, 875809, 1099582, 7283 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, 7284 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, 7285 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, 7286 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, 7287 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 7288 }; 7289